cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

dm-cache-target.c (84423B)


      1/*
      2 * Copyright (C) 2012 Red Hat. All rights reserved.
      3 *
      4 * This file is released under the GPL.
      5 */
      6
      7#include "dm.h"
      8#include "dm-bio-prison-v2.h"
      9#include "dm-bio-record.h"
     10#include "dm-cache-metadata.h"
     11#include "dm-io-tracker.h"
     12
     13#include <linux/dm-io.h>
     14#include <linux/dm-kcopyd.h>
     15#include <linux/jiffies.h>
     16#include <linux/init.h>
     17#include <linux/mempool.h>
     18#include <linux/module.h>
     19#include <linux/rwsem.h>
     20#include <linux/slab.h>
     21#include <linux/vmalloc.h>
     22
     23#define DM_MSG_PREFIX "cache"
     24
     25DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
     26	"A percentage of time allocated for copying to and/or from cache");
     27
     28/*----------------------------------------------------------------*/
     29
     30/*
     31 * Glossary:
     32 *
     33 * oblock: index of an origin block
     34 * cblock: index of a cache block
     35 * promotion: movement of a block from origin to cache
     36 * demotion: movement of a block from cache to origin
     37 * migration: movement of a block between the origin and cache device,
     38 *	      either direction
     39 */
     40
     41/*----------------------------------------------------------------*/
     42
     43/*
     44 * Represents a chunk of future work.  'input' allows continuations to pass
     45 * values between themselves, typically error values.
     46 */
     47struct continuation {
     48	struct work_struct ws;
     49	blk_status_t input;
     50};
     51
     52static inline void init_continuation(struct continuation *k,
     53				     void (*fn)(struct work_struct *))
     54{
     55	INIT_WORK(&k->ws, fn);
     56	k->input = 0;
     57}
     58
     59static inline void queue_continuation(struct workqueue_struct *wq,
     60				      struct continuation *k)
     61{
     62	queue_work(wq, &k->ws);
     63}
     64
     65/*----------------------------------------------------------------*/
     66
     67/*
     68 * The batcher collects together pieces of work that need a particular
     69 * operation to occur before they can proceed (typically a commit).
     70 */
     71struct batcher {
     72	/*
     73	 * The operation that everyone is waiting for.
     74	 */
     75	blk_status_t (*commit_op)(void *context);
     76	void *commit_context;
     77
     78	/*
     79	 * This is how bios should be issued once the commit op is complete
     80	 * (accounted_request).
     81	 */
     82	void (*issue_op)(struct bio *bio, void *context);
     83	void *issue_context;
     84
     85	/*
     86	 * Queued work gets put on here after commit.
     87	 */
     88	struct workqueue_struct *wq;
     89
     90	spinlock_t lock;
     91	struct list_head work_items;
     92	struct bio_list bios;
     93	struct work_struct commit_work;
     94
     95	bool commit_scheduled;
     96};
     97
     98static void __commit(struct work_struct *_ws)
     99{
    100	struct batcher *b = container_of(_ws, struct batcher, commit_work);
    101	blk_status_t r;
    102	struct list_head work_items;
    103	struct work_struct *ws, *tmp;
    104	struct continuation *k;
    105	struct bio *bio;
    106	struct bio_list bios;
    107
    108	INIT_LIST_HEAD(&work_items);
    109	bio_list_init(&bios);
    110
    111	/*
    112	 * We have to grab these before the commit_op to avoid a race
    113	 * condition.
    114	 */
    115	spin_lock_irq(&b->lock);
    116	list_splice_init(&b->work_items, &work_items);
    117	bio_list_merge(&bios, &b->bios);
    118	bio_list_init(&b->bios);
    119	b->commit_scheduled = false;
    120	spin_unlock_irq(&b->lock);
    121
    122	r = b->commit_op(b->commit_context);
    123
    124	list_for_each_entry_safe(ws, tmp, &work_items, entry) {
    125		k = container_of(ws, struct continuation, ws);
    126		k->input = r;
    127		INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
    128		queue_work(b->wq, ws);
    129	}
    130
    131	while ((bio = bio_list_pop(&bios))) {
    132		if (r) {
    133			bio->bi_status = r;
    134			bio_endio(bio);
    135		} else
    136			b->issue_op(bio, b->issue_context);
    137	}
    138}
    139
    140static void batcher_init(struct batcher *b,
    141			 blk_status_t (*commit_op)(void *),
    142			 void *commit_context,
    143			 void (*issue_op)(struct bio *bio, void *),
    144			 void *issue_context,
    145			 struct workqueue_struct *wq)
    146{
    147	b->commit_op = commit_op;
    148	b->commit_context = commit_context;
    149	b->issue_op = issue_op;
    150	b->issue_context = issue_context;
    151	b->wq = wq;
    152
    153	spin_lock_init(&b->lock);
    154	INIT_LIST_HEAD(&b->work_items);
    155	bio_list_init(&b->bios);
    156	INIT_WORK(&b->commit_work, __commit);
    157	b->commit_scheduled = false;
    158}
    159
    160static void async_commit(struct batcher *b)
    161{
    162	queue_work(b->wq, &b->commit_work);
    163}
    164
    165static void continue_after_commit(struct batcher *b, struct continuation *k)
    166{
    167	bool commit_scheduled;
    168
    169	spin_lock_irq(&b->lock);
    170	commit_scheduled = b->commit_scheduled;
    171	list_add_tail(&k->ws.entry, &b->work_items);
    172	spin_unlock_irq(&b->lock);
    173
    174	if (commit_scheduled)
    175		async_commit(b);
    176}
    177
    178/*
    179 * Bios are errored if commit failed.
    180 */
    181static void issue_after_commit(struct batcher *b, struct bio *bio)
    182{
    183       bool commit_scheduled;
    184
    185       spin_lock_irq(&b->lock);
    186       commit_scheduled = b->commit_scheduled;
    187       bio_list_add(&b->bios, bio);
    188       spin_unlock_irq(&b->lock);
    189
    190       if (commit_scheduled)
    191	       async_commit(b);
    192}
    193
    194/*
    195 * Call this if some urgent work is waiting for the commit to complete.
    196 */
    197static void schedule_commit(struct batcher *b)
    198{
    199	bool immediate;
    200
    201	spin_lock_irq(&b->lock);
    202	immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
    203	b->commit_scheduled = true;
    204	spin_unlock_irq(&b->lock);
    205
    206	if (immediate)
    207		async_commit(b);
    208}
    209
    210/*
    211 * There are a couple of places where we let a bio run, but want to do some
    212 * work before calling its endio function.  We do this by temporarily
    213 * changing the endio fn.
    214 */
    215struct dm_hook_info {
    216	bio_end_io_t *bi_end_io;
    217};
    218
    219static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
    220			bio_end_io_t *bi_end_io, void *bi_private)
    221{
    222	h->bi_end_io = bio->bi_end_io;
    223
    224	bio->bi_end_io = bi_end_io;
    225	bio->bi_private = bi_private;
    226}
    227
    228static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
    229{
    230	bio->bi_end_io = h->bi_end_io;
    231}
    232
    233/*----------------------------------------------------------------*/
    234
    235#define MIGRATION_POOL_SIZE 128
    236#define COMMIT_PERIOD HZ
    237#define MIGRATION_COUNT_WINDOW 10
    238
    239/*
    240 * The block size of the device holding cache data must be
    241 * between 32KB and 1GB.
    242 */
    243#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
    244#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
    245
    246enum cache_metadata_mode {
    247	CM_WRITE,		/* metadata may be changed */
    248	CM_READ_ONLY,		/* metadata may not be changed */
    249	CM_FAIL
    250};
    251
    252enum cache_io_mode {
    253	/*
    254	 * Data is written to cached blocks only.  These blocks are marked
    255	 * dirty.  If you lose the cache device you will lose data.
    256	 * Potential performance increase for both reads and writes.
    257	 */
    258	CM_IO_WRITEBACK,
    259
    260	/*
    261	 * Data is written to both cache and origin.  Blocks are never
    262	 * dirty.  Potential performance benfit for reads only.
    263	 */
    264	CM_IO_WRITETHROUGH,
    265
    266	/*
    267	 * A degraded mode useful for various cache coherency situations
    268	 * (eg, rolling back snapshots).  Reads and writes always go to the
    269	 * origin.  If a write goes to a cached oblock, then the cache
    270	 * block is invalidated.
    271	 */
    272	CM_IO_PASSTHROUGH
    273};
    274
    275struct cache_features {
    276	enum cache_metadata_mode mode;
    277	enum cache_io_mode io_mode;
    278	unsigned metadata_version;
    279	bool discard_passdown:1;
    280};
    281
    282struct cache_stats {
    283	atomic_t read_hit;
    284	atomic_t read_miss;
    285	atomic_t write_hit;
    286	atomic_t write_miss;
    287	atomic_t demotion;
    288	atomic_t promotion;
    289	atomic_t writeback;
    290	atomic_t copies_avoided;
    291	atomic_t cache_cell_clash;
    292	atomic_t commit_count;
    293	atomic_t discard_count;
    294};
    295
    296struct cache {
    297	struct dm_target *ti;
    298	spinlock_t lock;
    299
    300	/*
    301	 * Fields for converting from sectors to blocks.
    302	 */
    303	int sectors_per_block_shift;
    304	sector_t sectors_per_block;
    305
    306	struct dm_cache_metadata *cmd;
    307
    308	/*
    309	 * Metadata is written to this device.
    310	 */
    311	struct dm_dev *metadata_dev;
    312
    313	/*
    314	 * The slower of the two data devices.  Typically a spindle.
    315	 */
    316	struct dm_dev *origin_dev;
    317
    318	/*
    319	 * The faster of the two data devices.  Typically an SSD.
    320	 */
    321	struct dm_dev *cache_dev;
    322
    323	/*
    324	 * Size of the origin device in _complete_ blocks and native sectors.
    325	 */
    326	dm_oblock_t origin_blocks;
    327	sector_t origin_sectors;
    328
    329	/*
    330	 * Size of the cache device in blocks.
    331	 */
    332	dm_cblock_t cache_size;
    333
    334	/*
    335	 * Invalidation fields.
    336	 */
    337	spinlock_t invalidation_lock;
    338	struct list_head invalidation_requests;
    339
    340	sector_t migration_threshold;
    341	wait_queue_head_t migration_wait;
    342	atomic_t nr_allocated_migrations;
    343
    344	/*
    345	 * The number of in flight migrations that are performing
    346	 * background io. eg, promotion, writeback.
    347	 */
    348	atomic_t nr_io_migrations;
    349
    350	struct bio_list deferred_bios;
    351
    352	struct rw_semaphore quiesce_lock;
    353
    354	/*
    355	 * origin_blocks entries, discarded if set.
    356	 */
    357	dm_dblock_t discard_nr_blocks;
    358	unsigned long *discard_bitset;
    359	uint32_t discard_block_size; /* a power of 2 times sectors per block */
    360
    361	/*
    362	 * Rather than reconstructing the table line for the status we just
    363	 * save it and regurgitate.
    364	 */
    365	unsigned nr_ctr_args;
    366	const char **ctr_args;
    367
    368	struct dm_kcopyd_client *copier;
    369	struct work_struct deferred_bio_worker;
    370	struct work_struct migration_worker;
    371	struct workqueue_struct *wq;
    372	struct delayed_work waker;
    373	struct dm_bio_prison_v2 *prison;
    374
    375	/*
    376	 * cache_size entries, dirty if set
    377	 */
    378	unsigned long *dirty_bitset;
    379	atomic_t nr_dirty;
    380
    381	unsigned policy_nr_args;
    382	struct dm_cache_policy *policy;
    383
    384	/*
    385	 * Cache features such as write-through.
    386	 */
    387	struct cache_features features;
    388
    389	struct cache_stats stats;
    390
    391	bool need_tick_bio:1;
    392	bool sized:1;
    393	bool invalidate:1;
    394	bool commit_requested:1;
    395	bool loaded_mappings:1;
    396	bool loaded_discards:1;
    397
    398	struct rw_semaphore background_work_lock;
    399
    400	struct batcher committer;
    401	struct work_struct commit_ws;
    402
    403	struct dm_io_tracker tracker;
    404
    405	mempool_t migration_pool;
    406
    407	struct bio_set bs;
    408};
    409
    410struct per_bio_data {
    411	bool tick:1;
    412	unsigned req_nr:2;
    413	struct dm_bio_prison_cell_v2 *cell;
    414	struct dm_hook_info hook_info;
    415	sector_t len;
    416};
    417
    418struct dm_cache_migration {
    419	struct continuation k;
    420	struct cache *cache;
    421
    422	struct policy_work *op;
    423	struct bio *overwrite_bio;
    424	struct dm_bio_prison_cell_v2 *cell;
    425
    426	dm_cblock_t invalidate_cblock;
    427	dm_oblock_t invalidate_oblock;
    428};
    429
    430/*----------------------------------------------------------------*/
    431
    432static bool writethrough_mode(struct cache *cache)
    433{
    434	return cache->features.io_mode == CM_IO_WRITETHROUGH;
    435}
    436
    437static bool writeback_mode(struct cache *cache)
    438{
    439	return cache->features.io_mode == CM_IO_WRITEBACK;
    440}
    441
    442static inline bool passthrough_mode(struct cache *cache)
    443{
    444	return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH);
    445}
    446
    447/*----------------------------------------------------------------*/
    448
    449static void wake_deferred_bio_worker(struct cache *cache)
    450{
    451	queue_work(cache->wq, &cache->deferred_bio_worker);
    452}
    453
    454static void wake_migration_worker(struct cache *cache)
    455{
    456	if (passthrough_mode(cache))
    457		return;
    458
    459	queue_work(cache->wq, &cache->migration_worker);
    460}
    461
    462/*----------------------------------------------------------------*/
    463
    464static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
    465{
    466	return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO);
    467}
    468
    469static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
    470{
    471	dm_bio_prison_free_cell_v2(cache->prison, cell);
    472}
    473
    474static struct dm_cache_migration *alloc_migration(struct cache *cache)
    475{
    476	struct dm_cache_migration *mg;
    477
    478	mg = mempool_alloc(&cache->migration_pool, GFP_NOIO);
    479
    480	memset(mg, 0, sizeof(*mg));
    481
    482	mg->cache = cache;
    483	atomic_inc(&cache->nr_allocated_migrations);
    484
    485	return mg;
    486}
    487
    488static void free_migration(struct dm_cache_migration *mg)
    489{
    490	struct cache *cache = mg->cache;
    491
    492	if (atomic_dec_and_test(&cache->nr_allocated_migrations))
    493		wake_up(&cache->migration_wait);
    494
    495	mempool_free(mg, &cache->migration_pool);
    496}
    497
    498/*----------------------------------------------------------------*/
    499
    500static inline dm_oblock_t oblock_succ(dm_oblock_t b)
    501{
    502	return to_oblock(from_oblock(b) + 1ull);
    503}
    504
    505static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
    506{
    507	key->virtual = 0;
    508	key->dev = 0;
    509	key->block_begin = from_oblock(begin);
    510	key->block_end = from_oblock(end);
    511}
    512
    513/*
    514 * We have two lock levels.  Level 0, which is used to prevent WRITEs, and
    515 * level 1 which prevents *both* READs and WRITEs.
    516 */
    517#define WRITE_LOCK_LEVEL 0
    518#define READ_WRITE_LOCK_LEVEL 1
    519
    520static unsigned lock_level(struct bio *bio)
    521{
    522	return bio_data_dir(bio) == WRITE ?
    523		WRITE_LOCK_LEVEL :
    524		READ_WRITE_LOCK_LEVEL;
    525}
    526
    527/*----------------------------------------------------------------
    528 * Per bio data
    529 *--------------------------------------------------------------*/
    530
    531static struct per_bio_data *get_per_bio_data(struct bio *bio)
    532{
    533	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
    534	BUG_ON(!pb);
    535	return pb;
    536}
    537
    538static struct per_bio_data *init_per_bio_data(struct bio *bio)
    539{
    540	struct per_bio_data *pb = get_per_bio_data(bio);
    541
    542	pb->tick = false;
    543	pb->req_nr = dm_bio_get_target_bio_nr(bio);
    544	pb->cell = NULL;
    545	pb->len = 0;
    546
    547	return pb;
    548}
    549
    550/*----------------------------------------------------------------*/
    551
    552static void defer_bio(struct cache *cache, struct bio *bio)
    553{
    554	spin_lock_irq(&cache->lock);
    555	bio_list_add(&cache->deferred_bios, bio);
    556	spin_unlock_irq(&cache->lock);
    557
    558	wake_deferred_bio_worker(cache);
    559}
    560
    561static void defer_bios(struct cache *cache, struct bio_list *bios)
    562{
    563	spin_lock_irq(&cache->lock);
    564	bio_list_merge(&cache->deferred_bios, bios);
    565	bio_list_init(bios);
    566	spin_unlock_irq(&cache->lock);
    567
    568	wake_deferred_bio_worker(cache);
    569}
    570
    571/*----------------------------------------------------------------*/
    572
    573static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
    574{
    575	bool r;
    576	struct per_bio_data *pb;
    577	struct dm_cell_key_v2 key;
    578	dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
    579	struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
    580
    581	cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
    582
    583	build_key(oblock, end, &key);
    584	r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
    585	if (!r) {
    586		/*
    587		 * Failed to get the lock.
    588		 */
    589		free_prison_cell(cache, cell_prealloc);
    590		return r;
    591	}
    592
    593	if (cell != cell_prealloc)
    594		free_prison_cell(cache, cell_prealloc);
    595
    596	pb = get_per_bio_data(bio);
    597	pb->cell = cell;
    598
    599	return r;
    600}
    601
    602/*----------------------------------------------------------------*/
    603
    604static bool is_dirty(struct cache *cache, dm_cblock_t b)
    605{
    606	return test_bit(from_cblock(b), cache->dirty_bitset);
    607}
    608
    609static void set_dirty(struct cache *cache, dm_cblock_t cblock)
    610{
    611	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
    612		atomic_inc(&cache->nr_dirty);
    613		policy_set_dirty(cache->policy, cblock);
    614	}
    615}
    616
    617/*
    618 * These two are called when setting after migrations to force the policy
    619 * and dirty bitset to be in sync.
    620 */
    621static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
    622{
    623	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
    624		atomic_inc(&cache->nr_dirty);
    625	policy_set_dirty(cache->policy, cblock);
    626}
    627
    628static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
    629{
    630	if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
    631		if (atomic_dec_return(&cache->nr_dirty) == 0)
    632			dm_table_event(cache->ti->table);
    633	}
    634
    635	policy_clear_dirty(cache->policy, cblock);
    636}
    637
    638/*----------------------------------------------------------------*/
    639
    640static bool block_size_is_power_of_two(struct cache *cache)
    641{
    642	return cache->sectors_per_block_shift >= 0;
    643}
    644
    645static dm_block_t block_div(dm_block_t b, uint32_t n)
    646{
    647	do_div(b, n);
    648
    649	return b;
    650}
    651
    652static dm_block_t oblocks_per_dblock(struct cache *cache)
    653{
    654	dm_block_t oblocks = cache->discard_block_size;
    655
    656	if (block_size_is_power_of_two(cache))
    657		oblocks >>= cache->sectors_per_block_shift;
    658	else
    659		oblocks = block_div(oblocks, cache->sectors_per_block);
    660
    661	return oblocks;
    662}
    663
    664static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
    665{
    666	return to_dblock(block_div(from_oblock(oblock),
    667				   oblocks_per_dblock(cache)));
    668}
    669
    670static void set_discard(struct cache *cache, dm_dblock_t b)
    671{
    672	BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
    673	atomic_inc(&cache->stats.discard_count);
    674
    675	spin_lock_irq(&cache->lock);
    676	set_bit(from_dblock(b), cache->discard_bitset);
    677	spin_unlock_irq(&cache->lock);
    678}
    679
    680static void clear_discard(struct cache *cache, dm_dblock_t b)
    681{
    682	spin_lock_irq(&cache->lock);
    683	clear_bit(from_dblock(b), cache->discard_bitset);
    684	spin_unlock_irq(&cache->lock);
    685}
    686
    687static bool is_discarded(struct cache *cache, dm_dblock_t b)
    688{
    689	int r;
    690	spin_lock_irq(&cache->lock);
    691	r = test_bit(from_dblock(b), cache->discard_bitset);
    692	spin_unlock_irq(&cache->lock);
    693
    694	return r;
    695}
    696
    697static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
    698{
    699	int r;
    700	spin_lock_irq(&cache->lock);
    701	r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
    702		     cache->discard_bitset);
    703	spin_unlock_irq(&cache->lock);
    704
    705	return r;
    706}
    707
    708/*----------------------------------------------------------------
    709 * Remapping
    710 *--------------------------------------------------------------*/
    711static void remap_to_origin(struct cache *cache, struct bio *bio)
    712{
    713	bio_set_dev(bio, cache->origin_dev->bdev);
    714}
    715
    716static void remap_to_cache(struct cache *cache, struct bio *bio,
    717			   dm_cblock_t cblock)
    718{
    719	sector_t bi_sector = bio->bi_iter.bi_sector;
    720	sector_t block = from_cblock(cblock);
    721
    722	bio_set_dev(bio, cache->cache_dev->bdev);
    723	if (!block_size_is_power_of_two(cache))
    724		bio->bi_iter.bi_sector =
    725			(block * cache->sectors_per_block) +
    726			sector_div(bi_sector, cache->sectors_per_block);
    727	else
    728		bio->bi_iter.bi_sector =
    729			(block << cache->sectors_per_block_shift) |
    730			(bi_sector & (cache->sectors_per_block - 1));
    731}
    732
    733static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
    734{
    735	struct per_bio_data *pb;
    736
    737	spin_lock_irq(&cache->lock);
    738	if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
    739	    bio_op(bio) != REQ_OP_DISCARD) {
    740		pb = get_per_bio_data(bio);
    741		pb->tick = true;
    742		cache->need_tick_bio = false;
    743	}
    744	spin_unlock_irq(&cache->lock);
    745}
    746
    747static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
    748					  dm_oblock_t oblock)
    749{
    750	// FIXME: check_if_tick_bio_needed() is called way too much through this interface
    751	check_if_tick_bio_needed(cache, bio);
    752	remap_to_origin(cache, bio);
    753	if (bio_data_dir(bio) == WRITE)
    754		clear_discard(cache, oblock_to_dblock(cache, oblock));
    755}
    756
    757static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
    758				 dm_oblock_t oblock, dm_cblock_t cblock)
    759{
    760	check_if_tick_bio_needed(cache, bio);
    761	remap_to_cache(cache, bio, cblock);
    762	if (bio_data_dir(bio) == WRITE) {
    763		set_dirty(cache, cblock);
    764		clear_discard(cache, oblock_to_dblock(cache, oblock));
    765	}
    766}
    767
    768static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
    769{
    770	sector_t block_nr = bio->bi_iter.bi_sector;
    771
    772	if (!block_size_is_power_of_two(cache))
    773		(void) sector_div(block_nr, cache->sectors_per_block);
    774	else
    775		block_nr >>= cache->sectors_per_block_shift;
    776
    777	return to_oblock(block_nr);
    778}
    779
    780static bool accountable_bio(struct cache *cache, struct bio *bio)
    781{
    782	return bio_op(bio) != REQ_OP_DISCARD;
    783}
    784
    785static void accounted_begin(struct cache *cache, struct bio *bio)
    786{
    787	struct per_bio_data *pb;
    788
    789	if (accountable_bio(cache, bio)) {
    790		pb = get_per_bio_data(bio);
    791		pb->len = bio_sectors(bio);
    792		dm_iot_io_begin(&cache->tracker, pb->len);
    793	}
    794}
    795
    796static void accounted_complete(struct cache *cache, struct bio *bio)
    797{
    798	struct per_bio_data *pb = get_per_bio_data(bio);
    799
    800	dm_iot_io_end(&cache->tracker, pb->len);
    801}
    802
    803static void accounted_request(struct cache *cache, struct bio *bio)
    804{
    805	accounted_begin(cache, bio);
    806	dm_submit_bio_remap(bio, NULL);
    807}
    808
    809static void issue_op(struct bio *bio, void *context)
    810{
    811	struct cache *cache = context;
    812	accounted_request(cache, bio);
    813}
    814
    815/*
    816 * When running in writethrough mode we need to send writes to clean blocks
    817 * to both the cache and origin devices.  Clone the bio and send them in parallel.
    818 */
    819static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio,
    820				      dm_oblock_t oblock, dm_cblock_t cblock)
    821{
    822	struct bio *origin_bio = bio_alloc_clone(cache->origin_dev->bdev, bio,
    823						 GFP_NOIO, &cache->bs);
    824
    825	BUG_ON(!origin_bio);
    826
    827	bio_chain(origin_bio, bio);
    828
    829	if (bio_data_dir(origin_bio) == WRITE)
    830		clear_discard(cache, oblock_to_dblock(cache, oblock));
    831	submit_bio(origin_bio);
    832
    833	remap_to_cache(cache, bio, cblock);
    834}
    835
    836/*----------------------------------------------------------------
    837 * Failure modes
    838 *--------------------------------------------------------------*/
    839static enum cache_metadata_mode get_cache_mode(struct cache *cache)
    840{
    841	return cache->features.mode;
    842}
    843
    844static const char *cache_device_name(struct cache *cache)
    845{
    846	return dm_table_device_name(cache->ti->table);
    847}
    848
    849static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
    850{
    851	const char *descs[] = {
    852		"write",
    853		"read-only",
    854		"fail"
    855	};
    856
    857	dm_table_event(cache->ti->table);
    858	DMINFO("%s: switching cache to %s mode",
    859	       cache_device_name(cache), descs[(int)mode]);
    860}
    861
    862static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
    863{
    864	bool needs_check;
    865	enum cache_metadata_mode old_mode = get_cache_mode(cache);
    866
    867	if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) {
    868		DMERR("%s: unable to read needs_check flag, setting failure mode.",
    869		      cache_device_name(cache));
    870		new_mode = CM_FAIL;
    871	}
    872
    873	if (new_mode == CM_WRITE && needs_check) {
    874		DMERR("%s: unable to switch cache to write mode until repaired.",
    875		      cache_device_name(cache));
    876		if (old_mode != new_mode)
    877			new_mode = old_mode;
    878		else
    879			new_mode = CM_READ_ONLY;
    880	}
    881
    882	/* Never move out of fail mode */
    883	if (old_mode == CM_FAIL)
    884		new_mode = CM_FAIL;
    885
    886	switch (new_mode) {
    887	case CM_FAIL:
    888	case CM_READ_ONLY:
    889		dm_cache_metadata_set_read_only(cache->cmd);
    890		break;
    891
    892	case CM_WRITE:
    893		dm_cache_metadata_set_read_write(cache->cmd);
    894		break;
    895	}
    896
    897	cache->features.mode = new_mode;
    898
    899	if (new_mode != old_mode)
    900		notify_mode_switch(cache, new_mode);
    901}
    902
    903static void abort_transaction(struct cache *cache)
    904{
    905	const char *dev_name = cache_device_name(cache);
    906
    907	if (get_cache_mode(cache) >= CM_READ_ONLY)
    908		return;
    909
    910	if (dm_cache_metadata_set_needs_check(cache->cmd)) {
    911		DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
    912		set_cache_mode(cache, CM_FAIL);
    913	}
    914
    915	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
    916	if (dm_cache_metadata_abort(cache->cmd)) {
    917		DMERR("%s: failed to abort metadata transaction", dev_name);
    918		set_cache_mode(cache, CM_FAIL);
    919	}
    920}
    921
    922static void metadata_operation_failed(struct cache *cache, const char *op, int r)
    923{
    924	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
    925		    cache_device_name(cache), op, r);
    926	abort_transaction(cache);
    927	set_cache_mode(cache, CM_READ_ONLY);
    928}
    929
    930/*----------------------------------------------------------------*/
    931
    932static void load_stats(struct cache *cache)
    933{
    934	struct dm_cache_statistics stats;
    935
    936	dm_cache_metadata_get_stats(cache->cmd, &stats);
    937	atomic_set(&cache->stats.read_hit, stats.read_hits);
    938	atomic_set(&cache->stats.read_miss, stats.read_misses);
    939	atomic_set(&cache->stats.write_hit, stats.write_hits);
    940	atomic_set(&cache->stats.write_miss, stats.write_misses);
    941}
    942
    943static void save_stats(struct cache *cache)
    944{
    945	struct dm_cache_statistics stats;
    946
    947	if (get_cache_mode(cache) >= CM_READ_ONLY)
    948		return;
    949
    950	stats.read_hits = atomic_read(&cache->stats.read_hit);
    951	stats.read_misses = atomic_read(&cache->stats.read_miss);
    952	stats.write_hits = atomic_read(&cache->stats.write_hit);
    953	stats.write_misses = atomic_read(&cache->stats.write_miss);
    954
    955	dm_cache_metadata_set_stats(cache->cmd, &stats);
    956}
    957
    958static void update_stats(struct cache_stats *stats, enum policy_operation op)
    959{
    960	switch (op) {
    961	case POLICY_PROMOTE:
    962		atomic_inc(&stats->promotion);
    963		break;
    964
    965	case POLICY_DEMOTE:
    966		atomic_inc(&stats->demotion);
    967		break;
    968
    969	case POLICY_WRITEBACK:
    970		atomic_inc(&stats->writeback);
    971		break;
    972	}
    973}
    974
    975/*----------------------------------------------------------------
    976 * Migration processing
    977 *
    978 * Migration covers moving data from the origin device to the cache, or
    979 * vice versa.
    980 *--------------------------------------------------------------*/
    981
    982static void inc_io_migrations(struct cache *cache)
    983{
    984	atomic_inc(&cache->nr_io_migrations);
    985}
    986
    987static void dec_io_migrations(struct cache *cache)
    988{
    989	atomic_dec(&cache->nr_io_migrations);
    990}
    991
    992static bool discard_or_flush(struct bio *bio)
    993{
    994	return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
    995}
    996
    997static void calc_discard_block_range(struct cache *cache, struct bio *bio,
    998				     dm_dblock_t *b, dm_dblock_t *e)
    999{
   1000	sector_t sb = bio->bi_iter.bi_sector;
   1001	sector_t se = bio_end_sector(bio);
   1002
   1003	*b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
   1004
   1005	if (se - sb < cache->discard_block_size)
   1006		*e = *b;
   1007	else
   1008		*e = to_dblock(block_div(se, cache->discard_block_size));
   1009}
   1010
   1011/*----------------------------------------------------------------*/
   1012
   1013static void prevent_background_work(struct cache *cache)
   1014{
   1015	lockdep_off();
   1016	down_write(&cache->background_work_lock);
   1017	lockdep_on();
   1018}
   1019
   1020static void allow_background_work(struct cache *cache)
   1021{
   1022	lockdep_off();
   1023	up_write(&cache->background_work_lock);
   1024	lockdep_on();
   1025}
   1026
   1027static bool background_work_begin(struct cache *cache)
   1028{
   1029	bool r;
   1030
   1031	lockdep_off();
   1032	r = down_read_trylock(&cache->background_work_lock);
   1033	lockdep_on();
   1034
   1035	return r;
   1036}
   1037
   1038static void background_work_end(struct cache *cache)
   1039{
   1040	lockdep_off();
   1041	up_read(&cache->background_work_lock);
   1042	lockdep_on();
   1043}
   1044
   1045/*----------------------------------------------------------------*/
   1046
   1047static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
   1048{
   1049	return (bio_data_dir(bio) == WRITE) &&
   1050		(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
   1051}
   1052
   1053static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
   1054{
   1055	return writeback_mode(cache) &&
   1056		(is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
   1057}
   1058
   1059static void quiesce(struct dm_cache_migration *mg,
   1060		    void (*continuation)(struct work_struct *))
   1061{
   1062	init_continuation(&mg->k, continuation);
   1063	dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
   1064}
   1065
   1066static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
   1067{
   1068	struct continuation *k = container_of(ws, struct continuation, ws);
   1069	return container_of(k, struct dm_cache_migration, k);
   1070}
   1071
   1072static void copy_complete(int read_err, unsigned long write_err, void *context)
   1073{
   1074	struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
   1075
   1076	if (read_err || write_err)
   1077		mg->k.input = BLK_STS_IOERR;
   1078
   1079	queue_continuation(mg->cache->wq, &mg->k);
   1080}
   1081
   1082static void copy(struct dm_cache_migration *mg, bool promote)
   1083{
   1084	struct dm_io_region o_region, c_region;
   1085	struct cache *cache = mg->cache;
   1086
   1087	o_region.bdev = cache->origin_dev->bdev;
   1088	o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
   1089	o_region.count = cache->sectors_per_block;
   1090
   1091	c_region.bdev = cache->cache_dev->bdev;
   1092	c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
   1093	c_region.count = cache->sectors_per_block;
   1094
   1095	if (promote)
   1096		dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
   1097	else
   1098		dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
   1099}
   1100
   1101static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
   1102{
   1103	struct per_bio_data *pb = get_per_bio_data(bio);
   1104
   1105	if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
   1106		free_prison_cell(cache, pb->cell);
   1107	pb->cell = NULL;
   1108}
   1109
   1110static void overwrite_endio(struct bio *bio)
   1111{
   1112	struct dm_cache_migration *mg = bio->bi_private;
   1113	struct cache *cache = mg->cache;
   1114	struct per_bio_data *pb = get_per_bio_data(bio);
   1115
   1116	dm_unhook_bio(&pb->hook_info, bio);
   1117
   1118	if (bio->bi_status)
   1119		mg->k.input = bio->bi_status;
   1120
   1121	queue_continuation(cache->wq, &mg->k);
   1122}
   1123
   1124static void overwrite(struct dm_cache_migration *mg,
   1125		      void (*continuation)(struct work_struct *))
   1126{
   1127	struct bio *bio = mg->overwrite_bio;
   1128	struct per_bio_data *pb = get_per_bio_data(bio);
   1129
   1130	dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
   1131
   1132	/*
   1133	 * The overwrite bio is part of the copy operation, as such it does
   1134	 * not set/clear discard or dirty flags.
   1135	 */
   1136	if (mg->op->op == POLICY_PROMOTE)
   1137		remap_to_cache(mg->cache, bio, mg->op->cblock);
   1138	else
   1139		remap_to_origin(mg->cache, bio);
   1140
   1141	init_continuation(&mg->k, continuation);
   1142	accounted_request(mg->cache, bio);
   1143}
   1144
   1145/*
   1146 * Migration steps:
   1147 *
   1148 * 1) exclusive lock preventing WRITEs
   1149 * 2) quiesce
   1150 * 3) copy or issue overwrite bio
   1151 * 4) upgrade to exclusive lock preventing READs and WRITEs
   1152 * 5) quiesce
   1153 * 6) update metadata and commit
   1154 * 7) unlock
   1155 */
   1156static void mg_complete(struct dm_cache_migration *mg, bool success)
   1157{
   1158	struct bio_list bios;
   1159	struct cache *cache = mg->cache;
   1160	struct policy_work *op = mg->op;
   1161	dm_cblock_t cblock = op->cblock;
   1162
   1163	if (success)
   1164		update_stats(&cache->stats, op->op);
   1165
   1166	switch (op->op) {
   1167	case POLICY_PROMOTE:
   1168		clear_discard(cache, oblock_to_dblock(cache, op->oblock));
   1169		policy_complete_background_work(cache->policy, op, success);
   1170
   1171		if (mg->overwrite_bio) {
   1172			if (success)
   1173				force_set_dirty(cache, cblock);
   1174			else if (mg->k.input)
   1175				mg->overwrite_bio->bi_status = mg->k.input;
   1176			else
   1177				mg->overwrite_bio->bi_status = BLK_STS_IOERR;
   1178			bio_endio(mg->overwrite_bio);
   1179		} else {
   1180			if (success)
   1181				force_clear_dirty(cache, cblock);
   1182			dec_io_migrations(cache);
   1183		}
   1184		break;
   1185
   1186	case POLICY_DEMOTE:
   1187		/*
   1188		 * We clear dirty here to update the nr_dirty counter.
   1189		 */
   1190		if (success)
   1191			force_clear_dirty(cache, cblock);
   1192		policy_complete_background_work(cache->policy, op, success);
   1193		dec_io_migrations(cache);
   1194		break;
   1195
   1196	case POLICY_WRITEBACK:
   1197		if (success)
   1198			force_clear_dirty(cache, cblock);
   1199		policy_complete_background_work(cache->policy, op, success);
   1200		dec_io_migrations(cache);
   1201		break;
   1202	}
   1203
   1204	bio_list_init(&bios);
   1205	if (mg->cell) {
   1206		if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
   1207			free_prison_cell(cache, mg->cell);
   1208	}
   1209
   1210	free_migration(mg);
   1211	defer_bios(cache, &bios);
   1212	wake_migration_worker(cache);
   1213
   1214	background_work_end(cache);
   1215}
   1216
   1217static void mg_success(struct work_struct *ws)
   1218{
   1219	struct dm_cache_migration *mg = ws_to_mg(ws);
   1220	mg_complete(mg, mg->k.input == 0);
   1221}
   1222
   1223static void mg_update_metadata(struct work_struct *ws)
   1224{
   1225	int r;
   1226	struct dm_cache_migration *mg = ws_to_mg(ws);
   1227	struct cache *cache = mg->cache;
   1228	struct policy_work *op = mg->op;
   1229
   1230	switch (op->op) {
   1231	case POLICY_PROMOTE:
   1232		r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
   1233		if (r) {
   1234			DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
   1235				    cache_device_name(cache));
   1236			metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
   1237
   1238			mg_complete(mg, false);
   1239			return;
   1240		}
   1241		mg_complete(mg, true);
   1242		break;
   1243
   1244	case POLICY_DEMOTE:
   1245		r = dm_cache_remove_mapping(cache->cmd, op->cblock);
   1246		if (r) {
   1247			DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
   1248				    cache_device_name(cache));
   1249			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
   1250
   1251			mg_complete(mg, false);
   1252			return;
   1253		}
   1254
   1255		/*
   1256		 * It would be nice if we only had to commit when a REQ_FLUSH
   1257		 * comes through.  But there's one scenario that we have to
   1258		 * look out for:
   1259		 *
   1260		 * - vblock x in a cache block
   1261		 * - domotion occurs
   1262		 * - cache block gets reallocated and over written
   1263		 * - crash
   1264		 *
   1265		 * When we recover, because there was no commit the cache will
   1266		 * rollback to having the data for vblock x in the cache block.
   1267		 * But the cache block has since been overwritten, so it'll end
   1268		 * up pointing to data that was never in 'x' during the history
   1269		 * of the device.
   1270		 *
   1271		 * To avoid this issue we require a commit as part of the
   1272		 * demotion operation.
   1273		 */
   1274		init_continuation(&mg->k, mg_success);
   1275		continue_after_commit(&cache->committer, &mg->k);
   1276		schedule_commit(&cache->committer);
   1277		break;
   1278
   1279	case POLICY_WRITEBACK:
   1280		mg_complete(mg, true);
   1281		break;
   1282	}
   1283}
   1284
   1285static void mg_update_metadata_after_copy(struct work_struct *ws)
   1286{
   1287	struct dm_cache_migration *mg = ws_to_mg(ws);
   1288
   1289	/*
   1290	 * Did the copy succeed?
   1291	 */
   1292	if (mg->k.input)
   1293		mg_complete(mg, false);
   1294	else
   1295		mg_update_metadata(ws);
   1296}
   1297
   1298static void mg_upgrade_lock(struct work_struct *ws)
   1299{
   1300	int r;
   1301	struct dm_cache_migration *mg = ws_to_mg(ws);
   1302
   1303	/*
   1304	 * Did the copy succeed?
   1305	 */
   1306	if (mg->k.input)
   1307		mg_complete(mg, false);
   1308
   1309	else {
   1310		/*
   1311		 * Now we want the lock to prevent both reads and writes.
   1312		 */
   1313		r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
   1314					    READ_WRITE_LOCK_LEVEL);
   1315		if (r < 0)
   1316			mg_complete(mg, false);
   1317
   1318		else if (r)
   1319			quiesce(mg, mg_update_metadata);
   1320
   1321		else
   1322			mg_update_metadata(ws);
   1323	}
   1324}
   1325
   1326static void mg_full_copy(struct work_struct *ws)
   1327{
   1328	struct dm_cache_migration *mg = ws_to_mg(ws);
   1329	struct cache *cache = mg->cache;
   1330	struct policy_work *op = mg->op;
   1331	bool is_policy_promote = (op->op == POLICY_PROMOTE);
   1332
   1333	if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
   1334	    is_discarded_oblock(cache, op->oblock)) {
   1335		mg_upgrade_lock(ws);
   1336		return;
   1337	}
   1338
   1339	init_continuation(&mg->k, mg_upgrade_lock);
   1340	copy(mg, is_policy_promote);
   1341}
   1342
   1343static void mg_copy(struct work_struct *ws)
   1344{
   1345	struct dm_cache_migration *mg = ws_to_mg(ws);
   1346
   1347	if (mg->overwrite_bio) {
   1348		/*
   1349		 * No exclusive lock was held when we last checked if the bio
   1350		 * was optimisable.  So we have to check again in case things
   1351		 * have changed (eg, the block may no longer be discarded).
   1352		 */
   1353		if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) {
   1354			/*
   1355			 * Fallback to a real full copy after doing some tidying up.
   1356			 */
   1357			bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio);
   1358			BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */
   1359			mg->overwrite_bio = NULL;
   1360			inc_io_migrations(mg->cache);
   1361			mg_full_copy(ws);
   1362			return;
   1363		}
   1364
   1365		/*
   1366		 * It's safe to do this here, even though it's new data
   1367		 * because all IO has been locked out of the block.
   1368		 *
   1369		 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
   1370		 * so _not_ using mg_upgrade_lock() as continutation.
   1371		 */
   1372		overwrite(mg, mg_update_metadata_after_copy);
   1373
   1374	} else
   1375		mg_full_copy(ws);
   1376}
   1377
   1378static int mg_lock_writes(struct dm_cache_migration *mg)
   1379{
   1380	int r;
   1381	struct dm_cell_key_v2 key;
   1382	struct cache *cache = mg->cache;
   1383	struct dm_bio_prison_cell_v2 *prealloc;
   1384
   1385	prealloc = alloc_prison_cell(cache);
   1386
   1387	/*
   1388	 * Prevent writes to the block, but allow reads to continue.
   1389	 * Unless we're using an overwrite bio, in which case we lock
   1390	 * everything.
   1391	 */
   1392	build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
   1393	r = dm_cell_lock_v2(cache->prison, &key,
   1394			    mg->overwrite_bio ?  READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
   1395			    prealloc, &mg->cell);
   1396	if (r < 0) {
   1397		free_prison_cell(cache, prealloc);
   1398		mg_complete(mg, false);
   1399		return r;
   1400	}
   1401
   1402	if (mg->cell != prealloc)
   1403		free_prison_cell(cache, prealloc);
   1404
   1405	if (r == 0)
   1406		mg_copy(&mg->k.ws);
   1407	else
   1408		quiesce(mg, mg_copy);
   1409
   1410	return 0;
   1411}
   1412
   1413static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
   1414{
   1415	struct dm_cache_migration *mg;
   1416
   1417	if (!background_work_begin(cache)) {
   1418		policy_complete_background_work(cache->policy, op, false);
   1419		return -EPERM;
   1420	}
   1421
   1422	mg = alloc_migration(cache);
   1423
   1424	mg->op = op;
   1425	mg->overwrite_bio = bio;
   1426
   1427	if (!bio)
   1428		inc_io_migrations(cache);
   1429
   1430	return mg_lock_writes(mg);
   1431}
   1432
   1433/*----------------------------------------------------------------
   1434 * invalidation processing
   1435 *--------------------------------------------------------------*/
   1436
   1437static void invalidate_complete(struct dm_cache_migration *mg, bool success)
   1438{
   1439	struct bio_list bios;
   1440	struct cache *cache = mg->cache;
   1441
   1442	bio_list_init(&bios);
   1443	if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
   1444		free_prison_cell(cache, mg->cell);
   1445
   1446	if (!success && mg->overwrite_bio)
   1447		bio_io_error(mg->overwrite_bio);
   1448
   1449	free_migration(mg);
   1450	defer_bios(cache, &bios);
   1451
   1452	background_work_end(cache);
   1453}
   1454
   1455static void invalidate_completed(struct work_struct *ws)
   1456{
   1457	struct dm_cache_migration *mg = ws_to_mg(ws);
   1458	invalidate_complete(mg, !mg->k.input);
   1459}
   1460
   1461static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
   1462{
   1463	int r = policy_invalidate_mapping(cache->policy, cblock);
   1464	if (!r) {
   1465		r = dm_cache_remove_mapping(cache->cmd, cblock);
   1466		if (r) {
   1467			DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
   1468				    cache_device_name(cache));
   1469			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
   1470		}
   1471
   1472	} else if (r == -ENODATA) {
   1473		/*
   1474		 * Harmless, already unmapped.
   1475		 */
   1476		r = 0;
   1477
   1478	} else
   1479		DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
   1480
   1481	return r;
   1482}
   1483
   1484static void invalidate_remove(struct work_struct *ws)
   1485{
   1486	int r;
   1487	struct dm_cache_migration *mg = ws_to_mg(ws);
   1488	struct cache *cache = mg->cache;
   1489
   1490	r = invalidate_cblock(cache, mg->invalidate_cblock);
   1491	if (r) {
   1492		invalidate_complete(mg, false);
   1493		return;
   1494	}
   1495
   1496	init_continuation(&mg->k, invalidate_completed);
   1497	continue_after_commit(&cache->committer, &mg->k);
   1498	remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
   1499	mg->overwrite_bio = NULL;
   1500	schedule_commit(&cache->committer);
   1501}
   1502
   1503static int invalidate_lock(struct dm_cache_migration *mg)
   1504{
   1505	int r;
   1506	struct dm_cell_key_v2 key;
   1507	struct cache *cache = mg->cache;
   1508	struct dm_bio_prison_cell_v2 *prealloc;
   1509
   1510	prealloc = alloc_prison_cell(cache);
   1511
   1512	build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
   1513	r = dm_cell_lock_v2(cache->prison, &key,
   1514			    READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
   1515	if (r < 0) {
   1516		free_prison_cell(cache, prealloc);
   1517		invalidate_complete(mg, false);
   1518		return r;
   1519	}
   1520
   1521	if (mg->cell != prealloc)
   1522		free_prison_cell(cache, prealloc);
   1523
   1524	if (r)
   1525		quiesce(mg, invalidate_remove);
   1526
   1527	else {
   1528		/*
   1529		 * We can't call invalidate_remove() directly here because we
   1530		 * might still be in request context.
   1531		 */
   1532		init_continuation(&mg->k, invalidate_remove);
   1533		queue_work(cache->wq, &mg->k.ws);
   1534	}
   1535
   1536	return 0;
   1537}
   1538
   1539static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
   1540			    dm_oblock_t oblock, struct bio *bio)
   1541{
   1542	struct dm_cache_migration *mg;
   1543
   1544	if (!background_work_begin(cache))
   1545		return -EPERM;
   1546
   1547	mg = alloc_migration(cache);
   1548
   1549	mg->overwrite_bio = bio;
   1550	mg->invalidate_cblock = cblock;
   1551	mg->invalidate_oblock = oblock;
   1552
   1553	return invalidate_lock(mg);
   1554}
   1555
   1556/*----------------------------------------------------------------
   1557 * bio processing
   1558 *--------------------------------------------------------------*/
   1559
   1560enum busy {
   1561	IDLE,
   1562	BUSY
   1563};
   1564
   1565static enum busy spare_migration_bandwidth(struct cache *cache)
   1566{
   1567	bool idle = dm_iot_idle_for(&cache->tracker, HZ);
   1568	sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
   1569		cache->sectors_per_block;
   1570
   1571	if (idle && current_volume <= cache->migration_threshold)
   1572		return IDLE;
   1573	else
   1574		return BUSY;
   1575}
   1576
   1577static void inc_hit_counter(struct cache *cache, struct bio *bio)
   1578{
   1579	atomic_inc(bio_data_dir(bio) == READ ?
   1580		   &cache->stats.read_hit : &cache->stats.write_hit);
   1581}
   1582
   1583static void inc_miss_counter(struct cache *cache, struct bio *bio)
   1584{
   1585	atomic_inc(bio_data_dir(bio) == READ ?
   1586		   &cache->stats.read_miss : &cache->stats.write_miss);
   1587}
   1588
   1589/*----------------------------------------------------------------*/
   1590
   1591static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
   1592		   bool *commit_needed)
   1593{
   1594	int r, data_dir;
   1595	bool rb, background_queued;
   1596	dm_cblock_t cblock;
   1597
   1598	*commit_needed = false;
   1599
   1600	rb = bio_detain_shared(cache, block, bio);
   1601	if (!rb) {
   1602		/*
   1603		 * An exclusive lock is held for this block, so we have to
   1604		 * wait.  We set the commit_needed flag so the current
   1605		 * transaction will be committed asap, allowing this lock
   1606		 * to be dropped.
   1607		 */
   1608		*commit_needed = true;
   1609		return DM_MAPIO_SUBMITTED;
   1610	}
   1611
   1612	data_dir = bio_data_dir(bio);
   1613
   1614	if (optimisable_bio(cache, bio, block)) {
   1615		struct policy_work *op = NULL;
   1616
   1617		r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
   1618		if (unlikely(r && r != -ENOENT)) {
   1619			DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
   1620				    cache_device_name(cache), r);
   1621			bio_io_error(bio);
   1622			return DM_MAPIO_SUBMITTED;
   1623		}
   1624
   1625		if (r == -ENOENT && op) {
   1626			bio_drop_shared_lock(cache, bio);
   1627			BUG_ON(op->op != POLICY_PROMOTE);
   1628			mg_start(cache, op, bio);
   1629			return DM_MAPIO_SUBMITTED;
   1630		}
   1631	} else {
   1632		r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
   1633		if (unlikely(r && r != -ENOENT)) {
   1634			DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
   1635				    cache_device_name(cache), r);
   1636			bio_io_error(bio);
   1637			return DM_MAPIO_SUBMITTED;
   1638		}
   1639
   1640		if (background_queued)
   1641			wake_migration_worker(cache);
   1642	}
   1643
   1644	if (r == -ENOENT) {
   1645		struct per_bio_data *pb = get_per_bio_data(bio);
   1646
   1647		/*
   1648		 * Miss.
   1649		 */
   1650		inc_miss_counter(cache, bio);
   1651		if (pb->req_nr == 0) {
   1652			accounted_begin(cache, bio);
   1653			remap_to_origin_clear_discard(cache, bio, block);
   1654		} else {
   1655			/*
   1656			 * This is a duplicate writethrough io that is no
   1657			 * longer needed because the block has been demoted.
   1658			 */
   1659			bio_endio(bio);
   1660			return DM_MAPIO_SUBMITTED;
   1661		}
   1662	} else {
   1663		/*
   1664		 * Hit.
   1665		 */
   1666		inc_hit_counter(cache, bio);
   1667
   1668		/*
   1669		 * Passthrough always maps to the origin, invalidating any
   1670		 * cache blocks that are written to.
   1671		 */
   1672		if (passthrough_mode(cache)) {
   1673			if (bio_data_dir(bio) == WRITE) {
   1674				bio_drop_shared_lock(cache, bio);
   1675				atomic_inc(&cache->stats.demotion);
   1676				invalidate_start(cache, cblock, block, bio);
   1677			} else
   1678				remap_to_origin_clear_discard(cache, bio, block);
   1679		} else {
   1680			if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) &&
   1681			    !is_dirty(cache, cblock)) {
   1682				remap_to_origin_and_cache(cache, bio, block, cblock);
   1683				accounted_begin(cache, bio);
   1684			} else
   1685				remap_to_cache_dirty(cache, bio, block, cblock);
   1686		}
   1687	}
   1688
   1689	/*
   1690	 * dm core turns FUA requests into a separate payload and FLUSH req.
   1691	 */
   1692	if (bio->bi_opf & REQ_FUA) {
   1693		/*
   1694		 * issue_after_commit will call accounted_begin a second time.  So
   1695		 * we call accounted_complete() to avoid double accounting.
   1696		 */
   1697		accounted_complete(cache, bio);
   1698		issue_after_commit(&cache->committer, bio);
   1699		*commit_needed = true;
   1700		return DM_MAPIO_SUBMITTED;
   1701	}
   1702
   1703	return DM_MAPIO_REMAPPED;
   1704}
   1705
   1706static bool process_bio(struct cache *cache, struct bio *bio)
   1707{
   1708	bool commit_needed;
   1709
   1710	if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
   1711		dm_submit_bio_remap(bio, NULL);
   1712
   1713	return commit_needed;
   1714}
   1715
   1716/*
   1717 * A non-zero return indicates read_only or fail_io mode.
   1718 */
   1719static int commit(struct cache *cache, bool clean_shutdown)
   1720{
   1721	int r;
   1722
   1723	if (get_cache_mode(cache) >= CM_READ_ONLY)
   1724		return -EINVAL;
   1725
   1726	atomic_inc(&cache->stats.commit_count);
   1727	r = dm_cache_commit(cache->cmd, clean_shutdown);
   1728	if (r)
   1729		metadata_operation_failed(cache, "dm_cache_commit", r);
   1730
   1731	return r;
   1732}
   1733
   1734/*
   1735 * Used by the batcher.
   1736 */
   1737static blk_status_t commit_op(void *context)
   1738{
   1739	struct cache *cache = context;
   1740
   1741	if (dm_cache_changed_this_transaction(cache->cmd))
   1742		return errno_to_blk_status(commit(cache, false));
   1743
   1744	return 0;
   1745}
   1746
   1747/*----------------------------------------------------------------*/
   1748
   1749static bool process_flush_bio(struct cache *cache, struct bio *bio)
   1750{
   1751	struct per_bio_data *pb = get_per_bio_data(bio);
   1752
   1753	if (!pb->req_nr)
   1754		remap_to_origin(cache, bio);
   1755	else
   1756		remap_to_cache(cache, bio, 0);
   1757
   1758	issue_after_commit(&cache->committer, bio);
   1759	return true;
   1760}
   1761
   1762static bool process_discard_bio(struct cache *cache, struct bio *bio)
   1763{
   1764	dm_dblock_t b, e;
   1765
   1766	// FIXME: do we need to lock the region?  Or can we just assume the
   1767	// user wont be so foolish as to issue discard concurrently with
   1768	// other IO?
   1769	calc_discard_block_range(cache, bio, &b, &e);
   1770	while (b != e) {
   1771		set_discard(cache, b);
   1772		b = to_dblock(from_dblock(b) + 1);
   1773	}
   1774
   1775	if (cache->features.discard_passdown) {
   1776		remap_to_origin(cache, bio);
   1777		dm_submit_bio_remap(bio, NULL);
   1778	} else
   1779		bio_endio(bio);
   1780
   1781	return false;
   1782}
   1783
   1784static void process_deferred_bios(struct work_struct *ws)
   1785{
   1786	struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
   1787
   1788	bool commit_needed = false;
   1789	struct bio_list bios;
   1790	struct bio *bio;
   1791
   1792	bio_list_init(&bios);
   1793
   1794	spin_lock_irq(&cache->lock);
   1795	bio_list_merge(&bios, &cache->deferred_bios);
   1796	bio_list_init(&cache->deferred_bios);
   1797	spin_unlock_irq(&cache->lock);
   1798
   1799	while ((bio = bio_list_pop(&bios))) {
   1800		if (bio->bi_opf & REQ_PREFLUSH)
   1801			commit_needed = process_flush_bio(cache, bio) || commit_needed;
   1802
   1803		else if (bio_op(bio) == REQ_OP_DISCARD)
   1804			commit_needed = process_discard_bio(cache, bio) || commit_needed;
   1805
   1806		else
   1807			commit_needed = process_bio(cache, bio) || commit_needed;
   1808	}
   1809
   1810	if (commit_needed)
   1811		schedule_commit(&cache->committer);
   1812}
   1813
   1814/*----------------------------------------------------------------
   1815 * Main worker loop
   1816 *--------------------------------------------------------------*/
   1817
   1818static void requeue_deferred_bios(struct cache *cache)
   1819{
   1820	struct bio *bio;
   1821	struct bio_list bios;
   1822
   1823	bio_list_init(&bios);
   1824	bio_list_merge(&bios, &cache->deferred_bios);
   1825	bio_list_init(&cache->deferred_bios);
   1826
   1827	while ((bio = bio_list_pop(&bios))) {
   1828		bio->bi_status = BLK_STS_DM_REQUEUE;
   1829		bio_endio(bio);
   1830	}
   1831}
   1832
   1833/*
   1834 * We want to commit periodically so that not too much
   1835 * unwritten metadata builds up.
   1836 */
   1837static void do_waker(struct work_struct *ws)
   1838{
   1839	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
   1840
   1841	policy_tick(cache->policy, true);
   1842	wake_migration_worker(cache);
   1843	schedule_commit(&cache->committer);
   1844	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
   1845}
   1846
   1847static void check_migrations(struct work_struct *ws)
   1848{
   1849	int r;
   1850	struct policy_work *op;
   1851	struct cache *cache = container_of(ws, struct cache, migration_worker);
   1852	enum busy b;
   1853
   1854	for (;;) {
   1855		b = spare_migration_bandwidth(cache);
   1856
   1857		r = policy_get_background_work(cache->policy, b == IDLE, &op);
   1858		if (r == -ENODATA)
   1859			break;
   1860
   1861		if (r) {
   1862			DMERR_LIMIT("%s: policy_background_work failed",
   1863				    cache_device_name(cache));
   1864			break;
   1865		}
   1866
   1867		r = mg_start(cache, op, NULL);
   1868		if (r)
   1869			break;
   1870	}
   1871}
   1872
   1873/*----------------------------------------------------------------
   1874 * Target methods
   1875 *--------------------------------------------------------------*/
   1876
   1877/*
   1878 * This function gets called on the error paths of the constructor, so we
   1879 * have to cope with a partially initialised struct.
   1880 */
   1881static void destroy(struct cache *cache)
   1882{
   1883	unsigned i;
   1884
   1885	mempool_exit(&cache->migration_pool);
   1886
   1887	if (cache->prison)
   1888		dm_bio_prison_destroy_v2(cache->prison);
   1889
   1890	if (cache->wq)
   1891		destroy_workqueue(cache->wq);
   1892
   1893	if (cache->dirty_bitset)
   1894		free_bitset(cache->dirty_bitset);
   1895
   1896	if (cache->discard_bitset)
   1897		free_bitset(cache->discard_bitset);
   1898
   1899	if (cache->copier)
   1900		dm_kcopyd_client_destroy(cache->copier);
   1901
   1902	if (cache->cmd)
   1903		dm_cache_metadata_close(cache->cmd);
   1904
   1905	if (cache->metadata_dev)
   1906		dm_put_device(cache->ti, cache->metadata_dev);
   1907
   1908	if (cache->origin_dev)
   1909		dm_put_device(cache->ti, cache->origin_dev);
   1910
   1911	if (cache->cache_dev)
   1912		dm_put_device(cache->ti, cache->cache_dev);
   1913
   1914	if (cache->policy)
   1915		dm_cache_policy_destroy(cache->policy);
   1916
   1917	for (i = 0; i < cache->nr_ctr_args ; i++)
   1918		kfree(cache->ctr_args[i]);
   1919	kfree(cache->ctr_args);
   1920
   1921	bioset_exit(&cache->bs);
   1922
   1923	kfree(cache);
   1924}
   1925
   1926static void cache_dtr(struct dm_target *ti)
   1927{
   1928	struct cache *cache = ti->private;
   1929
   1930	destroy(cache);
   1931}
   1932
   1933static sector_t get_dev_size(struct dm_dev *dev)
   1934{
   1935	return bdev_nr_sectors(dev->bdev);
   1936}
   1937
   1938/*----------------------------------------------------------------*/
   1939
   1940/*
   1941 * Construct a cache device mapping.
   1942 *
   1943 * cache <metadata dev> <cache dev> <origin dev> <block size>
   1944 *       <#feature args> [<feature arg>]*
   1945 *       <policy> <#policy args> [<policy arg>]*
   1946 *
   1947 * metadata dev    : fast device holding the persistent metadata
   1948 * cache dev	   : fast device holding cached data blocks
   1949 * origin dev	   : slow device holding original data blocks
   1950 * block size	   : cache unit size in sectors
   1951 *
   1952 * #feature args   : number of feature arguments passed
   1953 * feature args    : writethrough.  (The default is writeback.)
   1954 *
   1955 * policy	   : the replacement policy to use
   1956 * #policy args    : an even number of policy arguments corresponding
   1957 *		     to key/value pairs passed to the policy
   1958 * policy args	   : key/value pairs passed to the policy
   1959 *		     E.g. 'sequential_threshold 1024'
   1960 *		     See cache-policies.txt for details.
   1961 *
   1962 * Optional feature arguments are:
   1963 *   writethrough  : write through caching that prohibits cache block
   1964 *		     content from being different from origin block content.
   1965 *		     Without this argument, the default behaviour is to write
   1966 *		     back cache block contents later for performance reasons,
   1967 *		     so they may differ from the corresponding origin blocks.
   1968 */
   1969struct cache_args {
   1970	struct dm_target *ti;
   1971
   1972	struct dm_dev *metadata_dev;
   1973
   1974	struct dm_dev *cache_dev;
   1975	sector_t cache_sectors;
   1976
   1977	struct dm_dev *origin_dev;
   1978	sector_t origin_sectors;
   1979
   1980	uint32_t block_size;
   1981
   1982	const char *policy_name;
   1983	int policy_argc;
   1984	const char **policy_argv;
   1985
   1986	struct cache_features features;
   1987};
   1988
   1989static void destroy_cache_args(struct cache_args *ca)
   1990{
   1991	if (ca->metadata_dev)
   1992		dm_put_device(ca->ti, ca->metadata_dev);
   1993
   1994	if (ca->cache_dev)
   1995		dm_put_device(ca->ti, ca->cache_dev);
   1996
   1997	if (ca->origin_dev)
   1998		dm_put_device(ca->ti, ca->origin_dev);
   1999
   2000	kfree(ca);
   2001}
   2002
   2003static bool at_least_one_arg(struct dm_arg_set *as, char **error)
   2004{
   2005	if (!as->argc) {
   2006		*error = "Insufficient args";
   2007		return false;
   2008	}
   2009
   2010	return true;
   2011}
   2012
   2013static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
   2014			      char **error)
   2015{
   2016	int r;
   2017	sector_t metadata_dev_size;
   2018
   2019	if (!at_least_one_arg(as, error))
   2020		return -EINVAL;
   2021
   2022	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
   2023			  &ca->metadata_dev);
   2024	if (r) {
   2025		*error = "Error opening metadata device";
   2026		return r;
   2027	}
   2028
   2029	metadata_dev_size = get_dev_size(ca->metadata_dev);
   2030	if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
   2031		DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.",
   2032		       ca->metadata_dev->bdev, THIN_METADATA_MAX_SECTORS);
   2033
   2034	return 0;
   2035}
   2036
   2037static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
   2038			   char **error)
   2039{
   2040	int r;
   2041
   2042	if (!at_least_one_arg(as, error))
   2043		return -EINVAL;
   2044
   2045	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
   2046			  &ca->cache_dev);
   2047	if (r) {
   2048		*error = "Error opening cache device";
   2049		return r;
   2050	}
   2051	ca->cache_sectors = get_dev_size(ca->cache_dev);
   2052
   2053	return 0;
   2054}
   2055
   2056static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
   2057			    char **error)
   2058{
   2059	int r;
   2060
   2061	if (!at_least_one_arg(as, error))
   2062		return -EINVAL;
   2063
   2064	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
   2065			  &ca->origin_dev);
   2066	if (r) {
   2067		*error = "Error opening origin device";
   2068		return r;
   2069	}
   2070
   2071	ca->origin_sectors = get_dev_size(ca->origin_dev);
   2072	if (ca->ti->len > ca->origin_sectors) {
   2073		*error = "Device size larger than cached device";
   2074		return -EINVAL;
   2075	}
   2076
   2077	return 0;
   2078}
   2079
   2080static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
   2081			    char **error)
   2082{
   2083	unsigned long block_size;
   2084
   2085	if (!at_least_one_arg(as, error))
   2086		return -EINVAL;
   2087
   2088	if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
   2089	    block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
   2090	    block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
   2091	    block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
   2092		*error = "Invalid data block size";
   2093		return -EINVAL;
   2094	}
   2095
   2096	if (block_size > ca->cache_sectors) {
   2097		*error = "Data block size is larger than the cache device";
   2098		return -EINVAL;
   2099	}
   2100
   2101	ca->block_size = block_size;
   2102
   2103	return 0;
   2104}
   2105
   2106static void init_features(struct cache_features *cf)
   2107{
   2108	cf->mode = CM_WRITE;
   2109	cf->io_mode = CM_IO_WRITEBACK;
   2110	cf->metadata_version = 1;
   2111	cf->discard_passdown = true;
   2112}
   2113
   2114static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
   2115			  char **error)
   2116{
   2117	static const struct dm_arg _args[] = {
   2118		{0, 3, "Invalid number of cache feature arguments"},
   2119	};
   2120
   2121	int r, mode_ctr = 0;
   2122	unsigned argc;
   2123	const char *arg;
   2124	struct cache_features *cf = &ca->features;
   2125
   2126	init_features(cf);
   2127
   2128	r = dm_read_arg_group(_args, as, &argc, error);
   2129	if (r)
   2130		return -EINVAL;
   2131
   2132	while (argc--) {
   2133		arg = dm_shift_arg(as);
   2134
   2135		if (!strcasecmp(arg, "writeback")) {
   2136			cf->io_mode = CM_IO_WRITEBACK;
   2137			mode_ctr++;
   2138		}
   2139
   2140		else if (!strcasecmp(arg, "writethrough")) {
   2141			cf->io_mode = CM_IO_WRITETHROUGH;
   2142			mode_ctr++;
   2143		}
   2144
   2145		else if (!strcasecmp(arg, "passthrough")) {
   2146			cf->io_mode = CM_IO_PASSTHROUGH;
   2147			mode_ctr++;
   2148		}
   2149
   2150		else if (!strcasecmp(arg, "metadata2"))
   2151			cf->metadata_version = 2;
   2152
   2153		else if (!strcasecmp(arg, "no_discard_passdown"))
   2154			cf->discard_passdown = false;
   2155
   2156		else {
   2157			*error = "Unrecognised cache feature requested";
   2158			return -EINVAL;
   2159		}
   2160	}
   2161
   2162	if (mode_ctr > 1) {
   2163		*error = "Duplicate cache io_mode features requested";
   2164		return -EINVAL;
   2165	}
   2166
   2167	return 0;
   2168}
   2169
   2170static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
   2171			char **error)
   2172{
   2173	static const struct dm_arg _args[] = {
   2174		{0, 1024, "Invalid number of policy arguments"},
   2175	};
   2176
   2177	int r;
   2178
   2179	if (!at_least_one_arg(as, error))
   2180		return -EINVAL;
   2181
   2182	ca->policy_name = dm_shift_arg(as);
   2183
   2184	r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
   2185	if (r)
   2186		return -EINVAL;
   2187
   2188	ca->policy_argv = (const char **)as->argv;
   2189	dm_consume_args(as, ca->policy_argc);
   2190
   2191	return 0;
   2192}
   2193
   2194static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
   2195			    char **error)
   2196{
   2197	int r;
   2198	struct dm_arg_set as;
   2199
   2200	as.argc = argc;
   2201	as.argv = argv;
   2202
   2203	r = parse_metadata_dev(ca, &as, error);
   2204	if (r)
   2205		return r;
   2206
   2207	r = parse_cache_dev(ca, &as, error);
   2208	if (r)
   2209		return r;
   2210
   2211	r = parse_origin_dev(ca, &as, error);
   2212	if (r)
   2213		return r;
   2214
   2215	r = parse_block_size(ca, &as, error);
   2216	if (r)
   2217		return r;
   2218
   2219	r = parse_features(ca, &as, error);
   2220	if (r)
   2221		return r;
   2222
   2223	r = parse_policy(ca, &as, error);
   2224	if (r)
   2225		return r;
   2226
   2227	return 0;
   2228}
   2229
   2230/*----------------------------------------------------------------*/
   2231
   2232static struct kmem_cache *migration_cache;
   2233
   2234#define NOT_CORE_OPTION 1
   2235
   2236static int process_config_option(struct cache *cache, const char *key, const char *value)
   2237{
   2238	unsigned long tmp;
   2239
   2240	if (!strcasecmp(key, "migration_threshold")) {
   2241		if (kstrtoul(value, 10, &tmp))
   2242			return -EINVAL;
   2243
   2244		cache->migration_threshold = tmp;
   2245		return 0;
   2246	}
   2247
   2248	return NOT_CORE_OPTION;
   2249}
   2250
   2251static int set_config_value(struct cache *cache, const char *key, const char *value)
   2252{
   2253	int r = process_config_option(cache, key, value);
   2254
   2255	if (r == NOT_CORE_OPTION)
   2256		r = policy_set_config_value(cache->policy, key, value);
   2257
   2258	if (r)
   2259		DMWARN("bad config value for %s: %s", key, value);
   2260
   2261	return r;
   2262}
   2263
   2264static int set_config_values(struct cache *cache, int argc, const char **argv)
   2265{
   2266	int r = 0;
   2267
   2268	if (argc & 1) {
   2269		DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
   2270		return -EINVAL;
   2271	}
   2272
   2273	while (argc) {
   2274		r = set_config_value(cache, argv[0], argv[1]);
   2275		if (r)
   2276			break;
   2277
   2278		argc -= 2;
   2279		argv += 2;
   2280	}
   2281
   2282	return r;
   2283}
   2284
   2285static int create_cache_policy(struct cache *cache, struct cache_args *ca,
   2286			       char **error)
   2287{
   2288	struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
   2289							   cache->cache_size,
   2290							   cache->origin_sectors,
   2291							   cache->sectors_per_block);
   2292	if (IS_ERR(p)) {
   2293		*error = "Error creating cache's policy";
   2294		return PTR_ERR(p);
   2295	}
   2296	cache->policy = p;
   2297	BUG_ON(!cache->policy);
   2298
   2299	return 0;
   2300}
   2301
   2302/*
   2303 * We want the discard block size to be at least the size of the cache
   2304 * block size and have no more than 2^14 discard blocks across the origin.
   2305 */
   2306#define MAX_DISCARD_BLOCKS (1 << 14)
   2307
   2308static bool too_many_discard_blocks(sector_t discard_block_size,
   2309				    sector_t origin_size)
   2310{
   2311	(void) sector_div(origin_size, discard_block_size);
   2312
   2313	return origin_size > MAX_DISCARD_BLOCKS;
   2314}
   2315
   2316static sector_t calculate_discard_block_size(sector_t cache_block_size,
   2317					     sector_t origin_size)
   2318{
   2319	sector_t discard_block_size = cache_block_size;
   2320
   2321	if (origin_size)
   2322		while (too_many_discard_blocks(discard_block_size, origin_size))
   2323			discard_block_size *= 2;
   2324
   2325	return discard_block_size;
   2326}
   2327
   2328static void set_cache_size(struct cache *cache, dm_cblock_t size)
   2329{
   2330	dm_block_t nr_blocks = from_cblock(size);
   2331
   2332	if (nr_blocks > (1 << 20) && cache->cache_size != size)
   2333		DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
   2334			     "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
   2335			     "Please consider increasing the cache block size to reduce the overall cache block count.",
   2336			     (unsigned long long) nr_blocks);
   2337
   2338	cache->cache_size = size;
   2339}
   2340
   2341#define DEFAULT_MIGRATION_THRESHOLD 2048
   2342
   2343static int cache_create(struct cache_args *ca, struct cache **result)
   2344{
   2345	int r = 0;
   2346	char **error = &ca->ti->error;
   2347	struct cache *cache;
   2348	struct dm_target *ti = ca->ti;
   2349	dm_block_t origin_blocks;
   2350	struct dm_cache_metadata *cmd;
   2351	bool may_format = ca->features.mode == CM_WRITE;
   2352
   2353	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
   2354	if (!cache)
   2355		return -ENOMEM;
   2356
   2357	cache->ti = ca->ti;
   2358	ti->private = cache;
   2359	ti->accounts_remapped_io = true;
   2360	ti->num_flush_bios = 2;
   2361	ti->flush_supported = true;
   2362
   2363	ti->num_discard_bios = 1;
   2364	ti->discards_supported = true;
   2365
   2366	ti->per_io_data_size = sizeof(struct per_bio_data);
   2367
   2368	cache->features = ca->features;
   2369	if (writethrough_mode(cache)) {
   2370		/* Create bioset for writethrough bios issued to origin */
   2371		r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0);
   2372		if (r)
   2373			goto bad;
   2374	}
   2375
   2376	cache->metadata_dev = ca->metadata_dev;
   2377	cache->origin_dev = ca->origin_dev;
   2378	cache->cache_dev = ca->cache_dev;
   2379
   2380	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
   2381
   2382	origin_blocks = cache->origin_sectors = ca->origin_sectors;
   2383	origin_blocks = block_div(origin_blocks, ca->block_size);
   2384	cache->origin_blocks = to_oblock(origin_blocks);
   2385
   2386	cache->sectors_per_block = ca->block_size;
   2387	if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
   2388		r = -EINVAL;
   2389		goto bad;
   2390	}
   2391
   2392	if (ca->block_size & (ca->block_size - 1)) {
   2393		dm_block_t cache_size = ca->cache_sectors;
   2394
   2395		cache->sectors_per_block_shift = -1;
   2396		cache_size = block_div(cache_size, ca->block_size);
   2397		set_cache_size(cache, to_cblock(cache_size));
   2398	} else {
   2399		cache->sectors_per_block_shift = __ffs(ca->block_size);
   2400		set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
   2401	}
   2402
   2403	r = create_cache_policy(cache, ca, error);
   2404	if (r)
   2405		goto bad;
   2406
   2407	cache->policy_nr_args = ca->policy_argc;
   2408	cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
   2409
   2410	r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
   2411	if (r) {
   2412		*error = "Error setting cache policy's config values";
   2413		goto bad;
   2414	}
   2415
   2416	cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
   2417				     ca->block_size, may_format,
   2418				     dm_cache_policy_get_hint_size(cache->policy),
   2419				     ca->features.metadata_version);
   2420	if (IS_ERR(cmd)) {
   2421		*error = "Error creating metadata object";
   2422		r = PTR_ERR(cmd);
   2423		goto bad;
   2424	}
   2425	cache->cmd = cmd;
   2426	set_cache_mode(cache, CM_WRITE);
   2427	if (get_cache_mode(cache) != CM_WRITE) {
   2428		*error = "Unable to get write access to metadata, please check/repair metadata.";
   2429		r = -EINVAL;
   2430		goto bad;
   2431	}
   2432
   2433	if (passthrough_mode(cache)) {
   2434		bool all_clean;
   2435
   2436		r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
   2437		if (r) {
   2438			*error = "dm_cache_metadata_all_clean() failed";
   2439			goto bad;
   2440		}
   2441
   2442		if (!all_clean) {
   2443			*error = "Cannot enter passthrough mode unless all blocks are clean";
   2444			r = -EINVAL;
   2445			goto bad;
   2446		}
   2447
   2448		policy_allow_migrations(cache->policy, false);
   2449	}
   2450
   2451	spin_lock_init(&cache->lock);
   2452	bio_list_init(&cache->deferred_bios);
   2453	atomic_set(&cache->nr_allocated_migrations, 0);
   2454	atomic_set(&cache->nr_io_migrations, 0);
   2455	init_waitqueue_head(&cache->migration_wait);
   2456
   2457	r = -ENOMEM;
   2458	atomic_set(&cache->nr_dirty, 0);
   2459	cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
   2460	if (!cache->dirty_bitset) {
   2461		*error = "could not allocate dirty bitset";
   2462		goto bad;
   2463	}
   2464	clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
   2465
   2466	cache->discard_block_size =
   2467		calculate_discard_block_size(cache->sectors_per_block,
   2468					     cache->origin_sectors);
   2469	cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
   2470							      cache->discard_block_size));
   2471	cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
   2472	if (!cache->discard_bitset) {
   2473		*error = "could not allocate discard bitset";
   2474		goto bad;
   2475	}
   2476	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
   2477
   2478	cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
   2479	if (IS_ERR(cache->copier)) {
   2480		*error = "could not create kcopyd client";
   2481		r = PTR_ERR(cache->copier);
   2482		goto bad;
   2483	}
   2484
   2485	cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
   2486	if (!cache->wq) {
   2487		*error = "could not create workqueue for metadata object";
   2488		goto bad;
   2489	}
   2490	INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
   2491	INIT_WORK(&cache->migration_worker, check_migrations);
   2492	INIT_DELAYED_WORK(&cache->waker, do_waker);
   2493
   2494	cache->prison = dm_bio_prison_create_v2(cache->wq);
   2495	if (!cache->prison) {
   2496		*error = "could not create bio prison";
   2497		goto bad;
   2498	}
   2499
   2500	r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE,
   2501				   migration_cache);
   2502	if (r) {
   2503		*error = "Error creating cache's migration mempool";
   2504		goto bad;
   2505	}
   2506
   2507	cache->need_tick_bio = true;
   2508	cache->sized = false;
   2509	cache->invalidate = false;
   2510	cache->commit_requested = false;
   2511	cache->loaded_mappings = false;
   2512	cache->loaded_discards = false;
   2513
   2514	load_stats(cache);
   2515
   2516	atomic_set(&cache->stats.demotion, 0);
   2517	atomic_set(&cache->stats.promotion, 0);
   2518	atomic_set(&cache->stats.copies_avoided, 0);
   2519	atomic_set(&cache->stats.cache_cell_clash, 0);
   2520	atomic_set(&cache->stats.commit_count, 0);
   2521	atomic_set(&cache->stats.discard_count, 0);
   2522
   2523	spin_lock_init(&cache->invalidation_lock);
   2524	INIT_LIST_HEAD(&cache->invalidation_requests);
   2525
   2526	batcher_init(&cache->committer, commit_op, cache,
   2527		     issue_op, cache, cache->wq);
   2528	dm_iot_init(&cache->tracker);
   2529
   2530	init_rwsem(&cache->background_work_lock);
   2531	prevent_background_work(cache);
   2532
   2533	*result = cache;
   2534	return 0;
   2535bad:
   2536	destroy(cache);
   2537	return r;
   2538}
   2539
   2540static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
   2541{
   2542	unsigned i;
   2543	const char **copy;
   2544
   2545	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
   2546	if (!copy)
   2547		return -ENOMEM;
   2548	for (i = 0; i < argc; i++) {
   2549		copy[i] = kstrdup(argv[i], GFP_KERNEL);
   2550		if (!copy[i]) {
   2551			while (i--)
   2552				kfree(copy[i]);
   2553			kfree(copy);
   2554			return -ENOMEM;
   2555		}
   2556	}
   2557
   2558	cache->nr_ctr_args = argc;
   2559	cache->ctr_args = copy;
   2560
   2561	return 0;
   2562}
   2563
   2564static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
   2565{
   2566	int r = -EINVAL;
   2567	struct cache_args *ca;
   2568	struct cache *cache = NULL;
   2569
   2570	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
   2571	if (!ca) {
   2572		ti->error = "Error allocating memory for cache";
   2573		return -ENOMEM;
   2574	}
   2575	ca->ti = ti;
   2576
   2577	r = parse_cache_args(ca, argc, argv, &ti->error);
   2578	if (r)
   2579		goto out;
   2580
   2581	r = cache_create(ca, &cache);
   2582	if (r)
   2583		goto out;
   2584
   2585	r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
   2586	if (r) {
   2587		destroy(cache);
   2588		goto out;
   2589	}
   2590
   2591	ti->private = cache;
   2592out:
   2593	destroy_cache_args(ca);
   2594	return r;
   2595}
   2596
   2597/*----------------------------------------------------------------*/
   2598
   2599static int cache_map(struct dm_target *ti, struct bio *bio)
   2600{
   2601	struct cache *cache = ti->private;
   2602
   2603	int r;
   2604	bool commit_needed;
   2605	dm_oblock_t block = get_bio_block(cache, bio);
   2606
   2607	init_per_bio_data(bio);
   2608	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
   2609		/*
   2610		 * This can only occur if the io goes to a partial block at
   2611		 * the end of the origin device.  We don't cache these.
   2612		 * Just remap to the origin and carry on.
   2613		 */
   2614		remap_to_origin(cache, bio);
   2615		accounted_begin(cache, bio);
   2616		return DM_MAPIO_REMAPPED;
   2617	}
   2618
   2619	if (discard_or_flush(bio)) {
   2620		defer_bio(cache, bio);
   2621		return DM_MAPIO_SUBMITTED;
   2622	}
   2623
   2624	r = map_bio(cache, bio, block, &commit_needed);
   2625	if (commit_needed)
   2626		schedule_commit(&cache->committer);
   2627
   2628	return r;
   2629}
   2630
   2631static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error)
   2632{
   2633	struct cache *cache = ti->private;
   2634	unsigned long flags;
   2635	struct per_bio_data *pb = get_per_bio_data(bio);
   2636
   2637	if (pb->tick) {
   2638		policy_tick(cache->policy, false);
   2639
   2640		spin_lock_irqsave(&cache->lock, flags);
   2641		cache->need_tick_bio = true;
   2642		spin_unlock_irqrestore(&cache->lock, flags);
   2643	}
   2644
   2645	bio_drop_shared_lock(cache, bio);
   2646	accounted_complete(cache, bio);
   2647
   2648	return DM_ENDIO_DONE;
   2649}
   2650
   2651static int write_dirty_bitset(struct cache *cache)
   2652{
   2653	int r;
   2654
   2655	if (get_cache_mode(cache) >= CM_READ_ONLY)
   2656		return -EINVAL;
   2657
   2658	r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset);
   2659	if (r)
   2660		metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r);
   2661
   2662	return r;
   2663}
   2664
   2665static int write_discard_bitset(struct cache *cache)
   2666{
   2667	unsigned i, r;
   2668
   2669	if (get_cache_mode(cache) >= CM_READ_ONLY)
   2670		return -EINVAL;
   2671
   2672	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
   2673					   cache->discard_nr_blocks);
   2674	if (r) {
   2675		DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
   2676		metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
   2677		return r;
   2678	}
   2679
   2680	for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
   2681		r = dm_cache_set_discard(cache->cmd, to_dblock(i),
   2682					 is_discarded(cache, to_dblock(i)));
   2683		if (r) {
   2684			metadata_operation_failed(cache, "dm_cache_set_discard", r);
   2685			return r;
   2686		}
   2687	}
   2688
   2689	return 0;
   2690}
   2691
   2692static int write_hints(struct cache *cache)
   2693{
   2694	int r;
   2695
   2696	if (get_cache_mode(cache) >= CM_READ_ONLY)
   2697		return -EINVAL;
   2698
   2699	r = dm_cache_write_hints(cache->cmd, cache->policy);
   2700	if (r) {
   2701		metadata_operation_failed(cache, "dm_cache_write_hints", r);
   2702		return r;
   2703	}
   2704
   2705	return 0;
   2706}
   2707
   2708/*
   2709 * returns true on success
   2710 */
   2711static bool sync_metadata(struct cache *cache)
   2712{
   2713	int r1, r2, r3, r4;
   2714
   2715	r1 = write_dirty_bitset(cache);
   2716	if (r1)
   2717		DMERR("%s: could not write dirty bitset", cache_device_name(cache));
   2718
   2719	r2 = write_discard_bitset(cache);
   2720	if (r2)
   2721		DMERR("%s: could not write discard bitset", cache_device_name(cache));
   2722
   2723	save_stats(cache);
   2724
   2725	r3 = write_hints(cache);
   2726	if (r3)
   2727		DMERR("%s: could not write hints", cache_device_name(cache));
   2728
   2729	/*
   2730	 * If writing the above metadata failed, we still commit, but don't
   2731	 * set the clean shutdown flag.  This will effectively force every
   2732	 * dirty bit to be set on reload.
   2733	 */
   2734	r4 = commit(cache, !r1 && !r2 && !r3);
   2735	if (r4)
   2736		DMERR("%s: could not write cache metadata", cache_device_name(cache));
   2737
   2738	return !r1 && !r2 && !r3 && !r4;
   2739}
   2740
   2741static void cache_postsuspend(struct dm_target *ti)
   2742{
   2743	struct cache *cache = ti->private;
   2744
   2745	prevent_background_work(cache);
   2746	BUG_ON(atomic_read(&cache->nr_io_migrations));
   2747
   2748	cancel_delayed_work_sync(&cache->waker);
   2749	drain_workqueue(cache->wq);
   2750	WARN_ON(cache->tracker.in_flight);
   2751
   2752	/*
   2753	 * If it's a flush suspend there won't be any deferred bios, so this
   2754	 * call is harmless.
   2755	 */
   2756	requeue_deferred_bios(cache);
   2757
   2758	if (get_cache_mode(cache) == CM_WRITE)
   2759		(void) sync_metadata(cache);
   2760}
   2761
   2762static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
   2763			bool dirty, uint32_t hint, bool hint_valid)
   2764{
   2765	struct cache *cache = context;
   2766
   2767	if (dirty) {
   2768		set_bit(from_cblock(cblock), cache->dirty_bitset);
   2769		atomic_inc(&cache->nr_dirty);
   2770	} else
   2771		clear_bit(from_cblock(cblock), cache->dirty_bitset);
   2772
   2773	return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
   2774}
   2775
   2776/*
   2777 * The discard block size in the on disk metadata is not
   2778 * neccessarily the same as we're currently using.  So we have to
   2779 * be careful to only set the discarded attribute if we know it
   2780 * covers a complete block of the new size.
   2781 */
   2782struct discard_load_info {
   2783	struct cache *cache;
   2784
   2785	/*
   2786	 * These blocks are sized using the on disk dblock size, rather
   2787	 * than the current one.
   2788	 */
   2789	dm_block_t block_size;
   2790	dm_block_t discard_begin, discard_end;
   2791};
   2792
   2793static void discard_load_info_init(struct cache *cache,
   2794				   struct discard_load_info *li)
   2795{
   2796	li->cache = cache;
   2797	li->discard_begin = li->discard_end = 0;
   2798}
   2799
   2800static void set_discard_range(struct discard_load_info *li)
   2801{
   2802	sector_t b, e;
   2803
   2804	if (li->discard_begin == li->discard_end)
   2805		return;
   2806
   2807	/*
   2808	 * Convert to sectors.
   2809	 */
   2810	b = li->discard_begin * li->block_size;
   2811	e = li->discard_end * li->block_size;
   2812
   2813	/*
   2814	 * Then convert back to the current dblock size.
   2815	 */
   2816	b = dm_sector_div_up(b, li->cache->discard_block_size);
   2817	sector_div(e, li->cache->discard_block_size);
   2818
   2819	/*
   2820	 * The origin may have shrunk, so we need to check we're still in
   2821	 * bounds.
   2822	 */
   2823	if (e > from_dblock(li->cache->discard_nr_blocks))
   2824		e = from_dblock(li->cache->discard_nr_blocks);
   2825
   2826	for (; b < e; b++)
   2827		set_discard(li->cache, to_dblock(b));
   2828}
   2829
   2830static int load_discard(void *context, sector_t discard_block_size,
   2831			dm_dblock_t dblock, bool discard)
   2832{
   2833	struct discard_load_info *li = context;
   2834
   2835	li->block_size = discard_block_size;
   2836
   2837	if (discard) {
   2838		if (from_dblock(dblock) == li->discard_end)
   2839			/*
   2840			 * We're already in a discard range, just extend it.
   2841			 */
   2842			li->discard_end = li->discard_end + 1ULL;
   2843
   2844		else {
   2845			/*
   2846			 * Emit the old range and start a new one.
   2847			 */
   2848			set_discard_range(li);
   2849			li->discard_begin = from_dblock(dblock);
   2850			li->discard_end = li->discard_begin + 1ULL;
   2851		}
   2852	} else {
   2853		set_discard_range(li);
   2854		li->discard_begin = li->discard_end = 0;
   2855	}
   2856
   2857	return 0;
   2858}
   2859
   2860static dm_cblock_t get_cache_dev_size(struct cache *cache)
   2861{
   2862	sector_t size = get_dev_size(cache->cache_dev);
   2863	(void) sector_div(size, cache->sectors_per_block);
   2864	return to_cblock(size);
   2865}
   2866
   2867static bool can_resize(struct cache *cache, dm_cblock_t new_size)
   2868{
   2869	if (from_cblock(new_size) > from_cblock(cache->cache_size)) {
   2870		if (cache->sized) {
   2871			DMERR("%s: unable to extend cache due to missing cache table reload",
   2872			      cache_device_name(cache));
   2873			return false;
   2874		}
   2875	}
   2876
   2877	/*
   2878	 * We can't drop a dirty block when shrinking the cache.
   2879	 */
   2880	while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
   2881		new_size = to_cblock(from_cblock(new_size) + 1);
   2882		if (is_dirty(cache, new_size)) {
   2883			DMERR("%s: unable to shrink cache; cache block %llu is dirty",
   2884			      cache_device_name(cache),
   2885			      (unsigned long long) from_cblock(new_size));
   2886			return false;
   2887		}
   2888	}
   2889
   2890	return true;
   2891}
   2892
   2893static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
   2894{
   2895	int r;
   2896
   2897	r = dm_cache_resize(cache->cmd, new_size);
   2898	if (r) {
   2899		DMERR("%s: could not resize cache metadata", cache_device_name(cache));
   2900		metadata_operation_failed(cache, "dm_cache_resize", r);
   2901		return r;
   2902	}
   2903
   2904	set_cache_size(cache, new_size);
   2905
   2906	return 0;
   2907}
   2908
   2909static int cache_preresume(struct dm_target *ti)
   2910{
   2911	int r = 0;
   2912	struct cache *cache = ti->private;
   2913	dm_cblock_t csize = get_cache_dev_size(cache);
   2914
   2915	/*
   2916	 * Check to see if the cache has resized.
   2917	 */
   2918	if (!cache->sized) {
   2919		r = resize_cache_dev(cache, csize);
   2920		if (r)
   2921			return r;
   2922
   2923		cache->sized = true;
   2924
   2925	} else if (csize != cache->cache_size) {
   2926		if (!can_resize(cache, csize))
   2927			return -EINVAL;
   2928
   2929		r = resize_cache_dev(cache, csize);
   2930		if (r)
   2931			return r;
   2932	}
   2933
   2934	if (!cache->loaded_mappings) {
   2935		r = dm_cache_load_mappings(cache->cmd, cache->policy,
   2936					   load_mapping, cache);
   2937		if (r) {
   2938			DMERR("%s: could not load cache mappings", cache_device_name(cache));
   2939			metadata_operation_failed(cache, "dm_cache_load_mappings", r);
   2940			return r;
   2941		}
   2942
   2943		cache->loaded_mappings = true;
   2944	}
   2945
   2946	if (!cache->loaded_discards) {
   2947		struct discard_load_info li;
   2948
   2949		/*
   2950		 * The discard bitset could have been resized, or the
   2951		 * discard block size changed.  To be safe we start by
   2952		 * setting every dblock to not discarded.
   2953		 */
   2954		clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
   2955
   2956		discard_load_info_init(cache, &li);
   2957		r = dm_cache_load_discards(cache->cmd, load_discard, &li);
   2958		if (r) {
   2959			DMERR("%s: could not load origin discards", cache_device_name(cache));
   2960			metadata_operation_failed(cache, "dm_cache_load_discards", r);
   2961			return r;
   2962		}
   2963		set_discard_range(&li);
   2964
   2965		cache->loaded_discards = true;
   2966	}
   2967
   2968	return r;
   2969}
   2970
   2971static void cache_resume(struct dm_target *ti)
   2972{
   2973	struct cache *cache = ti->private;
   2974
   2975	cache->need_tick_bio = true;
   2976	allow_background_work(cache);
   2977	do_waker(&cache->waker.work);
   2978}
   2979
   2980static void emit_flags(struct cache *cache, char *result,
   2981		       unsigned maxlen, ssize_t *sz_ptr)
   2982{
   2983	ssize_t sz = *sz_ptr;
   2984	struct cache_features *cf = &cache->features;
   2985	unsigned count = (cf->metadata_version == 2) + !cf->discard_passdown + 1;
   2986
   2987	DMEMIT("%u ", count);
   2988
   2989	if (cf->metadata_version == 2)
   2990		DMEMIT("metadata2 ");
   2991
   2992	if (writethrough_mode(cache))
   2993		DMEMIT("writethrough ");
   2994
   2995	else if (passthrough_mode(cache))
   2996		DMEMIT("passthrough ");
   2997
   2998	else if (writeback_mode(cache))
   2999		DMEMIT("writeback ");
   3000
   3001	else {
   3002		DMEMIT("unknown ");
   3003		DMERR("%s: internal error: unknown io mode: %d",
   3004		      cache_device_name(cache), (int) cf->io_mode);
   3005	}
   3006
   3007	if (!cf->discard_passdown)
   3008		DMEMIT("no_discard_passdown ");
   3009
   3010	*sz_ptr = sz;
   3011}
   3012
   3013/*
   3014 * Status format:
   3015 *
   3016 * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
   3017 * <cache block size> <#used cache blocks>/<#total cache blocks>
   3018 * <#read hits> <#read misses> <#write hits> <#write misses>
   3019 * <#demotions> <#promotions> <#dirty>
   3020 * <#features> <features>*
   3021 * <#core args> <core args>
   3022 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
   3023 */
   3024static void cache_status(struct dm_target *ti, status_type_t type,
   3025			 unsigned status_flags, char *result, unsigned maxlen)
   3026{
   3027	int r = 0;
   3028	unsigned i;
   3029	ssize_t sz = 0;
   3030	dm_block_t nr_free_blocks_metadata = 0;
   3031	dm_block_t nr_blocks_metadata = 0;
   3032	char buf[BDEVNAME_SIZE];
   3033	struct cache *cache = ti->private;
   3034	dm_cblock_t residency;
   3035	bool needs_check;
   3036
   3037	switch (type) {
   3038	case STATUSTYPE_INFO:
   3039		if (get_cache_mode(cache) == CM_FAIL) {
   3040			DMEMIT("Fail");
   3041			break;
   3042		}
   3043
   3044		/* Commit to ensure statistics aren't out-of-date */
   3045		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
   3046			(void) commit(cache, false);
   3047
   3048		r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
   3049		if (r) {
   3050			DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
   3051			      cache_device_name(cache), r);
   3052			goto err;
   3053		}
   3054
   3055		r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
   3056		if (r) {
   3057			DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
   3058			      cache_device_name(cache), r);
   3059			goto err;
   3060		}
   3061
   3062		residency = policy_residency(cache->policy);
   3063
   3064		DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
   3065		       (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
   3066		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
   3067		       (unsigned long long)nr_blocks_metadata,
   3068		       (unsigned long long)cache->sectors_per_block,
   3069		       (unsigned long long) from_cblock(residency),
   3070		       (unsigned long long) from_cblock(cache->cache_size),
   3071		       (unsigned) atomic_read(&cache->stats.read_hit),
   3072		       (unsigned) atomic_read(&cache->stats.read_miss),
   3073		       (unsigned) atomic_read(&cache->stats.write_hit),
   3074		       (unsigned) atomic_read(&cache->stats.write_miss),
   3075		       (unsigned) atomic_read(&cache->stats.demotion),
   3076		       (unsigned) atomic_read(&cache->stats.promotion),
   3077		       (unsigned long) atomic_read(&cache->nr_dirty));
   3078
   3079		emit_flags(cache, result, maxlen, &sz);
   3080
   3081		DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
   3082
   3083		DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
   3084		if (sz < maxlen) {
   3085			r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
   3086			if (r)
   3087				DMERR("%s: policy_emit_config_values returned %d",
   3088				      cache_device_name(cache), r);
   3089		}
   3090
   3091		if (get_cache_mode(cache) == CM_READ_ONLY)
   3092			DMEMIT("ro ");
   3093		else
   3094			DMEMIT("rw ");
   3095
   3096		r = dm_cache_metadata_needs_check(cache->cmd, &needs_check);
   3097
   3098		if (r || needs_check)
   3099			DMEMIT("needs_check ");
   3100		else
   3101			DMEMIT("- ");
   3102
   3103		break;
   3104
   3105	case STATUSTYPE_TABLE:
   3106		format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
   3107		DMEMIT("%s ", buf);
   3108		format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
   3109		DMEMIT("%s ", buf);
   3110		format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
   3111		DMEMIT("%s", buf);
   3112
   3113		for (i = 0; i < cache->nr_ctr_args - 1; i++)
   3114			DMEMIT(" %s", cache->ctr_args[i]);
   3115		if (cache->nr_ctr_args)
   3116			DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
   3117		break;
   3118
   3119	case STATUSTYPE_IMA:
   3120		DMEMIT_TARGET_NAME_VERSION(ti->type);
   3121		if (get_cache_mode(cache) == CM_FAIL)
   3122			DMEMIT(",metadata_mode=fail");
   3123		else if (get_cache_mode(cache) == CM_READ_ONLY)
   3124			DMEMIT(",metadata_mode=ro");
   3125		else
   3126			DMEMIT(",metadata_mode=rw");
   3127
   3128		format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
   3129		DMEMIT(",cache_metadata_device=%s", buf);
   3130		format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
   3131		DMEMIT(",cache_device=%s", buf);
   3132		format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
   3133		DMEMIT(",cache_origin_device=%s", buf);
   3134		DMEMIT(",writethrough=%c", writethrough_mode(cache) ? 'y' : 'n');
   3135		DMEMIT(",writeback=%c", writeback_mode(cache) ? 'y' : 'n');
   3136		DMEMIT(",passthrough=%c", passthrough_mode(cache) ? 'y' : 'n');
   3137		DMEMIT(",metadata2=%c", cache->features.metadata_version == 2 ? 'y' : 'n');
   3138		DMEMIT(",no_discard_passdown=%c", cache->features.discard_passdown ? 'n' : 'y');
   3139		DMEMIT(";");
   3140		break;
   3141	}
   3142
   3143	return;
   3144
   3145err:
   3146	DMEMIT("Error");
   3147}
   3148
   3149/*
   3150 * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
   3151 * the one-past-the-end value.
   3152 */
   3153struct cblock_range {
   3154	dm_cblock_t begin;
   3155	dm_cblock_t end;
   3156};
   3157
   3158/*
   3159 * A cache block range can take two forms:
   3160 *
   3161 * i) A single cblock, eg. '3456'
   3162 * ii) A begin and end cblock with a dash between, eg. 123-234
   3163 */
   3164static int parse_cblock_range(struct cache *cache, const char *str,
   3165			      struct cblock_range *result)
   3166{
   3167	char dummy;
   3168	uint64_t b, e;
   3169	int r;
   3170
   3171	/*
   3172	 * Try and parse form (ii) first.
   3173	 */
   3174	r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
   3175	if (r < 0)
   3176		return r;
   3177
   3178	if (r == 2) {
   3179		result->begin = to_cblock(b);
   3180		result->end = to_cblock(e);
   3181		return 0;
   3182	}
   3183
   3184	/*
   3185	 * That didn't work, try form (i).
   3186	 */
   3187	r = sscanf(str, "%llu%c", &b, &dummy);
   3188	if (r < 0)
   3189		return r;
   3190
   3191	if (r == 1) {
   3192		result->begin = to_cblock(b);
   3193		result->end = to_cblock(from_cblock(result->begin) + 1u);
   3194		return 0;
   3195	}
   3196
   3197	DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
   3198	return -EINVAL;
   3199}
   3200
   3201static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
   3202{
   3203	uint64_t b = from_cblock(range->begin);
   3204	uint64_t e = from_cblock(range->end);
   3205	uint64_t n = from_cblock(cache->cache_size);
   3206
   3207	if (b >= n) {
   3208		DMERR("%s: begin cblock out of range: %llu >= %llu",
   3209		      cache_device_name(cache), b, n);
   3210		return -EINVAL;
   3211	}
   3212
   3213	if (e > n) {
   3214		DMERR("%s: end cblock out of range: %llu > %llu",
   3215		      cache_device_name(cache), e, n);
   3216		return -EINVAL;
   3217	}
   3218
   3219	if (b >= e) {
   3220		DMERR("%s: invalid cblock range: %llu >= %llu",
   3221		      cache_device_name(cache), b, e);
   3222		return -EINVAL;
   3223	}
   3224
   3225	return 0;
   3226}
   3227
   3228static inline dm_cblock_t cblock_succ(dm_cblock_t b)
   3229{
   3230	return to_cblock(from_cblock(b) + 1);
   3231}
   3232
   3233static int request_invalidation(struct cache *cache, struct cblock_range *range)
   3234{
   3235	int r = 0;
   3236
   3237	/*
   3238	 * We don't need to do any locking here because we know we're in
   3239	 * passthrough mode.  There's is potential for a race between an
   3240	 * invalidation triggered by an io and an invalidation message.  This
   3241	 * is harmless, we must not worry if the policy call fails.
   3242	 */
   3243	while (range->begin != range->end) {
   3244		r = invalidate_cblock(cache, range->begin);
   3245		if (r)
   3246			return r;
   3247
   3248		range->begin = cblock_succ(range->begin);
   3249	}
   3250
   3251	cache->commit_requested = true;
   3252	return r;
   3253}
   3254
   3255static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
   3256					      const char **cblock_ranges)
   3257{
   3258	int r = 0;
   3259	unsigned i;
   3260	struct cblock_range range;
   3261
   3262	if (!passthrough_mode(cache)) {
   3263		DMERR("%s: cache has to be in passthrough mode for invalidation",
   3264		      cache_device_name(cache));
   3265		return -EPERM;
   3266	}
   3267
   3268	for (i = 0; i < count; i++) {
   3269		r = parse_cblock_range(cache, cblock_ranges[i], &range);
   3270		if (r)
   3271			break;
   3272
   3273		r = validate_cblock_range(cache, &range);
   3274		if (r)
   3275			break;
   3276
   3277		/*
   3278		 * Pass begin and end origin blocks to the worker and wake it.
   3279		 */
   3280		r = request_invalidation(cache, &range);
   3281		if (r)
   3282			break;
   3283	}
   3284
   3285	return r;
   3286}
   3287
   3288/*
   3289 * Supports
   3290 *	"<key> <value>"
   3291 * and
   3292 *     "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
   3293 *
   3294 * The key migration_threshold is supported by the cache target core.
   3295 */
   3296static int cache_message(struct dm_target *ti, unsigned argc, char **argv,
   3297			 char *result, unsigned maxlen)
   3298{
   3299	struct cache *cache = ti->private;
   3300
   3301	if (!argc)
   3302		return -EINVAL;
   3303
   3304	if (get_cache_mode(cache) >= CM_READ_ONLY) {
   3305		DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
   3306		      cache_device_name(cache));
   3307		return -EOPNOTSUPP;
   3308	}
   3309
   3310	if (!strcasecmp(argv[0], "invalidate_cblocks"))
   3311		return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
   3312
   3313	if (argc != 2)
   3314		return -EINVAL;
   3315
   3316	return set_config_value(cache, argv[0], argv[1]);
   3317}
   3318
   3319static int cache_iterate_devices(struct dm_target *ti,
   3320				 iterate_devices_callout_fn fn, void *data)
   3321{
   3322	int r = 0;
   3323	struct cache *cache = ti->private;
   3324
   3325	r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
   3326	if (!r)
   3327		r = fn(ti, cache->origin_dev, 0, ti->len, data);
   3328
   3329	return r;
   3330}
   3331
   3332/*
   3333 * If discard_passdown was enabled verify that the origin device
   3334 * supports discards.  Disable discard_passdown if not.
   3335 */
   3336static void disable_passdown_if_not_supported(struct cache *cache)
   3337{
   3338	struct block_device *origin_bdev = cache->origin_dev->bdev;
   3339	struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
   3340	const char *reason = NULL;
   3341
   3342	if (!cache->features.discard_passdown)
   3343		return;
   3344
   3345	if (!bdev_max_discard_sectors(origin_bdev))
   3346		reason = "discard unsupported";
   3347
   3348	else if (origin_limits->max_discard_sectors < cache->sectors_per_block)
   3349		reason = "max discard sectors smaller than a block";
   3350
   3351	if (reason) {
   3352		DMWARN("Origin device (%pg) %s: Disabling discard passdown.",
   3353		       origin_bdev, reason);
   3354		cache->features.discard_passdown = false;
   3355	}
   3356}
   3357
   3358static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
   3359{
   3360	struct block_device *origin_bdev = cache->origin_dev->bdev;
   3361	struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
   3362
   3363	if (!cache->features.discard_passdown) {
   3364		/* No passdown is done so setting own virtual limits */
   3365		limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
   3366						    cache->origin_sectors);
   3367		limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
   3368		return;
   3369	}
   3370
   3371	/*
   3372	 * cache_iterate_devices() is stacking both origin and fast device limits
   3373	 * but discards aren't passed to fast device, so inherit origin's limits.
   3374	 */
   3375	limits->max_discard_sectors = origin_limits->max_discard_sectors;
   3376	limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors;
   3377	limits->discard_granularity = origin_limits->discard_granularity;
   3378	limits->discard_alignment = origin_limits->discard_alignment;
   3379	limits->discard_misaligned = origin_limits->discard_misaligned;
   3380}
   3381
   3382static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
   3383{
   3384	struct cache *cache = ti->private;
   3385	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
   3386
   3387	/*
   3388	 * If the system-determined stacked limits are compatible with the
   3389	 * cache's blocksize (io_opt is a factor) do not override them.
   3390	 */
   3391	if (io_opt_sectors < cache->sectors_per_block ||
   3392	    do_div(io_opt_sectors, cache->sectors_per_block)) {
   3393		blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
   3394		blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
   3395	}
   3396
   3397	disable_passdown_if_not_supported(cache);
   3398	set_discard_limits(cache, limits);
   3399}
   3400
   3401/*----------------------------------------------------------------*/
   3402
   3403static struct target_type cache_target = {
   3404	.name = "cache",
   3405	.version = {2, 2, 0},
   3406	.module = THIS_MODULE,
   3407	.ctr = cache_ctr,
   3408	.dtr = cache_dtr,
   3409	.map = cache_map,
   3410	.end_io = cache_end_io,
   3411	.postsuspend = cache_postsuspend,
   3412	.preresume = cache_preresume,
   3413	.resume = cache_resume,
   3414	.status = cache_status,
   3415	.message = cache_message,
   3416	.iterate_devices = cache_iterate_devices,
   3417	.io_hints = cache_io_hints,
   3418};
   3419
   3420static int __init dm_cache_init(void)
   3421{
   3422	int r;
   3423
   3424	migration_cache = KMEM_CACHE(dm_cache_migration, 0);
   3425	if (!migration_cache)
   3426		return -ENOMEM;
   3427
   3428	r = dm_register_target(&cache_target);
   3429	if (r) {
   3430		DMERR("cache target registration failed: %d", r);
   3431		kmem_cache_destroy(migration_cache);
   3432		return r;
   3433	}
   3434
   3435	return 0;
   3436}
   3437
   3438static void __exit dm_cache_exit(void)
   3439{
   3440	dm_unregister_target(&cache_target);
   3441	kmem_cache_destroy(migration_cache);
   3442}
   3443
   3444module_init(dm_cache_init);
   3445module_exit(dm_cache_exit);
   3446
   3447MODULE_DESCRIPTION(DM_NAME " cache target");
   3448MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
   3449MODULE_LICENSE("GPL");