dm-clone-target.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
dm-clone-target.c (55694B)
      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved.
      4 */
      5
      6#include <linux/mm.h>
      7#include <linux/bio.h>
      8#include <linux/err.h>
      9#include <linux/hash.h>
     10#include <linux/list.h>
     11#include <linux/log2.h>
     12#include <linux/init.h>
     13#include <linux/slab.h>
     14#include <linux/wait.h>
     15#include <linux/dm-io.h>
     16#include <linux/mutex.h>
     17#include <linux/atomic.h>
     18#include <linux/bitops.h>
     19#include <linux/blkdev.h>
     20#include <linux/kdev_t.h>
     21#include <linux/kernel.h>
     22#include <linux/module.h>
     23#include <linux/jiffies.h>
     24#include <linux/mempool.h>
     25#include <linux/spinlock.h>
     26#include <linux/blk_types.h>
     27#include <linux/dm-kcopyd.h>
     28#include <linux/workqueue.h>
     29#include <linux/backing-dev.h>
     30#include <linux/device-mapper.h>
     31
     32#include "dm.h"
     33#include "dm-clone-metadata.h"
     34
     35#define DM_MSG_PREFIX "clone"
     36
     37/*
     38 * Minimum and maximum allowed region sizes
     39 */
     40#define MIN_REGION_SIZE (1 << 3)  /* 4KB */
     41#define MAX_REGION_SIZE (1 << 21) /* 1GB */
     42
     43#define MIN_HYDRATIONS 256 /* Size of hydration mempool */
     44#define DEFAULT_HYDRATION_THRESHOLD 1 /* 1 region */
     45#define DEFAULT_HYDRATION_BATCH_SIZE 1 /* Hydrate in batches of 1 region */
     46
     47#define COMMIT_PERIOD HZ /* 1 sec */
     48
     49/*
     50 * Hydration hash table size: 1 << HASH_TABLE_BITS
     51 */
     52#define HASH_TABLE_BITS 15
     53
     54DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(clone_hydration_throttle,
     55	"A percentage of time allocated for hydrating regions");
     56
     57/* Slab cache for struct dm_clone_region_hydration */
     58static struct kmem_cache *_hydration_cache;
     59
     60/* dm-clone metadata modes */
     61enum clone_metadata_mode {
     62	CM_WRITE,		/* metadata may be changed */
     63	CM_READ_ONLY,		/* metadata may not be changed */
     64	CM_FAIL,		/* all metadata I/O fails */
     65};
     66
     67struct hash_table_bucket;
     68
     69struct clone {
     70	struct dm_target *ti;
     71
     72	struct dm_dev *metadata_dev;
     73	struct dm_dev *dest_dev;
     74	struct dm_dev *source_dev;
     75
     76	unsigned long nr_regions;
     77	sector_t region_size;
     78	unsigned int region_shift;
     79
     80	/*
     81	 * A metadata commit and the actions taken in case it fails should run
     82	 * as a single atomic step.
     83	 */
     84	struct mutex commit_lock;
     85
     86	struct dm_clone_metadata *cmd;
     87
     88	/* Region hydration hash table */
     89	struct hash_table_bucket *ht;
     90
     91	atomic_t ios_in_flight;
     92
     93	wait_queue_head_t hydration_stopped;
     94
     95	mempool_t hydration_pool;
     96
     97	unsigned long last_commit_jiffies;
     98
     99	/*
    100	 * We defer incoming WRITE bios for regions that are not hydrated,
    101	 * until after these regions have been hydrated.
    102	 *
    103	 * Also, we defer REQ_FUA and REQ_PREFLUSH bios, until after the
    104	 * metadata have been committed.
    105	 */
    106	spinlock_t lock;
    107	struct bio_list deferred_bios;
    108	struct bio_list deferred_discard_bios;
    109	struct bio_list deferred_flush_bios;
    110	struct bio_list deferred_flush_completions;
    111
    112	/* Maximum number of regions being copied during background hydration. */
    113	unsigned int hydration_threshold;
    114
    115	/* Number of regions to batch together during background hydration. */
    116	unsigned int hydration_batch_size;
    117
    118	/* Which region to hydrate next */
    119	unsigned long hydration_offset;
    120
    121	atomic_t hydrations_in_flight;
    122
    123	/*
    124	 * Save a copy of the table line rather than reconstructing it for the
    125	 * status.
    126	 */
    127	unsigned int nr_ctr_args;
    128	const char **ctr_args;
    129
    130	struct workqueue_struct *wq;
    131	struct work_struct worker;
    132	struct delayed_work waker;
    133
    134	struct dm_kcopyd_client *kcopyd_client;
    135
    136	enum clone_metadata_mode mode;
    137	unsigned long flags;
    138};
    139
    140/*
    141 * dm-clone flags
    142 */
    143#define DM_CLONE_DISCARD_PASSDOWN 0
    144#define DM_CLONE_HYDRATION_ENABLED 1
    145#define DM_CLONE_HYDRATION_SUSPENDED 2
    146
    147/*---------------------------------------------------------------------------*/
    148
    149/*
    150 * Metadata failure handling.
    151 */
    152static enum clone_metadata_mode get_clone_mode(struct clone *clone)
    153{
    154	return READ_ONCE(clone->mode);
    155}
    156
    157static const char *clone_device_name(struct clone *clone)
    158{
    159	return dm_table_device_name(clone->ti->table);
    160}
    161
    162static void __set_clone_mode(struct clone *clone, enum clone_metadata_mode new_mode)
    163{
    164	static const char * const descs[] = {
    165		"read-write",
    166		"read-only",
    167		"fail"
    168	};
    169
    170	enum clone_metadata_mode old_mode = get_clone_mode(clone);
    171
    172	/* Never move out of fail mode */
    173	if (old_mode == CM_FAIL)
    174		new_mode = CM_FAIL;
    175
    176	switch (new_mode) {
    177	case CM_FAIL:
    178	case CM_READ_ONLY:
    179		dm_clone_metadata_set_read_only(clone->cmd);
    180		break;
    181
    182	case CM_WRITE:
    183		dm_clone_metadata_set_read_write(clone->cmd);
    184		break;
    185	}
    186
    187	WRITE_ONCE(clone->mode, new_mode);
    188
    189	if (new_mode != old_mode) {
    190		dm_table_event(clone->ti->table);
    191		DMINFO("%s: Switching to %s mode", clone_device_name(clone),
    192		       descs[(int)new_mode]);
    193	}
    194}
    195
    196static void __abort_transaction(struct clone *clone)
    197{
    198	const char *dev_name = clone_device_name(clone);
    199
    200	if (get_clone_mode(clone) >= CM_READ_ONLY)
    201		return;
    202
    203	DMERR("%s: Aborting current metadata transaction", dev_name);
    204	if (dm_clone_metadata_abort(clone->cmd)) {
    205		DMERR("%s: Failed to abort metadata transaction", dev_name);
    206		__set_clone_mode(clone, CM_FAIL);
    207	}
    208}
    209
    210static void __reload_in_core_bitset(struct clone *clone)
    211{
    212	const char *dev_name = clone_device_name(clone);
    213
    214	if (get_clone_mode(clone) == CM_FAIL)
    215		return;
    216
    217	/* Reload the on-disk bitset */
    218	DMINFO("%s: Reloading on-disk bitmap", dev_name);
    219	if (dm_clone_reload_in_core_bitset(clone->cmd)) {
    220		DMERR("%s: Failed to reload on-disk bitmap", dev_name);
    221		__set_clone_mode(clone, CM_FAIL);
    222	}
    223}
    224
    225static void __metadata_operation_failed(struct clone *clone, const char *op, int r)
    226{
    227	DMERR("%s: Metadata operation `%s' failed: error = %d",
    228	      clone_device_name(clone), op, r);
    229
    230	__abort_transaction(clone);
    231	__set_clone_mode(clone, CM_READ_ONLY);
    232
    233	/*
    234	 * dm_clone_reload_in_core_bitset() may run concurrently with either
    235	 * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), but
    236	 * it's safe as we have already set the metadata to read-only mode.
    237	 */
    238	__reload_in_core_bitset(clone);
    239}
    240
    241/*---------------------------------------------------------------------------*/
    242
    243/* Wake up anyone waiting for region hydrations to stop */
    244static inline void wakeup_hydration_waiters(struct clone *clone)
    245{
    246	wake_up_all(&clone->hydration_stopped);
    247}
    248
    249static inline void wake_worker(struct clone *clone)
    250{
    251	queue_work(clone->wq, &clone->worker);
    252}
    253
    254/*---------------------------------------------------------------------------*/
    255
    256/*
    257 * bio helper functions.
    258 */
    259static inline void remap_to_source(struct clone *clone, struct bio *bio)
    260{
    261	bio_set_dev(bio, clone->source_dev->bdev);
    262}
    263
    264static inline void remap_to_dest(struct clone *clone, struct bio *bio)
    265{
    266	bio_set_dev(bio, clone->dest_dev->bdev);
    267}
    268
    269static bool bio_triggers_commit(struct clone *clone, struct bio *bio)
    270{
    271	return op_is_flush(bio->bi_opf) &&
    272		dm_clone_changed_this_transaction(clone->cmd);
    273}
    274
    275/* Get the address of the region in sectors */
    276static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr)
    277{
    278	return ((sector_t)region_nr << clone->region_shift);
    279}
    280
    281/* Get the region number of the bio */
    282static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio)
    283{
    284	return (bio->bi_iter.bi_sector >> clone->region_shift);
    285}
    286
    287/* Get the region range covered by the bio */
    288static void bio_region_range(struct clone *clone, struct bio *bio,
    289			     unsigned long *rs, unsigned long *nr_regions)
    290{
    291	unsigned long end;
    292
    293	*rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size);
    294	end = bio_end_sector(bio) >> clone->region_shift;
    295
    296	if (*rs >= end)
    297		*nr_regions = 0;
    298	else
    299		*nr_regions = end - *rs;
    300}
    301
    302/* Check whether a bio overwrites a region */
    303static inline bool is_overwrite_bio(struct clone *clone, struct bio *bio)
    304{
    305	return (bio_data_dir(bio) == WRITE && bio_sectors(bio) == clone->region_size);
    306}
    307
    308static void fail_bios(struct bio_list *bios, blk_status_t status)
    309{
    310	struct bio *bio;
    311
    312	while ((bio = bio_list_pop(bios))) {
    313		bio->bi_status = status;
    314		bio_endio(bio);
    315	}
    316}
    317
    318static void submit_bios(struct bio_list *bios)
    319{
    320	struct bio *bio;
    321	struct blk_plug plug;
    322
    323	blk_start_plug(&plug);
    324
    325	while ((bio = bio_list_pop(bios)))
    326		submit_bio_noacct(bio);
    327
    328	blk_finish_plug(&plug);
    329}
    330
    331/*
    332 * Submit bio to the underlying device.
    333 *
    334 * If the bio triggers a commit, delay it, until after the metadata have been
    335 * committed.
    336 *
    337 * NOTE: The bio remapping must be performed by the caller.
    338 */
    339static void issue_bio(struct clone *clone, struct bio *bio)
    340{
    341	if (!bio_triggers_commit(clone, bio)) {
    342		submit_bio_noacct(bio);
    343		return;
    344	}
    345
    346	/*
    347	 * If the metadata mode is RO or FAIL we won't be able to commit the
    348	 * metadata, so we complete the bio with an error.
    349	 */
    350	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
    351		bio_io_error(bio);
    352		return;
    353	}
    354
    355	/*
    356	 * Batch together any bios that trigger commits and then issue a single
    357	 * commit for them in process_deferred_flush_bios().
    358	 */
    359	spin_lock_irq(&clone->lock);
    360	bio_list_add(&clone->deferred_flush_bios, bio);
    361	spin_unlock_irq(&clone->lock);
    362
    363	wake_worker(clone);
    364}
    365
    366/*
    367 * Remap bio to the destination device and submit it.
    368 *
    369 * If the bio triggers a commit, delay it, until after the metadata have been
    370 * committed.
    371 */
    372static void remap_and_issue(struct clone *clone, struct bio *bio)
    373{
    374	remap_to_dest(clone, bio);
    375	issue_bio(clone, bio);
    376}
    377
    378/*
    379 * Issue bios that have been deferred until after their region has finished
    380 * hydrating.
    381 *
    382 * We delegate the bio submission to the worker thread, so this is safe to call
    383 * from interrupt context.
    384 */
    385static void issue_deferred_bios(struct clone *clone, struct bio_list *bios)
    386{
    387	struct bio *bio;
    388	unsigned long flags;
    389	struct bio_list flush_bios = BIO_EMPTY_LIST;
    390	struct bio_list normal_bios = BIO_EMPTY_LIST;
    391
    392	if (bio_list_empty(bios))
    393		return;
    394
    395	while ((bio = bio_list_pop(bios))) {
    396		if (bio_triggers_commit(clone, bio))
    397			bio_list_add(&flush_bios, bio);
    398		else
    399			bio_list_add(&normal_bios, bio);
    400	}
    401
    402	spin_lock_irqsave(&clone->lock, flags);
    403	bio_list_merge(&clone->deferred_bios, &normal_bios);
    404	bio_list_merge(&clone->deferred_flush_bios, &flush_bios);
    405	spin_unlock_irqrestore(&clone->lock, flags);
    406
    407	wake_worker(clone);
    408}
    409
    410static void complete_overwrite_bio(struct clone *clone, struct bio *bio)
    411{
    412	unsigned long flags;
    413
    414	/*
    415	 * If the bio has the REQ_FUA flag set we must commit the metadata
    416	 * before signaling its completion.
    417	 *
    418	 * complete_overwrite_bio() is only called by hydration_complete(),
    419	 * after having successfully updated the metadata. This means we don't
    420	 * need to call dm_clone_changed_this_transaction() to check if the
    421	 * metadata has changed and thus we can avoid taking the metadata spin
    422	 * lock.
    423	 */
    424	if (!(bio->bi_opf & REQ_FUA)) {
    425		bio_endio(bio);
    426		return;
    427	}
    428
    429	/*
    430	 * If the metadata mode is RO or FAIL we won't be able to commit the
    431	 * metadata, so we complete the bio with an error.
    432	 */
    433	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
    434		bio_io_error(bio);
    435		return;
    436	}
    437
    438	/*
    439	 * Batch together any bios that trigger commits and then issue a single
    440	 * commit for them in process_deferred_flush_bios().
    441	 */
    442	spin_lock_irqsave(&clone->lock, flags);
    443	bio_list_add(&clone->deferred_flush_completions, bio);
    444	spin_unlock_irqrestore(&clone->lock, flags);
    445
    446	wake_worker(clone);
    447}
    448
    449static void trim_bio(struct bio *bio, sector_t sector, unsigned int len)
    450{
    451	bio->bi_iter.bi_sector = sector;
    452	bio->bi_iter.bi_size = to_bytes(len);
    453}
    454
    455static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success)
    456{
    457	unsigned long rs, nr_regions;
    458
    459	/*
    460	 * If the destination device supports discards, remap and trim the
    461	 * discard bio and pass it down. Otherwise complete the bio
    462	 * immediately.
    463	 */
    464	if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) {
    465		remap_to_dest(clone, bio);
    466		bio_region_range(clone, bio, &rs, &nr_regions);
    467		trim_bio(bio, region_to_sector(clone, rs),
    468			 nr_regions << clone->region_shift);
    469		submit_bio_noacct(bio);
    470	} else
    471		bio_endio(bio);
    472}
    473
    474static void process_discard_bio(struct clone *clone, struct bio *bio)
    475{
    476	unsigned long rs, nr_regions;
    477
    478	bio_region_range(clone, bio, &rs, &nr_regions);
    479	if (!nr_regions) {
    480		bio_endio(bio);
    481		return;
    482	}
    483
    484	if (WARN_ON(rs >= clone->nr_regions || (rs + nr_regions) < rs ||
    485		    (rs + nr_regions) > clone->nr_regions)) {
    486		DMERR("%s: Invalid range (%lu + %lu, total regions %lu) for discard (%llu + %u)",
    487		      clone_device_name(clone), rs, nr_regions,
    488		      clone->nr_regions,
    489		      (unsigned long long)bio->bi_iter.bi_sector,
    490		      bio_sectors(bio));
    491		bio_endio(bio);
    492		return;
    493	}
    494
    495	/*
    496	 * The covered regions are already hydrated so we just need to pass
    497	 * down the discard.
    498	 */
    499	if (dm_clone_is_range_hydrated(clone->cmd, rs, nr_regions)) {
    500		complete_discard_bio(clone, bio, true);
    501		return;
    502	}
    503
    504	/*
    505	 * If the metadata mode is RO or FAIL we won't be able to update the
    506	 * metadata for the regions covered by the discard so we just ignore
    507	 * it.
    508	 */
    509	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
    510		bio_endio(bio);
    511		return;
    512	}
    513
    514	/*
    515	 * Defer discard processing.
    516	 */
    517	spin_lock_irq(&clone->lock);
    518	bio_list_add(&clone->deferred_discard_bios, bio);
    519	spin_unlock_irq(&clone->lock);
    520
    521	wake_worker(clone);
    522}
    523
    524/*---------------------------------------------------------------------------*/
    525
    526/*
    527 * dm-clone region hydrations.
    528 */
    529struct dm_clone_region_hydration {
    530	struct clone *clone;
    531	unsigned long region_nr;
    532
    533	struct bio *overwrite_bio;
    534	bio_end_io_t *overwrite_bio_end_io;
    535
    536	struct bio_list deferred_bios;
    537
    538	blk_status_t status;
    539
    540	/* Used by hydration batching */
    541	struct list_head list;
    542
    543	/* Used by hydration hash table */
    544	struct hlist_node h;
    545};
    546
    547/*
    548 * Hydration hash table implementation.
    549 *
    550 * Ideally we would like to use list_bl, which uses bit spin locks and employs
    551 * the least significant bit of the list head to lock the corresponding bucket,
    552 * reducing the memory overhead for the locks. But, currently, list_bl and bit
    553 * spin locks don't support IRQ safe versions. Since we have to take the lock
    554 * in both process and interrupt context, we must fall back to using regular
    555 * spin locks; one per hash table bucket.
    556 */
    557struct hash_table_bucket {
    558	struct hlist_head head;
    559
    560	/* Spinlock protecting the bucket */
    561	spinlock_t lock;
    562};
    563
    564#define bucket_lock_irqsave(bucket, flags) \
    565	spin_lock_irqsave(&(bucket)->lock, flags)
    566
    567#define bucket_unlock_irqrestore(bucket, flags) \
    568	spin_unlock_irqrestore(&(bucket)->lock, flags)
    569
    570#define bucket_lock_irq(bucket) \
    571	spin_lock_irq(&(bucket)->lock)
    572
    573#define bucket_unlock_irq(bucket) \
    574	spin_unlock_irq(&(bucket)->lock)
    575
    576static int hash_table_init(struct clone *clone)
    577{
    578	unsigned int i, sz;
    579	struct hash_table_bucket *bucket;
    580
    581	sz = 1 << HASH_TABLE_BITS;
    582
    583	clone->ht = kvmalloc(sz * sizeof(struct hash_table_bucket), GFP_KERNEL);
    584	if (!clone->ht)
    585		return -ENOMEM;
    586
    587	for (i = 0; i < sz; i++) {
    588		bucket = clone->ht + i;
    589
    590		INIT_HLIST_HEAD(&bucket->head);
    591		spin_lock_init(&bucket->lock);
    592	}
    593
    594	return 0;
    595}
    596
    597static void hash_table_exit(struct clone *clone)
    598{
    599	kvfree(clone->ht);
    600}
    601
    602static struct hash_table_bucket *get_hash_table_bucket(struct clone *clone,
    603						       unsigned long region_nr)
    604{
    605	return &clone->ht[hash_long(region_nr, HASH_TABLE_BITS)];
    606}
    607
    608/*
    609 * Search hash table for a hydration with hd->region_nr == region_nr
    610 *
    611 * NOTE: Must be called with the bucket lock held
    612 */
    613static struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket,
    614						     unsigned long region_nr)
    615{
    616	struct dm_clone_region_hydration *hd;
    617
    618	hlist_for_each_entry(hd, &bucket->head, h) {
    619		if (hd->region_nr == region_nr)
    620			return hd;
    621	}
    622
    623	return NULL;
    624}
    625
    626/*
    627 * Insert a hydration into the hash table.
    628 *
    629 * NOTE: Must be called with the bucket lock held.
    630 */
    631static inline void __insert_region_hydration(struct hash_table_bucket *bucket,
    632					     struct dm_clone_region_hydration *hd)
    633{
    634	hlist_add_head(&hd->h, &bucket->head);
    635}
    636
    637/*
    638 * This function inserts a hydration into the hash table, unless someone else
    639 * managed to insert a hydration for the same region first. In the latter case
    640 * it returns the existing hydration descriptor for this region.
    641 *
    642 * NOTE: Must be called with the hydration hash table lock held.
    643 */
    644static struct dm_clone_region_hydration *
    645__find_or_insert_region_hydration(struct hash_table_bucket *bucket,
    646				  struct dm_clone_region_hydration *hd)
    647{
    648	struct dm_clone_region_hydration *hd2;
    649
    650	hd2 = __hash_find(bucket, hd->region_nr);
    651	if (hd2)
    652		return hd2;
    653
    654	__insert_region_hydration(bucket, hd);
    655
    656	return hd;
    657}
    658
    659/*---------------------------------------------------------------------------*/
    660
    661/* Allocate a hydration */
    662static struct dm_clone_region_hydration *alloc_hydration(struct clone *clone)
    663{
    664	struct dm_clone_region_hydration *hd;
    665
    666	/*
    667	 * Allocate a hydration from the hydration mempool.
    668	 * This might block but it can't fail.
    669	 */
    670	hd = mempool_alloc(&clone->hydration_pool, GFP_NOIO);
    671	hd->clone = clone;
    672
    673	return hd;
    674}
    675
    676static inline void free_hydration(struct dm_clone_region_hydration *hd)
    677{
    678	mempool_free(hd, &hd->clone->hydration_pool);
    679}
    680
    681/* Initialize a hydration */
    682static void hydration_init(struct dm_clone_region_hydration *hd, unsigned long region_nr)
    683{
    684	hd->region_nr = region_nr;
    685	hd->overwrite_bio = NULL;
    686	bio_list_init(&hd->deferred_bios);
    687	hd->status = 0;
    688
    689	INIT_LIST_HEAD(&hd->list);
    690	INIT_HLIST_NODE(&hd->h);
    691}
    692
    693/*---------------------------------------------------------------------------*/
    694
    695/*
    696 * Update dm-clone's metadata after a region has finished hydrating and remove
    697 * hydration from the hash table.
    698 */
    699static int hydration_update_metadata(struct dm_clone_region_hydration *hd)
    700{
    701	int r = 0;
    702	unsigned long flags;
    703	struct hash_table_bucket *bucket;
    704	struct clone *clone = hd->clone;
    705
    706	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
    707		r = -EPERM;
    708
    709	/* Update the metadata */
    710	if (likely(!r) && hd->status == BLK_STS_OK)
    711		r = dm_clone_set_region_hydrated(clone->cmd, hd->region_nr);
    712
    713	bucket = get_hash_table_bucket(clone, hd->region_nr);
    714
    715	/* Remove hydration from hash table */
    716	bucket_lock_irqsave(bucket, flags);
    717	hlist_del(&hd->h);
    718	bucket_unlock_irqrestore(bucket, flags);
    719
    720	return r;
    721}
    722
    723/*
    724 * Complete a region's hydration:
    725 *
    726 *	1. Update dm-clone's metadata.
    727 *	2. Remove hydration from hash table.
    728 *	3. Complete overwrite bio.
    729 *	4. Issue deferred bios.
    730 *	5. If this was the last hydration, wake up anyone waiting for
    731 *	   hydrations to finish.
    732 */
    733static void hydration_complete(struct dm_clone_region_hydration *hd)
    734{
    735	int r;
    736	blk_status_t status;
    737	struct clone *clone = hd->clone;
    738
    739	r = hydration_update_metadata(hd);
    740
    741	if (hd->status == BLK_STS_OK && likely(!r)) {
    742		if (hd->overwrite_bio)
    743			complete_overwrite_bio(clone, hd->overwrite_bio);
    744
    745		issue_deferred_bios(clone, &hd->deferred_bios);
    746	} else {
    747		status = r ? BLK_STS_IOERR : hd->status;
    748
    749		if (hd->overwrite_bio)
    750			bio_list_add(&hd->deferred_bios, hd->overwrite_bio);
    751
    752		fail_bios(&hd->deferred_bios, status);
    753	}
    754
    755	free_hydration(hd);
    756
    757	if (atomic_dec_and_test(&clone->hydrations_in_flight))
    758		wakeup_hydration_waiters(clone);
    759}
    760
    761static void hydration_kcopyd_callback(int read_err, unsigned long write_err, void *context)
    762{
    763	blk_status_t status;
    764
    765	struct dm_clone_region_hydration *tmp, *hd = context;
    766	struct clone *clone = hd->clone;
    767
    768	LIST_HEAD(batched_hydrations);
    769
    770	if (read_err || write_err) {
    771		DMERR_LIMIT("%s: hydration failed", clone_device_name(clone));
    772		status = BLK_STS_IOERR;
    773	} else {
    774		status = BLK_STS_OK;
    775	}
    776	list_splice_tail(&hd->list, &batched_hydrations);
    777
    778	hd->status = status;
    779	hydration_complete(hd);
    780
    781	/* Complete batched hydrations */
    782	list_for_each_entry_safe(hd, tmp, &batched_hydrations, list) {
    783		hd->status = status;
    784		hydration_complete(hd);
    785	}
    786
    787	/* Continue background hydration, if there is no I/O in-flight */
    788	if (test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
    789	    !atomic_read(&clone->ios_in_flight))
    790		wake_worker(clone);
    791}
    792
    793static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr_regions)
    794{
    795	unsigned long region_start, region_end;
    796	sector_t tail_size, region_size, total_size;
    797	struct dm_io_region from, to;
    798	struct clone *clone = hd->clone;
    799
    800	if (WARN_ON(!nr_regions))
    801		return;
    802
    803	region_size = clone->region_size;
    804	region_start = hd->region_nr;
    805	region_end = region_start + nr_regions - 1;
    806
    807	total_size = region_to_sector(clone, nr_regions - 1);
    808
    809	if (region_end == clone->nr_regions - 1) {
    810		/*
    811		 * The last region of the target might be smaller than
    812		 * region_size.
    813		 */
    814		tail_size = clone->ti->len & (region_size - 1);
    815		if (!tail_size)
    816			tail_size = region_size;
    817	} else {
    818		tail_size = region_size;
    819	}
    820
    821	total_size += tail_size;
    822
    823	from.bdev = clone->source_dev->bdev;
    824	from.sector = region_to_sector(clone, region_start);
    825	from.count = total_size;
    826
    827	to.bdev = clone->dest_dev->bdev;
    828	to.sector = from.sector;
    829	to.count = from.count;
    830
    831	/* Issue copy */
    832	atomic_add(nr_regions, &clone->hydrations_in_flight);
    833	dm_kcopyd_copy(clone->kcopyd_client, &from, 1, &to, 0,
    834		       hydration_kcopyd_callback, hd);
    835}
    836
    837static void overwrite_endio(struct bio *bio)
    838{
    839	struct dm_clone_region_hydration *hd = bio->bi_private;
    840
    841	bio->bi_end_io = hd->overwrite_bio_end_io;
    842	hd->status = bio->bi_status;
    843
    844	hydration_complete(hd);
    845}
    846
    847static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio *bio)
    848{
    849	/*
    850	 * We don't need to save and restore bio->bi_private because device
    851	 * mapper core generates a new bio for us to use, with clean
    852	 * bi_private.
    853	 */
    854	hd->overwrite_bio = bio;
    855	hd->overwrite_bio_end_io = bio->bi_end_io;
    856
    857	bio->bi_end_io = overwrite_endio;
    858	bio->bi_private = hd;
    859
    860	atomic_inc(&hd->clone->hydrations_in_flight);
    861	submit_bio_noacct(bio);
    862}
    863
    864/*
    865 * Hydrate bio's region.
    866 *
    867 * This function starts the hydration of the bio's region and puts the bio in
    868 * the list of deferred bios for this region. In case, by the time this
    869 * function is called, the region has finished hydrating it's submitted to the
    870 * destination device.
    871 *
    872 * NOTE: The bio remapping must be performed by the caller.
    873 */
    874static void hydrate_bio_region(struct clone *clone, struct bio *bio)
    875{
    876	unsigned long region_nr;
    877	struct hash_table_bucket *bucket;
    878	struct dm_clone_region_hydration *hd, *hd2;
    879
    880	region_nr = bio_to_region(clone, bio);
    881	bucket = get_hash_table_bucket(clone, region_nr);
    882
    883	bucket_lock_irq(bucket);
    884
    885	hd = __hash_find(bucket, region_nr);
    886	if (hd) {
    887		/* Someone else is hydrating the region */
    888		bio_list_add(&hd->deferred_bios, bio);
    889		bucket_unlock_irq(bucket);
    890		return;
    891	}
    892
    893	if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
    894		/* The region has been hydrated */
    895		bucket_unlock_irq(bucket);
    896		issue_bio(clone, bio);
    897		return;
    898	}
    899
    900	/*
    901	 * We must allocate a hydration descriptor and start the hydration of
    902	 * the corresponding region.
    903	 */
    904	bucket_unlock_irq(bucket);
    905
    906	hd = alloc_hydration(clone);
    907	hydration_init(hd, region_nr);
    908
    909	bucket_lock_irq(bucket);
    910
    911	/* Check if the region has been hydrated in the meantime. */
    912	if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
    913		bucket_unlock_irq(bucket);
    914		free_hydration(hd);
    915		issue_bio(clone, bio);
    916		return;
    917	}
    918
    919	hd2 = __find_or_insert_region_hydration(bucket, hd);
    920	if (hd2 != hd) {
    921		/* Someone else started the region's hydration. */
    922		bio_list_add(&hd2->deferred_bios, bio);
    923		bucket_unlock_irq(bucket);
    924		free_hydration(hd);
    925		return;
    926	}
    927
    928	/*
    929	 * If the metadata mode is RO or FAIL then there is no point starting a
    930	 * hydration, since we will not be able to update the metadata when the
    931	 * hydration finishes.
    932	 */
    933	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
    934		hlist_del(&hd->h);
    935		bucket_unlock_irq(bucket);
    936		free_hydration(hd);
    937		bio_io_error(bio);
    938		return;
    939	}
    940
    941	/*
    942	 * Start region hydration.
    943	 *
    944	 * If a bio overwrites a region, i.e., its size is equal to the
    945	 * region's size, then we don't need to copy the region from the source
    946	 * to the destination device.
    947	 */
    948	if (is_overwrite_bio(clone, bio)) {
    949		bucket_unlock_irq(bucket);
    950		hydration_overwrite(hd, bio);
    951	} else {
    952		bio_list_add(&hd->deferred_bios, bio);
    953		bucket_unlock_irq(bucket);
    954		hydration_copy(hd, 1);
    955	}
    956}
    957
    958/*---------------------------------------------------------------------------*/
    959
    960/*
    961 * Background hydrations.
    962 */
    963
    964/*
    965 * Batch region hydrations.
    966 *
    967 * To better utilize device bandwidth we batch together the hydration of
    968 * adjacent regions. This allows us to use small region sizes, e.g., 4KB, which
    969 * is good for small, random write performance (because of the overwriting of
    970 * un-hydrated regions) and at the same time issue big copy requests to kcopyd
    971 * to achieve high hydration bandwidth.
    972 */
    973struct batch_info {
    974	struct dm_clone_region_hydration *head;
    975	unsigned int nr_batched_regions;
    976};
    977
    978static void __batch_hydration(struct batch_info *batch,
    979			      struct dm_clone_region_hydration *hd)
    980{
    981	struct clone *clone = hd->clone;
    982	unsigned int max_batch_size = READ_ONCE(clone->hydration_batch_size);
    983
    984	if (batch->head) {
    985		/* Try to extend the current batch */
    986		if (batch->nr_batched_regions < max_batch_size &&
    987		    (batch->head->region_nr + batch->nr_batched_regions) == hd->region_nr) {
    988			list_add_tail(&hd->list, &batch->head->list);
    989			batch->nr_batched_regions++;
    990			hd = NULL;
    991		}
    992
    993		/* Check if we should issue the current batch */
    994		if (batch->nr_batched_regions >= max_batch_size || hd) {
    995			hydration_copy(batch->head, batch->nr_batched_regions);
    996			batch->head = NULL;
    997			batch->nr_batched_regions = 0;
    998		}
    999	}
   1000
   1001	if (!hd)
   1002		return;
   1003
   1004	/* We treat max batch sizes of zero and one equivalently */
   1005	if (max_batch_size <= 1) {
   1006		hydration_copy(hd, 1);
   1007		return;
   1008	}
   1009
   1010	/* Start a new batch */
   1011	BUG_ON(!list_empty(&hd->list));
   1012	batch->head = hd;
   1013	batch->nr_batched_regions = 1;
   1014}
   1015
   1016static unsigned long __start_next_hydration(struct clone *clone,
   1017					    unsigned long offset,
   1018					    struct batch_info *batch)
   1019{
   1020	struct hash_table_bucket *bucket;
   1021	struct dm_clone_region_hydration *hd;
   1022	unsigned long nr_regions = clone->nr_regions;
   1023
   1024	hd = alloc_hydration(clone);
   1025
   1026	/* Try to find a region to hydrate. */
   1027	do {
   1028		offset = dm_clone_find_next_unhydrated_region(clone->cmd, offset);
   1029		if (offset == nr_regions)
   1030			break;
   1031
   1032		bucket = get_hash_table_bucket(clone, offset);
   1033		bucket_lock_irq(bucket);
   1034
   1035		if (!dm_clone_is_region_hydrated(clone->cmd, offset) &&
   1036		    !__hash_find(bucket, offset)) {
   1037			hydration_init(hd, offset);
   1038			__insert_region_hydration(bucket, hd);
   1039			bucket_unlock_irq(bucket);
   1040
   1041			/* Batch hydration */
   1042			__batch_hydration(batch, hd);
   1043
   1044			return (offset + 1);
   1045		}
   1046
   1047		bucket_unlock_irq(bucket);
   1048
   1049	} while (++offset < nr_regions);
   1050
   1051	if (hd)
   1052		free_hydration(hd);
   1053
   1054	return offset;
   1055}
   1056
   1057/*
   1058 * This function searches for regions that still reside in the source device
   1059 * and starts their hydration.
   1060 */
   1061static void do_hydration(struct clone *clone)
   1062{
   1063	unsigned int current_volume;
   1064	unsigned long offset, nr_regions = clone->nr_regions;
   1065
   1066	struct batch_info batch = {
   1067		.head = NULL,
   1068		.nr_batched_regions = 0,
   1069	};
   1070
   1071	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
   1072		return;
   1073
   1074	if (dm_clone_is_hydration_done(clone->cmd))
   1075		return;
   1076
   1077	/*
   1078	 * Avoid race with device suspension.
   1079	 */
   1080	atomic_inc(&clone->hydrations_in_flight);
   1081
   1082	/*
   1083	 * Make sure atomic_inc() is ordered before test_bit(), otherwise we
   1084	 * might race with clone_postsuspend() and start a region hydration
   1085	 * after the target has been suspended.
   1086	 *
   1087	 * This is paired with the smp_mb__after_atomic() in
   1088	 * clone_postsuspend().
   1089	 */
   1090	smp_mb__after_atomic();
   1091
   1092	offset = clone->hydration_offset;
   1093	while (likely(!test_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags)) &&
   1094	       !atomic_read(&clone->ios_in_flight) &&
   1095	       test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
   1096	       offset < nr_regions) {
   1097		current_volume = atomic_read(&clone->hydrations_in_flight);
   1098		current_volume += batch.nr_batched_regions;
   1099
   1100		if (current_volume > READ_ONCE(clone->hydration_threshold))
   1101			break;
   1102
   1103		offset = __start_next_hydration(clone, offset, &batch);
   1104	}
   1105
   1106	if (batch.head)
   1107		hydration_copy(batch.head, batch.nr_batched_regions);
   1108
   1109	if (offset >= nr_regions)
   1110		offset = 0;
   1111
   1112	clone->hydration_offset = offset;
   1113
   1114	if (atomic_dec_and_test(&clone->hydrations_in_flight))
   1115		wakeup_hydration_waiters(clone);
   1116}
   1117
   1118/*---------------------------------------------------------------------------*/
   1119
   1120static bool need_commit_due_to_time(struct clone *clone)
   1121{
   1122	return !time_in_range(jiffies, clone->last_commit_jiffies,
   1123			      clone->last_commit_jiffies + COMMIT_PERIOD);
   1124}
   1125
   1126/*
   1127 * A non-zero return indicates read-only or fail mode.
   1128 */
   1129static int commit_metadata(struct clone *clone, bool *dest_dev_flushed)
   1130{
   1131	int r = 0;
   1132
   1133	if (dest_dev_flushed)
   1134		*dest_dev_flushed = false;
   1135
   1136	mutex_lock(&clone->commit_lock);
   1137
   1138	if (!dm_clone_changed_this_transaction(clone->cmd))
   1139		goto out;
   1140
   1141	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
   1142		r = -EPERM;
   1143		goto out;
   1144	}
   1145
   1146	r = dm_clone_metadata_pre_commit(clone->cmd);
   1147	if (unlikely(r)) {
   1148		__metadata_operation_failed(clone, "dm_clone_metadata_pre_commit", r);
   1149		goto out;
   1150	}
   1151
   1152	r = blkdev_issue_flush(clone->dest_dev->bdev);
   1153	if (unlikely(r)) {
   1154		__metadata_operation_failed(clone, "flush destination device", r);
   1155		goto out;
   1156	}
   1157
   1158	if (dest_dev_flushed)
   1159		*dest_dev_flushed = true;
   1160
   1161	r = dm_clone_metadata_commit(clone->cmd);
   1162	if (unlikely(r)) {
   1163		__metadata_operation_failed(clone, "dm_clone_metadata_commit", r);
   1164		goto out;
   1165	}
   1166
   1167	if (dm_clone_is_hydration_done(clone->cmd))
   1168		dm_table_event(clone->ti->table);
   1169out:
   1170	mutex_unlock(&clone->commit_lock);
   1171
   1172	return r;
   1173}
   1174
   1175static void process_deferred_discards(struct clone *clone)
   1176{
   1177	int r = -EPERM;
   1178	struct bio *bio;
   1179	struct blk_plug plug;
   1180	unsigned long rs, nr_regions;
   1181	struct bio_list discards = BIO_EMPTY_LIST;
   1182
   1183	spin_lock_irq(&clone->lock);
   1184	bio_list_merge(&discards, &clone->deferred_discard_bios);
   1185	bio_list_init(&clone->deferred_discard_bios);
   1186	spin_unlock_irq(&clone->lock);
   1187
   1188	if (bio_list_empty(&discards))
   1189		return;
   1190
   1191	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
   1192		goto out;
   1193
   1194	/* Update the metadata */
   1195	bio_list_for_each(bio, &discards) {
   1196		bio_region_range(clone, bio, &rs, &nr_regions);
   1197		/*
   1198		 * A discard request might cover regions that have been already
   1199		 * hydrated. There is no need to update the metadata for these
   1200		 * regions.
   1201		 */
   1202		r = dm_clone_cond_set_range(clone->cmd, rs, nr_regions);
   1203		if (unlikely(r))
   1204			break;
   1205	}
   1206out:
   1207	blk_start_plug(&plug);
   1208	while ((bio = bio_list_pop(&discards)))
   1209		complete_discard_bio(clone, bio, r == 0);
   1210	blk_finish_plug(&plug);
   1211}
   1212
   1213static void process_deferred_bios(struct clone *clone)
   1214{
   1215	struct bio_list bios = BIO_EMPTY_LIST;
   1216
   1217	spin_lock_irq(&clone->lock);
   1218	bio_list_merge(&bios, &clone->deferred_bios);
   1219	bio_list_init(&clone->deferred_bios);
   1220	spin_unlock_irq(&clone->lock);
   1221
   1222	if (bio_list_empty(&bios))
   1223		return;
   1224
   1225	submit_bios(&bios);
   1226}
   1227
   1228static void process_deferred_flush_bios(struct clone *clone)
   1229{
   1230	struct bio *bio;
   1231	bool dest_dev_flushed;
   1232	struct bio_list bios = BIO_EMPTY_LIST;
   1233	struct bio_list bio_completions = BIO_EMPTY_LIST;
   1234
   1235	/*
   1236	 * If there are any deferred flush bios, we must commit the metadata
   1237	 * before issuing them or signaling their completion.
   1238	 */
   1239	spin_lock_irq(&clone->lock);
   1240	bio_list_merge(&bios, &clone->deferred_flush_bios);
   1241	bio_list_init(&clone->deferred_flush_bios);
   1242
   1243	bio_list_merge(&bio_completions, &clone->deferred_flush_completions);
   1244	bio_list_init(&clone->deferred_flush_completions);
   1245	spin_unlock_irq(&clone->lock);
   1246
   1247	if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
   1248	    !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone)))
   1249		return;
   1250
   1251	if (commit_metadata(clone, &dest_dev_flushed)) {
   1252		bio_list_merge(&bios, &bio_completions);
   1253
   1254		while ((bio = bio_list_pop(&bios)))
   1255			bio_io_error(bio);
   1256
   1257		return;
   1258	}
   1259
   1260	clone->last_commit_jiffies = jiffies;
   1261
   1262	while ((bio = bio_list_pop(&bio_completions)))
   1263		bio_endio(bio);
   1264
   1265	while ((bio = bio_list_pop(&bios))) {
   1266		if ((bio->bi_opf & REQ_PREFLUSH) && dest_dev_flushed) {
   1267			/* We just flushed the destination device as part of
   1268			 * the metadata commit, so there is no reason to send
   1269			 * another flush.
   1270			 */
   1271			bio_endio(bio);
   1272		} else {
   1273			submit_bio_noacct(bio);
   1274		}
   1275	}
   1276}
   1277
   1278static void do_worker(struct work_struct *work)
   1279{
   1280	struct clone *clone = container_of(work, typeof(*clone), worker);
   1281
   1282	process_deferred_bios(clone);
   1283	process_deferred_discards(clone);
   1284
   1285	/*
   1286	 * process_deferred_flush_bios():
   1287	 *
   1288	 *   - Commit metadata
   1289	 *
   1290	 *   - Process deferred REQ_FUA completions
   1291	 *
   1292	 *   - Process deferred REQ_PREFLUSH bios
   1293	 */
   1294	process_deferred_flush_bios(clone);
   1295
   1296	/* Background hydration */
   1297	do_hydration(clone);
   1298}
   1299
   1300/*
   1301 * Commit periodically so that not too much unwritten data builds up.
   1302 *
   1303 * Also, restart background hydration, if it has been stopped by in-flight I/O.
   1304 */
   1305static void do_waker(struct work_struct *work)
   1306{
   1307	struct clone *clone = container_of(to_delayed_work(work), struct clone, waker);
   1308
   1309	wake_worker(clone);
   1310	queue_delayed_work(clone->wq, &clone->waker, COMMIT_PERIOD);
   1311}
   1312
   1313/*---------------------------------------------------------------------------*/
   1314
   1315/*
   1316 * Target methods
   1317 */
   1318static int clone_map(struct dm_target *ti, struct bio *bio)
   1319{
   1320	struct clone *clone = ti->private;
   1321	unsigned long region_nr;
   1322
   1323	atomic_inc(&clone->ios_in_flight);
   1324
   1325	if (unlikely(get_clone_mode(clone) == CM_FAIL))
   1326		return DM_MAPIO_KILL;
   1327
   1328	/*
   1329	 * REQ_PREFLUSH bios carry no data:
   1330	 *
   1331	 * - Commit metadata, if changed
   1332	 *
   1333	 * - Pass down to destination device
   1334	 */
   1335	if (bio->bi_opf & REQ_PREFLUSH) {
   1336		remap_and_issue(clone, bio);
   1337		return DM_MAPIO_SUBMITTED;
   1338	}
   1339
   1340	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
   1341
   1342	/*
   1343	 * dm-clone interprets discards and performs a fast hydration of the
   1344	 * discarded regions, i.e., we skip the copy from the source device and
   1345	 * just mark the regions as hydrated.
   1346	 */
   1347	if (bio_op(bio) == REQ_OP_DISCARD) {
   1348		process_discard_bio(clone, bio);
   1349		return DM_MAPIO_SUBMITTED;
   1350	}
   1351
   1352	/*
   1353	 * If the bio's region is hydrated, redirect it to the destination
   1354	 * device.
   1355	 *
   1356	 * If the region is not hydrated and the bio is a READ, redirect it to
   1357	 * the source device.
   1358	 *
   1359	 * Else, defer WRITE bio until after its region has been hydrated and
   1360	 * start the region's hydration immediately.
   1361	 */
   1362	region_nr = bio_to_region(clone, bio);
   1363	if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
   1364		remap_and_issue(clone, bio);
   1365		return DM_MAPIO_SUBMITTED;
   1366	} else if (bio_data_dir(bio) == READ) {
   1367		remap_to_source(clone, bio);
   1368		return DM_MAPIO_REMAPPED;
   1369	}
   1370
   1371	remap_to_dest(clone, bio);
   1372	hydrate_bio_region(clone, bio);
   1373
   1374	return DM_MAPIO_SUBMITTED;
   1375}
   1376
   1377static int clone_endio(struct dm_target *ti, struct bio *bio, blk_status_t *error)
   1378{
   1379	struct clone *clone = ti->private;
   1380
   1381	atomic_dec(&clone->ios_in_flight);
   1382
   1383	return DM_ENDIO_DONE;
   1384}
   1385
   1386static void emit_flags(struct clone *clone, char *result, unsigned int maxlen,
   1387		       ssize_t *sz_ptr)
   1388{
   1389	ssize_t sz = *sz_ptr;
   1390	unsigned int count;
   1391
   1392	count = !test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
   1393	count += !test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
   1394
   1395	DMEMIT("%u ", count);
   1396
   1397	if (!test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
   1398		DMEMIT("no_hydration ");
   1399
   1400	if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
   1401		DMEMIT("no_discard_passdown ");
   1402
   1403	*sz_ptr = sz;
   1404}
   1405
   1406static void emit_core_args(struct clone *clone, char *result,
   1407			   unsigned int maxlen, ssize_t *sz_ptr)
   1408{
   1409	ssize_t sz = *sz_ptr;
   1410	unsigned int count = 4;
   1411
   1412	DMEMIT("%u hydration_threshold %u hydration_batch_size %u ", count,
   1413	       READ_ONCE(clone->hydration_threshold),
   1414	       READ_ONCE(clone->hydration_batch_size));
   1415
   1416	*sz_ptr = sz;
   1417}
   1418
   1419/*
   1420 * Status format:
   1421 *
   1422 * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
   1423 * <clone region size> <#hydrated regions>/<#total regions> <#hydrating regions>
   1424 * <#features> <features>* <#core args> <core args>* <clone metadata mode>
   1425 */
   1426static void clone_status(struct dm_target *ti, status_type_t type,
   1427			 unsigned int status_flags, char *result,
   1428			 unsigned int maxlen)
   1429{
   1430	int r;
   1431	unsigned int i;
   1432	ssize_t sz = 0;
   1433	dm_block_t nr_free_metadata_blocks = 0;
   1434	dm_block_t nr_metadata_blocks = 0;
   1435	char buf[BDEVNAME_SIZE];
   1436	struct clone *clone = ti->private;
   1437
   1438	switch (type) {
   1439	case STATUSTYPE_INFO:
   1440		if (get_clone_mode(clone) == CM_FAIL) {
   1441			DMEMIT("Fail");
   1442			break;
   1443		}
   1444
   1445		/* Commit to ensure statistics aren't out-of-date */
   1446		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
   1447			(void) commit_metadata(clone, NULL);
   1448
   1449		r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks);
   1450
   1451		if (r) {
   1452			DMERR("%s: dm_clone_get_free_metadata_block_count returned %d",
   1453			      clone_device_name(clone), r);
   1454			goto error;
   1455		}
   1456
   1457		r = dm_clone_get_metadata_dev_size(clone->cmd, &nr_metadata_blocks);
   1458
   1459		if (r) {
   1460			DMERR("%s: dm_clone_get_metadata_dev_size returned %d",
   1461			      clone_device_name(clone), r);
   1462			goto error;
   1463		}
   1464
   1465		DMEMIT("%u %llu/%llu %llu %u/%lu %u ",
   1466		       DM_CLONE_METADATA_BLOCK_SIZE,
   1467		       (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks),
   1468		       (unsigned long long)nr_metadata_blocks,
   1469		       (unsigned long long)clone->region_size,
   1470		       dm_clone_nr_of_hydrated_regions(clone->cmd),
   1471		       clone->nr_regions,
   1472		       atomic_read(&clone->hydrations_in_flight));
   1473
   1474		emit_flags(clone, result, maxlen, &sz);
   1475		emit_core_args(clone, result, maxlen, &sz);
   1476
   1477		switch (get_clone_mode(clone)) {
   1478		case CM_WRITE:
   1479			DMEMIT("rw");
   1480			break;
   1481		case CM_READ_ONLY:
   1482			DMEMIT("ro");
   1483			break;
   1484		case CM_FAIL:
   1485			DMEMIT("Fail");
   1486		}
   1487
   1488		break;
   1489
   1490	case STATUSTYPE_TABLE:
   1491		format_dev_t(buf, clone->metadata_dev->bdev->bd_dev);
   1492		DMEMIT("%s ", buf);
   1493
   1494		format_dev_t(buf, clone->dest_dev->bdev->bd_dev);
   1495		DMEMIT("%s ", buf);
   1496
   1497		format_dev_t(buf, clone->source_dev->bdev->bd_dev);
   1498		DMEMIT("%s", buf);
   1499
   1500		for (i = 0; i < clone->nr_ctr_args; i++)
   1501			DMEMIT(" %s", clone->ctr_args[i]);
   1502		break;
   1503
   1504	case STATUSTYPE_IMA:
   1505		*result = '\0';
   1506		break;
   1507	}
   1508
   1509	return;
   1510
   1511error:
   1512	DMEMIT("Error");
   1513}
   1514
   1515static sector_t get_dev_size(struct dm_dev *dev)
   1516{
   1517	return bdev_nr_sectors(dev->bdev);
   1518}
   1519
   1520/*---------------------------------------------------------------------------*/
   1521
   1522/*
   1523 * Construct a clone device mapping:
   1524 *
   1525 * clone <metadata dev> <destination dev> <source dev> <region size>
   1526 *	[<#feature args> [<feature arg>]* [<#core args> [key value]*]]
   1527 *
   1528 * metadata dev: Fast device holding the persistent metadata
   1529 * destination dev: The destination device, which will become a clone of the
   1530 *                  source device
   1531 * source dev: The read-only source device that gets cloned
   1532 * region size: dm-clone unit size in sectors
   1533 *
   1534 * #feature args: Number of feature arguments passed
   1535 * feature args: E.g. no_hydration, no_discard_passdown
   1536 *
   1537 * #core arguments: An even number of core arguments
   1538 * core arguments: Key/value pairs for tuning the core
   1539 *		   E.g. 'hydration_threshold 256'
   1540 */
   1541static int parse_feature_args(struct dm_arg_set *as, struct clone *clone)
   1542{
   1543	int r;
   1544	unsigned int argc;
   1545	const char *arg_name;
   1546	struct dm_target *ti = clone->ti;
   1547
   1548	const struct dm_arg args = {
   1549		.min = 0,
   1550		.max = 2,
   1551		.error = "Invalid number of feature arguments"
   1552	};
   1553
   1554	/* No feature arguments supplied */
   1555	if (!as->argc)
   1556		return 0;
   1557
   1558	r = dm_read_arg_group(&args, as, &argc, &ti->error);
   1559	if (r)
   1560		return r;
   1561
   1562	while (argc) {
   1563		arg_name = dm_shift_arg(as);
   1564		argc--;
   1565
   1566		if (!strcasecmp(arg_name, "no_hydration")) {
   1567			__clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
   1568		} else if (!strcasecmp(arg_name, "no_discard_passdown")) {
   1569			__clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
   1570		} else {
   1571			ti->error = "Invalid feature argument";
   1572			return -EINVAL;
   1573		}
   1574	}
   1575
   1576	return 0;
   1577}
   1578
   1579static int parse_core_args(struct dm_arg_set *as, struct clone *clone)
   1580{
   1581	int r;
   1582	unsigned int argc;
   1583	unsigned int value;
   1584	const char *arg_name;
   1585	struct dm_target *ti = clone->ti;
   1586
   1587	const struct dm_arg args = {
   1588		.min = 0,
   1589		.max = 4,
   1590		.error = "Invalid number of core arguments"
   1591	};
   1592
   1593	/* Initialize core arguments */
   1594	clone->hydration_batch_size = DEFAULT_HYDRATION_BATCH_SIZE;
   1595	clone->hydration_threshold = DEFAULT_HYDRATION_THRESHOLD;
   1596
   1597	/* No core arguments supplied */
   1598	if (!as->argc)
   1599		return 0;
   1600
   1601	r = dm_read_arg_group(&args, as, &argc, &ti->error);
   1602	if (r)
   1603		return r;
   1604
   1605	if (argc & 1) {
   1606		ti->error = "Number of core arguments must be even";
   1607		return -EINVAL;
   1608	}
   1609
   1610	while (argc) {
   1611		arg_name = dm_shift_arg(as);
   1612		argc -= 2;
   1613
   1614		if (!strcasecmp(arg_name, "hydration_threshold")) {
   1615			if (kstrtouint(dm_shift_arg(as), 10, &value)) {
   1616				ti->error = "Invalid value for argument `hydration_threshold'";
   1617				return -EINVAL;
   1618			}
   1619			clone->hydration_threshold = value;
   1620		} else if (!strcasecmp(arg_name, "hydration_batch_size")) {
   1621			if (kstrtouint(dm_shift_arg(as), 10, &value)) {
   1622				ti->error = "Invalid value for argument `hydration_batch_size'";
   1623				return -EINVAL;
   1624			}
   1625			clone->hydration_batch_size = value;
   1626		} else {
   1627			ti->error = "Invalid core argument";
   1628			return -EINVAL;
   1629		}
   1630	}
   1631
   1632	return 0;
   1633}
   1634
   1635static int parse_region_size(struct clone *clone, struct dm_arg_set *as, char **error)
   1636{
   1637	int r;
   1638	unsigned int region_size;
   1639	struct dm_arg arg;
   1640
   1641	arg.min = MIN_REGION_SIZE;
   1642	arg.max = MAX_REGION_SIZE;
   1643	arg.error = "Invalid region size";
   1644
   1645	r = dm_read_arg(&arg, as, &region_size, error);
   1646	if (r)
   1647		return r;
   1648
   1649	/* Check region size is a power of 2 */
   1650	if (!is_power_of_2(region_size)) {
   1651		*error = "Region size is not a power of 2";
   1652		return -EINVAL;
   1653	}
   1654
   1655	/* Validate the region size against the device logical block size */
   1656	if (region_size % (bdev_logical_block_size(clone->source_dev->bdev) >> 9) ||
   1657	    region_size % (bdev_logical_block_size(clone->dest_dev->bdev) >> 9)) {
   1658		*error = "Region size is not a multiple of device logical block size";
   1659		return -EINVAL;
   1660	}
   1661
   1662	clone->region_size = region_size;
   1663
   1664	return 0;
   1665}
   1666
   1667static int validate_nr_regions(unsigned long n, char **error)
   1668{
   1669	/*
   1670	 * dm_bitset restricts us to 2^32 regions. test_bit & co. restrict us
   1671	 * further to 2^31 regions.
   1672	 */
   1673	if (n > (1UL << 31)) {
   1674		*error = "Too many regions. Consider increasing the region size";
   1675		return -EINVAL;
   1676	}
   1677
   1678	return 0;
   1679}
   1680
   1681static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char **error)
   1682{
   1683	int r;
   1684	sector_t metadata_dev_size;
   1685
   1686	r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
   1687			  &clone->metadata_dev);
   1688	if (r) {
   1689		*error = "Error opening metadata device";
   1690		return r;
   1691	}
   1692
   1693	metadata_dev_size = get_dev_size(clone->metadata_dev);
   1694	if (metadata_dev_size > DM_CLONE_METADATA_MAX_SECTORS_WARNING)
   1695		DMWARN("Metadata device %pg is larger than %u sectors: excess space will not be used.",
   1696		       clone->metadata_dev->bdev, DM_CLONE_METADATA_MAX_SECTORS);
   1697
   1698	return 0;
   1699}
   1700
   1701static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **error)
   1702{
   1703	int r;
   1704	sector_t dest_dev_size;
   1705
   1706	r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
   1707			  &clone->dest_dev);
   1708	if (r) {
   1709		*error = "Error opening destination device";
   1710		return r;
   1711	}
   1712
   1713	dest_dev_size = get_dev_size(clone->dest_dev);
   1714	if (dest_dev_size < clone->ti->len) {
   1715		dm_put_device(clone->ti, clone->dest_dev);
   1716		*error = "Device size larger than destination device";
   1717		return -EINVAL;
   1718	}
   1719
   1720	return 0;
   1721}
   1722
   1723static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **error)
   1724{
   1725	int r;
   1726	sector_t source_dev_size;
   1727
   1728	r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ,
   1729			  &clone->source_dev);
   1730	if (r) {
   1731		*error = "Error opening source device";
   1732		return r;
   1733	}
   1734
   1735	source_dev_size = get_dev_size(clone->source_dev);
   1736	if (source_dev_size < clone->ti->len) {
   1737		dm_put_device(clone->ti, clone->source_dev);
   1738		*error = "Device size larger than source device";
   1739		return -EINVAL;
   1740	}
   1741
   1742	return 0;
   1743}
   1744
   1745static int copy_ctr_args(struct clone *clone, int argc, const char **argv, char **error)
   1746{
   1747	unsigned int i;
   1748	const char **copy;
   1749
   1750	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
   1751	if (!copy)
   1752		goto error;
   1753
   1754	for (i = 0; i < argc; i++) {
   1755		copy[i] = kstrdup(argv[i], GFP_KERNEL);
   1756
   1757		if (!copy[i]) {
   1758			while (i--)
   1759				kfree(copy[i]);
   1760			kfree(copy);
   1761			goto error;
   1762		}
   1763	}
   1764
   1765	clone->nr_ctr_args = argc;
   1766	clone->ctr_args = copy;
   1767	return 0;
   1768
   1769error:
   1770	*error = "Failed to allocate memory for table line";
   1771	return -ENOMEM;
   1772}
   1773
   1774static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
   1775{
   1776	int r;
   1777	sector_t nr_regions;
   1778	struct clone *clone;
   1779	struct dm_arg_set as;
   1780
   1781	if (argc < 4) {
   1782		ti->error = "Invalid number of arguments";
   1783		return -EINVAL;
   1784	}
   1785
   1786	as.argc = argc;
   1787	as.argv = argv;
   1788
   1789	clone = kzalloc(sizeof(*clone), GFP_KERNEL);
   1790	if (!clone) {
   1791		ti->error = "Failed to allocate clone structure";
   1792		return -ENOMEM;
   1793	}
   1794
   1795	clone->ti = ti;
   1796
   1797	/* Initialize dm-clone flags */
   1798	__set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
   1799	__set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
   1800	__set_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
   1801
   1802	r = parse_metadata_dev(clone, &as, &ti->error);
   1803	if (r)
   1804		goto out_with_clone;
   1805
   1806	r = parse_dest_dev(clone, &as, &ti->error);
   1807	if (r)
   1808		goto out_with_meta_dev;
   1809
   1810	r = parse_source_dev(clone, &as, &ti->error);
   1811	if (r)
   1812		goto out_with_dest_dev;
   1813
   1814	r = parse_region_size(clone, &as, &ti->error);
   1815	if (r)
   1816		goto out_with_source_dev;
   1817
   1818	clone->region_shift = __ffs(clone->region_size);
   1819	nr_regions = dm_sector_div_up(ti->len, clone->region_size);
   1820
   1821	/* Check for overflow */
   1822	if (nr_regions != (unsigned long)nr_regions) {
   1823		ti->error = "Too many regions. Consider increasing the region size";
   1824		r = -EOVERFLOW;
   1825		goto out_with_source_dev;
   1826	}
   1827
   1828	clone->nr_regions = nr_regions;
   1829
   1830	r = validate_nr_regions(clone->nr_regions, &ti->error);
   1831	if (r)
   1832		goto out_with_source_dev;
   1833
   1834	r = dm_set_target_max_io_len(ti, clone->region_size);
   1835	if (r) {
   1836		ti->error = "Failed to set max io len";
   1837		goto out_with_source_dev;
   1838	}
   1839
   1840	r = parse_feature_args(&as, clone);
   1841	if (r)
   1842		goto out_with_source_dev;
   1843
   1844	r = parse_core_args(&as, clone);
   1845	if (r)
   1846		goto out_with_source_dev;
   1847
   1848	/* Load metadata */
   1849	clone->cmd = dm_clone_metadata_open(clone->metadata_dev->bdev, ti->len,
   1850					    clone->region_size);
   1851	if (IS_ERR(clone->cmd)) {
   1852		ti->error = "Failed to load metadata";
   1853		r = PTR_ERR(clone->cmd);
   1854		goto out_with_source_dev;
   1855	}
   1856
   1857	__set_clone_mode(clone, CM_WRITE);
   1858
   1859	if (get_clone_mode(clone) != CM_WRITE) {
   1860		ti->error = "Unable to get write access to metadata, please check/repair metadata";
   1861		r = -EPERM;
   1862		goto out_with_metadata;
   1863	}
   1864
   1865	clone->last_commit_jiffies = jiffies;
   1866
   1867	/* Allocate hydration hash table */
   1868	r = hash_table_init(clone);
   1869	if (r) {
   1870		ti->error = "Failed to allocate hydration hash table";
   1871		goto out_with_metadata;
   1872	}
   1873
   1874	atomic_set(&clone->ios_in_flight, 0);
   1875	init_waitqueue_head(&clone->hydration_stopped);
   1876	spin_lock_init(&clone->lock);
   1877	bio_list_init(&clone->deferred_bios);
   1878	bio_list_init(&clone->deferred_discard_bios);
   1879	bio_list_init(&clone->deferred_flush_bios);
   1880	bio_list_init(&clone->deferred_flush_completions);
   1881	clone->hydration_offset = 0;
   1882	atomic_set(&clone->hydrations_in_flight, 0);
   1883
   1884	clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
   1885	if (!clone->wq) {
   1886		ti->error = "Failed to allocate workqueue";
   1887		r = -ENOMEM;
   1888		goto out_with_ht;
   1889	}
   1890
   1891	INIT_WORK(&clone->worker, do_worker);
   1892	INIT_DELAYED_WORK(&clone->waker, do_waker);
   1893
   1894	clone->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
   1895	if (IS_ERR(clone->kcopyd_client)) {
   1896		r = PTR_ERR(clone->kcopyd_client);
   1897		goto out_with_wq;
   1898	}
   1899
   1900	r = mempool_init_slab_pool(&clone->hydration_pool, MIN_HYDRATIONS,
   1901				   _hydration_cache);
   1902	if (r) {
   1903		ti->error = "Failed to create dm_clone_region_hydration memory pool";
   1904		goto out_with_kcopyd;
   1905	}
   1906
   1907	/* Save a copy of the table line */
   1908	r = copy_ctr_args(clone, argc - 3, (const char **)argv + 3, &ti->error);
   1909	if (r)
   1910		goto out_with_mempool;
   1911
   1912	mutex_init(&clone->commit_lock);
   1913
   1914	/* Enable flushes */
   1915	ti->num_flush_bios = 1;
   1916	ti->flush_supported = true;
   1917
   1918	/* Enable discards */
   1919	ti->discards_supported = true;
   1920	ti->num_discard_bios = 1;
   1921
   1922	ti->private = clone;
   1923
   1924	return 0;
   1925
   1926out_with_mempool:
   1927	mempool_exit(&clone->hydration_pool);
   1928out_with_kcopyd:
   1929	dm_kcopyd_client_destroy(clone->kcopyd_client);
   1930out_with_wq:
   1931	destroy_workqueue(clone->wq);
   1932out_with_ht:
   1933	hash_table_exit(clone);
   1934out_with_metadata:
   1935	dm_clone_metadata_close(clone->cmd);
   1936out_with_source_dev:
   1937	dm_put_device(ti, clone->source_dev);
   1938out_with_dest_dev:
   1939	dm_put_device(ti, clone->dest_dev);
   1940out_with_meta_dev:
   1941	dm_put_device(ti, clone->metadata_dev);
   1942out_with_clone:
   1943	kfree(clone);
   1944
   1945	return r;
   1946}
   1947
   1948static void clone_dtr(struct dm_target *ti)
   1949{
   1950	unsigned int i;
   1951	struct clone *clone = ti->private;
   1952
   1953	mutex_destroy(&clone->commit_lock);
   1954
   1955	for (i = 0; i < clone->nr_ctr_args; i++)
   1956		kfree(clone->ctr_args[i]);
   1957	kfree(clone->ctr_args);
   1958
   1959	mempool_exit(&clone->hydration_pool);
   1960	dm_kcopyd_client_destroy(clone->kcopyd_client);
   1961	destroy_workqueue(clone->wq);
   1962	hash_table_exit(clone);
   1963	dm_clone_metadata_close(clone->cmd);
   1964	dm_put_device(ti, clone->source_dev);
   1965	dm_put_device(ti, clone->dest_dev);
   1966	dm_put_device(ti, clone->metadata_dev);
   1967
   1968	kfree(clone);
   1969}
   1970
   1971/*---------------------------------------------------------------------------*/
   1972
   1973static void clone_postsuspend(struct dm_target *ti)
   1974{
   1975	struct clone *clone = ti->private;
   1976
   1977	/*
   1978	 * To successfully suspend the device:
   1979	 *
   1980	 *	- We cancel the delayed work for periodic commits and wait for
   1981	 *	  it to finish.
   1982	 *
   1983	 *	- We stop the background hydration, i.e. we prevent new region
   1984	 *	  hydrations from starting.
   1985	 *
   1986	 *	- We wait for any in-flight hydrations to finish.
   1987	 *
   1988	 *	- We flush the workqueue.
   1989	 *
   1990	 *	- We commit the metadata.
   1991	 */
   1992	cancel_delayed_work_sync(&clone->waker);
   1993
   1994	set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
   1995
   1996	/*
   1997	 * Make sure set_bit() is ordered before atomic_read(), otherwise we
   1998	 * might race with do_hydration() and miss some started region
   1999	 * hydrations.
   2000	 *
   2001	 * This is paired with smp_mb__after_atomic() in do_hydration().
   2002	 */
   2003	smp_mb__after_atomic();
   2004
   2005	wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight));
   2006	flush_workqueue(clone->wq);
   2007
   2008	(void) commit_metadata(clone, NULL);
   2009}
   2010
   2011static void clone_resume(struct dm_target *ti)
   2012{
   2013	struct clone *clone = ti->private;
   2014
   2015	clear_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
   2016	do_waker(&clone->waker.work);
   2017}
   2018
   2019/*
   2020 * If discard_passdown was enabled verify that the destination device supports
   2021 * discards. Disable discard_passdown if not.
   2022 */
   2023static void disable_passdown_if_not_supported(struct clone *clone)
   2024{
   2025	struct block_device *dest_dev = clone->dest_dev->bdev;
   2026	struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits;
   2027	const char *reason = NULL;
   2028
   2029	if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
   2030		return;
   2031
   2032	if (!bdev_max_discard_sectors(dest_dev))
   2033		reason = "discard unsupported";
   2034	else if (dest_limits->max_discard_sectors < clone->region_size)
   2035		reason = "max discard sectors smaller than a region";
   2036
   2037	if (reason) {
   2038		DMWARN("Destination device (%pd) %s: Disabling discard passdown.",
   2039		       dest_dev, reason);
   2040		clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
   2041	}
   2042}
   2043
   2044static void set_discard_limits(struct clone *clone, struct queue_limits *limits)
   2045{
   2046	struct block_device *dest_bdev = clone->dest_dev->bdev;
   2047	struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits;
   2048
   2049	if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) {
   2050		/* No passdown is done so we set our own virtual limits */
   2051		limits->discard_granularity = clone->region_size << SECTOR_SHIFT;
   2052		limits->max_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT, clone->region_size);
   2053		return;
   2054	}
   2055
   2056	/*
   2057	 * clone_iterate_devices() is stacking both the source and destination
   2058	 * device limits but discards aren't passed to the source device, so
   2059	 * inherit destination's limits.
   2060	 */
   2061	limits->max_discard_sectors = dest_limits->max_discard_sectors;
   2062	limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors;
   2063	limits->discard_granularity = dest_limits->discard_granularity;
   2064	limits->discard_alignment = dest_limits->discard_alignment;
   2065	limits->discard_misaligned = dest_limits->discard_misaligned;
   2066	limits->max_discard_segments = dest_limits->max_discard_segments;
   2067}
   2068
   2069static void clone_io_hints(struct dm_target *ti, struct queue_limits *limits)
   2070{
   2071	struct clone *clone = ti->private;
   2072	u64 io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
   2073
   2074	/*
   2075	 * If the system-determined stacked limits are compatible with
   2076	 * dm-clone's region size (io_opt is a factor) do not override them.
   2077	 */
   2078	if (io_opt_sectors < clone->region_size ||
   2079	    do_div(io_opt_sectors, clone->region_size)) {
   2080		blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT);
   2081		blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT);
   2082	}
   2083
   2084	disable_passdown_if_not_supported(clone);
   2085	set_discard_limits(clone, limits);
   2086}
   2087
   2088static int clone_iterate_devices(struct dm_target *ti,
   2089				 iterate_devices_callout_fn fn, void *data)
   2090{
   2091	int ret;
   2092	struct clone *clone = ti->private;
   2093	struct dm_dev *dest_dev = clone->dest_dev;
   2094	struct dm_dev *source_dev = clone->source_dev;
   2095
   2096	ret = fn(ti, source_dev, 0, ti->len, data);
   2097	if (!ret)
   2098		ret = fn(ti, dest_dev, 0, ti->len, data);
   2099	return ret;
   2100}
   2101
   2102/*
   2103 * dm-clone message functions.
   2104 */
   2105static void set_hydration_threshold(struct clone *clone, unsigned int nr_regions)
   2106{
   2107	WRITE_ONCE(clone->hydration_threshold, nr_regions);
   2108
   2109	/*
   2110	 * If user space sets hydration_threshold to zero then the hydration
   2111	 * will stop. If at a later time the hydration_threshold is increased
   2112	 * we must restart the hydration process by waking up the worker.
   2113	 */
   2114	wake_worker(clone);
   2115}
   2116
   2117static void set_hydration_batch_size(struct clone *clone, unsigned int nr_regions)
   2118{
   2119	WRITE_ONCE(clone->hydration_batch_size, nr_regions);
   2120}
   2121
   2122static void enable_hydration(struct clone *clone)
   2123{
   2124	if (!test_and_set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
   2125		wake_worker(clone);
   2126}
   2127
   2128static void disable_hydration(struct clone *clone)
   2129{
   2130	clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
   2131}
   2132
   2133static int clone_message(struct dm_target *ti, unsigned int argc, char **argv,
   2134			 char *result, unsigned int maxlen)
   2135{
   2136	struct clone *clone = ti->private;
   2137	unsigned int value;
   2138
   2139	if (!argc)
   2140		return -EINVAL;
   2141
   2142	if (!strcasecmp(argv[0], "enable_hydration")) {
   2143		enable_hydration(clone);
   2144		return 0;
   2145	}
   2146
   2147	if (!strcasecmp(argv[0], "disable_hydration")) {
   2148		disable_hydration(clone);
   2149		return 0;
   2150	}
   2151
   2152	if (argc != 2)
   2153		return -EINVAL;
   2154
   2155	if (!strcasecmp(argv[0], "hydration_threshold")) {
   2156		if (kstrtouint(argv[1], 10, &value))
   2157			return -EINVAL;
   2158
   2159		set_hydration_threshold(clone, value);
   2160
   2161		return 0;
   2162	}
   2163
   2164	if (!strcasecmp(argv[0], "hydration_batch_size")) {
   2165		if (kstrtouint(argv[1], 10, &value))
   2166			return -EINVAL;
   2167
   2168		set_hydration_batch_size(clone, value);
   2169
   2170		return 0;
   2171	}
   2172
   2173	DMERR("%s: Unsupported message `%s'", clone_device_name(clone), argv[0]);
   2174	return -EINVAL;
   2175}
   2176
   2177static struct target_type clone_target = {
   2178	.name = "clone",
   2179	.version = {1, 0, 0},
   2180	.module = THIS_MODULE,
   2181	.ctr = clone_ctr,
   2182	.dtr =  clone_dtr,
   2183	.map = clone_map,
   2184	.end_io = clone_endio,
   2185	.postsuspend = clone_postsuspend,
   2186	.resume = clone_resume,
   2187	.status = clone_status,
   2188	.message = clone_message,
   2189	.io_hints = clone_io_hints,
   2190	.iterate_devices = clone_iterate_devices,
   2191};
   2192
   2193/*---------------------------------------------------------------------------*/
   2194
   2195/* Module functions */
   2196static int __init dm_clone_init(void)
   2197{
   2198	int r;
   2199
   2200	_hydration_cache = KMEM_CACHE(dm_clone_region_hydration, 0);
   2201	if (!_hydration_cache)
   2202		return -ENOMEM;
   2203
   2204	r = dm_register_target(&clone_target);
   2205	if (r < 0) {
   2206		DMERR("Failed to register clone target");
   2207		return r;
   2208	}
   2209
   2210	return 0;
   2211}
   2212
   2213static void __exit dm_clone_exit(void)
   2214{
   2215	dm_unregister_target(&clone_target);
   2216
   2217	kmem_cache_destroy(_hydration_cache);
   2218	_hydration_cache = NULL;
   2219}
   2220
   2221/* Module hooks */
   2222module_init(dm_clone_init);
   2223module_exit(dm_clone_exit);
   2224
   2225MODULE_DESCRIPTION(DM_NAME " clone target");
   2226MODULE_AUTHOR("Nikos Tsironis <ntsironis@arrikto.com>");
   2227MODULE_LICENSE("GPL");