cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

dm-snap.c (69921B)


      1/*
      2 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
      3 *
      4 * This file is released under the GPL.
      5 */
      6
      7#include <linux/blkdev.h>
      8#include <linux/device-mapper.h>
      9#include <linux/delay.h>
     10#include <linux/fs.h>
     11#include <linux/init.h>
     12#include <linux/kdev_t.h>
     13#include <linux/list.h>
     14#include <linux/list_bl.h>
     15#include <linux/mempool.h>
     16#include <linux/module.h>
     17#include <linux/slab.h>
     18#include <linux/vmalloc.h>
     19#include <linux/log2.h>
     20#include <linux/dm-kcopyd.h>
     21
     22#include "dm.h"
     23
     24#include "dm-exception-store.h"
     25
     26#define DM_MSG_PREFIX "snapshots"
     27
     28static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
     29
     30#define dm_target_is_snapshot_merge(ti) \
     31	((ti)->type->name == dm_snapshot_merge_target_name)
     32
     33/*
     34 * The size of the mempool used to track chunks in use.
     35 */
     36#define MIN_IOS 256
     37
     38#define DM_TRACKED_CHUNK_HASH_SIZE	16
     39#define DM_TRACKED_CHUNK_HASH(x)	((unsigned long)(x) & \
     40					 (DM_TRACKED_CHUNK_HASH_SIZE - 1))
     41
     42struct dm_exception_table {
     43	uint32_t hash_mask;
     44	unsigned hash_shift;
     45	struct hlist_bl_head *table;
     46};
     47
     48struct dm_snapshot {
     49	struct rw_semaphore lock;
     50
     51	struct dm_dev *origin;
     52	struct dm_dev *cow;
     53
     54	struct dm_target *ti;
     55
     56	/* List of snapshots per Origin */
     57	struct list_head list;
     58
     59	/*
     60	 * You can't use a snapshot if this is 0 (e.g. if full).
     61	 * A snapshot-merge target never clears this.
     62	 */
     63	int valid;
     64
     65	/*
     66	 * The snapshot overflowed because of a write to the snapshot device.
     67	 * We don't have to invalidate the snapshot in this case, but we need
     68	 * to prevent further writes.
     69	 */
     70	int snapshot_overflowed;
     71
     72	/* Origin writes don't trigger exceptions until this is set */
     73	int active;
     74
     75	atomic_t pending_exceptions_count;
     76
     77	spinlock_t pe_allocation_lock;
     78
     79	/* Protected by "pe_allocation_lock" */
     80	sector_t exception_start_sequence;
     81
     82	/* Protected by kcopyd single-threaded callback */
     83	sector_t exception_complete_sequence;
     84
     85	/*
     86	 * A list of pending exceptions that completed out of order.
     87	 * Protected by kcopyd single-threaded callback.
     88	 */
     89	struct rb_root out_of_order_tree;
     90
     91	mempool_t pending_pool;
     92
     93	struct dm_exception_table pending;
     94	struct dm_exception_table complete;
     95
     96	/*
     97	 * pe_lock protects all pending_exception operations and access
     98	 * as well as the snapshot_bios list.
     99	 */
    100	spinlock_t pe_lock;
    101
    102	/* Chunks with outstanding reads */
    103	spinlock_t tracked_chunk_lock;
    104	struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
    105
    106	/* The on disk metadata handler */
    107	struct dm_exception_store *store;
    108
    109	unsigned in_progress;
    110	struct wait_queue_head in_progress_wait;
    111
    112	struct dm_kcopyd_client *kcopyd_client;
    113
    114	/* Wait for events based on state_bits */
    115	unsigned long state_bits;
    116
    117	/* Range of chunks currently being merged. */
    118	chunk_t first_merging_chunk;
    119	int num_merging_chunks;
    120
    121	/*
    122	 * The merge operation failed if this flag is set.
    123	 * Failure modes are handled as follows:
    124	 * - I/O error reading the header
    125	 *   	=> don't load the target; abort.
    126	 * - Header does not have "valid" flag set
    127	 *   	=> use the origin; forget about the snapshot.
    128	 * - I/O error when reading exceptions
    129	 *   	=> don't load the target; abort.
    130	 *         (We can't use the intermediate origin state.)
    131	 * - I/O error while merging
    132	 *	=> stop merging; set merge_failed; process I/O normally.
    133	 */
    134	bool merge_failed:1;
    135
    136	bool discard_zeroes_cow:1;
    137	bool discard_passdown_origin:1;
    138
    139	/*
    140	 * Incoming bios that overlap with chunks being merged must wait
    141	 * for them to be committed.
    142	 */
    143	struct bio_list bios_queued_during_merge;
    144};
    145
    146/*
    147 * state_bits:
    148 *   RUNNING_MERGE  - Merge operation is in progress.
    149 *   SHUTDOWN_MERGE - Set to signal that merge needs to be stopped;
    150 *                    cleared afterwards.
    151 */
    152#define RUNNING_MERGE          0
    153#define SHUTDOWN_MERGE         1
    154
    155/*
    156 * Maximum number of chunks being copied on write.
    157 *
    158 * The value was decided experimentally as a trade-off between memory
    159 * consumption, stalling the kernel's workqueues and maintaining a high enough
    160 * throughput.
    161 */
    162#define DEFAULT_COW_THRESHOLD 2048
    163
    164static unsigned cow_threshold = DEFAULT_COW_THRESHOLD;
    165module_param_named(snapshot_cow_threshold, cow_threshold, uint, 0644);
    166MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
    167
    168DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
    169		"A percentage of time allocated for copy on write");
    170
    171struct dm_dev *dm_snap_origin(struct dm_snapshot *s)
    172{
    173	return s->origin;
    174}
    175EXPORT_SYMBOL(dm_snap_origin);
    176
    177struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
    178{
    179	return s->cow;
    180}
    181EXPORT_SYMBOL(dm_snap_cow);
    182
    183static sector_t chunk_to_sector(struct dm_exception_store *store,
    184				chunk_t chunk)
    185{
    186	return chunk << store->chunk_shift;
    187}
    188
    189static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
    190{
    191	/*
    192	 * There is only ever one instance of a particular block
    193	 * device so we can compare pointers safely.
    194	 */
    195	return lhs == rhs;
    196}
    197
    198struct dm_snap_pending_exception {
    199	struct dm_exception e;
    200
    201	/*
    202	 * Origin buffers waiting for this to complete are held
    203	 * in a bio list
    204	 */
    205	struct bio_list origin_bios;
    206	struct bio_list snapshot_bios;
    207
    208	/* Pointer back to snapshot context */
    209	struct dm_snapshot *snap;
    210
    211	/*
    212	 * 1 indicates the exception has already been sent to
    213	 * kcopyd.
    214	 */
    215	int started;
    216
    217	/* There was copying error. */
    218	int copy_error;
    219
    220	/* A sequence number, it is used for in-order completion. */
    221	sector_t exception_sequence;
    222
    223	struct rb_node out_of_order_node;
    224
    225	/*
    226	 * For writing a complete chunk, bypassing the copy.
    227	 */
    228	struct bio *full_bio;
    229	bio_end_io_t *full_bio_end_io;
    230};
    231
    232/*
    233 * Hash table mapping origin volumes to lists of snapshots and
    234 * a lock to protect it
    235 */
    236static struct kmem_cache *exception_cache;
    237static struct kmem_cache *pending_cache;
    238
    239struct dm_snap_tracked_chunk {
    240	struct hlist_node node;
    241	chunk_t chunk;
    242};
    243
    244static void init_tracked_chunk(struct bio *bio)
    245{
    246	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
    247	INIT_HLIST_NODE(&c->node);
    248}
    249
    250static bool is_bio_tracked(struct bio *bio)
    251{
    252	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
    253	return !hlist_unhashed(&c->node);
    254}
    255
    256static void track_chunk(struct dm_snapshot *s, struct bio *bio, chunk_t chunk)
    257{
    258	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
    259
    260	c->chunk = chunk;
    261
    262	spin_lock_irq(&s->tracked_chunk_lock);
    263	hlist_add_head(&c->node,
    264		       &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]);
    265	spin_unlock_irq(&s->tracked_chunk_lock);
    266}
    267
    268static void stop_tracking_chunk(struct dm_snapshot *s, struct bio *bio)
    269{
    270	struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
    271	unsigned long flags;
    272
    273	spin_lock_irqsave(&s->tracked_chunk_lock, flags);
    274	hlist_del(&c->node);
    275	spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
    276}
    277
    278static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
    279{
    280	struct dm_snap_tracked_chunk *c;
    281	int found = 0;
    282
    283	spin_lock_irq(&s->tracked_chunk_lock);
    284
    285	hlist_for_each_entry(c,
    286	    &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) {
    287		if (c->chunk == chunk) {
    288			found = 1;
    289			break;
    290		}
    291	}
    292
    293	spin_unlock_irq(&s->tracked_chunk_lock);
    294
    295	return found;
    296}
    297
    298/*
    299 * This conflicting I/O is extremely improbable in the caller,
    300 * so msleep(1) is sufficient and there is no need for a wait queue.
    301 */
    302static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk)
    303{
    304	while (__chunk_is_tracked(s, chunk))
    305		msleep(1);
    306}
    307
    308/*
    309 * One of these per registered origin, held in the snapshot_origins hash
    310 */
    311struct origin {
    312	/* The origin device */
    313	struct block_device *bdev;
    314
    315	struct list_head hash_list;
    316
    317	/* List of snapshots for this origin */
    318	struct list_head snapshots;
    319};
    320
    321/*
    322 * This structure is allocated for each origin target
    323 */
    324struct dm_origin {
    325	struct dm_dev *dev;
    326	struct dm_target *ti;
    327	unsigned split_boundary;
    328	struct list_head hash_list;
    329};
    330
    331/*
    332 * Size of the hash table for origin volumes. If we make this
    333 * the size of the minors list then it should be nearly perfect
    334 */
    335#define ORIGIN_HASH_SIZE 256
    336#define ORIGIN_MASK      0xFF
    337static struct list_head *_origins;
    338static struct list_head *_dm_origins;
    339static struct rw_semaphore _origins_lock;
    340
    341static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done);
    342static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock);
    343static uint64_t _pending_exceptions_done_count;
    344
    345static int init_origin_hash(void)
    346{
    347	int i;
    348
    349	_origins = kmalloc_array(ORIGIN_HASH_SIZE, sizeof(struct list_head),
    350				 GFP_KERNEL);
    351	if (!_origins) {
    352		DMERR("unable to allocate memory for _origins");
    353		return -ENOMEM;
    354	}
    355	for (i = 0; i < ORIGIN_HASH_SIZE; i++)
    356		INIT_LIST_HEAD(_origins + i);
    357
    358	_dm_origins = kmalloc_array(ORIGIN_HASH_SIZE,
    359				    sizeof(struct list_head),
    360				    GFP_KERNEL);
    361	if (!_dm_origins) {
    362		DMERR("unable to allocate memory for _dm_origins");
    363		kfree(_origins);
    364		return -ENOMEM;
    365	}
    366	for (i = 0; i < ORIGIN_HASH_SIZE; i++)
    367		INIT_LIST_HEAD(_dm_origins + i);
    368
    369	init_rwsem(&_origins_lock);
    370
    371	return 0;
    372}
    373
    374static void exit_origin_hash(void)
    375{
    376	kfree(_origins);
    377	kfree(_dm_origins);
    378}
    379
    380static unsigned origin_hash(struct block_device *bdev)
    381{
    382	return bdev->bd_dev & ORIGIN_MASK;
    383}
    384
    385static struct origin *__lookup_origin(struct block_device *origin)
    386{
    387	struct list_head *ol;
    388	struct origin *o;
    389
    390	ol = &_origins[origin_hash(origin)];
    391	list_for_each_entry (o, ol, hash_list)
    392		if (bdev_equal(o->bdev, origin))
    393			return o;
    394
    395	return NULL;
    396}
    397
    398static void __insert_origin(struct origin *o)
    399{
    400	struct list_head *sl = &_origins[origin_hash(o->bdev)];
    401	list_add_tail(&o->hash_list, sl);
    402}
    403
    404static struct dm_origin *__lookup_dm_origin(struct block_device *origin)
    405{
    406	struct list_head *ol;
    407	struct dm_origin *o;
    408
    409	ol = &_dm_origins[origin_hash(origin)];
    410	list_for_each_entry (o, ol, hash_list)
    411		if (bdev_equal(o->dev->bdev, origin))
    412			return o;
    413
    414	return NULL;
    415}
    416
    417static void __insert_dm_origin(struct dm_origin *o)
    418{
    419	struct list_head *sl = &_dm_origins[origin_hash(o->dev->bdev)];
    420	list_add_tail(&o->hash_list, sl);
    421}
    422
    423static void __remove_dm_origin(struct dm_origin *o)
    424{
    425	list_del(&o->hash_list);
    426}
    427
    428/*
    429 * _origins_lock must be held when calling this function.
    430 * Returns number of snapshots registered using the supplied cow device, plus:
    431 * snap_src - a snapshot suitable for use as a source of exception handover
    432 * snap_dest - a snapshot capable of receiving exception handover.
    433 * snap_merge - an existing snapshot-merge target linked to the same origin.
    434 *   There can be at most one snapshot-merge target. The parameter is optional.
    435 *
    436 * Possible return values and states of snap_src and snap_dest.
    437 *   0: NULL, NULL  - first new snapshot
    438 *   1: snap_src, NULL - normal snapshot
    439 *   2: snap_src, snap_dest  - waiting for handover
    440 *   2: snap_src, NULL - handed over, waiting for old to be deleted
    441 *   1: NULL, snap_dest - source got destroyed without handover
    442 */
    443static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
    444					struct dm_snapshot **snap_src,
    445					struct dm_snapshot **snap_dest,
    446					struct dm_snapshot **snap_merge)
    447{
    448	struct dm_snapshot *s;
    449	struct origin *o;
    450	int count = 0;
    451	int active;
    452
    453	o = __lookup_origin(snap->origin->bdev);
    454	if (!o)
    455		goto out;
    456
    457	list_for_each_entry(s, &o->snapshots, list) {
    458		if (dm_target_is_snapshot_merge(s->ti) && snap_merge)
    459			*snap_merge = s;
    460		if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
    461			continue;
    462
    463		down_read(&s->lock);
    464		active = s->active;
    465		up_read(&s->lock);
    466
    467		if (active) {
    468			if (snap_src)
    469				*snap_src = s;
    470		} else if (snap_dest)
    471			*snap_dest = s;
    472
    473		count++;
    474	}
    475
    476out:
    477	return count;
    478}
    479
    480/*
    481 * On success, returns 1 if this snapshot is a handover destination,
    482 * otherwise returns 0.
    483 */
    484static int __validate_exception_handover(struct dm_snapshot *snap)
    485{
    486	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
    487	struct dm_snapshot *snap_merge = NULL;
    488
    489	/* Does snapshot need exceptions handed over to it? */
    490	if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest,
    491					  &snap_merge) == 2) ||
    492	    snap_dest) {
    493		snap->ti->error = "Snapshot cow pairing for exception "
    494				  "table handover failed";
    495		return -EINVAL;
    496	}
    497
    498	/*
    499	 * If no snap_src was found, snap cannot become a handover
    500	 * destination.
    501	 */
    502	if (!snap_src)
    503		return 0;
    504
    505	/*
    506	 * Non-snapshot-merge handover?
    507	 */
    508	if (!dm_target_is_snapshot_merge(snap->ti))
    509		return 1;
    510
    511	/*
    512	 * Do not allow more than one merging snapshot.
    513	 */
    514	if (snap_merge) {
    515		snap->ti->error = "A snapshot is already merging.";
    516		return -EINVAL;
    517	}
    518
    519	if (!snap_src->store->type->prepare_merge ||
    520	    !snap_src->store->type->commit_merge) {
    521		snap->ti->error = "Snapshot exception store does not "
    522				  "support snapshot-merge.";
    523		return -EINVAL;
    524	}
    525
    526	return 1;
    527}
    528
    529static void __insert_snapshot(struct origin *o, struct dm_snapshot *s)
    530{
    531	struct dm_snapshot *l;
    532
    533	/* Sort the list according to chunk size, largest-first smallest-last */
    534	list_for_each_entry(l, &o->snapshots, list)
    535		if (l->store->chunk_size < s->store->chunk_size)
    536			break;
    537	list_add_tail(&s->list, &l->list);
    538}
    539
    540/*
    541 * Make a note of the snapshot and its origin so we can look it
    542 * up when the origin has a write on it.
    543 *
    544 * Also validate snapshot exception store handovers.
    545 * On success, returns 1 if this registration is a handover destination,
    546 * otherwise returns 0.
    547 */
    548static int register_snapshot(struct dm_snapshot *snap)
    549{
    550	struct origin *o, *new_o = NULL;
    551	struct block_device *bdev = snap->origin->bdev;
    552	int r = 0;
    553
    554	new_o = kmalloc(sizeof(*new_o), GFP_KERNEL);
    555	if (!new_o)
    556		return -ENOMEM;
    557
    558	down_write(&_origins_lock);
    559
    560	r = __validate_exception_handover(snap);
    561	if (r < 0) {
    562		kfree(new_o);
    563		goto out;
    564	}
    565
    566	o = __lookup_origin(bdev);
    567	if (o)
    568		kfree(new_o);
    569	else {
    570		/* New origin */
    571		o = new_o;
    572
    573		/* Initialise the struct */
    574		INIT_LIST_HEAD(&o->snapshots);
    575		o->bdev = bdev;
    576
    577		__insert_origin(o);
    578	}
    579
    580	__insert_snapshot(o, snap);
    581
    582out:
    583	up_write(&_origins_lock);
    584
    585	return r;
    586}
    587
    588/*
    589 * Move snapshot to correct place in list according to chunk size.
    590 */
    591static void reregister_snapshot(struct dm_snapshot *s)
    592{
    593	struct block_device *bdev = s->origin->bdev;
    594
    595	down_write(&_origins_lock);
    596
    597	list_del(&s->list);
    598	__insert_snapshot(__lookup_origin(bdev), s);
    599
    600	up_write(&_origins_lock);
    601}
    602
    603static void unregister_snapshot(struct dm_snapshot *s)
    604{
    605	struct origin *o;
    606
    607	down_write(&_origins_lock);
    608	o = __lookup_origin(s->origin->bdev);
    609
    610	list_del(&s->list);
    611	if (o && list_empty(&o->snapshots)) {
    612		list_del(&o->hash_list);
    613		kfree(o);
    614	}
    615
    616	up_write(&_origins_lock);
    617}
    618
    619/*
    620 * Implementation of the exception hash tables.
    621 * The lowest hash_shift bits of the chunk number are ignored, allowing
    622 * some consecutive chunks to be grouped together.
    623 */
    624static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk);
    625
    626/* Lock to protect access to the completed and pending exception hash tables. */
    627struct dm_exception_table_lock {
    628	struct hlist_bl_head *complete_slot;
    629	struct hlist_bl_head *pending_slot;
    630};
    631
    632static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
    633					 struct dm_exception_table_lock *lock)
    634{
    635	struct dm_exception_table *complete = &s->complete;
    636	struct dm_exception_table *pending = &s->pending;
    637
    638	lock->complete_slot = &complete->table[exception_hash(complete, chunk)];
    639	lock->pending_slot = &pending->table[exception_hash(pending, chunk)];
    640}
    641
    642static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
    643{
    644	hlist_bl_lock(lock->complete_slot);
    645	hlist_bl_lock(lock->pending_slot);
    646}
    647
    648static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
    649{
    650	hlist_bl_unlock(lock->pending_slot);
    651	hlist_bl_unlock(lock->complete_slot);
    652}
    653
    654static int dm_exception_table_init(struct dm_exception_table *et,
    655				   uint32_t size, unsigned hash_shift)
    656{
    657	unsigned int i;
    658
    659	et->hash_shift = hash_shift;
    660	et->hash_mask = size - 1;
    661	et->table = kvmalloc_array(size, sizeof(struct hlist_bl_head),
    662				   GFP_KERNEL);
    663	if (!et->table)
    664		return -ENOMEM;
    665
    666	for (i = 0; i < size; i++)
    667		INIT_HLIST_BL_HEAD(et->table + i);
    668
    669	return 0;
    670}
    671
    672static void dm_exception_table_exit(struct dm_exception_table *et,
    673				    struct kmem_cache *mem)
    674{
    675	struct hlist_bl_head *slot;
    676	struct dm_exception *ex;
    677	struct hlist_bl_node *pos, *n;
    678	int i, size;
    679
    680	size = et->hash_mask + 1;
    681	for (i = 0; i < size; i++) {
    682		slot = et->table + i;
    683
    684		hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list)
    685			kmem_cache_free(mem, ex);
    686	}
    687
    688	kvfree(et->table);
    689}
    690
    691static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
    692{
    693	return (chunk >> et->hash_shift) & et->hash_mask;
    694}
    695
    696static void dm_remove_exception(struct dm_exception *e)
    697{
    698	hlist_bl_del(&e->hash_list);
    699}
    700
    701/*
    702 * Return the exception data for a sector, or NULL if not
    703 * remapped.
    704 */
    705static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
    706						chunk_t chunk)
    707{
    708	struct hlist_bl_head *slot;
    709	struct hlist_bl_node *pos;
    710	struct dm_exception *e;
    711
    712	slot = &et->table[exception_hash(et, chunk)];
    713	hlist_bl_for_each_entry(e, pos, slot, hash_list)
    714		if (chunk >= e->old_chunk &&
    715		    chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
    716			return e;
    717
    718	return NULL;
    719}
    720
    721static struct dm_exception *alloc_completed_exception(gfp_t gfp)
    722{
    723	struct dm_exception *e;
    724
    725	e = kmem_cache_alloc(exception_cache, gfp);
    726	if (!e && gfp == GFP_NOIO)
    727		e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
    728
    729	return e;
    730}
    731
    732static void free_completed_exception(struct dm_exception *e)
    733{
    734	kmem_cache_free(exception_cache, e);
    735}
    736
    737static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s)
    738{
    739	struct dm_snap_pending_exception *pe = mempool_alloc(&s->pending_pool,
    740							     GFP_NOIO);
    741
    742	atomic_inc(&s->pending_exceptions_count);
    743	pe->snap = s;
    744
    745	return pe;
    746}
    747
    748static void free_pending_exception(struct dm_snap_pending_exception *pe)
    749{
    750	struct dm_snapshot *s = pe->snap;
    751
    752	mempool_free(pe, &s->pending_pool);
    753	smp_mb__before_atomic();
    754	atomic_dec(&s->pending_exceptions_count);
    755}
    756
    757static void dm_insert_exception(struct dm_exception_table *eh,
    758				struct dm_exception *new_e)
    759{
    760	struct hlist_bl_head *l;
    761	struct hlist_bl_node *pos;
    762	struct dm_exception *e = NULL;
    763
    764	l = &eh->table[exception_hash(eh, new_e->old_chunk)];
    765
    766	/* Add immediately if this table doesn't support consecutive chunks */
    767	if (!eh->hash_shift)
    768		goto out;
    769
    770	/* List is ordered by old_chunk */
    771	hlist_bl_for_each_entry(e, pos, l, hash_list) {
    772		/* Insert after an existing chunk? */
    773		if (new_e->old_chunk == (e->old_chunk +
    774					 dm_consecutive_chunk_count(e) + 1) &&
    775		    new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
    776					 dm_consecutive_chunk_count(e) + 1)) {
    777			dm_consecutive_chunk_count_inc(e);
    778			free_completed_exception(new_e);
    779			return;
    780		}
    781
    782		/* Insert before an existing chunk? */
    783		if (new_e->old_chunk == (e->old_chunk - 1) &&
    784		    new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) {
    785			dm_consecutive_chunk_count_inc(e);
    786			e->old_chunk--;
    787			e->new_chunk--;
    788			free_completed_exception(new_e);
    789			return;
    790		}
    791
    792		if (new_e->old_chunk < e->old_chunk)
    793			break;
    794	}
    795
    796out:
    797	if (!e) {
    798		/*
    799		 * Either the table doesn't support consecutive chunks or slot
    800		 * l is empty.
    801		 */
    802		hlist_bl_add_head(&new_e->hash_list, l);
    803	} else if (new_e->old_chunk < e->old_chunk) {
    804		/* Add before an existing exception */
    805		hlist_bl_add_before(&new_e->hash_list, &e->hash_list);
    806	} else {
    807		/* Add to l's tail: e is the last exception in this slot */
    808		hlist_bl_add_behind(&new_e->hash_list, &e->hash_list);
    809	}
    810}
    811
    812/*
    813 * Callback used by the exception stores to load exceptions when
    814 * initialising.
    815 */
    816static int dm_add_exception(void *context, chunk_t old, chunk_t new)
    817{
    818	struct dm_exception_table_lock lock;
    819	struct dm_snapshot *s = context;
    820	struct dm_exception *e;
    821
    822	e = alloc_completed_exception(GFP_KERNEL);
    823	if (!e)
    824		return -ENOMEM;
    825
    826	e->old_chunk = old;
    827
    828	/* Consecutive_count is implicitly initialised to zero */
    829	e->new_chunk = new;
    830
    831	/*
    832	 * Although there is no need to lock access to the exception tables
    833	 * here, if we don't then hlist_bl_add_head(), called by
    834	 * dm_insert_exception(), will complain about accessing the
    835	 * corresponding list without locking it first.
    836	 */
    837	dm_exception_table_lock_init(s, old, &lock);
    838
    839	dm_exception_table_lock(&lock);
    840	dm_insert_exception(&s->complete, e);
    841	dm_exception_table_unlock(&lock);
    842
    843	return 0;
    844}
    845
    846/*
    847 * Return a minimum chunk size of all snapshots that have the specified origin.
    848 * Return zero if the origin has no snapshots.
    849 */
    850static uint32_t __minimum_chunk_size(struct origin *o)
    851{
    852	struct dm_snapshot *snap;
    853	unsigned chunk_size = rounddown_pow_of_two(UINT_MAX);
    854
    855	if (o)
    856		list_for_each_entry(snap, &o->snapshots, list)
    857			chunk_size = min_not_zero(chunk_size,
    858						  snap->store->chunk_size);
    859
    860	return (uint32_t) chunk_size;
    861}
    862
    863/*
    864 * Hard coded magic.
    865 */
    866static int calc_max_buckets(void)
    867{
    868	/* use a fixed size of 2MB */
    869	unsigned long mem = 2 * 1024 * 1024;
    870	mem /= sizeof(struct hlist_bl_head);
    871
    872	return mem;
    873}
    874
    875/*
    876 * Allocate room for a suitable hash table.
    877 */
    878static int init_hash_tables(struct dm_snapshot *s)
    879{
    880	sector_t hash_size, cow_dev_size, max_buckets;
    881
    882	/*
    883	 * Calculate based on the size of the original volume or
    884	 * the COW volume...
    885	 */
    886	cow_dev_size = get_dev_size(s->cow->bdev);
    887	max_buckets = calc_max_buckets();
    888
    889	hash_size = cow_dev_size >> s->store->chunk_shift;
    890	hash_size = min(hash_size, max_buckets);
    891
    892	if (hash_size < 64)
    893		hash_size = 64;
    894	hash_size = rounddown_pow_of_two(hash_size);
    895	if (dm_exception_table_init(&s->complete, hash_size,
    896				    DM_CHUNK_CONSECUTIVE_BITS))
    897		return -ENOMEM;
    898
    899	/*
    900	 * Allocate hash table for in-flight exceptions
    901	 * Make this smaller than the real hash table
    902	 */
    903	hash_size >>= 3;
    904	if (hash_size < 64)
    905		hash_size = 64;
    906
    907	if (dm_exception_table_init(&s->pending, hash_size, 0)) {
    908		dm_exception_table_exit(&s->complete, exception_cache);
    909		return -ENOMEM;
    910	}
    911
    912	return 0;
    913}
    914
    915static void merge_shutdown(struct dm_snapshot *s)
    916{
    917	clear_bit_unlock(RUNNING_MERGE, &s->state_bits);
    918	smp_mb__after_atomic();
    919	wake_up_bit(&s->state_bits, RUNNING_MERGE);
    920}
    921
    922static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s)
    923{
    924	s->first_merging_chunk = 0;
    925	s->num_merging_chunks = 0;
    926
    927	return bio_list_get(&s->bios_queued_during_merge);
    928}
    929
    930/*
    931 * Remove one chunk from the index of completed exceptions.
    932 */
    933static int __remove_single_exception_chunk(struct dm_snapshot *s,
    934					   chunk_t old_chunk)
    935{
    936	struct dm_exception *e;
    937
    938	e = dm_lookup_exception(&s->complete, old_chunk);
    939	if (!e) {
    940		DMERR("Corruption detected: exception for block %llu is "
    941		      "on disk but not in memory",
    942		      (unsigned long long)old_chunk);
    943		return -EINVAL;
    944	}
    945
    946	/*
    947	 * If this is the only chunk using this exception, remove exception.
    948	 */
    949	if (!dm_consecutive_chunk_count(e)) {
    950		dm_remove_exception(e);
    951		free_completed_exception(e);
    952		return 0;
    953	}
    954
    955	/*
    956	 * The chunk may be either at the beginning or the end of a
    957	 * group of consecutive chunks - never in the middle.  We are
    958	 * removing chunks in the opposite order to that in which they
    959	 * were added, so this should always be true.
    960	 * Decrement the consecutive chunk counter and adjust the
    961	 * starting point if necessary.
    962	 */
    963	if (old_chunk == e->old_chunk) {
    964		e->old_chunk++;
    965		e->new_chunk++;
    966	} else if (old_chunk != e->old_chunk +
    967		   dm_consecutive_chunk_count(e)) {
    968		DMERR("Attempt to merge block %llu from the "
    969		      "middle of a chunk range [%llu - %llu]",
    970		      (unsigned long long)old_chunk,
    971		      (unsigned long long)e->old_chunk,
    972		      (unsigned long long)
    973		      e->old_chunk + dm_consecutive_chunk_count(e));
    974		return -EINVAL;
    975	}
    976
    977	dm_consecutive_chunk_count_dec(e);
    978
    979	return 0;
    980}
    981
    982static void flush_bios(struct bio *bio);
    983
    984static int remove_single_exception_chunk(struct dm_snapshot *s)
    985{
    986	struct bio *b = NULL;
    987	int r;
    988	chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
    989
    990	down_write(&s->lock);
    991
    992	/*
    993	 * Process chunks (and associated exceptions) in reverse order
    994	 * so that dm_consecutive_chunk_count_dec() accounting works.
    995	 */
    996	do {
    997		r = __remove_single_exception_chunk(s, old_chunk);
    998		if (r)
    999			goto out;
   1000	} while (old_chunk-- > s->first_merging_chunk);
   1001
   1002	b = __release_queued_bios_after_merge(s);
   1003
   1004out:
   1005	up_write(&s->lock);
   1006	if (b)
   1007		flush_bios(b);
   1008
   1009	return r;
   1010}
   1011
   1012static int origin_write_extent(struct dm_snapshot *merging_snap,
   1013			       sector_t sector, unsigned chunk_size);
   1014
   1015static void merge_callback(int read_err, unsigned long write_err,
   1016			   void *context);
   1017
   1018static uint64_t read_pending_exceptions_done_count(void)
   1019{
   1020	uint64_t pending_exceptions_done;
   1021
   1022	spin_lock(&_pending_exceptions_done_spinlock);
   1023	pending_exceptions_done = _pending_exceptions_done_count;
   1024	spin_unlock(&_pending_exceptions_done_spinlock);
   1025
   1026	return pending_exceptions_done;
   1027}
   1028
   1029static void increment_pending_exceptions_done_count(void)
   1030{
   1031	spin_lock(&_pending_exceptions_done_spinlock);
   1032	_pending_exceptions_done_count++;
   1033	spin_unlock(&_pending_exceptions_done_spinlock);
   1034
   1035	wake_up_all(&_pending_exceptions_done);
   1036}
   1037
   1038static void snapshot_merge_next_chunks(struct dm_snapshot *s)
   1039{
   1040	int i, linear_chunks;
   1041	chunk_t old_chunk, new_chunk;
   1042	struct dm_io_region src, dest;
   1043	sector_t io_size;
   1044	uint64_t previous_count;
   1045
   1046	BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits));
   1047	if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits)))
   1048		goto shut;
   1049
   1050	/*
   1051	 * valid flag never changes during merge, so no lock required.
   1052	 */
   1053	if (!s->valid) {
   1054		DMERR("Snapshot is invalid: can't merge");
   1055		goto shut;
   1056	}
   1057
   1058	linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk,
   1059						      &new_chunk);
   1060	if (linear_chunks <= 0) {
   1061		if (linear_chunks < 0) {
   1062			DMERR("Read error in exception store: "
   1063			      "shutting down merge");
   1064			down_write(&s->lock);
   1065			s->merge_failed = true;
   1066			up_write(&s->lock);
   1067		}
   1068		goto shut;
   1069	}
   1070
   1071	/* Adjust old_chunk and new_chunk to reflect start of linear region */
   1072	old_chunk = old_chunk + 1 - linear_chunks;
   1073	new_chunk = new_chunk + 1 - linear_chunks;
   1074
   1075	/*
   1076	 * Use one (potentially large) I/O to copy all 'linear_chunks'
   1077	 * from the exception store to the origin
   1078	 */
   1079	io_size = linear_chunks * s->store->chunk_size;
   1080
   1081	dest.bdev = s->origin->bdev;
   1082	dest.sector = chunk_to_sector(s->store, old_chunk);
   1083	dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector);
   1084
   1085	src.bdev = s->cow->bdev;
   1086	src.sector = chunk_to_sector(s->store, new_chunk);
   1087	src.count = dest.count;
   1088
   1089	/*
   1090	 * Reallocate any exceptions needed in other snapshots then
   1091	 * wait for the pending exceptions to complete.
   1092	 * Each time any pending exception (globally on the system)
   1093	 * completes we are woken and repeat the process to find out
   1094	 * if we can proceed.  While this may not seem a particularly
   1095	 * efficient algorithm, it is not expected to have any
   1096	 * significant impact on performance.
   1097	 */
   1098	previous_count = read_pending_exceptions_done_count();
   1099	while (origin_write_extent(s, dest.sector, io_size)) {
   1100		wait_event(_pending_exceptions_done,
   1101			   (read_pending_exceptions_done_count() !=
   1102			    previous_count));
   1103		/* Retry after the wait, until all exceptions are done. */
   1104		previous_count = read_pending_exceptions_done_count();
   1105	}
   1106
   1107	down_write(&s->lock);
   1108	s->first_merging_chunk = old_chunk;
   1109	s->num_merging_chunks = linear_chunks;
   1110	up_write(&s->lock);
   1111
   1112	/* Wait until writes to all 'linear_chunks' drain */
   1113	for (i = 0; i < linear_chunks; i++)
   1114		__check_for_conflicting_io(s, old_chunk + i);
   1115
   1116	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s);
   1117	return;
   1118
   1119shut:
   1120	merge_shutdown(s);
   1121}
   1122
   1123static void error_bios(struct bio *bio);
   1124
   1125static void merge_callback(int read_err, unsigned long write_err, void *context)
   1126{
   1127	struct dm_snapshot *s = context;
   1128	struct bio *b = NULL;
   1129
   1130	if (read_err || write_err) {
   1131		if (read_err)
   1132			DMERR("Read error: shutting down merge.");
   1133		else
   1134			DMERR("Write error: shutting down merge.");
   1135		goto shut;
   1136	}
   1137
   1138	if (blkdev_issue_flush(s->origin->bdev) < 0) {
   1139		DMERR("Flush after merge failed: shutting down merge");
   1140		goto shut;
   1141	}
   1142
   1143	if (s->store->type->commit_merge(s->store,
   1144					 s->num_merging_chunks) < 0) {
   1145		DMERR("Write error in exception store: shutting down merge");
   1146		goto shut;
   1147	}
   1148
   1149	if (remove_single_exception_chunk(s) < 0)
   1150		goto shut;
   1151
   1152	snapshot_merge_next_chunks(s);
   1153
   1154	return;
   1155
   1156shut:
   1157	down_write(&s->lock);
   1158	s->merge_failed = true;
   1159	b = __release_queued_bios_after_merge(s);
   1160	up_write(&s->lock);
   1161	error_bios(b);
   1162
   1163	merge_shutdown(s);
   1164}
   1165
   1166static void start_merge(struct dm_snapshot *s)
   1167{
   1168	if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits))
   1169		snapshot_merge_next_chunks(s);
   1170}
   1171
   1172/*
   1173 * Stop the merging process and wait until it finishes.
   1174 */
   1175static void stop_merge(struct dm_snapshot *s)
   1176{
   1177	set_bit(SHUTDOWN_MERGE, &s->state_bits);
   1178	wait_on_bit(&s->state_bits, RUNNING_MERGE, TASK_UNINTERRUPTIBLE);
   1179	clear_bit(SHUTDOWN_MERGE, &s->state_bits);
   1180}
   1181
   1182static int parse_snapshot_features(struct dm_arg_set *as, struct dm_snapshot *s,
   1183				   struct dm_target *ti)
   1184{
   1185	int r;
   1186	unsigned argc;
   1187	const char *arg_name;
   1188
   1189	static const struct dm_arg _args[] = {
   1190		{0, 2, "Invalid number of feature arguments"},
   1191	};
   1192
   1193	/*
   1194	 * No feature arguments supplied.
   1195	 */
   1196	if (!as->argc)
   1197		return 0;
   1198
   1199	r = dm_read_arg_group(_args, as, &argc, &ti->error);
   1200	if (r)
   1201		return -EINVAL;
   1202
   1203	while (argc && !r) {
   1204		arg_name = dm_shift_arg(as);
   1205		argc--;
   1206
   1207		if (!strcasecmp(arg_name, "discard_zeroes_cow"))
   1208			s->discard_zeroes_cow = true;
   1209
   1210		else if (!strcasecmp(arg_name, "discard_passdown_origin"))
   1211			s->discard_passdown_origin = true;
   1212
   1213		else {
   1214			ti->error = "Unrecognised feature requested";
   1215			r = -EINVAL;
   1216			break;
   1217		}
   1218	}
   1219
   1220	if (!s->discard_zeroes_cow && s->discard_passdown_origin) {
   1221		/*
   1222		 * TODO: really these are disjoint.. but ti->num_discard_bios
   1223		 * and dm_bio_get_target_bio_nr() require rigid constraints.
   1224		 */
   1225		ti->error = "discard_passdown_origin feature depends on discard_zeroes_cow";
   1226		r = -EINVAL;
   1227	}
   1228
   1229	return r;
   1230}
   1231
   1232/*
   1233 * Construct a snapshot mapping:
   1234 * <origin_dev> <COW-dev> <p|po|n> <chunk-size> [<# feature args> [<arg>]*]
   1235 */
   1236static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
   1237{
   1238	struct dm_snapshot *s;
   1239	struct dm_arg_set as;
   1240	int i;
   1241	int r = -EINVAL;
   1242	char *origin_path, *cow_path;
   1243	dev_t origin_dev, cow_dev;
   1244	unsigned args_used, num_flush_bios = 1;
   1245	fmode_t origin_mode = FMODE_READ;
   1246
   1247	if (argc < 4) {
   1248		ti->error = "requires 4 or more arguments";
   1249		r = -EINVAL;
   1250		goto bad;
   1251	}
   1252
   1253	if (dm_target_is_snapshot_merge(ti)) {
   1254		num_flush_bios = 2;
   1255		origin_mode = FMODE_WRITE;
   1256	}
   1257
   1258	s = kzalloc(sizeof(*s), GFP_KERNEL);
   1259	if (!s) {
   1260		ti->error = "Cannot allocate private snapshot structure";
   1261		r = -ENOMEM;
   1262		goto bad;
   1263	}
   1264
   1265	as.argc = argc;
   1266	as.argv = argv;
   1267	dm_consume_args(&as, 4);
   1268	r = parse_snapshot_features(&as, s, ti);
   1269	if (r)
   1270		goto bad_features;
   1271
   1272	origin_path = argv[0];
   1273	argv++;
   1274	argc--;
   1275
   1276	r = dm_get_device(ti, origin_path, origin_mode, &s->origin);
   1277	if (r) {
   1278		ti->error = "Cannot get origin device";
   1279		goto bad_origin;
   1280	}
   1281	origin_dev = s->origin->bdev->bd_dev;
   1282
   1283	cow_path = argv[0];
   1284	argv++;
   1285	argc--;
   1286
   1287	cow_dev = dm_get_dev_t(cow_path);
   1288	if (cow_dev && cow_dev == origin_dev) {
   1289		ti->error = "COW device cannot be the same as origin device";
   1290		r = -EINVAL;
   1291		goto bad_cow;
   1292	}
   1293
   1294	r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow);
   1295	if (r) {
   1296		ti->error = "Cannot get COW device";
   1297		goto bad_cow;
   1298	}
   1299
   1300	r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store);
   1301	if (r) {
   1302		ti->error = "Couldn't create exception store";
   1303		r = -EINVAL;
   1304		goto bad_store;
   1305	}
   1306
   1307	argv += args_used;
   1308	argc -= args_used;
   1309
   1310	s->ti = ti;
   1311	s->valid = 1;
   1312	s->snapshot_overflowed = 0;
   1313	s->active = 0;
   1314	atomic_set(&s->pending_exceptions_count, 0);
   1315	spin_lock_init(&s->pe_allocation_lock);
   1316	s->exception_start_sequence = 0;
   1317	s->exception_complete_sequence = 0;
   1318	s->out_of_order_tree = RB_ROOT;
   1319	init_rwsem(&s->lock);
   1320	INIT_LIST_HEAD(&s->list);
   1321	spin_lock_init(&s->pe_lock);
   1322	s->state_bits = 0;
   1323	s->merge_failed = false;
   1324	s->first_merging_chunk = 0;
   1325	s->num_merging_chunks = 0;
   1326	bio_list_init(&s->bios_queued_during_merge);
   1327
   1328	/* Allocate hash table for COW data */
   1329	if (init_hash_tables(s)) {
   1330		ti->error = "Unable to allocate hash table space";
   1331		r = -ENOMEM;
   1332		goto bad_hash_tables;
   1333	}
   1334
   1335	init_waitqueue_head(&s->in_progress_wait);
   1336
   1337	s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
   1338	if (IS_ERR(s->kcopyd_client)) {
   1339		r = PTR_ERR(s->kcopyd_client);
   1340		ti->error = "Could not create kcopyd client";
   1341		goto bad_kcopyd;
   1342	}
   1343
   1344	r = mempool_init_slab_pool(&s->pending_pool, MIN_IOS, pending_cache);
   1345	if (r) {
   1346		ti->error = "Could not allocate mempool for pending exceptions";
   1347		goto bad_pending_pool;
   1348	}
   1349
   1350	for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
   1351		INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
   1352
   1353	spin_lock_init(&s->tracked_chunk_lock);
   1354
   1355	ti->private = s;
   1356	ti->num_flush_bios = num_flush_bios;
   1357	if (s->discard_zeroes_cow)
   1358		ti->num_discard_bios = (s->discard_passdown_origin ? 2 : 1);
   1359	ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk);
   1360
   1361	/* Add snapshot to the list of snapshots for this origin */
   1362	/* Exceptions aren't triggered till snapshot_resume() is called */
   1363	r = register_snapshot(s);
   1364	if (r == -ENOMEM) {
   1365		ti->error = "Snapshot origin struct allocation failed";
   1366		goto bad_load_and_register;
   1367	} else if (r < 0) {
   1368		/* invalid handover, register_snapshot has set ti->error */
   1369		goto bad_load_and_register;
   1370	}
   1371
   1372	/*
   1373	 * Metadata must only be loaded into one table at once, so skip this
   1374	 * if metadata will be handed over during resume.
   1375	 * Chunk size will be set during the handover - set it to zero to
   1376	 * ensure it's ignored.
   1377	 */
   1378	if (r > 0) {
   1379		s->store->chunk_size = 0;
   1380		return 0;
   1381	}
   1382
   1383	r = s->store->type->read_metadata(s->store, dm_add_exception,
   1384					  (void *)s);
   1385	if (r < 0) {
   1386		ti->error = "Failed to read snapshot metadata";
   1387		goto bad_read_metadata;
   1388	} else if (r > 0) {
   1389		s->valid = 0;
   1390		DMWARN("Snapshot is marked invalid.");
   1391	}
   1392
   1393	if (!s->store->chunk_size) {
   1394		ti->error = "Chunk size not set";
   1395		r = -EINVAL;
   1396		goto bad_read_metadata;
   1397	}
   1398
   1399	r = dm_set_target_max_io_len(ti, s->store->chunk_size);
   1400	if (r)
   1401		goto bad_read_metadata;
   1402
   1403	return 0;
   1404
   1405bad_read_metadata:
   1406	unregister_snapshot(s);
   1407bad_load_and_register:
   1408	mempool_exit(&s->pending_pool);
   1409bad_pending_pool:
   1410	dm_kcopyd_client_destroy(s->kcopyd_client);
   1411bad_kcopyd:
   1412	dm_exception_table_exit(&s->pending, pending_cache);
   1413	dm_exception_table_exit(&s->complete, exception_cache);
   1414bad_hash_tables:
   1415	dm_exception_store_destroy(s->store);
   1416bad_store:
   1417	dm_put_device(ti, s->cow);
   1418bad_cow:
   1419	dm_put_device(ti, s->origin);
   1420bad_origin:
   1421bad_features:
   1422	kfree(s);
   1423bad:
   1424	return r;
   1425}
   1426
   1427static void __free_exceptions(struct dm_snapshot *s)
   1428{
   1429	dm_kcopyd_client_destroy(s->kcopyd_client);
   1430	s->kcopyd_client = NULL;
   1431
   1432	dm_exception_table_exit(&s->pending, pending_cache);
   1433	dm_exception_table_exit(&s->complete, exception_cache);
   1434}
   1435
   1436static void __handover_exceptions(struct dm_snapshot *snap_src,
   1437				  struct dm_snapshot *snap_dest)
   1438{
   1439	union {
   1440		struct dm_exception_table table_swap;
   1441		struct dm_exception_store *store_swap;
   1442	} u;
   1443
   1444	/*
   1445	 * Swap all snapshot context information between the two instances.
   1446	 */
   1447	u.table_swap = snap_dest->complete;
   1448	snap_dest->complete = snap_src->complete;
   1449	snap_src->complete = u.table_swap;
   1450
   1451	u.store_swap = snap_dest->store;
   1452	snap_dest->store = snap_src->store;
   1453	snap_dest->store->userspace_supports_overflow = u.store_swap->userspace_supports_overflow;
   1454	snap_src->store = u.store_swap;
   1455
   1456	snap_dest->store->snap = snap_dest;
   1457	snap_src->store->snap = snap_src;
   1458
   1459	snap_dest->ti->max_io_len = snap_dest->store->chunk_size;
   1460	snap_dest->valid = snap_src->valid;
   1461	snap_dest->snapshot_overflowed = snap_src->snapshot_overflowed;
   1462
   1463	/*
   1464	 * Set source invalid to ensure it receives no further I/O.
   1465	 */
   1466	snap_src->valid = 0;
   1467}
   1468
   1469static void snapshot_dtr(struct dm_target *ti)
   1470{
   1471#ifdef CONFIG_DM_DEBUG
   1472	int i;
   1473#endif
   1474	struct dm_snapshot *s = ti->private;
   1475	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
   1476
   1477	down_read(&_origins_lock);
   1478	/* Check whether exception handover must be cancelled */
   1479	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
   1480	if (snap_src && snap_dest && (s == snap_src)) {
   1481		down_write(&snap_dest->lock);
   1482		snap_dest->valid = 0;
   1483		up_write(&snap_dest->lock);
   1484		DMERR("Cancelling snapshot handover.");
   1485	}
   1486	up_read(&_origins_lock);
   1487
   1488	if (dm_target_is_snapshot_merge(ti))
   1489		stop_merge(s);
   1490
   1491	/* Prevent further origin writes from using this snapshot. */
   1492	/* After this returns there can be no new kcopyd jobs. */
   1493	unregister_snapshot(s);
   1494
   1495	while (atomic_read(&s->pending_exceptions_count))
   1496		msleep(1);
   1497	/*
   1498	 * Ensure instructions in mempool_exit aren't reordered
   1499	 * before atomic_read.
   1500	 */
   1501	smp_mb();
   1502
   1503#ifdef CONFIG_DM_DEBUG
   1504	for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
   1505		BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
   1506#endif
   1507
   1508	__free_exceptions(s);
   1509
   1510	mempool_exit(&s->pending_pool);
   1511
   1512	dm_exception_store_destroy(s->store);
   1513
   1514	dm_put_device(ti, s->cow);
   1515
   1516	dm_put_device(ti, s->origin);
   1517
   1518	WARN_ON(s->in_progress);
   1519
   1520	kfree(s);
   1521}
   1522
   1523static void account_start_copy(struct dm_snapshot *s)
   1524{
   1525	spin_lock(&s->in_progress_wait.lock);
   1526	s->in_progress++;
   1527	spin_unlock(&s->in_progress_wait.lock);
   1528}
   1529
   1530static void account_end_copy(struct dm_snapshot *s)
   1531{
   1532	spin_lock(&s->in_progress_wait.lock);
   1533	BUG_ON(!s->in_progress);
   1534	s->in_progress--;
   1535	if (likely(s->in_progress <= cow_threshold) &&
   1536	    unlikely(waitqueue_active(&s->in_progress_wait)))
   1537		wake_up_locked(&s->in_progress_wait);
   1538	spin_unlock(&s->in_progress_wait.lock);
   1539}
   1540
   1541static bool wait_for_in_progress(struct dm_snapshot *s, bool unlock_origins)
   1542{
   1543	if (unlikely(s->in_progress > cow_threshold)) {
   1544		spin_lock(&s->in_progress_wait.lock);
   1545		if (likely(s->in_progress > cow_threshold)) {
   1546			/*
   1547			 * NOTE: this throttle doesn't account for whether
   1548			 * the caller is servicing an IO that will trigger a COW
   1549			 * so excess throttling may result for chunks not required
   1550			 * to be COW'd.  But if cow_threshold was reached, extra
   1551			 * throttling is unlikely to negatively impact performance.
   1552			 */
   1553			DECLARE_WAITQUEUE(wait, current);
   1554			__add_wait_queue(&s->in_progress_wait, &wait);
   1555			__set_current_state(TASK_UNINTERRUPTIBLE);
   1556			spin_unlock(&s->in_progress_wait.lock);
   1557			if (unlock_origins)
   1558				up_read(&_origins_lock);
   1559			io_schedule();
   1560			remove_wait_queue(&s->in_progress_wait, &wait);
   1561			return false;
   1562		}
   1563		spin_unlock(&s->in_progress_wait.lock);
   1564	}
   1565	return true;
   1566}
   1567
   1568/*
   1569 * Flush a list of buffers.
   1570 */
   1571static void flush_bios(struct bio *bio)
   1572{
   1573	struct bio *n;
   1574
   1575	while (bio) {
   1576		n = bio->bi_next;
   1577		bio->bi_next = NULL;
   1578		submit_bio_noacct(bio);
   1579		bio = n;
   1580	}
   1581}
   1582
   1583static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit);
   1584
   1585/*
   1586 * Flush a list of buffers.
   1587 */
   1588static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
   1589{
   1590	struct bio *n;
   1591	int r;
   1592
   1593	while (bio) {
   1594		n = bio->bi_next;
   1595		bio->bi_next = NULL;
   1596		r = do_origin(s->origin, bio, false);
   1597		if (r == DM_MAPIO_REMAPPED)
   1598			submit_bio_noacct(bio);
   1599		bio = n;
   1600	}
   1601}
   1602
   1603/*
   1604 * Error a list of buffers.
   1605 */
   1606static void error_bios(struct bio *bio)
   1607{
   1608	struct bio *n;
   1609
   1610	while (bio) {
   1611		n = bio->bi_next;
   1612		bio->bi_next = NULL;
   1613		bio_io_error(bio);
   1614		bio = n;
   1615	}
   1616}
   1617
   1618static void __invalidate_snapshot(struct dm_snapshot *s, int err)
   1619{
   1620	if (!s->valid)
   1621		return;
   1622
   1623	if (err == -EIO)
   1624		DMERR("Invalidating snapshot: Error reading/writing.");
   1625	else if (err == -ENOMEM)
   1626		DMERR("Invalidating snapshot: Unable to allocate exception.");
   1627
   1628	if (s->store->type->drop_snapshot)
   1629		s->store->type->drop_snapshot(s->store);
   1630
   1631	s->valid = 0;
   1632
   1633	dm_table_event(s->ti->table);
   1634}
   1635
   1636static void invalidate_snapshot(struct dm_snapshot *s, int err)
   1637{
   1638	down_write(&s->lock);
   1639	__invalidate_snapshot(s, err);
   1640	up_write(&s->lock);
   1641}
   1642
   1643static void pending_complete(void *context, int success)
   1644{
   1645	struct dm_snap_pending_exception *pe = context;
   1646	struct dm_exception *e;
   1647	struct dm_snapshot *s = pe->snap;
   1648	struct bio *origin_bios = NULL;
   1649	struct bio *snapshot_bios = NULL;
   1650	struct bio *full_bio = NULL;
   1651	struct dm_exception_table_lock lock;
   1652	int error = 0;
   1653
   1654	dm_exception_table_lock_init(s, pe->e.old_chunk, &lock);
   1655
   1656	if (!success) {
   1657		/* Read/write error - snapshot is unusable */
   1658		invalidate_snapshot(s, -EIO);
   1659		error = 1;
   1660
   1661		dm_exception_table_lock(&lock);
   1662		goto out;
   1663	}
   1664
   1665	e = alloc_completed_exception(GFP_NOIO);
   1666	if (!e) {
   1667		invalidate_snapshot(s, -ENOMEM);
   1668		error = 1;
   1669
   1670		dm_exception_table_lock(&lock);
   1671		goto out;
   1672	}
   1673	*e = pe->e;
   1674
   1675	down_read(&s->lock);
   1676	dm_exception_table_lock(&lock);
   1677	if (!s->valid) {
   1678		up_read(&s->lock);
   1679		free_completed_exception(e);
   1680		error = 1;
   1681
   1682		goto out;
   1683	}
   1684
   1685	/*
   1686	 * Add a proper exception. After inserting the completed exception all
   1687	 * subsequent snapshot reads to this chunk will be redirected to the
   1688	 * COW device.  This ensures that we do not starve. Moreover, as long
   1689	 * as the pending exception exists, neither origin writes nor snapshot
   1690	 * merging can overwrite the chunk in origin.
   1691	 */
   1692	dm_insert_exception(&s->complete, e);
   1693	up_read(&s->lock);
   1694
   1695	/* Wait for conflicting reads to drain */
   1696	if (__chunk_is_tracked(s, pe->e.old_chunk)) {
   1697		dm_exception_table_unlock(&lock);
   1698		__check_for_conflicting_io(s, pe->e.old_chunk);
   1699		dm_exception_table_lock(&lock);
   1700	}
   1701
   1702out:
   1703	/* Remove the in-flight exception from the list */
   1704	dm_remove_exception(&pe->e);
   1705
   1706	dm_exception_table_unlock(&lock);
   1707
   1708	snapshot_bios = bio_list_get(&pe->snapshot_bios);
   1709	origin_bios = bio_list_get(&pe->origin_bios);
   1710	full_bio = pe->full_bio;
   1711	if (full_bio)
   1712		full_bio->bi_end_io = pe->full_bio_end_io;
   1713	increment_pending_exceptions_done_count();
   1714
   1715	/* Submit any pending write bios */
   1716	if (error) {
   1717		if (full_bio)
   1718			bio_io_error(full_bio);
   1719		error_bios(snapshot_bios);
   1720	} else {
   1721		if (full_bio)
   1722			bio_endio(full_bio);
   1723		flush_bios(snapshot_bios);
   1724	}
   1725
   1726	retry_origin_bios(s, origin_bios);
   1727
   1728	free_pending_exception(pe);
   1729}
   1730
   1731static void complete_exception(struct dm_snap_pending_exception *pe)
   1732{
   1733	struct dm_snapshot *s = pe->snap;
   1734
   1735	/* Update the metadata if we are persistent */
   1736	s->store->type->commit_exception(s->store, &pe->e, !pe->copy_error,
   1737					 pending_complete, pe);
   1738}
   1739
   1740/*
   1741 * Called when the copy I/O has finished.  kcopyd actually runs
   1742 * this code so don't block.
   1743 */
   1744static void copy_callback(int read_err, unsigned long write_err, void *context)
   1745{
   1746	struct dm_snap_pending_exception *pe = context;
   1747	struct dm_snapshot *s = pe->snap;
   1748
   1749	pe->copy_error = read_err || write_err;
   1750
   1751	if (pe->exception_sequence == s->exception_complete_sequence) {
   1752		struct rb_node *next;
   1753
   1754		s->exception_complete_sequence++;
   1755		complete_exception(pe);
   1756
   1757		next = rb_first(&s->out_of_order_tree);
   1758		while (next) {
   1759			pe = rb_entry(next, struct dm_snap_pending_exception,
   1760					out_of_order_node);
   1761			if (pe->exception_sequence != s->exception_complete_sequence)
   1762				break;
   1763			next = rb_next(next);
   1764			s->exception_complete_sequence++;
   1765			rb_erase(&pe->out_of_order_node, &s->out_of_order_tree);
   1766			complete_exception(pe);
   1767			cond_resched();
   1768		}
   1769	} else {
   1770		struct rb_node *parent = NULL;
   1771		struct rb_node **p = &s->out_of_order_tree.rb_node;
   1772		struct dm_snap_pending_exception *pe2;
   1773
   1774		while (*p) {
   1775			pe2 = rb_entry(*p, struct dm_snap_pending_exception, out_of_order_node);
   1776			parent = *p;
   1777
   1778			BUG_ON(pe->exception_sequence == pe2->exception_sequence);
   1779			if (pe->exception_sequence < pe2->exception_sequence)
   1780				p = &((*p)->rb_left);
   1781			else
   1782				p = &((*p)->rb_right);
   1783		}
   1784
   1785		rb_link_node(&pe->out_of_order_node, parent, p);
   1786		rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree);
   1787	}
   1788	account_end_copy(s);
   1789}
   1790
   1791/*
   1792 * Dispatches the copy operation to kcopyd.
   1793 */
   1794static void start_copy(struct dm_snap_pending_exception *pe)
   1795{
   1796	struct dm_snapshot *s = pe->snap;
   1797	struct dm_io_region src, dest;
   1798	struct block_device *bdev = s->origin->bdev;
   1799	sector_t dev_size;
   1800
   1801	dev_size = get_dev_size(bdev);
   1802
   1803	src.bdev = bdev;
   1804	src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
   1805	src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector);
   1806
   1807	dest.bdev = s->cow->bdev;
   1808	dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
   1809	dest.count = src.count;
   1810
   1811	/* Hand over to kcopyd */
   1812	account_start_copy(s);
   1813	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
   1814}
   1815
   1816static void full_bio_end_io(struct bio *bio)
   1817{
   1818	void *callback_data = bio->bi_private;
   1819
   1820	dm_kcopyd_do_callback(callback_data, 0, bio->bi_status ? 1 : 0);
   1821}
   1822
   1823static void start_full_bio(struct dm_snap_pending_exception *pe,
   1824			   struct bio *bio)
   1825{
   1826	struct dm_snapshot *s = pe->snap;
   1827	void *callback_data;
   1828
   1829	pe->full_bio = bio;
   1830	pe->full_bio_end_io = bio->bi_end_io;
   1831
   1832	account_start_copy(s);
   1833	callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
   1834						   copy_callback, pe);
   1835
   1836	bio->bi_end_io = full_bio_end_io;
   1837	bio->bi_private = callback_data;
   1838
   1839	submit_bio_noacct(bio);
   1840}
   1841
   1842static struct dm_snap_pending_exception *
   1843__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
   1844{
   1845	struct dm_exception *e = dm_lookup_exception(&s->pending, chunk);
   1846
   1847	if (!e)
   1848		return NULL;
   1849
   1850	return container_of(e, struct dm_snap_pending_exception, e);
   1851}
   1852
   1853/*
   1854 * Inserts a pending exception into the pending table.
   1855 *
   1856 * NOTE: a write lock must be held on the chunk's pending exception table slot
   1857 * before calling this.
   1858 */
   1859static struct dm_snap_pending_exception *
   1860__insert_pending_exception(struct dm_snapshot *s,
   1861			   struct dm_snap_pending_exception *pe, chunk_t chunk)
   1862{
   1863	pe->e.old_chunk = chunk;
   1864	bio_list_init(&pe->origin_bios);
   1865	bio_list_init(&pe->snapshot_bios);
   1866	pe->started = 0;
   1867	pe->full_bio = NULL;
   1868
   1869	spin_lock(&s->pe_allocation_lock);
   1870	if (s->store->type->prepare_exception(s->store, &pe->e)) {
   1871		spin_unlock(&s->pe_allocation_lock);
   1872		free_pending_exception(pe);
   1873		return NULL;
   1874	}
   1875
   1876	pe->exception_sequence = s->exception_start_sequence++;
   1877	spin_unlock(&s->pe_allocation_lock);
   1878
   1879	dm_insert_exception(&s->pending, &pe->e);
   1880
   1881	return pe;
   1882}
   1883
   1884/*
   1885 * Looks to see if this snapshot already has a pending exception
   1886 * for this chunk, otherwise it allocates a new one and inserts
   1887 * it into the pending table.
   1888 *
   1889 * NOTE: a write lock must be held on the chunk's pending exception table slot
   1890 * before calling this.
   1891 */
   1892static struct dm_snap_pending_exception *
   1893__find_pending_exception(struct dm_snapshot *s,
   1894			 struct dm_snap_pending_exception *pe, chunk_t chunk)
   1895{
   1896	struct dm_snap_pending_exception *pe2;
   1897
   1898	pe2 = __lookup_pending_exception(s, chunk);
   1899	if (pe2) {
   1900		free_pending_exception(pe);
   1901		return pe2;
   1902	}
   1903
   1904	return __insert_pending_exception(s, pe, chunk);
   1905}
   1906
   1907static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
   1908			    struct bio *bio, chunk_t chunk)
   1909{
   1910	bio_set_dev(bio, s->cow->bdev);
   1911	bio->bi_iter.bi_sector =
   1912		chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) +
   1913				(chunk - e->old_chunk)) +
   1914		(bio->bi_iter.bi_sector & s->store->chunk_mask);
   1915}
   1916
   1917static void zero_callback(int read_err, unsigned long write_err, void *context)
   1918{
   1919	struct bio *bio = context;
   1920	struct dm_snapshot *s = bio->bi_private;
   1921
   1922	account_end_copy(s);
   1923	bio->bi_status = write_err ? BLK_STS_IOERR : 0;
   1924	bio_endio(bio);
   1925}
   1926
   1927static void zero_exception(struct dm_snapshot *s, struct dm_exception *e,
   1928			   struct bio *bio, chunk_t chunk)
   1929{
   1930	struct dm_io_region dest;
   1931
   1932	dest.bdev = s->cow->bdev;
   1933	dest.sector = bio->bi_iter.bi_sector;
   1934	dest.count = s->store->chunk_size;
   1935
   1936	account_start_copy(s);
   1937	WARN_ON_ONCE(bio->bi_private);
   1938	bio->bi_private = s;
   1939	dm_kcopyd_zero(s->kcopyd_client, 1, &dest, 0, zero_callback, bio);
   1940}
   1941
   1942static bool io_overlaps_chunk(struct dm_snapshot *s, struct bio *bio)
   1943{
   1944	return bio->bi_iter.bi_size ==
   1945		(s->store->chunk_size << SECTOR_SHIFT);
   1946}
   1947
   1948static int snapshot_map(struct dm_target *ti, struct bio *bio)
   1949{
   1950	struct dm_exception *e;
   1951	struct dm_snapshot *s = ti->private;
   1952	int r = DM_MAPIO_REMAPPED;
   1953	chunk_t chunk;
   1954	struct dm_snap_pending_exception *pe = NULL;
   1955	struct dm_exception_table_lock lock;
   1956
   1957	init_tracked_chunk(bio);
   1958
   1959	if (bio->bi_opf & REQ_PREFLUSH) {
   1960		bio_set_dev(bio, s->cow->bdev);
   1961		return DM_MAPIO_REMAPPED;
   1962	}
   1963
   1964	chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
   1965	dm_exception_table_lock_init(s, chunk, &lock);
   1966
   1967	/* Full snapshots are not usable */
   1968	/* To get here the table must be live so s->active is always set. */
   1969	if (!s->valid)
   1970		return DM_MAPIO_KILL;
   1971
   1972	if (bio_data_dir(bio) == WRITE) {
   1973		while (unlikely(!wait_for_in_progress(s, false)))
   1974			; /* wait_for_in_progress() has slept */
   1975	}
   1976
   1977	down_read(&s->lock);
   1978	dm_exception_table_lock(&lock);
   1979
   1980	if (!s->valid || (unlikely(s->snapshot_overflowed) &&
   1981	    bio_data_dir(bio) == WRITE)) {
   1982		r = DM_MAPIO_KILL;
   1983		goto out_unlock;
   1984	}
   1985
   1986	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
   1987		if (s->discard_passdown_origin && dm_bio_get_target_bio_nr(bio)) {
   1988			/*
   1989			 * passdown discard to origin (without triggering
   1990			 * snapshot exceptions via do_origin; doing so would
   1991			 * defeat the goal of freeing space in origin that is
   1992			 * implied by the "discard_passdown_origin" feature)
   1993			 */
   1994			bio_set_dev(bio, s->origin->bdev);
   1995			track_chunk(s, bio, chunk);
   1996			goto out_unlock;
   1997		}
   1998		/* discard to snapshot (target_bio_nr == 0) zeroes exceptions */
   1999	}
   2000
   2001	/* If the block is already remapped - use that, else remap it */
   2002	e = dm_lookup_exception(&s->complete, chunk);
   2003	if (e) {
   2004		remap_exception(s, e, bio, chunk);
   2005		if (unlikely(bio_op(bio) == REQ_OP_DISCARD) &&
   2006		    io_overlaps_chunk(s, bio)) {
   2007			dm_exception_table_unlock(&lock);
   2008			up_read(&s->lock);
   2009			zero_exception(s, e, bio, chunk);
   2010			r = DM_MAPIO_SUBMITTED; /* discard is not issued */
   2011			goto out;
   2012		}
   2013		goto out_unlock;
   2014	}
   2015
   2016	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
   2017		/*
   2018		 * If no exception exists, complete discard immediately
   2019		 * otherwise it'll trigger copy-out.
   2020		 */
   2021		bio_endio(bio);
   2022		r = DM_MAPIO_SUBMITTED;
   2023		goto out_unlock;
   2024	}
   2025
   2026	/*
   2027	 * Write to snapshot - higher level takes care of RW/RO
   2028	 * flags so we should only get this if we are
   2029	 * writeable.
   2030	 */
   2031	if (bio_data_dir(bio) == WRITE) {
   2032		pe = __lookup_pending_exception(s, chunk);
   2033		if (!pe) {
   2034			dm_exception_table_unlock(&lock);
   2035			pe = alloc_pending_exception(s);
   2036			dm_exception_table_lock(&lock);
   2037
   2038			e = dm_lookup_exception(&s->complete, chunk);
   2039			if (e) {
   2040				free_pending_exception(pe);
   2041				remap_exception(s, e, bio, chunk);
   2042				goto out_unlock;
   2043			}
   2044
   2045			pe = __find_pending_exception(s, pe, chunk);
   2046			if (!pe) {
   2047				dm_exception_table_unlock(&lock);
   2048				up_read(&s->lock);
   2049
   2050				down_write(&s->lock);
   2051
   2052				if (s->store->userspace_supports_overflow) {
   2053					if (s->valid && !s->snapshot_overflowed) {
   2054						s->snapshot_overflowed = 1;
   2055						DMERR("Snapshot overflowed: Unable to allocate exception.");
   2056					}
   2057				} else
   2058					__invalidate_snapshot(s, -ENOMEM);
   2059				up_write(&s->lock);
   2060
   2061				r = DM_MAPIO_KILL;
   2062				goto out;
   2063			}
   2064		}
   2065
   2066		remap_exception(s, &pe->e, bio, chunk);
   2067
   2068		r = DM_MAPIO_SUBMITTED;
   2069
   2070		if (!pe->started && io_overlaps_chunk(s, bio)) {
   2071			pe->started = 1;
   2072
   2073			dm_exception_table_unlock(&lock);
   2074			up_read(&s->lock);
   2075
   2076			start_full_bio(pe, bio);
   2077			goto out;
   2078		}
   2079
   2080		bio_list_add(&pe->snapshot_bios, bio);
   2081
   2082		if (!pe->started) {
   2083			/* this is protected by the exception table lock */
   2084			pe->started = 1;
   2085
   2086			dm_exception_table_unlock(&lock);
   2087			up_read(&s->lock);
   2088
   2089			start_copy(pe);
   2090			goto out;
   2091		}
   2092	} else {
   2093		bio_set_dev(bio, s->origin->bdev);
   2094		track_chunk(s, bio, chunk);
   2095	}
   2096
   2097out_unlock:
   2098	dm_exception_table_unlock(&lock);
   2099	up_read(&s->lock);
   2100out:
   2101	return r;
   2102}
   2103
   2104/*
   2105 * A snapshot-merge target behaves like a combination of a snapshot
   2106 * target and a snapshot-origin target.  It only generates new
   2107 * exceptions in other snapshots and not in the one that is being
   2108 * merged.
   2109 *
   2110 * For each chunk, if there is an existing exception, it is used to
   2111 * redirect I/O to the cow device.  Otherwise I/O is sent to the origin,
   2112 * which in turn might generate exceptions in other snapshots.
   2113 * If merging is currently taking place on the chunk in question, the
   2114 * I/O is deferred by adding it to s->bios_queued_during_merge.
   2115 */
   2116static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
   2117{
   2118	struct dm_exception *e;
   2119	struct dm_snapshot *s = ti->private;
   2120	int r = DM_MAPIO_REMAPPED;
   2121	chunk_t chunk;
   2122
   2123	init_tracked_chunk(bio);
   2124
   2125	if (bio->bi_opf & REQ_PREFLUSH) {
   2126		if (!dm_bio_get_target_bio_nr(bio))
   2127			bio_set_dev(bio, s->origin->bdev);
   2128		else
   2129			bio_set_dev(bio, s->cow->bdev);
   2130		return DM_MAPIO_REMAPPED;
   2131	}
   2132
   2133	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
   2134		/* Once merging, discards no longer effect change */
   2135		bio_endio(bio);
   2136		return DM_MAPIO_SUBMITTED;
   2137	}
   2138
   2139	chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
   2140
   2141	down_write(&s->lock);
   2142
   2143	/* Full merging snapshots are redirected to the origin */
   2144	if (!s->valid)
   2145		goto redirect_to_origin;
   2146
   2147	/* If the block is already remapped - use that */
   2148	e = dm_lookup_exception(&s->complete, chunk);
   2149	if (e) {
   2150		/* Queue writes overlapping with chunks being merged */
   2151		if (bio_data_dir(bio) == WRITE &&
   2152		    chunk >= s->first_merging_chunk &&
   2153		    chunk < (s->first_merging_chunk +
   2154			     s->num_merging_chunks)) {
   2155			bio_set_dev(bio, s->origin->bdev);
   2156			bio_list_add(&s->bios_queued_during_merge, bio);
   2157			r = DM_MAPIO_SUBMITTED;
   2158			goto out_unlock;
   2159		}
   2160
   2161		remap_exception(s, e, bio, chunk);
   2162
   2163		if (bio_data_dir(bio) == WRITE)
   2164			track_chunk(s, bio, chunk);
   2165		goto out_unlock;
   2166	}
   2167
   2168redirect_to_origin:
   2169	bio_set_dev(bio, s->origin->bdev);
   2170
   2171	if (bio_data_dir(bio) == WRITE) {
   2172		up_write(&s->lock);
   2173		return do_origin(s->origin, bio, false);
   2174	}
   2175
   2176out_unlock:
   2177	up_write(&s->lock);
   2178
   2179	return r;
   2180}
   2181
   2182static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
   2183		blk_status_t *error)
   2184{
   2185	struct dm_snapshot *s = ti->private;
   2186
   2187	if (is_bio_tracked(bio))
   2188		stop_tracking_chunk(s, bio);
   2189
   2190	return DM_ENDIO_DONE;
   2191}
   2192
   2193static void snapshot_merge_presuspend(struct dm_target *ti)
   2194{
   2195	struct dm_snapshot *s = ti->private;
   2196
   2197	stop_merge(s);
   2198}
   2199
   2200static int snapshot_preresume(struct dm_target *ti)
   2201{
   2202	int r = 0;
   2203	struct dm_snapshot *s = ti->private;
   2204	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
   2205
   2206	down_read(&_origins_lock);
   2207	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
   2208	if (snap_src && snap_dest) {
   2209		down_read(&snap_src->lock);
   2210		if (s == snap_src) {
   2211			DMERR("Unable to resume snapshot source until "
   2212			      "handover completes.");
   2213			r = -EINVAL;
   2214		} else if (!dm_suspended(snap_src->ti)) {
   2215			DMERR("Unable to perform snapshot handover until "
   2216			      "source is suspended.");
   2217			r = -EINVAL;
   2218		}
   2219		up_read(&snap_src->lock);
   2220	}
   2221	up_read(&_origins_lock);
   2222
   2223	return r;
   2224}
   2225
   2226static void snapshot_resume(struct dm_target *ti)
   2227{
   2228	struct dm_snapshot *s = ti->private;
   2229	struct dm_snapshot *snap_src = NULL, *snap_dest = NULL, *snap_merging = NULL;
   2230	struct dm_origin *o;
   2231	struct mapped_device *origin_md = NULL;
   2232	bool must_restart_merging = false;
   2233
   2234	down_read(&_origins_lock);
   2235
   2236	o = __lookup_dm_origin(s->origin->bdev);
   2237	if (o)
   2238		origin_md = dm_table_get_md(o->ti->table);
   2239	if (!origin_md) {
   2240		(void) __find_snapshots_sharing_cow(s, NULL, NULL, &snap_merging);
   2241		if (snap_merging)
   2242			origin_md = dm_table_get_md(snap_merging->ti->table);
   2243	}
   2244	if (origin_md == dm_table_get_md(ti->table))
   2245		origin_md = NULL;
   2246	if (origin_md) {
   2247		if (dm_hold(origin_md))
   2248			origin_md = NULL;
   2249	}
   2250
   2251	up_read(&_origins_lock);
   2252
   2253	if (origin_md) {
   2254		dm_internal_suspend_fast(origin_md);
   2255		if (snap_merging && test_bit(RUNNING_MERGE, &snap_merging->state_bits)) {
   2256			must_restart_merging = true;
   2257			stop_merge(snap_merging);
   2258		}
   2259	}
   2260
   2261	down_read(&_origins_lock);
   2262
   2263	(void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
   2264	if (snap_src && snap_dest) {
   2265		down_write(&snap_src->lock);
   2266		down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
   2267		__handover_exceptions(snap_src, snap_dest);
   2268		up_write(&snap_dest->lock);
   2269		up_write(&snap_src->lock);
   2270	}
   2271
   2272	up_read(&_origins_lock);
   2273
   2274	if (origin_md) {
   2275		if (must_restart_merging)
   2276			start_merge(snap_merging);
   2277		dm_internal_resume_fast(origin_md);
   2278		dm_put(origin_md);
   2279	}
   2280
   2281	/* Now we have correct chunk size, reregister */
   2282	reregister_snapshot(s);
   2283
   2284	down_write(&s->lock);
   2285	s->active = 1;
   2286	up_write(&s->lock);
   2287}
   2288
   2289static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
   2290{
   2291	uint32_t min_chunksize;
   2292
   2293	down_read(&_origins_lock);
   2294	min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
   2295	up_read(&_origins_lock);
   2296
   2297	return min_chunksize;
   2298}
   2299
   2300static void snapshot_merge_resume(struct dm_target *ti)
   2301{
   2302	struct dm_snapshot *s = ti->private;
   2303
   2304	/*
   2305	 * Handover exceptions from existing snapshot.
   2306	 */
   2307	snapshot_resume(ti);
   2308
   2309	/*
   2310	 * snapshot-merge acts as an origin, so set ti->max_io_len
   2311	 */
   2312	ti->max_io_len = get_origin_minimum_chunksize(s->origin->bdev);
   2313
   2314	start_merge(s);
   2315}
   2316
   2317static void snapshot_status(struct dm_target *ti, status_type_t type,
   2318			    unsigned status_flags, char *result, unsigned maxlen)
   2319{
   2320	unsigned sz = 0;
   2321	struct dm_snapshot *snap = ti->private;
   2322	unsigned num_features;
   2323
   2324	switch (type) {
   2325	case STATUSTYPE_INFO:
   2326
   2327		down_write(&snap->lock);
   2328
   2329		if (!snap->valid)
   2330			DMEMIT("Invalid");
   2331		else if (snap->merge_failed)
   2332			DMEMIT("Merge failed");
   2333		else if (snap->snapshot_overflowed)
   2334			DMEMIT("Overflow");
   2335		else {
   2336			if (snap->store->type->usage) {
   2337				sector_t total_sectors, sectors_allocated,
   2338					 metadata_sectors;
   2339				snap->store->type->usage(snap->store,
   2340							 &total_sectors,
   2341							 &sectors_allocated,
   2342							 &metadata_sectors);
   2343				DMEMIT("%llu/%llu %llu",
   2344				       (unsigned long long)sectors_allocated,
   2345				       (unsigned long long)total_sectors,
   2346				       (unsigned long long)metadata_sectors);
   2347			}
   2348			else
   2349				DMEMIT("Unknown");
   2350		}
   2351
   2352		up_write(&snap->lock);
   2353
   2354		break;
   2355
   2356	case STATUSTYPE_TABLE:
   2357		/*
   2358		 * kdevname returns a static pointer so we need
   2359		 * to make private copies if the output is to
   2360		 * make sense.
   2361		 */
   2362		DMEMIT("%s %s", snap->origin->name, snap->cow->name);
   2363		sz += snap->store->type->status(snap->store, type, result + sz,
   2364						maxlen - sz);
   2365		num_features = snap->discard_zeroes_cow + snap->discard_passdown_origin;
   2366		if (num_features) {
   2367			DMEMIT(" %u", num_features);
   2368			if (snap->discard_zeroes_cow)
   2369				DMEMIT(" discard_zeroes_cow");
   2370			if (snap->discard_passdown_origin)
   2371				DMEMIT(" discard_passdown_origin");
   2372		}
   2373		break;
   2374
   2375	case STATUSTYPE_IMA:
   2376		DMEMIT_TARGET_NAME_VERSION(ti->type);
   2377		DMEMIT(",snap_origin_name=%s", snap->origin->name);
   2378		DMEMIT(",snap_cow_name=%s", snap->cow->name);
   2379		DMEMIT(",snap_valid=%c", snap->valid ? 'y' : 'n');
   2380		DMEMIT(",snap_merge_failed=%c", snap->merge_failed ? 'y' : 'n');
   2381		DMEMIT(",snapshot_overflowed=%c", snap->snapshot_overflowed ? 'y' : 'n');
   2382		DMEMIT(";");
   2383		break;
   2384	}
   2385}
   2386
   2387static int snapshot_iterate_devices(struct dm_target *ti,
   2388				    iterate_devices_callout_fn fn, void *data)
   2389{
   2390	struct dm_snapshot *snap = ti->private;
   2391	int r;
   2392
   2393	r = fn(ti, snap->origin, 0, ti->len, data);
   2394
   2395	if (!r)
   2396		r = fn(ti, snap->cow, 0, get_dev_size(snap->cow->bdev), data);
   2397
   2398	return r;
   2399}
   2400
   2401static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits)
   2402{
   2403	struct dm_snapshot *snap = ti->private;
   2404
   2405	if (snap->discard_zeroes_cow) {
   2406		struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
   2407
   2408		down_read(&_origins_lock);
   2409
   2410		(void) __find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, NULL);
   2411		if (snap_src && snap_dest)
   2412			snap = snap_src;
   2413
   2414		/* All discards are split on chunk_size boundary */
   2415		limits->discard_granularity = snap->store->chunk_size;
   2416		limits->max_discard_sectors = snap->store->chunk_size;
   2417
   2418		up_read(&_origins_lock);
   2419	}
   2420}
   2421
   2422/*-----------------------------------------------------------------
   2423 * Origin methods
   2424 *---------------------------------------------------------------*/
   2425
   2426/*
   2427 * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any
   2428 * supplied bio was ignored.  The caller may submit it immediately.
   2429 * (No remapping actually occurs as the origin is always a direct linear
   2430 * map.)
   2431 *
   2432 * If further exceptions are required, DM_MAPIO_SUBMITTED is returned
   2433 * and any supplied bio is added to a list to be submitted once all
   2434 * the necessary exceptions exist.
   2435 */
   2436static int __origin_write(struct list_head *snapshots, sector_t sector,
   2437			  struct bio *bio)
   2438{
   2439	int r = DM_MAPIO_REMAPPED;
   2440	struct dm_snapshot *snap;
   2441	struct dm_exception *e;
   2442	struct dm_snap_pending_exception *pe, *pe2;
   2443	struct dm_snap_pending_exception *pe_to_start_now = NULL;
   2444	struct dm_snap_pending_exception *pe_to_start_last = NULL;
   2445	struct dm_exception_table_lock lock;
   2446	chunk_t chunk;
   2447
   2448	/* Do all the snapshots on this origin */
   2449	list_for_each_entry (snap, snapshots, list) {
   2450		/*
   2451		 * Don't make new exceptions in a merging snapshot
   2452		 * because it has effectively been deleted
   2453		 */
   2454		if (dm_target_is_snapshot_merge(snap->ti))
   2455			continue;
   2456
   2457		/* Nothing to do if writing beyond end of snapshot */
   2458		if (sector >= dm_table_get_size(snap->ti->table))
   2459			continue;
   2460
   2461		/*
   2462		 * Remember, different snapshots can have
   2463		 * different chunk sizes.
   2464		 */
   2465		chunk = sector_to_chunk(snap->store, sector);
   2466		dm_exception_table_lock_init(snap, chunk, &lock);
   2467
   2468		down_read(&snap->lock);
   2469		dm_exception_table_lock(&lock);
   2470
   2471		/* Only deal with valid and active snapshots */
   2472		if (!snap->valid || !snap->active)
   2473			goto next_snapshot;
   2474
   2475		pe = __lookup_pending_exception(snap, chunk);
   2476		if (!pe) {
   2477			/*
   2478			 * Check exception table to see if block is already
   2479			 * remapped in this snapshot and trigger an exception
   2480			 * if not.
   2481			 */
   2482			e = dm_lookup_exception(&snap->complete, chunk);
   2483			if (e)
   2484				goto next_snapshot;
   2485
   2486			dm_exception_table_unlock(&lock);
   2487			pe = alloc_pending_exception(snap);
   2488			dm_exception_table_lock(&lock);
   2489
   2490			pe2 = __lookup_pending_exception(snap, chunk);
   2491
   2492			if (!pe2) {
   2493				e = dm_lookup_exception(&snap->complete, chunk);
   2494				if (e) {
   2495					free_pending_exception(pe);
   2496					goto next_snapshot;
   2497				}
   2498
   2499				pe = __insert_pending_exception(snap, pe, chunk);
   2500				if (!pe) {
   2501					dm_exception_table_unlock(&lock);
   2502					up_read(&snap->lock);
   2503
   2504					invalidate_snapshot(snap, -ENOMEM);
   2505					continue;
   2506				}
   2507			} else {
   2508				free_pending_exception(pe);
   2509				pe = pe2;
   2510			}
   2511		}
   2512
   2513		r = DM_MAPIO_SUBMITTED;
   2514
   2515		/*
   2516		 * If an origin bio was supplied, queue it to wait for the
   2517		 * completion of this exception, and start this one last,
   2518		 * at the end of the function.
   2519		 */
   2520		if (bio) {
   2521			bio_list_add(&pe->origin_bios, bio);
   2522			bio = NULL;
   2523
   2524			if (!pe->started) {
   2525				pe->started = 1;
   2526				pe_to_start_last = pe;
   2527			}
   2528		}
   2529
   2530		if (!pe->started) {
   2531			pe->started = 1;
   2532			pe_to_start_now = pe;
   2533		}
   2534
   2535next_snapshot:
   2536		dm_exception_table_unlock(&lock);
   2537		up_read(&snap->lock);
   2538
   2539		if (pe_to_start_now) {
   2540			start_copy(pe_to_start_now);
   2541			pe_to_start_now = NULL;
   2542		}
   2543	}
   2544
   2545	/*
   2546	 * Submit the exception against which the bio is queued last,
   2547	 * to give the other exceptions a head start.
   2548	 */
   2549	if (pe_to_start_last)
   2550		start_copy(pe_to_start_last);
   2551
   2552	return r;
   2553}
   2554
   2555/*
   2556 * Called on a write from the origin driver.
   2557 */
   2558static int do_origin(struct dm_dev *origin, struct bio *bio, bool limit)
   2559{
   2560	struct origin *o;
   2561	int r = DM_MAPIO_REMAPPED;
   2562
   2563again:
   2564	down_read(&_origins_lock);
   2565	o = __lookup_origin(origin->bdev);
   2566	if (o) {
   2567		if (limit) {
   2568			struct dm_snapshot *s;
   2569			list_for_each_entry(s, &o->snapshots, list)
   2570				if (unlikely(!wait_for_in_progress(s, true)))
   2571					goto again;
   2572		}
   2573
   2574		r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
   2575	}
   2576	up_read(&_origins_lock);
   2577
   2578	return r;
   2579}
   2580
   2581/*
   2582 * Trigger exceptions in all non-merging snapshots.
   2583 *
   2584 * The chunk size of the merging snapshot may be larger than the chunk
   2585 * size of some other snapshot so we may need to reallocate multiple
   2586 * chunks in other snapshots.
   2587 *
   2588 * We scan all the overlapping exceptions in the other snapshots.
   2589 * Returns 1 if anything was reallocated and must be waited for,
   2590 * otherwise returns 0.
   2591 *
   2592 * size must be a multiple of merging_snap's chunk_size.
   2593 */
   2594static int origin_write_extent(struct dm_snapshot *merging_snap,
   2595			       sector_t sector, unsigned size)
   2596{
   2597	int must_wait = 0;
   2598	sector_t n;
   2599	struct origin *o;
   2600
   2601	/*
   2602	 * The origin's __minimum_chunk_size() got stored in max_io_len
   2603	 * by snapshot_merge_resume().
   2604	 */
   2605	down_read(&_origins_lock);
   2606	o = __lookup_origin(merging_snap->origin->bdev);
   2607	for (n = 0; n < size; n += merging_snap->ti->max_io_len)
   2608		if (__origin_write(&o->snapshots, sector + n, NULL) ==
   2609		    DM_MAPIO_SUBMITTED)
   2610			must_wait = 1;
   2611	up_read(&_origins_lock);
   2612
   2613	return must_wait;
   2614}
   2615
   2616/*
   2617 * Origin: maps a linear range of a device, with hooks for snapshotting.
   2618 */
   2619
   2620/*
   2621 * Construct an origin mapping: <dev_path>
   2622 * The context for an origin is merely a 'struct dm_dev *'
   2623 * pointing to the real device.
   2624 */
   2625static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
   2626{
   2627	int r;
   2628	struct dm_origin *o;
   2629
   2630	if (argc != 1) {
   2631		ti->error = "origin: incorrect number of arguments";
   2632		return -EINVAL;
   2633	}
   2634
   2635	o = kmalloc(sizeof(struct dm_origin), GFP_KERNEL);
   2636	if (!o) {
   2637		ti->error = "Cannot allocate private origin structure";
   2638		r = -ENOMEM;
   2639		goto bad_alloc;
   2640	}
   2641
   2642	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &o->dev);
   2643	if (r) {
   2644		ti->error = "Cannot get target device";
   2645		goto bad_open;
   2646	}
   2647
   2648	o->ti = ti;
   2649	ti->private = o;
   2650	ti->num_flush_bios = 1;
   2651
   2652	return 0;
   2653
   2654bad_open:
   2655	kfree(o);
   2656bad_alloc:
   2657	return r;
   2658}
   2659
   2660static void origin_dtr(struct dm_target *ti)
   2661{
   2662	struct dm_origin *o = ti->private;
   2663
   2664	dm_put_device(ti, o->dev);
   2665	kfree(o);
   2666}
   2667
   2668static int origin_map(struct dm_target *ti, struct bio *bio)
   2669{
   2670	struct dm_origin *o = ti->private;
   2671	unsigned available_sectors;
   2672
   2673	bio_set_dev(bio, o->dev->bdev);
   2674
   2675	if (unlikely(bio->bi_opf & REQ_PREFLUSH))
   2676		return DM_MAPIO_REMAPPED;
   2677
   2678	if (bio_data_dir(bio) != WRITE)
   2679		return DM_MAPIO_REMAPPED;
   2680
   2681	available_sectors = o->split_boundary -
   2682		((unsigned)bio->bi_iter.bi_sector & (o->split_boundary - 1));
   2683
   2684	if (bio_sectors(bio) > available_sectors)
   2685		dm_accept_partial_bio(bio, available_sectors);
   2686
   2687	/* Only tell snapshots if this is a write */
   2688	return do_origin(o->dev, bio, true);
   2689}
   2690
   2691/*
   2692 * Set the target "max_io_len" field to the minimum of all the snapshots'
   2693 * chunk sizes.
   2694 */
   2695static void origin_resume(struct dm_target *ti)
   2696{
   2697	struct dm_origin *o = ti->private;
   2698
   2699	o->split_boundary = get_origin_minimum_chunksize(o->dev->bdev);
   2700
   2701	down_write(&_origins_lock);
   2702	__insert_dm_origin(o);
   2703	up_write(&_origins_lock);
   2704}
   2705
   2706static void origin_postsuspend(struct dm_target *ti)
   2707{
   2708	struct dm_origin *o = ti->private;
   2709
   2710	down_write(&_origins_lock);
   2711	__remove_dm_origin(o);
   2712	up_write(&_origins_lock);
   2713}
   2714
   2715static void origin_status(struct dm_target *ti, status_type_t type,
   2716			  unsigned status_flags, char *result, unsigned maxlen)
   2717{
   2718	struct dm_origin *o = ti->private;
   2719
   2720	switch (type) {
   2721	case STATUSTYPE_INFO:
   2722		result[0] = '\0';
   2723		break;
   2724
   2725	case STATUSTYPE_TABLE:
   2726		snprintf(result, maxlen, "%s", o->dev->name);
   2727		break;
   2728	case STATUSTYPE_IMA:
   2729		result[0] = '\0';
   2730		break;
   2731	}
   2732}
   2733
   2734static int origin_iterate_devices(struct dm_target *ti,
   2735				  iterate_devices_callout_fn fn, void *data)
   2736{
   2737	struct dm_origin *o = ti->private;
   2738
   2739	return fn(ti, o->dev, 0, ti->len, data);
   2740}
   2741
   2742static struct target_type origin_target = {
   2743	.name    = "snapshot-origin",
   2744	.version = {1, 9, 0},
   2745	.module  = THIS_MODULE,
   2746	.ctr     = origin_ctr,
   2747	.dtr     = origin_dtr,
   2748	.map     = origin_map,
   2749	.resume  = origin_resume,
   2750	.postsuspend = origin_postsuspend,
   2751	.status  = origin_status,
   2752	.iterate_devices = origin_iterate_devices,
   2753};
   2754
   2755static struct target_type snapshot_target = {
   2756	.name    = "snapshot",
   2757	.version = {1, 16, 0},
   2758	.module  = THIS_MODULE,
   2759	.ctr     = snapshot_ctr,
   2760	.dtr     = snapshot_dtr,
   2761	.map     = snapshot_map,
   2762	.end_io  = snapshot_end_io,
   2763	.preresume  = snapshot_preresume,
   2764	.resume  = snapshot_resume,
   2765	.status  = snapshot_status,
   2766	.iterate_devices = snapshot_iterate_devices,
   2767	.io_hints = snapshot_io_hints,
   2768};
   2769
   2770static struct target_type merge_target = {
   2771	.name    = dm_snapshot_merge_target_name,
   2772	.version = {1, 5, 0},
   2773	.module  = THIS_MODULE,
   2774	.ctr     = snapshot_ctr,
   2775	.dtr     = snapshot_dtr,
   2776	.map     = snapshot_merge_map,
   2777	.end_io  = snapshot_end_io,
   2778	.presuspend = snapshot_merge_presuspend,
   2779	.preresume  = snapshot_preresume,
   2780	.resume  = snapshot_merge_resume,
   2781	.status  = snapshot_status,
   2782	.iterate_devices = snapshot_iterate_devices,
   2783	.io_hints = snapshot_io_hints,
   2784};
   2785
   2786static int __init dm_snapshot_init(void)
   2787{
   2788	int r;
   2789
   2790	r = dm_exception_store_init();
   2791	if (r) {
   2792		DMERR("Failed to initialize exception stores");
   2793		return r;
   2794	}
   2795
   2796	r = init_origin_hash();
   2797	if (r) {
   2798		DMERR("init_origin_hash failed.");
   2799		goto bad_origin_hash;
   2800	}
   2801
   2802	exception_cache = KMEM_CACHE(dm_exception, 0);
   2803	if (!exception_cache) {
   2804		DMERR("Couldn't create exception cache.");
   2805		r = -ENOMEM;
   2806		goto bad_exception_cache;
   2807	}
   2808
   2809	pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0);
   2810	if (!pending_cache) {
   2811		DMERR("Couldn't create pending cache.");
   2812		r = -ENOMEM;
   2813		goto bad_pending_cache;
   2814	}
   2815
   2816	r = dm_register_target(&snapshot_target);
   2817	if (r < 0) {
   2818		DMERR("snapshot target register failed %d", r);
   2819		goto bad_register_snapshot_target;
   2820	}
   2821
   2822	r = dm_register_target(&origin_target);
   2823	if (r < 0) {
   2824		DMERR("Origin target register failed %d", r);
   2825		goto bad_register_origin_target;
   2826	}
   2827
   2828	r = dm_register_target(&merge_target);
   2829	if (r < 0) {
   2830		DMERR("Merge target register failed %d", r);
   2831		goto bad_register_merge_target;
   2832	}
   2833
   2834	return 0;
   2835
   2836bad_register_merge_target:
   2837	dm_unregister_target(&origin_target);
   2838bad_register_origin_target:
   2839	dm_unregister_target(&snapshot_target);
   2840bad_register_snapshot_target:
   2841	kmem_cache_destroy(pending_cache);
   2842bad_pending_cache:
   2843	kmem_cache_destroy(exception_cache);
   2844bad_exception_cache:
   2845	exit_origin_hash();
   2846bad_origin_hash:
   2847	dm_exception_store_exit();
   2848
   2849	return r;
   2850}
   2851
   2852static void __exit dm_snapshot_exit(void)
   2853{
   2854	dm_unregister_target(&snapshot_target);
   2855	dm_unregister_target(&origin_target);
   2856	dm_unregister_target(&merge_target);
   2857
   2858	exit_origin_hash();
   2859	kmem_cache_destroy(pending_cache);
   2860	kmem_cache_destroy(exception_cache);
   2861
   2862	dm_exception_store_exit();
   2863}
   2864
   2865/* Module hooks */
   2866module_init(dm_snapshot_init);
   2867module_exit(dm_snapshot_exit);
   2868
   2869MODULE_DESCRIPTION(DM_NAME " snapshot target");
   2870MODULE_AUTHOR("Joe Thornber");
   2871MODULE_LICENSE("GPL");
   2872MODULE_ALIAS("dm-snapshot-origin");
   2873MODULE_ALIAS("dm-snapshot-merge");