cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

dm-thin-metadata.c (48176B)


      1/*
      2 * Copyright (C) 2011-2012 Red Hat, Inc.
      3 *
      4 * This file is released under the GPL.
      5 */
      6
      7#include "dm-thin-metadata.h"
      8#include "persistent-data/dm-btree.h"
      9#include "persistent-data/dm-space-map.h"
     10#include "persistent-data/dm-space-map-disk.h"
     11#include "persistent-data/dm-transaction-manager.h"
     12
     13#include <linux/list.h>
     14#include <linux/device-mapper.h>
     15#include <linux/workqueue.h>
     16
     17/*--------------------------------------------------------------------------
     18 * As far as the metadata goes, there is:
     19 *
     20 * - A superblock in block zero, taking up fewer than 512 bytes for
     21 *   atomic writes.
     22 *
     23 * - A space map managing the metadata blocks.
     24 *
     25 * - A space map managing the data blocks.
     26 *
     27 * - A btree mapping our internal thin dev ids onto struct disk_device_details.
     28 *
     29 * - A hierarchical btree, with 2 levels which effectively maps (thin
     30 *   dev id, virtual block) -> block_time.  Block time is a 64-bit
     31 *   field holding the time in the low 24 bits, and block in the top 40
     32 *   bits.
     33 *
     34 * BTrees consist solely of btree_nodes, that fill a block.  Some are
     35 * internal nodes, as such their values are a __le64 pointing to other
     36 * nodes.  Leaf nodes can store data of any reasonable size (ie. much
     37 * smaller than the block size).  The nodes consist of the header,
     38 * followed by an array of keys, followed by an array of values.  We have
     39 * to binary search on the keys so they're all held together to help the
     40 * cpu cache.
     41 *
     42 * Space maps have 2 btrees:
     43 *
     44 * - One maps a uint64_t onto a struct index_entry.  Which points to a
     45 *   bitmap block, and has some details about how many free entries there
     46 *   are etc.
     47 *
     48 * - The bitmap blocks have a header (for the checksum).  Then the rest
     49 *   of the block is pairs of bits.  With the meaning being:
     50 *
     51 *   0 - ref count is 0
     52 *   1 - ref count is 1
     53 *   2 - ref count is 2
     54 *   3 - ref count is higher than 2
     55 *
     56 * - If the count is higher than 2 then the ref count is entered in a
     57 *   second btree that directly maps the block_address to a uint32_t ref
     58 *   count.
     59 *
     60 * The space map metadata variant doesn't have a bitmaps btree.  Instead
     61 * it has one single blocks worth of index_entries.  This avoids
     62 * recursive issues with the bitmap btree needing to allocate space in
     63 * order to insert.  With a small data block size such as 64k the
     64 * metadata support data devices that are hundreds of terrabytes.
     65 *
     66 * The space maps allocate space linearly from front to back.  Space that
     67 * is freed in a transaction is never recycled within that transaction.
     68 * To try and avoid fragmenting _free_ space the allocator always goes
     69 * back and fills in gaps.
     70 *
     71 * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks
     72 * from the block manager.
     73 *--------------------------------------------------------------------------*/
     74
     75#define DM_MSG_PREFIX   "thin metadata"
     76
     77#define THIN_SUPERBLOCK_MAGIC 27022010
     78#define THIN_SUPERBLOCK_LOCATION 0
     79#define THIN_VERSION 2
     80#define SECTOR_TO_BLOCK_SHIFT 3
     81
     82/*
     83 * For btree insert:
     84 *  3 for btree insert +
     85 *  2 for btree lookup used within space map
     86 * For btree remove:
     87 *  2 for shadow spine +
     88 *  4 for rebalance 3 child node
     89 */
     90#define THIN_MAX_CONCURRENT_LOCKS 6
     91
     92/* This should be plenty */
     93#define SPACE_MAP_ROOT_SIZE 128
     94
     95/*
     96 * Little endian on-disk superblock and device details.
     97 */
     98struct thin_disk_superblock {
     99	__le32 csum;	/* Checksum of superblock except for this field. */
    100	__le32 flags;
    101	__le64 blocknr;	/* This block number, dm_block_t. */
    102
    103	__u8 uuid[16];
    104	__le64 magic;
    105	__le32 version;
    106	__le32 time;
    107
    108	__le64 trans_id;
    109
    110	/*
    111	 * Root held by userspace transactions.
    112	 */
    113	__le64 held_root;
    114
    115	__u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
    116	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
    117
    118	/*
    119	 * 2-level btree mapping (dev_id, (dev block, time)) -> data block
    120	 */
    121	__le64 data_mapping_root;
    122
    123	/*
    124	 * Device detail root mapping dev_id -> device_details
    125	 */
    126	__le64 device_details_root;
    127
    128	__le32 data_block_size;		/* In 512-byte sectors. */
    129
    130	__le32 metadata_block_size;	/* In 512-byte sectors. */
    131	__le64 metadata_nr_blocks;
    132
    133	__le32 compat_flags;
    134	__le32 compat_ro_flags;
    135	__le32 incompat_flags;
    136} __packed;
    137
    138struct disk_device_details {
    139	__le64 mapped_blocks;
    140	__le64 transaction_id;		/* When created. */
    141	__le32 creation_time;
    142	__le32 snapshotted_time;
    143} __packed;
    144
    145struct dm_pool_metadata {
    146	struct hlist_node hash;
    147
    148	struct block_device *bdev;
    149	struct dm_block_manager *bm;
    150	struct dm_space_map *metadata_sm;
    151	struct dm_space_map *data_sm;
    152	struct dm_transaction_manager *tm;
    153	struct dm_transaction_manager *nb_tm;
    154
    155	/*
    156	 * Two-level btree.
    157	 * First level holds thin_dev_t.
    158	 * Second level holds mappings.
    159	 */
    160	struct dm_btree_info info;
    161
    162	/*
    163	 * Non-blocking version of the above.
    164	 */
    165	struct dm_btree_info nb_info;
    166
    167	/*
    168	 * Just the top level for deleting whole devices.
    169	 */
    170	struct dm_btree_info tl_info;
    171
    172	/*
    173	 * Just the bottom level for creating new devices.
    174	 */
    175	struct dm_btree_info bl_info;
    176
    177	/*
    178	 * Describes the device details btree.
    179	 */
    180	struct dm_btree_info details_info;
    181
    182	struct rw_semaphore root_lock;
    183	uint32_t time;
    184	dm_block_t root;
    185	dm_block_t details_root;
    186	struct list_head thin_devices;
    187	uint64_t trans_id;
    188	unsigned long flags;
    189	sector_t data_block_size;
    190
    191	/*
    192	 * Pre-commit callback.
    193	 *
    194	 * This allows the thin provisioning target to run a callback before
    195	 * the metadata are committed.
    196	 */
    197	dm_pool_pre_commit_fn pre_commit_fn;
    198	void *pre_commit_context;
    199
    200	/*
    201	 * We reserve a section of the metadata for commit overhead.
    202	 * All reported space does *not* include this.
    203	 */
    204	dm_block_t metadata_reserve;
    205
    206	/*
    207	 * Set if a transaction has to be aborted but the attempt to roll back
    208	 * to the previous (good) transaction failed.  The only pool metadata
    209	 * operation possible in this state is the closing of the device.
    210	 */
    211	bool fail_io:1;
    212
    213	/*
    214	 * Set once a thin-pool has been accessed through one of the interfaces
    215	 * that imply the pool is in-service (e.g. thin devices created/deleted,
    216	 * thin-pool message, metadata snapshots, etc).
    217	 */
    218	bool in_service:1;
    219
    220	/*
    221	 * Reading the space map roots can fail, so we read it into these
    222	 * buffers before the superblock is locked and updated.
    223	 */
    224	__u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
    225	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
    226};
    227
    228struct dm_thin_device {
    229	struct list_head list;
    230	struct dm_pool_metadata *pmd;
    231	dm_thin_id id;
    232
    233	int open_count;
    234	bool changed:1;
    235	bool aborted_with_changes:1;
    236	uint64_t mapped_blocks;
    237	uint64_t transaction_id;
    238	uint32_t creation_time;
    239	uint32_t snapshotted_time;
    240};
    241
    242/*----------------------------------------------------------------
    243 * superblock validator
    244 *--------------------------------------------------------------*/
    245
    246#define SUPERBLOCK_CSUM_XOR 160774
    247
    248static void sb_prepare_for_write(struct dm_block_validator *v,
    249				 struct dm_block *b,
    250				 size_t block_size)
    251{
    252	struct thin_disk_superblock *disk_super = dm_block_data(b);
    253
    254	disk_super->blocknr = cpu_to_le64(dm_block_location(b));
    255	disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
    256						      block_size - sizeof(__le32),
    257						      SUPERBLOCK_CSUM_XOR));
    258}
    259
    260static int sb_check(struct dm_block_validator *v,
    261		    struct dm_block *b,
    262		    size_t block_size)
    263{
    264	struct thin_disk_superblock *disk_super = dm_block_data(b);
    265	__le32 csum_le;
    266
    267	if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
    268		DMERR("sb_check failed: blocknr %llu: "
    269		      "wanted %llu", le64_to_cpu(disk_super->blocknr),
    270		      (unsigned long long)dm_block_location(b));
    271		return -ENOTBLK;
    272	}
    273
    274	if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) {
    275		DMERR("sb_check failed: magic %llu: "
    276		      "wanted %llu", le64_to_cpu(disk_super->magic),
    277		      (unsigned long long)THIN_SUPERBLOCK_MAGIC);
    278		return -EILSEQ;
    279	}
    280
    281	csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
    282					     block_size - sizeof(__le32),
    283					     SUPERBLOCK_CSUM_XOR));
    284	if (csum_le != disk_super->csum) {
    285		DMERR("sb_check failed: csum %u: wanted %u",
    286		      le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
    287		return -EILSEQ;
    288	}
    289
    290	return 0;
    291}
    292
    293static struct dm_block_validator sb_validator = {
    294	.name = "superblock",
    295	.prepare_for_write = sb_prepare_for_write,
    296	.check = sb_check
    297};
    298
    299/*----------------------------------------------------------------
    300 * Methods for the btree value types
    301 *--------------------------------------------------------------*/
    302
    303static uint64_t pack_block_time(dm_block_t b, uint32_t t)
    304{
    305	return (b << 24) | t;
    306}
    307
    308static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
    309{
    310	*b = v >> 24;
    311	*t = v & ((1 << 24) - 1);
    312}
    313
    314/*
    315 * It's more efficient to call dm_sm_{inc,dec}_blocks as few times as
    316 * possible.  'with_runs' reads contiguous runs of blocks, and calls the
    317 * given sm function.
    318 */
    319typedef int (*run_fn)(struct dm_space_map *, dm_block_t, dm_block_t);
    320
    321static void with_runs(struct dm_space_map *sm, const __le64 *value_le, unsigned count, run_fn fn)
    322{
    323	uint64_t b, begin, end;
    324	uint32_t t;
    325	bool in_run = false;
    326	unsigned i;
    327
    328	for (i = 0; i < count; i++, value_le++) {
    329		/* We know value_le is 8 byte aligned */
    330		unpack_block_time(le64_to_cpu(*value_le), &b, &t);
    331
    332		if (in_run) {
    333			if (b == end) {
    334				end++;
    335			} else {
    336				fn(sm, begin, end);
    337				begin = b;
    338				end = b + 1;
    339			}
    340		} else {
    341			in_run = true;
    342			begin = b;
    343			end = b + 1;
    344		}
    345	}
    346
    347	if (in_run)
    348		fn(sm, begin, end);
    349}
    350
    351static void data_block_inc(void *context, const void *value_le, unsigned count)
    352{
    353	with_runs((struct dm_space_map *) context,
    354		  (const __le64 *) value_le, count, dm_sm_inc_blocks);
    355}
    356
    357static void data_block_dec(void *context, const void *value_le, unsigned count)
    358{
    359	with_runs((struct dm_space_map *) context,
    360		  (const __le64 *) value_le, count, dm_sm_dec_blocks);
    361}
    362
    363static int data_block_equal(void *context, const void *value1_le, const void *value2_le)
    364{
    365	__le64 v1_le, v2_le;
    366	uint64_t b1, b2;
    367	uint32_t t;
    368
    369	memcpy(&v1_le, value1_le, sizeof(v1_le));
    370	memcpy(&v2_le, value2_le, sizeof(v2_le));
    371	unpack_block_time(le64_to_cpu(v1_le), &b1, &t);
    372	unpack_block_time(le64_to_cpu(v2_le), &b2, &t);
    373
    374	return b1 == b2;
    375}
    376
    377static void subtree_inc(void *context, const void *value, unsigned count)
    378{
    379	struct dm_btree_info *info = context;
    380	const __le64 *root_le = value;
    381	unsigned i;
    382
    383	for (i = 0; i < count; i++, root_le++)
    384		dm_tm_inc(info->tm, le64_to_cpu(*root_le));
    385}
    386
    387static void subtree_dec(void *context, const void *value, unsigned count)
    388{
    389	struct dm_btree_info *info = context;
    390	const __le64 *root_le = value;
    391	unsigned i;
    392
    393	for (i = 0; i < count; i++, root_le++)
    394		if (dm_btree_del(info, le64_to_cpu(*root_le)))
    395			DMERR("btree delete failed");
    396}
    397
    398static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
    399{
    400	__le64 v1_le, v2_le;
    401	memcpy(&v1_le, value1_le, sizeof(v1_le));
    402	memcpy(&v2_le, value2_le, sizeof(v2_le));
    403
    404	return v1_le == v2_le;
    405}
    406
    407/*----------------------------------------------------------------*/
    408
    409/*
    410 * Variant that is used for in-core only changes or code that
    411 * shouldn't put the pool in service on its own (e.g. commit).
    412 */
    413static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd)
    414	__acquires(pmd->root_lock)
    415{
    416	down_write(&pmd->root_lock);
    417}
    418
    419static inline void pmd_write_lock(struct dm_pool_metadata *pmd)
    420{
    421	pmd_write_lock_in_core(pmd);
    422	if (unlikely(!pmd->in_service))
    423		pmd->in_service = true;
    424}
    425
    426static inline void pmd_write_unlock(struct dm_pool_metadata *pmd)
    427	__releases(pmd->root_lock)
    428{
    429	up_write(&pmd->root_lock);
    430}
    431
    432/*----------------------------------------------------------------*/
    433
    434static int superblock_lock_zero(struct dm_pool_metadata *pmd,
    435				struct dm_block **sblock)
    436{
    437	return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION,
    438				     &sb_validator, sblock);
    439}
    440
    441static int superblock_lock(struct dm_pool_metadata *pmd,
    442			   struct dm_block **sblock)
    443{
    444	return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
    445				&sb_validator, sblock);
    446}
    447
    448static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
    449{
    450	int r;
    451	unsigned i;
    452	struct dm_block *b;
    453	__le64 *data_le, zero = cpu_to_le64(0);
    454	unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64);
    455
    456	/*
    457	 * We can't use a validator here - it may be all zeroes.
    458	 */
    459	r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b);
    460	if (r)
    461		return r;
    462
    463	data_le = dm_block_data(b);
    464	*result = 1;
    465	for (i = 0; i < block_size; i++) {
    466		if (data_le[i] != zero) {
    467			*result = 0;
    468			break;
    469		}
    470	}
    471
    472	dm_bm_unlock(b);
    473
    474	return 0;
    475}
    476
    477static void __setup_btree_details(struct dm_pool_metadata *pmd)
    478{
    479	pmd->info.tm = pmd->tm;
    480	pmd->info.levels = 2;
    481	pmd->info.value_type.context = pmd->data_sm;
    482	pmd->info.value_type.size = sizeof(__le64);
    483	pmd->info.value_type.inc = data_block_inc;
    484	pmd->info.value_type.dec = data_block_dec;
    485	pmd->info.value_type.equal = data_block_equal;
    486
    487	memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
    488	pmd->nb_info.tm = pmd->nb_tm;
    489
    490	pmd->tl_info.tm = pmd->tm;
    491	pmd->tl_info.levels = 1;
    492	pmd->tl_info.value_type.context = &pmd->bl_info;
    493	pmd->tl_info.value_type.size = sizeof(__le64);
    494	pmd->tl_info.value_type.inc = subtree_inc;
    495	pmd->tl_info.value_type.dec = subtree_dec;
    496	pmd->tl_info.value_type.equal = subtree_equal;
    497
    498	pmd->bl_info.tm = pmd->tm;
    499	pmd->bl_info.levels = 1;
    500	pmd->bl_info.value_type.context = pmd->data_sm;
    501	pmd->bl_info.value_type.size = sizeof(__le64);
    502	pmd->bl_info.value_type.inc = data_block_inc;
    503	pmd->bl_info.value_type.dec = data_block_dec;
    504	pmd->bl_info.value_type.equal = data_block_equal;
    505
    506	pmd->details_info.tm = pmd->tm;
    507	pmd->details_info.levels = 1;
    508	pmd->details_info.value_type.context = NULL;
    509	pmd->details_info.value_type.size = sizeof(struct disk_device_details);
    510	pmd->details_info.value_type.inc = NULL;
    511	pmd->details_info.value_type.dec = NULL;
    512	pmd->details_info.value_type.equal = NULL;
    513}
    514
    515static int save_sm_roots(struct dm_pool_metadata *pmd)
    516{
    517	int r;
    518	size_t len;
    519
    520	r = dm_sm_root_size(pmd->metadata_sm, &len);
    521	if (r < 0)
    522		return r;
    523
    524	r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len);
    525	if (r < 0)
    526		return r;
    527
    528	r = dm_sm_root_size(pmd->data_sm, &len);
    529	if (r < 0)
    530		return r;
    531
    532	return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len);
    533}
    534
    535static void copy_sm_roots(struct dm_pool_metadata *pmd,
    536			  struct thin_disk_superblock *disk)
    537{
    538	memcpy(&disk->metadata_space_map_root,
    539	       &pmd->metadata_space_map_root,
    540	       sizeof(pmd->metadata_space_map_root));
    541
    542	memcpy(&disk->data_space_map_root,
    543	       &pmd->data_space_map_root,
    544	       sizeof(pmd->data_space_map_root));
    545}
    546
    547static int __write_initial_superblock(struct dm_pool_metadata *pmd)
    548{
    549	int r;
    550	struct dm_block *sblock;
    551	struct thin_disk_superblock *disk_super;
    552	sector_t bdev_size = bdev_nr_sectors(pmd->bdev);
    553
    554	if (bdev_size > THIN_METADATA_MAX_SECTORS)
    555		bdev_size = THIN_METADATA_MAX_SECTORS;
    556
    557	r = dm_sm_commit(pmd->data_sm);
    558	if (r < 0)
    559		return r;
    560
    561	r = dm_tm_pre_commit(pmd->tm);
    562	if (r < 0)
    563		return r;
    564
    565	r = save_sm_roots(pmd);
    566	if (r < 0)
    567		return r;
    568
    569	r = superblock_lock_zero(pmd, &sblock);
    570	if (r)
    571		return r;
    572
    573	disk_super = dm_block_data(sblock);
    574	disk_super->flags = 0;
    575	memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
    576	disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
    577	disk_super->version = cpu_to_le32(THIN_VERSION);
    578	disk_super->time = 0;
    579	disk_super->trans_id = 0;
    580	disk_super->held_root = 0;
    581
    582	copy_sm_roots(pmd, disk_super);
    583
    584	disk_super->data_mapping_root = cpu_to_le64(pmd->root);
    585	disk_super->device_details_root = cpu_to_le64(pmd->details_root);
    586	disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE);
    587	disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
    588	disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
    589
    590	return dm_tm_commit(pmd->tm, sblock);
    591}
    592
    593static int __format_metadata(struct dm_pool_metadata *pmd)
    594{
    595	int r;
    596
    597	r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
    598				 &pmd->tm, &pmd->metadata_sm);
    599	if (r < 0) {
    600		DMERR("tm_create_with_sm failed");
    601		return r;
    602	}
    603
    604	pmd->data_sm = dm_sm_disk_create(pmd->tm, 0);
    605	if (IS_ERR(pmd->data_sm)) {
    606		DMERR("sm_disk_create failed");
    607		r = PTR_ERR(pmd->data_sm);
    608		goto bad_cleanup_tm;
    609	}
    610
    611	pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
    612	if (!pmd->nb_tm) {
    613		DMERR("could not create non-blocking clone tm");
    614		r = -ENOMEM;
    615		goto bad_cleanup_data_sm;
    616	}
    617
    618	__setup_btree_details(pmd);
    619
    620	r = dm_btree_empty(&pmd->info, &pmd->root);
    621	if (r < 0)
    622		goto bad_cleanup_nb_tm;
    623
    624	r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
    625	if (r < 0) {
    626		DMERR("couldn't create devices root");
    627		goto bad_cleanup_nb_tm;
    628	}
    629
    630	r = __write_initial_superblock(pmd);
    631	if (r)
    632		goto bad_cleanup_nb_tm;
    633
    634	return 0;
    635
    636bad_cleanup_nb_tm:
    637	dm_tm_destroy(pmd->nb_tm);
    638bad_cleanup_data_sm:
    639	dm_sm_destroy(pmd->data_sm);
    640bad_cleanup_tm:
    641	dm_tm_destroy(pmd->tm);
    642	dm_sm_destroy(pmd->metadata_sm);
    643
    644	return r;
    645}
    646
    647static int __check_incompat_features(struct thin_disk_superblock *disk_super,
    648				     struct dm_pool_metadata *pmd)
    649{
    650	uint32_t features;
    651
    652	features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
    653	if (features) {
    654		DMERR("could not access metadata due to unsupported optional features (%lx).",
    655		      (unsigned long)features);
    656		return -EINVAL;
    657	}
    658
    659	/*
    660	 * Check for read-only metadata to skip the following RDWR checks.
    661	 */
    662	if (bdev_read_only(pmd->bdev))
    663		return 0;
    664
    665	features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
    666	if (features) {
    667		DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
    668		      (unsigned long)features);
    669		return -EINVAL;
    670	}
    671
    672	return 0;
    673}
    674
    675static int __open_metadata(struct dm_pool_metadata *pmd)
    676{
    677	int r;
    678	struct dm_block *sblock;
    679	struct thin_disk_superblock *disk_super;
    680
    681	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
    682			    &sb_validator, &sblock);
    683	if (r < 0) {
    684		DMERR("couldn't read superblock");
    685		return r;
    686	}
    687
    688	disk_super = dm_block_data(sblock);
    689
    690	/* Verify the data block size hasn't changed */
    691	if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) {
    692		DMERR("changing the data block size (from %u to %llu) is not supported",
    693		      le32_to_cpu(disk_super->data_block_size),
    694		      (unsigned long long)pmd->data_block_size);
    695		r = -EINVAL;
    696		goto bad_unlock_sblock;
    697	}
    698
    699	r = __check_incompat_features(disk_super, pmd);
    700	if (r < 0)
    701		goto bad_unlock_sblock;
    702
    703	r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
    704			       disk_super->metadata_space_map_root,
    705			       sizeof(disk_super->metadata_space_map_root),
    706			       &pmd->tm, &pmd->metadata_sm);
    707	if (r < 0) {
    708		DMERR("tm_open_with_sm failed");
    709		goto bad_unlock_sblock;
    710	}
    711
    712	pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root,
    713				       sizeof(disk_super->data_space_map_root));
    714	if (IS_ERR(pmd->data_sm)) {
    715		DMERR("sm_disk_open failed");
    716		r = PTR_ERR(pmd->data_sm);
    717		goto bad_cleanup_tm;
    718	}
    719
    720	pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
    721	if (!pmd->nb_tm) {
    722		DMERR("could not create non-blocking clone tm");
    723		r = -ENOMEM;
    724		goto bad_cleanup_data_sm;
    725	}
    726
    727	__setup_btree_details(pmd);
    728	dm_bm_unlock(sblock);
    729
    730	return 0;
    731
    732bad_cleanup_data_sm:
    733	dm_sm_destroy(pmd->data_sm);
    734bad_cleanup_tm:
    735	dm_tm_destroy(pmd->tm);
    736	dm_sm_destroy(pmd->metadata_sm);
    737bad_unlock_sblock:
    738	dm_bm_unlock(sblock);
    739
    740	return r;
    741}
    742
    743static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device)
    744{
    745	int r, unformatted;
    746
    747	r = __superblock_all_zeroes(pmd->bm, &unformatted);
    748	if (r)
    749		return r;
    750
    751	if (unformatted)
    752		return format_device ? __format_metadata(pmd) : -EPERM;
    753
    754	return __open_metadata(pmd);
    755}
    756
    757static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device)
    758{
    759	int r;
    760
    761	pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
    762					  THIN_MAX_CONCURRENT_LOCKS);
    763	if (IS_ERR(pmd->bm)) {
    764		DMERR("could not create block manager");
    765		r = PTR_ERR(pmd->bm);
    766		pmd->bm = NULL;
    767		return r;
    768	}
    769
    770	r = __open_or_format_metadata(pmd, format_device);
    771	if (r) {
    772		dm_block_manager_destroy(pmd->bm);
    773		pmd->bm = NULL;
    774	}
    775
    776	return r;
    777}
    778
    779static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd)
    780{
    781	dm_sm_destroy(pmd->data_sm);
    782	dm_sm_destroy(pmd->metadata_sm);
    783	dm_tm_destroy(pmd->nb_tm);
    784	dm_tm_destroy(pmd->tm);
    785	dm_block_manager_destroy(pmd->bm);
    786}
    787
    788static int __begin_transaction(struct dm_pool_metadata *pmd)
    789{
    790	int r;
    791	struct thin_disk_superblock *disk_super;
    792	struct dm_block *sblock;
    793
    794	/*
    795	 * We re-read the superblock every time.  Shouldn't need to do this
    796	 * really.
    797	 */
    798	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
    799			    &sb_validator, &sblock);
    800	if (r)
    801		return r;
    802
    803	disk_super = dm_block_data(sblock);
    804	pmd->time = le32_to_cpu(disk_super->time);
    805	pmd->root = le64_to_cpu(disk_super->data_mapping_root);
    806	pmd->details_root = le64_to_cpu(disk_super->device_details_root);
    807	pmd->trans_id = le64_to_cpu(disk_super->trans_id);
    808	pmd->flags = le32_to_cpu(disk_super->flags);
    809	pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
    810
    811	dm_bm_unlock(sblock);
    812	return 0;
    813}
    814
    815static int __write_changed_details(struct dm_pool_metadata *pmd)
    816{
    817	int r;
    818	struct dm_thin_device *td, *tmp;
    819	struct disk_device_details details;
    820	uint64_t key;
    821
    822	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
    823		if (!td->changed)
    824			continue;
    825
    826		key = td->id;
    827
    828		details.mapped_blocks = cpu_to_le64(td->mapped_blocks);
    829		details.transaction_id = cpu_to_le64(td->transaction_id);
    830		details.creation_time = cpu_to_le32(td->creation_time);
    831		details.snapshotted_time = cpu_to_le32(td->snapshotted_time);
    832		__dm_bless_for_disk(&details);
    833
    834		r = dm_btree_insert(&pmd->details_info, pmd->details_root,
    835				    &key, &details, &pmd->details_root);
    836		if (r)
    837			return r;
    838
    839		if (td->open_count)
    840			td->changed = false;
    841		else {
    842			list_del(&td->list);
    843			kfree(td);
    844		}
    845	}
    846
    847	return 0;
    848}
    849
    850static int __commit_transaction(struct dm_pool_metadata *pmd)
    851{
    852	int r;
    853	struct thin_disk_superblock *disk_super;
    854	struct dm_block *sblock;
    855
    856	/*
    857	 * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
    858	 */
    859	BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
    860	BUG_ON(!rwsem_is_locked(&pmd->root_lock));
    861
    862	if (unlikely(!pmd->in_service))
    863		return 0;
    864
    865	if (pmd->pre_commit_fn) {
    866		r = pmd->pre_commit_fn(pmd->pre_commit_context);
    867		if (r < 0) {
    868			DMERR("pre-commit callback failed");
    869			return r;
    870		}
    871	}
    872
    873	r = __write_changed_details(pmd);
    874	if (r < 0)
    875		return r;
    876
    877	r = dm_sm_commit(pmd->data_sm);
    878	if (r < 0)
    879		return r;
    880
    881	r = dm_tm_pre_commit(pmd->tm);
    882	if (r < 0)
    883		return r;
    884
    885	r = save_sm_roots(pmd);
    886	if (r < 0)
    887		return r;
    888
    889	r = superblock_lock(pmd, &sblock);
    890	if (r)
    891		return r;
    892
    893	disk_super = dm_block_data(sblock);
    894	disk_super->time = cpu_to_le32(pmd->time);
    895	disk_super->data_mapping_root = cpu_to_le64(pmd->root);
    896	disk_super->device_details_root = cpu_to_le64(pmd->details_root);
    897	disk_super->trans_id = cpu_to_le64(pmd->trans_id);
    898	disk_super->flags = cpu_to_le32(pmd->flags);
    899
    900	copy_sm_roots(pmd, disk_super);
    901
    902	return dm_tm_commit(pmd->tm, sblock);
    903}
    904
    905static void __set_metadata_reserve(struct dm_pool_metadata *pmd)
    906{
    907	int r;
    908	dm_block_t total;
    909	dm_block_t max_blocks = 4096; /* 16M */
    910
    911	r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total);
    912	if (r) {
    913		DMERR("could not get size of metadata device");
    914		pmd->metadata_reserve = max_blocks;
    915	} else
    916		pmd->metadata_reserve = min(max_blocks, div_u64(total, 10));
    917}
    918
    919struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
    920					       sector_t data_block_size,
    921					       bool format_device)
    922{
    923	int r;
    924	struct dm_pool_metadata *pmd;
    925
    926	pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
    927	if (!pmd) {
    928		DMERR("could not allocate metadata struct");
    929		return ERR_PTR(-ENOMEM);
    930	}
    931
    932	init_rwsem(&pmd->root_lock);
    933	pmd->time = 0;
    934	INIT_LIST_HEAD(&pmd->thin_devices);
    935	pmd->fail_io = false;
    936	pmd->in_service = false;
    937	pmd->bdev = bdev;
    938	pmd->data_block_size = data_block_size;
    939	pmd->pre_commit_fn = NULL;
    940	pmd->pre_commit_context = NULL;
    941
    942	r = __create_persistent_data_objects(pmd, format_device);
    943	if (r) {
    944		kfree(pmd);
    945		return ERR_PTR(r);
    946	}
    947
    948	r = __begin_transaction(pmd);
    949	if (r < 0) {
    950		if (dm_pool_metadata_close(pmd) < 0)
    951			DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
    952		return ERR_PTR(r);
    953	}
    954
    955	__set_metadata_reserve(pmd);
    956
    957	return pmd;
    958}
    959
    960int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
    961{
    962	int r;
    963	unsigned open_devices = 0;
    964	struct dm_thin_device *td, *tmp;
    965
    966	down_read(&pmd->root_lock);
    967	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
    968		if (td->open_count)
    969			open_devices++;
    970		else {
    971			list_del(&td->list);
    972			kfree(td);
    973		}
    974	}
    975	up_read(&pmd->root_lock);
    976
    977	if (open_devices) {
    978		DMERR("attempt to close pmd when %u device(s) are still open",
    979		       open_devices);
    980		return -EBUSY;
    981	}
    982
    983	pmd_write_lock_in_core(pmd);
    984	if (!pmd->fail_io && !dm_bm_is_read_only(pmd->bm)) {
    985		r = __commit_transaction(pmd);
    986		if (r < 0)
    987			DMWARN("%s: __commit_transaction() failed, error = %d",
    988			       __func__, r);
    989	}
    990	pmd_write_unlock(pmd);
    991	if (!pmd->fail_io)
    992		__destroy_persistent_data_objects(pmd);
    993
    994	kfree(pmd);
    995	return 0;
    996}
    997
    998/*
    999 * __open_device: Returns @td corresponding to device with id @dev,
   1000 * creating it if @create is set and incrementing @td->open_count.
   1001 * On failure, @td is undefined.
   1002 */
   1003static int __open_device(struct dm_pool_metadata *pmd,
   1004			 dm_thin_id dev, int create,
   1005			 struct dm_thin_device **td)
   1006{
   1007	int r, changed = 0;
   1008	struct dm_thin_device *td2;
   1009	uint64_t key = dev;
   1010	struct disk_device_details details_le;
   1011
   1012	/*
   1013	 * If the device is already open, return it.
   1014	 */
   1015	list_for_each_entry(td2, &pmd->thin_devices, list)
   1016		if (td2->id == dev) {
   1017			/*
   1018			 * May not create an already-open device.
   1019			 */
   1020			if (create)
   1021				return -EEXIST;
   1022
   1023			td2->open_count++;
   1024			*td = td2;
   1025			return 0;
   1026		}
   1027
   1028	/*
   1029	 * Check the device exists.
   1030	 */
   1031	r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
   1032			    &key, &details_le);
   1033	if (r) {
   1034		if (r != -ENODATA || !create)
   1035			return r;
   1036
   1037		/*
   1038		 * Create new device.
   1039		 */
   1040		changed = 1;
   1041		details_le.mapped_blocks = 0;
   1042		details_le.transaction_id = cpu_to_le64(pmd->trans_id);
   1043		details_le.creation_time = cpu_to_le32(pmd->time);
   1044		details_le.snapshotted_time = cpu_to_le32(pmd->time);
   1045	}
   1046
   1047	*td = kmalloc(sizeof(**td), GFP_NOIO);
   1048	if (!*td)
   1049		return -ENOMEM;
   1050
   1051	(*td)->pmd = pmd;
   1052	(*td)->id = dev;
   1053	(*td)->open_count = 1;
   1054	(*td)->changed = changed;
   1055	(*td)->aborted_with_changes = false;
   1056	(*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
   1057	(*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
   1058	(*td)->creation_time = le32_to_cpu(details_le.creation_time);
   1059	(*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time);
   1060
   1061	list_add(&(*td)->list, &pmd->thin_devices);
   1062
   1063	return 0;
   1064}
   1065
   1066static void __close_device(struct dm_thin_device *td)
   1067{
   1068	--td->open_count;
   1069}
   1070
   1071static int __create_thin(struct dm_pool_metadata *pmd,
   1072			 dm_thin_id dev)
   1073{
   1074	int r;
   1075	dm_block_t dev_root;
   1076	uint64_t key = dev;
   1077	struct dm_thin_device *td;
   1078	__le64 value;
   1079
   1080	r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
   1081			    &key, NULL);
   1082	if (!r)
   1083		return -EEXIST;
   1084
   1085	/*
   1086	 * Create an empty btree for the mappings.
   1087	 */
   1088	r = dm_btree_empty(&pmd->bl_info, &dev_root);
   1089	if (r)
   1090		return r;
   1091
   1092	/*
   1093	 * Insert it into the main mapping tree.
   1094	 */
   1095	value = cpu_to_le64(dev_root);
   1096	__dm_bless_for_disk(&value);
   1097	r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
   1098	if (r) {
   1099		dm_btree_del(&pmd->bl_info, dev_root);
   1100		return r;
   1101	}
   1102
   1103	r = __open_device(pmd, dev, 1, &td);
   1104	if (r) {
   1105		dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
   1106		dm_btree_del(&pmd->bl_info, dev_root);
   1107		return r;
   1108	}
   1109	__close_device(td);
   1110
   1111	return r;
   1112}
   1113
   1114int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
   1115{
   1116	int r = -EINVAL;
   1117
   1118	pmd_write_lock(pmd);
   1119	if (!pmd->fail_io)
   1120		r = __create_thin(pmd, dev);
   1121	pmd_write_unlock(pmd);
   1122
   1123	return r;
   1124}
   1125
   1126static int __set_snapshot_details(struct dm_pool_metadata *pmd,
   1127				  struct dm_thin_device *snap,
   1128				  dm_thin_id origin, uint32_t time)
   1129{
   1130	int r;
   1131	struct dm_thin_device *td;
   1132
   1133	r = __open_device(pmd, origin, 0, &td);
   1134	if (r)
   1135		return r;
   1136
   1137	td->changed = true;
   1138	td->snapshotted_time = time;
   1139
   1140	snap->mapped_blocks = td->mapped_blocks;
   1141	snap->snapshotted_time = time;
   1142	__close_device(td);
   1143
   1144	return 0;
   1145}
   1146
   1147static int __create_snap(struct dm_pool_metadata *pmd,
   1148			 dm_thin_id dev, dm_thin_id origin)
   1149{
   1150	int r;
   1151	dm_block_t origin_root;
   1152	uint64_t key = origin, dev_key = dev;
   1153	struct dm_thin_device *td;
   1154	__le64 value;
   1155
   1156	/* check this device is unused */
   1157	r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
   1158			    &dev_key, NULL);
   1159	if (!r)
   1160		return -EEXIST;
   1161
   1162	/* find the mapping tree for the origin */
   1163	r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value);
   1164	if (r)
   1165		return r;
   1166	origin_root = le64_to_cpu(value);
   1167
   1168	/* clone the origin, an inc will do */
   1169	dm_tm_inc(pmd->tm, origin_root);
   1170
   1171	/* insert into the main mapping tree */
   1172	value = cpu_to_le64(origin_root);
   1173	__dm_bless_for_disk(&value);
   1174	key = dev;
   1175	r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
   1176	if (r) {
   1177		dm_tm_dec(pmd->tm, origin_root);
   1178		return r;
   1179	}
   1180
   1181	pmd->time++;
   1182
   1183	r = __open_device(pmd, dev, 1, &td);
   1184	if (r)
   1185		goto bad;
   1186
   1187	r = __set_snapshot_details(pmd, td, origin, pmd->time);
   1188	__close_device(td);
   1189
   1190	if (r)
   1191		goto bad;
   1192
   1193	return 0;
   1194
   1195bad:
   1196	dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
   1197	dm_btree_remove(&pmd->details_info, pmd->details_root,
   1198			&key, &pmd->details_root);
   1199	return r;
   1200}
   1201
   1202int dm_pool_create_snap(struct dm_pool_metadata *pmd,
   1203				 dm_thin_id dev,
   1204				 dm_thin_id origin)
   1205{
   1206	int r = -EINVAL;
   1207
   1208	pmd_write_lock(pmd);
   1209	if (!pmd->fail_io)
   1210		r = __create_snap(pmd, dev, origin);
   1211	pmd_write_unlock(pmd);
   1212
   1213	return r;
   1214}
   1215
   1216static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
   1217{
   1218	int r;
   1219	uint64_t key = dev;
   1220	struct dm_thin_device *td;
   1221
   1222	/* TODO: failure should mark the transaction invalid */
   1223	r = __open_device(pmd, dev, 0, &td);
   1224	if (r)
   1225		return r;
   1226
   1227	if (td->open_count > 1) {
   1228		__close_device(td);
   1229		return -EBUSY;
   1230	}
   1231
   1232	list_del(&td->list);
   1233	kfree(td);
   1234	r = dm_btree_remove(&pmd->details_info, pmd->details_root,
   1235			    &key, &pmd->details_root);
   1236	if (r)
   1237		return r;
   1238
   1239	r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
   1240	if (r)
   1241		return r;
   1242
   1243	return 0;
   1244}
   1245
   1246int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
   1247			       dm_thin_id dev)
   1248{
   1249	int r = -EINVAL;
   1250
   1251	pmd_write_lock(pmd);
   1252	if (!pmd->fail_io)
   1253		r = __delete_device(pmd, dev);
   1254	pmd_write_unlock(pmd);
   1255
   1256	return r;
   1257}
   1258
   1259int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
   1260					uint64_t current_id,
   1261					uint64_t new_id)
   1262{
   1263	int r = -EINVAL;
   1264
   1265	pmd_write_lock(pmd);
   1266
   1267	if (pmd->fail_io)
   1268		goto out;
   1269
   1270	if (pmd->trans_id != current_id) {
   1271		DMERR("mismatched transaction id");
   1272		goto out;
   1273	}
   1274
   1275	pmd->trans_id = new_id;
   1276	r = 0;
   1277
   1278out:
   1279	pmd_write_unlock(pmd);
   1280
   1281	return r;
   1282}
   1283
   1284int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
   1285					uint64_t *result)
   1286{
   1287	int r = -EINVAL;
   1288
   1289	down_read(&pmd->root_lock);
   1290	if (!pmd->fail_io) {
   1291		*result = pmd->trans_id;
   1292		r = 0;
   1293	}
   1294	up_read(&pmd->root_lock);
   1295
   1296	return r;
   1297}
   1298
   1299static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
   1300{
   1301	int r, inc;
   1302	struct thin_disk_superblock *disk_super;
   1303	struct dm_block *copy, *sblock;
   1304	dm_block_t held_root;
   1305
   1306	/*
   1307	 * We commit to ensure the btree roots which we increment in a
   1308	 * moment are up to date.
   1309	 */
   1310	r = __commit_transaction(pmd);
   1311	if (r < 0) {
   1312		DMWARN("%s: __commit_transaction() failed, error = %d",
   1313		       __func__, r);
   1314		return r;
   1315	}
   1316
   1317	/*
   1318	 * Copy the superblock.
   1319	 */
   1320	dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
   1321	r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION,
   1322			       &sb_validator, &copy, &inc);
   1323	if (r)
   1324		return r;
   1325
   1326	BUG_ON(!inc);
   1327
   1328	held_root = dm_block_location(copy);
   1329	disk_super = dm_block_data(copy);
   1330
   1331	if (le64_to_cpu(disk_super->held_root)) {
   1332		DMWARN("Pool metadata snapshot already exists: release this before taking another.");
   1333
   1334		dm_tm_dec(pmd->tm, held_root);
   1335		dm_tm_unlock(pmd->tm, copy);
   1336		return -EBUSY;
   1337	}
   1338
   1339	/*
   1340	 * Wipe the spacemap since we're not publishing this.
   1341	 */
   1342	memset(&disk_super->data_space_map_root, 0,
   1343	       sizeof(disk_super->data_space_map_root));
   1344	memset(&disk_super->metadata_space_map_root, 0,
   1345	       sizeof(disk_super->metadata_space_map_root));
   1346
   1347	/*
   1348	 * Increment the data structures that need to be preserved.
   1349	 */
   1350	dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root));
   1351	dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root));
   1352	dm_tm_unlock(pmd->tm, copy);
   1353
   1354	/*
   1355	 * Write the held root into the superblock.
   1356	 */
   1357	r = superblock_lock(pmd, &sblock);
   1358	if (r) {
   1359		dm_tm_dec(pmd->tm, held_root);
   1360		return r;
   1361	}
   1362
   1363	disk_super = dm_block_data(sblock);
   1364	disk_super->held_root = cpu_to_le64(held_root);
   1365	dm_bm_unlock(sblock);
   1366	return 0;
   1367}
   1368
   1369int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
   1370{
   1371	int r = -EINVAL;
   1372
   1373	pmd_write_lock(pmd);
   1374	if (!pmd->fail_io)
   1375		r = __reserve_metadata_snap(pmd);
   1376	pmd_write_unlock(pmd);
   1377
   1378	return r;
   1379}
   1380
   1381static int __release_metadata_snap(struct dm_pool_metadata *pmd)
   1382{
   1383	int r;
   1384	struct thin_disk_superblock *disk_super;
   1385	struct dm_block *sblock, *copy;
   1386	dm_block_t held_root;
   1387
   1388	r = superblock_lock(pmd, &sblock);
   1389	if (r)
   1390		return r;
   1391
   1392	disk_super = dm_block_data(sblock);
   1393	held_root = le64_to_cpu(disk_super->held_root);
   1394	disk_super->held_root = cpu_to_le64(0);
   1395
   1396	dm_bm_unlock(sblock);
   1397
   1398	if (!held_root) {
   1399		DMWARN("No pool metadata snapshot found: nothing to release.");
   1400		return -EINVAL;
   1401	}
   1402
   1403	r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, &copy);
   1404	if (r)
   1405		return r;
   1406
   1407	disk_super = dm_block_data(copy);
   1408	dm_btree_del(&pmd->info, le64_to_cpu(disk_super->data_mapping_root));
   1409	dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root));
   1410	dm_sm_dec_block(pmd->metadata_sm, held_root);
   1411
   1412	dm_tm_unlock(pmd->tm, copy);
   1413
   1414	return 0;
   1415}
   1416
   1417int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
   1418{
   1419	int r = -EINVAL;
   1420
   1421	pmd_write_lock(pmd);
   1422	if (!pmd->fail_io)
   1423		r = __release_metadata_snap(pmd);
   1424	pmd_write_unlock(pmd);
   1425
   1426	return r;
   1427}
   1428
   1429static int __get_metadata_snap(struct dm_pool_metadata *pmd,
   1430			       dm_block_t *result)
   1431{
   1432	int r;
   1433	struct thin_disk_superblock *disk_super;
   1434	struct dm_block *sblock;
   1435
   1436	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
   1437			    &sb_validator, &sblock);
   1438	if (r)
   1439		return r;
   1440
   1441	disk_super = dm_block_data(sblock);
   1442	*result = le64_to_cpu(disk_super->held_root);
   1443
   1444	dm_bm_unlock(sblock);
   1445
   1446	return 0;
   1447}
   1448
   1449int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
   1450			      dm_block_t *result)
   1451{
   1452	int r = -EINVAL;
   1453
   1454	down_read(&pmd->root_lock);
   1455	if (!pmd->fail_io)
   1456		r = __get_metadata_snap(pmd, result);
   1457	up_read(&pmd->root_lock);
   1458
   1459	return r;
   1460}
   1461
   1462int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
   1463			     struct dm_thin_device **td)
   1464{
   1465	int r = -EINVAL;
   1466
   1467	pmd_write_lock_in_core(pmd);
   1468	if (!pmd->fail_io)
   1469		r = __open_device(pmd, dev, 0, td);
   1470	pmd_write_unlock(pmd);
   1471
   1472	return r;
   1473}
   1474
   1475int dm_pool_close_thin_device(struct dm_thin_device *td)
   1476{
   1477	pmd_write_lock_in_core(td->pmd);
   1478	__close_device(td);
   1479	pmd_write_unlock(td->pmd);
   1480
   1481	return 0;
   1482}
   1483
   1484dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
   1485{
   1486	return td->id;
   1487}
   1488
   1489/*
   1490 * Check whether @time (of block creation) is older than @td's last snapshot.
   1491 * If so then the associated block is shared with the last snapshot device.
   1492 * Any block on a device created *after* the device last got snapshotted is
   1493 * necessarily not shared.
   1494 */
   1495static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
   1496{
   1497	return td->snapshotted_time > time;
   1498}
   1499
   1500static void unpack_lookup_result(struct dm_thin_device *td, __le64 value,
   1501				 struct dm_thin_lookup_result *result)
   1502{
   1503	uint64_t block_time = 0;
   1504	dm_block_t exception_block;
   1505	uint32_t exception_time;
   1506
   1507	block_time = le64_to_cpu(value);
   1508	unpack_block_time(block_time, &exception_block, &exception_time);
   1509	result->block = exception_block;
   1510	result->shared = __snapshotted_since(td, exception_time);
   1511}
   1512
   1513static int __find_block(struct dm_thin_device *td, dm_block_t block,
   1514			int can_issue_io, struct dm_thin_lookup_result *result)
   1515{
   1516	int r;
   1517	__le64 value;
   1518	struct dm_pool_metadata *pmd = td->pmd;
   1519	dm_block_t keys[2] = { td->id, block };
   1520	struct dm_btree_info *info;
   1521
   1522	if (can_issue_io) {
   1523		info = &pmd->info;
   1524	} else
   1525		info = &pmd->nb_info;
   1526
   1527	r = dm_btree_lookup(info, pmd->root, keys, &value);
   1528	if (!r)
   1529		unpack_lookup_result(td, value, result);
   1530
   1531	return r;
   1532}
   1533
   1534int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
   1535		       int can_issue_io, struct dm_thin_lookup_result *result)
   1536{
   1537	int r;
   1538	struct dm_pool_metadata *pmd = td->pmd;
   1539
   1540	down_read(&pmd->root_lock);
   1541	if (pmd->fail_io) {
   1542		up_read(&pmd->root_lock);
   1543		return -EINVAL;
   1544	}
   1545
   1546	r = __find_block(td, block, can_issue_io, result);
   1547
   1548	up_read(&pmd->root_lock);
   1549	return r;
   1550}
   1551
   1552static int __find_next_mapped_block(struct dm_thin_device *td, dm_block_t block,
   1553					  dm_block_t *vblock,
   1554					  struct dm_thin_lookup_result *result)
   1555{
   1556	int r;
   1557	__le64 value;
   1558	struct dm_pool_metadata *pmd = td->pmd;
   1559	dm_block_t keys[2] = { td->id, block };
   1560
   1561	r = dm_btree_lookup_next(&pmd->info, pmd->root, keys, vblock, &value);
   1562	if (!r)
   1563		unpack_lookup_result(td, value, result);
   1564
   1565	return r;
   1566}
   1567
   1568static int __find_mapped_range(struct dm_thin_device *td,
   1569			       dm_block_t begin, dm_block_t end,
   1570			       dm_block_t *thin_begin, dm_block_t *thin_end,
   1571			       dm_block_t *pool_begin, bool *maybe_shared)
   1572{
   1573	int r;
   1574	dm_block_t pool_end;
   1575	struct dm_thin_lookup_result lookup;
   1576
   1577	if (end < begin)
   1578		return -ENODATA;
   1579
   1580	r = __find_next_mapped_block(td, begin, &begin, &lookup);
   1581	if (r)
   1582		return r;
   1583
   1584	if (begin >= end)
   1585		return -ENODATA;
   1586
   1587	*thin_begin = begin;
   1588	*pool_begin = lookup.block;
   1589	*maybe_shared = lookup.shared;
   1590
   1591	begin++;
   1592	pool_end = *pool_begin + 1;
   1593	while (begin != end) {
   1594		r = __find_block(td, begin, true, &lookup);
   1595		if (r) {
   1596			if (r == -ENODATA)
   1597				break;
   1598			else
   1599				return r;
   1600		}
   1601
   1602		if ((lookup.block != pool_end) ||
   1603		    (lookup.shared != *maybe_shared))
   1604			break;
   1605
   1606		pool_end++;
   1607		begin++;
   1608	}
   1609
   1610	*thin_end = begin;
   1611	return 0;
   1612}
   1613
   1614int dm_thin_find_mapped_range(struct dm_thin_device *td,
   1615			      dm_block_t begin, dm_block_t end,
   1616			      dm_block_t *thin_begin, dm_block_t *thin_end,
   1617			      dm_block_t *pool_begin, bool *maybe_shared)
   1618{
   1619	int r = -EINVAL;
   1620	struct dm_pool_metadata *pmd = td->pmd;
   1621
   1622	down_read(&pmd->root_lock);
   1623	if (!pmd->fail_io) {
   1624		r = __find_mapped_range(td, begin, end, thin_begin, thin_end,
   1625					pool_begin, maybe_shared);
   1626	}
   1627	up_read(&pmd->root_lock);
   1628
   1629	return r;
   1630}
   1631
   1632static int __insert(struct dm_thin_device *td, dm_block_t block,
   1633		    dm_block_t data_block)
   1634{
   1635	int r, inserted;
   1636	__le64 value;
   1637	struct dm_pool_metadata *pmd = td->pmd;
   1638	dm_block_t keys[2] = { td->id, block };
   1639
   1640	value = cpu_to_le64(pack_block_time(data_block, pmd->time));
   1641	__dm_bless_for_disk(&value);
   1642
   1643	r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value,
   1644				   &pmd->root, &inserted);
   1645	if (r)
   1646		return r;
   1647
   1648	td->changed = true;
   1649	if (inserted)
   1650		td->mapped_blocks++;
   1651
   1652	return 0;
   1653}
   1654
   1655int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
   1656			 dm_block_t data_block)
   1657{
   1658	int r = -EINVAL;
   1659
   1660	pmd_write_lock(td->pmd);
   1661	if (!td->pmd->fail_io)
   1662		r = __insert(td, block, data_block);
   1663	pmd_write_unlock(td->pmd);
   1664
   1665	return r;
   1666}
   1667
   1668static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end)
   1669{
   1670	int r;
   1671	unsigned count, total_count = 0;
   1672	struct dm_pool_metadata *pmd = td->pmd;
   1673	dm_block_t keys[1] = { td->id };
   1674	__le64 value;
   1675	dm_block_t mapping_root;
   1676
   1677	/*
   1678	 * Find the mapping tree
   1679	 */
   1680	r = dm_btree_lookup(&pmd->tl_info, pmd->root, keys, &value);
   1681	if (r)
   1682		return r;
   1683
   1684	/*
   1685	 * Remove from the mapping tree, taking care to inc the
   1686	 * ref count so it doesn't get deleted.
   1687	 */
   1688	mapping_root = le64_to_cpu(value);
   1689	dm_tm_inc(pmd->tm, mapping_root);
   1690	r = dm_btree_remove(&pmd->tl_info, pmd->root, keys, &pmd->root);
   1691	if (r)
   1692		return r;
   1693
   1694	/*
   1695	 * Remove leaves stops at the first unmapped entry, so we have to
   1696	 * loop round finding mapped ranges.
   1697	 */
   1698	while (begin < end) {
   1699		r = dm_btree_lookup_next(&pmd->bl_info, mapping_root, &begin, &begin, &value);
   1700		if (r == -ENODATA)
   1701			break;
   1702
   1703		if (r)
   1704			return r;
   1705
   1706		if (begin >= end)
   1707			break;
   1708
   1709		r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
   1710		if (r)
   1711			return r;
   1712
   1713		total_count += count;
   1714	}
   1715
   1716	td->mapped_blocks -= total_count;
   1717	td->changed = true;
   1718
   1719	/*
   1720	 * Reinsert the mapping tree.
   1721	 */
   1722	value = cpu_to_le64(mapping_root);
   1723	__dm_bless_for_disk(&value);
   1724	return dm_btree_insert(&pmd->tl_info, pmd->root, keys, &value, &pmd->root);
   1725}
   1726
   1727int dm_thin_remove_range(struct dm_thin_device *td,
   1728			 dm_block_t begin, dm_block_t end)
   1729{
   1730	int r = -EINVAL;
   1731
   1732	pmd_write_lock(td->pmd);
   1733	if (!td->pmd->fail_io)
   1734		r = __remove_range(td, begin, end);
   1735	pmd_write_unlock(td->pmd);
   1736
   1737	return r;
   1738}
   1739
   1740int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
   1741{
   1742	int r;
   1743	uint32_t ref_count;
   1744
   1745	down_read(&pmd->root_lock);
   1746	r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
   1747	if (!r)
   1748		*result = (ref_count > 1);
   1749	up_read(&pmd->root_lock);
   1750
   1751	return r;
   1752}
   1753
   1754int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
   1755{
   1756	int r = 0;
   1757
   1758	pmd_write_lock(pmd);
   1759	r = dm_sm_inc_blocks(pmd->data_sm, b, e);
   1760	pmd_write_unlock(pmd);
   1761
   1762	return r;
   1763}
   1764
   1765int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
   1766{
   1767	int r = 0;
   1768
   1769	pmd_write_lock(pmd);
   1770	r = dm_sm_dec_blocks(pmd->data_sm, b, e);
   1771	pmd_write_unlock(pmd);
   1772
   1773	return r;
   1774}
   1775
   1776bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
   1777{
   1778	int r;
   1779
   1780	down_read(&td->pmd->root_lock);
   1781	r = td->changed;
   1782	up_read(&td->pmd->root_lock);
   1783
   1784	return r;
   1785}
   1786
   1787bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
   1788{
   1789	bool r = false;
   1790	struct dm_thin_device *td, *tmp;
   1791
   1792	down_read(&pmd->root_lock);
   1793	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
   1794		if (td->changed) {
   1795			r = td->changed;
   1796			break;
   1797		}
   1798	}
   1799	up_read(&pmd->root_lock);
   1800
   1801	return r;
   1802}
   1803
   1804bool dm_thin_aborted_changes(struct dm_thin_device *td)
   1805{
   1806	bool r;
   1807
   1808	down_read(&td->pmd->root_lock);
   1809	r = td->aborted_with_changes;
   1810	up_read(&td->pmd->root_lock);
   1811
   1812	return r;
   1813}
   1814
   1815int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
   1816{
   1817	int r = -EINVAL;
   1818
   1819	pmd_write_lock(pmd);
   1820	if (!pmd->fail_io)
   1821		r = dm_sm_new_block(pmd->data_sm, result);
   1822	pmd_write_unlock(pmd);
   1823
   1824	return r;
   1825}
   1826
   1827int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
   1828{
   1829	int r = -EINVAL;
   1830
   1831	/*
   1832	 * Care is taken to not have commit be what
   1833	 * triggers putting the thin-pool in-service.
   1834	 */
   1835	pmd_write_lock_in_core(pmd);
   1836	if (pmd->fail_io)
   1837		goto out;
   1838
   1839	r = __commit_transaction(pmd);
   1840	if (r < 0)
   1841		goto out;
   1842
   1843	/*
   1844	 * Open the next transaction.
   1845	 */
   1846	r = __begin_transaction(pmd);
   1847out:
   1848	pmd_write_unlock(pmd);
   1849	return r;
   1850}
   1851
   1852static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
   1853{
   1854	struct dm_thin_device *td;
   1855
   1856	list_for_each_entry(td, &pmd->thin_devices, list)
   1857		td->aborted_with_changes = td->changed;
   1858}
   1859
   1860int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
   1861{
   1862	int r = -EINVAL;
   1863
   1864	pmd_write_lock(pmd);
   1865	if (pmd->fail_io)
   1866		goto out;
   1867
   1868	__set_abort_with_changes_flags(pmd);
   1869	__destroy_persistent_data_objects(pmd);
   1870	r = __create_persistent_data_objects(pmd, false);
   1871	if (r)
   1872		pmd->fail_io = true;
   1873
   1874out:
   1875	pmd_write_unlock(pmd);
   1876
   1877	return r;
   1878}
   1879
   1880int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
   1881{
   1882	int r = -EINVAL;
   1883
   1884	down_read(&pmd->root_lock);
   1885	if (!pmd->fail_io)
   1886		r = dm_sm_get_nr_free(pmd->data_sm, result);
   1887	up_read(&pmd->root_lock);
   1888
   1889	return r;
   1890}
   1891
   1892int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
   1893					  dm_block_t *result)
   1894{
   1895	int r = -EINVAL;
   1896
   1897	down_read(&pmd->root_lock);
   1898	if (!pmd->fail_io)
   1899		r = dm_sm_get_nr_free(pmd->metadata_sm, result);
   1900
   1901	if (!r) {
   1902		if (*result < pmd->metadata_reserve)
   1903			*result = 0;
   1904		else
   1905			*result -= pmd->metadata_reserve;
   1906	}
   1907	up_read(&pmd->root_lock);
   1908
   1909	return r;
   1910}
   1911
   1912int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
   1913				  dm_block_t *result)
   1914{
   1915	int r = -EINVAL;
   1916
   1917	down_read(&pmd->root_lock);
   1918	if (!pmd->fail_io)
   1919		r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
   1920	up_read(&pmd->root_lock);
   1921
   1922	return r;
   1923}
   1924
   1925int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
   1926{
   1927	int r = -EINVAL;
   1928
   1929	down_read(&pmd->root_lock);
   1930	if (!pmd->fail_io)
   1931		r = dm_sm_get_nr_blocks(pmd->data_sm, result);
   1932	up_read(&pmd->root_lock);
   1933
   1934	return r;
   1935}
   1936
   1937int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
   1938{
   1939	int r = -EINVAL;
   1940	struct dm_pool_metadata *pmd = td->pmd;
   1941
   1942	down_read(&pmd->root_lock);
   1943	if (!pmd->fail_io) {
   1944		*result = td->mapped_blocks;
   1945		r = 0;
   1946	}
   1947	up_read(&pmd->root_lock);
   1948
   1949	return r;
   1950}
   1951
   1952static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
   1953{
   1954	int r;
   1955	__le64 value_le;
   1956	dm_block_t thin_root;
   1957	struct dm_pool_metadata *pmd = td->pmd;
   1958
   1959	r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le);
   1960	if (r)
   1961		return r;
   1962
   1963	thin_root = le64_to_cpu(value_le);
   1964
   1965	return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result);
   1966}
   1967
   1968int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
   1969				     dm_block_t *result)
   1970{
   1971	int r = -EINVAL;
   1972	struct dm_pool_metadata *pmd = td->pmd;
   1973
   1974	down_read(&pmd->root_lock);
   1975	if (!pmd->fail_io)
   1976		r = __highest_block(td, result);
   1977	up_read(&pmd->root_lock);
   1978
   1979	return r;
   1980}
   1981
   1982static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count)
   1983{
   1984	int r;
   1985	dm_block_t old_count;
   1986
   1987	r = dm_sm_get_nr_blocks(sm, &old_count);
   1988	if (r)
   1989		return r;
   1990
   1991	if (new_count == old_count)
   1992		return 0;
   1993
   1994	if (new_count < old_count) {
   1995		DMERR("cannot reduce size of space map");
   1996		return -EINVAL;
   1997	}
   1998
   1999	return dm_sm_extend(sm, new_count - old_count);
   2000}
   2001
   2002int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
   2003{
   2004	int r = -EINVAL;
   2005
   2006	pmd_write_lock(pmd);
   2007	if (!pmd->fail_io)
   2008		r = __resize_space_map(pmd->data_sm, new_count);
   2009	pmd_write_unlock(pmd);
   2010
   2011	return r;
   2012}
   2013
   2014int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
   2015{
   2016	int r = -EINVAL;
   2017
   2018	pmd_write_lock(pmd);
   2019	if (!pmd->fail_io) {
   2020		r = __resize_space_map(pmd->metadata_sm, new_count);
   2021		if (!r)
   2022			__set_metadata_reserve(pmd);
   2023	}
   2024	pmd_write_unlock(pmd);
   2025
   2026	return r;
   2027}
   2028
   2029void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
   2030{
   2031	pmd_write_lock_in_core(pmd);
   2032	dm_bm_set_read_only(pmd->bm);
   2033	pmd_write_unlock(pmd);
   2034}
   2035
   2036void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
   2037{
   2038	pmd_write_lock_in_core(pmd);
   2039	dm_bm_set_read_write(pmd->bm);
   2040	pmd_write_unlock(pmd);
   2041}
   2042
   2043int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
   2044					dm_block_t threshold,
   2045					dm_sm_threshold_fn fn,
   2046					void *context)
   2047{
   2048	int r;
   2049
   2050	pmd_write_lock_in_core(pmd);
   2051	r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context);
   2052	pmd_write_unlock(pmd);
   2053
   2054	return r;
   2055}
   2056
   2057void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd,
   2058					  dm_pool_pre_commit_fn fn,
   2059					  void *context)
   2060{
   2061	pmd_write_lock_in_core(pmd);
   2062	pmd->pre_commit_fn = fn;
   2063	pmd->pre_commit_context = context;
   2064	pmd_write_unlock(pmd);
   2065}
   2066
   2067int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
   2068{
   2069	int r = -EINVAL;
   2070	struct dm_block *sblock;
   2071	struct thin_disk_superblock *disk_super;
   2072
   2073	pmd_write_lock(pmd);
   2074	if (pmd->fail_io)
   2075		goto out;
   2076
   2077	pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
   2078
   2079	r = superblock_lock(pmd, &sblock);
   2080	if (r) {
   2081		DMERR("couldn't lock superblock");
   2082		goto out;
   2083	}
   2084
   2085	disk_super = dm_block_data(sblock);
   2086	disk_super->flags = cpu_to_le32(pmd->flags);
   2087
   2088	dm_bm_unlock(sblock);
   2089out:
   2090	pmd_write_unlock(pmd);
   2091	return r;
   2092}
   2093
   2094bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
   2095{
   2096	bool needs_check;
   2097
   2098	down_read(&pmd->root_lock);
   2099	needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
   2100	up_read(&pmd->root_lock);
   2101
   2102	return needs_check;
   2103}
   2104
   2105void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd)
   2106{
   2107	down_read(&pmd->root_lock);
   2108	if (!pmd->fail_io)
   2109		dm_tm_issue_prefetches(pmd->tm);
   2110	up_read(&pmd->root_lock);
   2111}