cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

dm-era-target.c (39544B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2#include "dm.h"
      3#include "persistent-data/dm-transaction-manager.h"
      4#include "persistent-data/dm-bitset.h"
      5#include "persistent-data/dm-space-map.h"
      6
      7#include <linux/dm-io.h>
      8#include <linux/dm-kcopyd.h>
      9#include <linux/init.h>
     10#include <linux/mempool.h>
     11#include <linux/module.h>
     12#include <linux/slab.h>
     13#include <linux/vmalloc.h>
     14
     15#define DM_MSG_PREFIX "era"
     16
     17#define SUPERBLOCK_LOCATION 0
     18#define SUPERBLOCK_MAGIC 2126579579
     19#define SUPERBLOCK_CSUM_XOR 146538381
     20#define MIN_ERA_VERSION 1
     21#define MAX_ERA_VERSION 1
     22#define INVALID_WRITESET_ROOT SUPERBLOCK_LOCATION
     23#define MIN_BLOCK_SIZE 8
     24
     25/*----------------------------------------------------------------
     26 * Writeset
     27 *--------------------------------------------------------------*/
     28struct writeset_metadata {
     29	uint32_t nr_bits;
     30	dm_block_t root;
     31};
     32
     33struct writeset {
     34	struct writeset_metadata md;
     35
     36	/*
     37	 * An in core copy of the bits to save constantly doing look ups on
     38	 * disk.
     39	 */
     40	unsigned long *bits;
     41};
     42
     43/*
     44 * This does not free off the on disk bitset as this will normally be done
     45 * after digesting into the era array.
     46 */
     47static void writeset_free(struct writeset *ws)
     48{
     49	vfree(ws->bits);
     50	ws->bits = NULL;
     51}
     52
     53static int setup_on_disk_bitset(struct dm_disk_bitset *info,
     54				unsigned nr_bits, dm_block_t *root)
     55{
     56	int r;
     57
     58	r = dm_bitset_empty(info, root);
     59	if (r)
     60		return r;
     61
     62	return dm_bitset_resize(info, *root, 0, nr_bits, false, root);
     63}
     64
     65static size_t bitset_size(unsigned nr_bits)
     66{
     67	return sizeof(unsigned long) * dm_div_up(nr_bits, BITS_PER_LONG);
     68}
     69
     70/*
     71 * Allocates memory for the in core bitset.
     72 */
     73static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks)
     74{
     75	ws->bits = vzalloc(bitset_size(nr_blocks));
     76	if (!ws->bits) {
     77		DMERR("%s: couldn't allocate in memory bitset", __func__);
     78		return -ENOMEM;
     79	}
     80
     81	return 0;
     82}
     83
     84/*
     85 * Wipes the in-core bitset, and creates a new on disk bitset.
     86 */
     87static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws,
     88			 dm_block_t nr_blocks)
     89{
     90	int r;
     91
     92	memset(ws->bits, 0, bitset_size(nr_blocks));
     93
     94	ws->md.nr_bits = nr_blocks;
     95	r = setup_on_disk_bitset(info, ws->md.nr_bits, &ws->md.root);
     96	if (r) {
     97		DMERR("%s: setup_on_disk_bitset failed", __func__);
     98		return r;
     99	}
    100
    101	return 0;
    102}
    103
    104static bool writeset_marked(struct writeset *ws, dm_block_t block)
    105{
    106	return test_bit(block, ws->bits);
    107}
    108
    109static int writeset_marked_on_disk(struct dm_disk_bitset *info,
    110				   struct writeset_metadata *m, dm_block_t block,
    111				   bool *result)
    112{
    113	dm_block_t old = m->root;
    114
    115	/*
    116	 * The bitset was flushed when it was archived, so we know there'll
    117	 * be no change to the root.
    118	 */
    119	int r = dm_bitset_test_bit(info, m->root, block, &m->root, result);
    120	if (r) {
    121		DMERR("%s: dm_bitset_test_bit failed", __func__);
    122		return r;
    123	}
    124
    125	BUG_ON(m->root != old);
    126
    127	return r;
    128}
    129
    130/*
    131 * Returns < 0 on error, 0 if the bit wasn't previously set, 1 if it was.
    132 */
    133static int writeset_test_and_set(struct dm_disk_bitset *info,
    134				 struct writeset *ws, uint32_t block)
    135{
    136	int r;
    137
    138	if (!test_bit(block, ws->bits)) {
    139		r = dm_bitset_set_bit(info, ws->md.root, block, &ws->md.root);
    140		if (r) {
    141			/* FIXME: fail mode */
    142			return r;
    143		}
    144
    145		return 0;
    146	}
    147
    148	return 1;
    149}
    150
    151/*----------------------------------------------------------------
    152 * On disk metadata layout
    153 *--------------------------------------------------------------*/
    154#define SPACE_MAP_ROOT_SIZE 128
    155#define UUID_LEN 16
    156
    157struct writeset_disk {
    158	__le32 nr_bits;
    159	__le64 root;
    160} __packed;
    161
    162struct superblock_disk {
    163	__le32 csum;
    164	__le32 flags;
    165	__le64 blocknr;
    166
    167	__u8 uuid[UUID_LEN];
    168	__le64 magic;
    169	__le32 version;
    170
    171	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
    172
    173	__le32 data_block_size;
    174	__le32 metadata_block_size;
    175	__le32 nr_blocks;
    176
    177	__le32 current_era;
    178	struct writeset_disk current_writeset;
    179
    180	/*
    181	 * Only these two fields are valid within the metadata snapshot.
    182	 */
    183	__le64 writeset_tree_root;
    184	__le64 era_array_root;
    185
    186	__le64 metadata_snap;
    187} __packed;
    188
    189/*----------------------------------------------------------------
    190 * Superblock validation
    191 *--------------------------------------------------------------*/
    192static void sb_prepare_for_write(struct dm_block_validator *v,
    193				 struct dm_block *b,
    194				 size_t sb_block_size)
    195{
    196	struct superblock_disk *disk = dm_block_data(b);
    197
    198	disk->blocknr = cpu_to_le64(dm_block_location(b));
    199	disk->csum = cpu_to_le32(dm_bm_checksum(&disk->flags,
    200						sb_block_size - sizeof(__le32),
    201						SUPERBLOCK_CSUM_XOR));
    202}
    203
    204static int check_metadata_version(struct superblock_disk *disk)
    205{
    206	uint32_t metadata_version = le32_to_cpu(disk->version);
    207	if (metadata_version < MIN_ERA_VERSION || metadata_version > MAX_ERA_VERSION) {
    208		DMERR("Era metadata version %u found, but only versions between %u and %u supported.",
    209		      metadata_version, MIN_ERA_VERSION, MAX_ERA_VERSION);
    210		return -EINVAL;
    211	}
    212
    213	return 0;
    214}
    215
    216static int sb_check(struct dm_block_validator *v,
    217		    struct dm_block *b,
    218		    size_t sb_block_size)
    219{
    220	struct superblock_disk *disk = dm_block_data(b);
    221	__le32 csum_le;
    222
    223	if (dm_block_location(b) != le64_to_cpu(disk->blocknr)) {
    224		DMERR("sb_check failed: blocknr %llu: wanted %llu",
    225		      le64_to_cpu(disk->blocknr),
    226		      (unsigned long long)dm_block_location(b));
    227		return -ENOTBLK;
    228	}
    229
    230	if (le64_to_cpu(disk->magic) != SUPERBLOCK_MAGIC) {
    231		DMERR("sb_check failed: magic %llu: wanted %llu",
    232		      le64_to_cpu(disk->magic),
    233		      (unsigned long long) SUPERBLOCK_MAGIC);
    234		return -EILSEQ;
    235	}
    236
    237	csum_le = cpu_to_le32(dm_bm_checksum(&disk->flags,
    238					     sb_block_size - sizeof(__le32),
    239					     SUPERBLOCK_CSUM_XOR));
    240	if (csum_le != disk->csum) {
    241		DMERR("sb_check failed: csum %u: wanted %u",
    242		      le32_to_cpu(csum_le), le32_to_cpu(disk->csum));
    243		return -EILSEQ;
    244	}
    245
    246	return check_metadata_version(disk);
    247}
    248
    249static struct dm_block_validator sb_validator = {
    250	.name = "superblock",
    251	.prepare_for_write = sb_prepare_for_write,
    252	.check = sb_check
    253};
    254
    255/*----------------------------------------------------------------
    256 * Low level metadata handling
    257 *--------------------------------------------------------------*/
    258#define DM_ERA_METADATA_BLOCK_SIZE 4096
    259#define ERA_MAX_CONCURRENT_LOCKS 5
    260
    261struct era_metadata {
    262	struct block_device *bdev;
    263	struct dm_block_manager *bm;
    264	struct dm_space_map *sm;
    265	struct dm_transaction_manager *tm;
    266
    267	dm_block_t block_size;
    268	uint32_t nr_blocks;
    269
    270	uint32_t current_era;
    271
    272	/*
    273	 * We preallocate 2 writesets.  When an era rolls over we
    274	 * switch between them. This means the allocation is done at
    275	 * preresume time, rather than on the io path.
    276	 */
    277	struct writeset writesets[2];
    278	struct writeset *current_writeset;
    279
    280	dm_block_t writeset_tree_root;
    281	dm_block_t era_array_root;
    282
    283	struct dm_disk_bitset bitset_info;
    284	struct dm_btree_info writeset_tree_info;
    285	struct dm_array_info era_array_info;
    286
    287	dm_block_t metadata_snap;
    288
    289	/*
    290	 * A flag that is set whenever a writeset has been archived.
    291	 */
    292	bool archived_writesets;
    293
    294	/*
    295	 * Reading the space map root can fail, so we read it into this
    296	 * buffer before the superblock is locked and updated.
    297	 */
    298	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
    299};
    300
    301static int superblock_read_lock(struct era_metadata *md,
    302				struct dm_block **sblock)
    303{
    304	return dm_bm_read_lock(md->bm, SUPERBLOCK_LOCATION,
    305			       &sb_validator, sblock);
    306}
    307
    308static int superblock_lock_zero(struct era_metadata *md,
    309				struct dm_block **sblock)
    310{
    311	return dm_bm_write_lock_zero(md->bm, SUPERBLOCK_LOCATION,
    312				     &sb_validator, sblock);
    313}
    314
    315static int superblock_lock(struct era_metadata *md,
    316			   struct dm_block **sblock)
    317{
    318	return dm_bm_write_lock(md->bm, SUPERBLOCK_LOCATION,
    319				&sb_validator, sblock);
    320}
    321
    322/* FIXME: duplication with cache and thin */
    323static int superblock_all_zeroes(struct dm_block_manager *bm, bool *result)
    324{
    325	int r;
    326	unsigned i;
    327	struct dm_block *b;
    328	__le64 *data_le, zero = cpu_to_le64(0);
    329	unsigned sb_block_size = dm_bm_block_size(bm) / sizeof(__le64);
    330
    331	/*
    332	 * We can't use a validator here - it may be all zeroes.
    333	 */
    334	r = dm_bm_read_lock(bm, SUPERBLOCK_LOCATION, NULL, &b);
    335	if (r)
    336		return r;
    337
    338	data_le = dm_block_data(b);
    339	*result = true;
    340	for (i = 0; i < sb_block_size; i++) {
    341		if (data_le[i] != zero) {
    342			*result = false;
    343			break;
    344		}
    345	}
    346
    347	dm_bm_unlock(b);
    348
    349	return 0;
    350}
    351
    352/*----------------------------------------------------------------*/
    353
    354static void ws_pack(const struct writeset_metadata *core, struct writeset_disk *disk)
    355{
    356	disk->nr_bits = cpu_to_le32(core->nr_bits);
    357	disk->root = cpu_to_le64(core->root);
    358}
    359
    360static void ws_unpack(const struct writeset_disk *disk, struct writeset_metadata *core)
    361{
    362	core->nr_bits = le32_to_cpu(disk->nr_bits);
    363	core->root = le64_to_cpu(disk->root);
    364}
    365
    366static void ws_inc(void *context, const void *value, unsigned count)
    367{
    368	struct era_metadata *md = context;
    369	struct writeset_disk ws_d;
    370	dm_block_t b;
    371	unsigned i;
    372
    373	for (i = 0; i < count; i++) {
    374		memcpy(&ws_d, value + (i * sizeof(ws_d)), sizeof(ws_d));
    375		b = le64_to_cpu(ws_d.root);
    376		dm_tm_inc(md->tm, b);
    377	}
    378}
    379
    380static void ws_dec(void *context, const void *value, unsigned count)
    381{
    382	struct era_metadata *md = context;
    383	struct writeset_disk ws_d;
    384	dm_block_t b;
    385	unsigned i;
    386
    387	for (i = 0; i < count; i++) {
    388		memcpy(&ws_d, value + (i * sizeof(ws_d)), sizeof(ws_d));
    389		b = le64_to_cpu(ws_d.root);
    390		dm_bitset_del(&md->bitset_info, b);
    391	}
    392}
    393
    394static int ws_eq(void *context, const void *value1, const void *value2)
    395{
    396	return !memcmp(value1, value2, sizeof(struct writeset_disk));
    397}
    398
    399/*----------------------------------------------------------------*/
    400
    401static void setup_writeset_tree_info(struct era_metadata *md)
    402{
    403	struct dm_btree_value_type *vt = &md->writeset_tree_info.value_type;
    404	md->writeset_tree_info.tm = md->tm;
    405	md->writeset_tree_info.levels = 1;
    406	vt->context = md;
    407	vt->size = sizeof(struct writeset_disk);
    408	vt->inc = ws_inc;
    409	vt->dec = ws_dec;
    410	vt->equal = ws_eq;
    411}
    412
    413static void setup_era_array_info(struct era_metadata *md)
    414
    415{
    416	struct dm_btree_value_type vt;
    417	vt.context = NULL;
    418	vt.size = sizeof(__le32);
    419	vt.inc = NULL;
    420	vt.dec = NULL;
    421	vt.equal = NULL;
    422
    423	dm_array_info_init(&md->era_array_info, md->tm, &vt);
    424}
    425
    426static void setup_infos(struct era_metadata *md)
    427{
    428	dm_disk_bitset_init(md->tm, &md->bitset_info);
    429	setup_writeset_tree_info(md);
    430	setup_era_array_info(md);
    431}
    432
    433/*----------------------------------------------------------------*/
    434
    435static int create_fresh_metadata(struct era_metadata *md)
    436{
    437	int r;
    438
    439	r = dm_tm_create_with_sm(md->bm, SUPERBLOCK_LOCATION,
    440				 &md->tm, &md->sm);
    441	if (r < 0) {
    442		DMERR("dm_tm_create_with_sm failed");
    443		return r;
    444	}
    445
    446	setup_infos(md);
    447
    448	r = dm_btree_empty(&md->writeset_tree_info, &md->writeset_tree_root);
    449	if (r) {
    450		DMERR("couldn't create new writeset tree");
    451		goto bad;
    452	}
    453
    454	r = dm_array_empty(&md->era_array_info, &md->era_array_root);
    455	if (r) {
    456		DMERR("couldn't create era array");
    457		goto bad;
    458	}
    459
    460	return 0;
    461
    462bad:
    463	dm_sm_destroy(md->sm);
    464	dm_tm_destroy(md->tm);
    465
    466	return r;
    467}
    468
    469static int save_sm_root(struct era_metadata *md)
    470{
    471	int r;
    472	size_t metadata_len;
    473
    474	r = dm_sm_root_size(md->sm, &metadata_len);
    475	if (r < 0)
    476		return r;
    477
    478	return dm_sm_copy_root(md->sm, &md->metadata_space_map_root,
    479			       metadata_len);
    480}
    481
    482static void copy_sm_root(struct era_metadata *md, struct superblock_disk *disk)
    483{
    484	memcpy(&disk->metadata_space_map_root,
    485	       &md->metadata_space_map_root,
    486	       sizeof(md->metadata_space_map_root));
    487}
    488
    489/*
    490 * Writes a superblock, including the static fields that don't get updated
    491 * with every commit (possible optimisation here).  'md' should be fully
    492 * constructed when this is called.
    493 */
    494static void prepare_superblock(struct era_metadata *md, struct superblock_disk *disk)
    495{
    496	disk->magic = cpu_to_le64(SUPERBLOCK_MAGIC);
    497	disk->flags = cpu_to_le32(0ul);
    498
    499	/* FIXME: can't keep blanking the uuid (uuid is currently unused though) */
    500	memset(disk->uuid, 0, sizeof(disk->uuid));
    501	disk->version = cpu_to_le32(MAX_ERA_VERSION);
    502
    503	copy_sm_root(md, disk);
    504
    505	disk->data_block_size = cpu_to_le32(md->block_size);
    506	disk->metadata_block_size = cpu_to_le32(DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT);
    507	disk->nr_blocks = cpu_to_le32(md->nr_blocks);
    508	disk->current_era = cpu_to_le32(md->current_era);
    509
    510	ws_pack(&md->current_writeset->md, &disk->current_writeset);
    511	disk->writeset_tree_root = cpu_to_le64(md->writeset_tree_root);
    512	disk->era_array_root = cpu_to_le64(md->era_array_root);
    513	disk->metadata_snap = cpu_to_le64(md->metadata_snap);
    514}
    515
    516static int write_superblock(struct era_metadata *md)
    517{
    518	int r;
    519	struct dm_block *sblock;
    520	struct superblock_disk *disk;
    521
    522	r = save_sm_root(md);
    523	if (r) {
    524		DMERR("%s: save_sm_root failed", __func__);
    525		return r;
    526	}
    527
    528	r = superblock_lock_zero(md, &sblock);
    529	if (r)
    530		return r;
    531
    532	disk = dm_block_data(sblock);
    533	prepare_superblock(md, disk);
    534
    535	return dm_tm_commit(md->tm, sblock);
    536}
    537
    538/*
    539 * Assumes block_size and the infos are set.
    540 */
    541static int format_metadata(struct era_metadata *md)
    542{
    543	int r;
    544
    545	r = create_fresh_metadata(md);
    546	if (r)
    547		return r;
    548
    549	r = write_superblock(md);
    550	if (r) {
    551		dm_sm_destroy(md->sm);
    552		dm_tm_destroy(md->tm);
    553		return r;
    554	}
    555
    556	return 0;
    557}
    558
    559static int open_metadata(struct era_metadata *md)
    560{
    561	int r;
    562	struct dm_block *sblock;
    563	struct superblock_disk *disk;
    564
    565	r = superblock_read_lock(md, &sblock);
    566	if (r) {
    567		DMERR("couldn't read_lock superblock");
    568		return r;
    569	}
    570
    571	disk = dm_block_data(sblock);
    572
    573	/* Verify the data block size hasn't changed */
    574	if (le32_to_cpu(disk->data_block_size) != md->block_size) {
    575		DMERR("changing the data block size (from %u to %llu) is not supported",
    576		      le32_to_cpu(disk->data_block_size), md->block_size);
    577		r = -EINVAL;
    578		goto bad;
    579	}
    580
    581	r = dm_tm_open_with_sm(md->bm, SUPERBLOCK_LOCATION,
    582			       disk->metadata_space_map_root,
    583			       sizeof(disk->metadata_space_map_root),
    584			       &md->tm, &md->sm);
    585	if (r) {
    586		DMERR("dm_tm_open_with_sm failed");
    587		goto bad;
    588	}
    589
    590	setup_infos(md);
    591
    592	md->nr_blocks = le32_to_cpu(disk->nr_blocks);
    593	md->current_era = le32_to_cpu(disk->current_era);
    594
    595	ws_unpack(&disk->current_writeset, &md->current_writeset->md);
    596	md->writeset_tree_root = le64_to_cpu(disk->writeset_tree_root);
    597	md->era_array_root = le64_to_cpu(disk->era_array_root);
    598	md->metadata_snap = le64_to_cpu(disk->metadata_snap);
    599	md->archived_writesets = true;
    600
    601	dm_bm_unlock(sblock);
    602
    603	return 0;
    604
    605bad:
    606	dm_bm_unlock(sblock);
    607	return r;
    608}
    609
    610static int open_or_format_metadata(struct era_metadata *md,
    611				   bool may_format)
    612{
    613	int r;
    614	bool unformatted = false;
    615
    616	r = superblock_all_zeroes(md->bm, &unformatted);
    617	if (r)
    618		return r;
    619
    620	if (unformatted)
    621		return may_format ? format_metadata(md) : -EPERM;
    622
    623	return open_metadata(md);
    624}
    625
    626static int create_persistent_data_objects(struct era_metadata *md,
    627					  bool may_format)
    628{
    629	int r;
    630
    631	md->bm = dm_block_manager_create(md->bdev, DM_ERA_METADATA_BLOCK_SIZE,
    632					 ERA_MAX_CONCURRENT_LOCKS);
    633	if (IS_ERR(md->bm)) {
    634		DMERR("could not create block manager");
    635		return PTR_ERR(md->bm);
    636	}
    637
    638	r = open_or_format_metadata(md, may_format);
    639	if (r)
    640		dm_block_manager_destroy(md->bm);
    641
    642	return r;
    643}
    644
    645static void destroy_persistent_data_objects(struct era_metadata *md)
    646{
    647	dm_sm_destroy(md->sm);
    648	dm_tm_destroy(md->tm);
    649	dm_block_manager_destroy(md->bm);
    650}
    651
    652/*
    653 * This waits until all era_map threads have picked up the new filter.
    654 */
    655static void swap_writeset(struct era_metadata *md, struct writeset *new_writeset)
    656{
    657	rcu_assign_pointer(md->current_writeset, new_writeset);
    658	synchronize_rcu();
    659}
    660
    661/*----------------------------------------------------------------
    662 * Writesets get 'digested' into the main era array.
    663 *
    664 * We're using a coroutine here so the worker thread can do the digestion,
    665 * thus avoiding synchronisation of the metadata.  Digesting a whole
    666 * writeset in one go would cause too much latency.
    667 *--------------------------------------------------------------*/
    668struct digest {
    669	uint32_t era;
    670	unsigned nr_bits, current_bit;
    671	struct writeset_metadata writeset;
    672	__le32 value;
    673	struct dm_disk_bitset info;
    674
    675	int (*step)(struct era_metadata *, struct digest *);
    676};
    677
    678static int metadata_digest_lookup_writeset(struct era_metadata *md,
    679					   struct digest *d);
    680
    681static int metadata_digest_remove_writeset(struct era_metadata *md,
    682					   struct digest *d)
    683{
    684	int r;
    685	uint64_t key = d->era;
    686
    687	r = dm_btree_remove(&md->writeset_tree_info, md->writeset_tree_root,
    688			    &key, &md->writeset_tree_root);
    689	if (r) {
    690		DMERR("%s: dm_btree_remove failed", __func__);
    691		return r;
    692	}
    693
    694	d->step = metadata_digest_lookup_writeset;
    695	return 0;
    696}
    697
    698#define INSERTS_PER_STEP 100
    699
    700static int metadata_digest_transcribe_writeset(struct era_metadata *md,
    701					       struct digest *d)
    702{
    703	int r;
    704	bool marked;
    705	unsigned b, e = min(d->current_bit + INSERTS_PER_STEP, d->nr_bits);
    706
    707	for (b = d->current_bit; b < e; b++) {
    708		r = writeset_marked_on_disk(&d->info, &d->writeset, b, &marked);
    709		if (r) {
    710			DMERR("%s: writeset_marked_on_disk failed", __func__);
    711			return r;
    712		}
    713
    714		if (!marked)
    715			continue;
    716
    717		__dm_bless_for_disk(&d->value);
    718		r = dm_array_set_value(&md->era_array_info, md->era_array_root,
    719				       b, &d->value, &md->era_array_root);
    720		if (r) {
    721			DMERR("%s: dm_array_set_value failed", __func__);
    722			return r;
    723		}
    724	}
    725
    726	if (b == d->nr_bits)
    727		d->step = metadata_digest_remove_writeset;
    728	else
    729		d->current_bit = b;
    730
    731	return 0;
    732}
    733
    734static int metadata_digest_lookup_writeset(struct era_metadata *md,
    735					   struct digest *d)
    736{
    737	int r;
    738	uint64_t key;
    739	struct writeset_disk disk;
    740
    741	r = dm_btree_find_lowest_key(&md->writeset_tree_info,
    742				     md->writeset_tree_root, &key);
    743	if (r < 0)
    744		return r;
    745
    746	d->era = key;
    747
    748	r = dm_btree_lookup(&md->writeset_tree_info,
    749			    md->writeset_tree_root, &key, &disk);
    750	if (r) {
    751		if (r == -ENODATA) {
    752			d->step = NULL;
    753			return 0;
    754		}
    755
    756		DMERR("%s: dm_btree_lookup failed", __func__);
    757		return r;
    758	}
    759
    760	ws_unpack(&disk, &d->writeset);
    761	d->value = cpu_to_le32(key);
    762
    763	/*
    764	 * We initialise another bitset info to avoid any caching side effects
    765	 * with the previous one.
    766	 */
    767	dm_disk_bitset_init(md->tm, &d->info);
    768
    769	d->nr_bits = min(d->writeset.nr_bits, md->nr_blocks);
    770	d->current_bit = 0;
    771	d->step = metadata_digest_transcribe_writeset;
    772
    773	return 0;
    774}
    775
    776static int metadata_digest_start(struct era_metadata *md, struct digest *d)
    777{
    778	if (d->step)
    779		return 0;
    780
    781	memset(d, 0, sizeof(*d));
    782	d->step = metadata_digest_lookup_writeset;
    783
    784	return 0;
    785}
    786
    787/*----------------------------------------------------------------
    788 * High level metadata interface.  Target methods should use these, and not
    789 * the lower level ones.
    790 *--------------------------------------------------------------*/
    791static struct era_metadata *metadata_open(struct block_device *bdev,
    792					  sector_t block_size,
    793					  bool may_format)
    794{
    795	int r;
    796	struct era_metadata *md = kzalloc(sizeof(*md), GFP_KERNEL);
    797
    798	if (!md)
    799		return NULL;
    800
    801	md->bdev = bdev;
    802	md->block_size = block_size;
    803
    804	md->writesets[0].md.root = INVALID_WRITESET_ROOT;
    805	md->writesets[1].md.root = INVALID_WRITESET_ROOT;
    806	md->current_writeset = &md->writesets[0];
    807
    808	r = create_persistent_data_objects(md, may_format);
    809	if (r) {
    810		kfree(md);
    811		return ERR_PTR(r);
    812	}
    813
    814	return md;
    815}
    816
    817static void metadata_close(struct era_metadata *md)
    818{
    819	writeset_free(&md->writesets[0]);
    820	writeset_free(&md->writesets[1]);
    821	destroy_persistent_data_objects(md);
    822	kfree(md);
    823}
    824
    825static bool valid_nr_blocks(dm_block_t n)
    826{
    827	/*
    828	 * dm_bitset restricts us to 2^32.  test_bit & co. restrict us
    829	 * further to 2^31 - 1
    830	 */
    831	return n < (1ull << 31);
    832}
    833
    834static int metadata_resize(struct era_metadata *md, void *arg)
    835{
    836	int r;
    837	dm_block_t *new_size = arg;
    838	__le32 value;
    839
    840	if (!valid_nr_blocks(*new_size)) {
    841		DMERR("Invalid number of origin blocks %llu",
    842		      (unsigned long long) *new_size);
    843		return -EINVAL;
    844	}
    845
    846	writeset_free(&md->writesets[0]);
    847	writeset_free(&md->writesets[1]);
    848
    849	r = writeset_alloc(&md->writesets[0], *new_size);
    850	if (r) {
    851		DMERR("%s: writeset_alloc failed for writeset 0", __func__);
    852		return r;
    853	}
    854
    855	r = writeset_alloc(&md->writesets[1], *new_size);
    856	if (r) {
    857		DMERR("%s: writeset_alloc failed for writeset 1", __func__);
    858		writeset_free(&md->writesets[0]);
    859		return r;
    860	}
    861
    862	value = cpu_to_le32(0u);
    863	__dm_bless_for_disk(&value);
    864	r = dm_array_resize(&md->era_array_info, md->era_array_root,
    865			    md->nr_blocks, *new_size,
    866			    &value, &md->era_array_root);
    867	if (r) {
    868		DMERR("%s: dm_array_resize failed", __func__);
    869		writeset_free(&md->writesets[0]);
    870		writeset_free(&md->writesets[1]);
    871		return r;
    872	}
    873
    874	md->nr_blocks = *new_size;
    875	return 0;
    876}
    877
    878static int metadata_era_archive(struct era_metadata *md)
    879{
    880	int r;
    881	uint64_t keys[1];
    882	struct writeset_disk value;
    883
    884	r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root,
    885			    &md->current_writeset->md.root);
    886	if (r) {
    887		DMERR("%s: dm_bitset_flush failed", __func__);
    888		return r;
    889	}
    890
    891	ws_pack(&md->current_writeset->md, &value);
    892
    893	keys[0] = md->current_era;
    894	__dm_bless_for_disk(&value);
    895	r = dm_btree_insert(&md->writeset_tree_info, md->writeset_tree_root,
    896			    keys, &value, &md->writeset_tree_root);
    897	if (r) {
    898		DMERR("%s: couldn't insert writeset into btree", __func__);
    899		/* FIXME: fail mode */
    900		return r;
    901	}
    902
    903	md->current_writeset->md.root = INVALID_WRITESET_ROOT;
    904	md->archived_writesets = true;
    905
    906	return 0;
    907}
    908
    909static struct writeset *next_writeset(struct era_metadata *md)
    910{
    911	return (md->current_writeset == &md->writesets[0]) ?
    912		&md->writesets[1] : &md->writesets[0];
    913}
    914
    915static int metadata_new_era(struct era_metadata *md)
    916{
    917	int r;
    918	struct writeset *new_writeset = next_writeset(md);
    919
    920	r = writeset_init(&md->bitset_info, new_writeset, md->nr_blocks);
    921	if (r) {
    922		DMERR("%s: writeset_init failed", __func__);
    923		return r;
    924	}
    925
    926	swap_writeset(md, new_writeset);
    927	md->current_era++;
    928
    929	return 0;
    930}
    931
    932static int metadata_era_rollover(struct era_metadata *md)
    933{
    934	int r;
    935
    936	if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) {
    937		r = metadata_era_archive(md);
    938		if (r) {
    939			DMERR("%s: metadata_archive_era failed", __func__);
    940			/* FIXME: fail mode? */
    941			return r;
    942		}
    943	}
    944
    945	r = metadata_new_era(md);
    946	if (r) {
    947		DMERR("%s: new era failed", __func__);
    948		/* FIXME: fail mode */
    949		return r;
    950	}
    951
    952	return 0;
    953}
    954
    955static bool metadata_current_marked(struct era_metadata *md, dm_block_t block)
    956{
    957	bool r;
    958	struct writeset *ws;
    959
    960	rcu_read_lock();
    961	ws = rcu_dereference(md->current_writeset);
    962	r = writeset_marked(ws, block);
    963	rcu_read_unlock();
    964
    965	return r;
    966}
    967
    968static int metadata_commit(struct era_metadata *md)
    969{
    970	int r;
    971	struct dm_block *sblock;
    972
    973	if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) {
    974		r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root,
    975				    &md->current_writeset->md.root);
    976		if (r) {
    977			DMERR("%s: bitset flush failed", __func__);
    978			return r;
    979		}
    980	}
    981
    982	r = dm_tm_pre_commit(md->tm);
    983	if (r) {
    984		DMERR("%s: pre commit failed", __func__);
    985		return r;
    986	}
    987
    988	r = save_sm_root(md);
    989	if (r) {
    990		DMERR("%s: save_sm_root failed", __func__);
    991		return r;
    992	}
    993
    994	r = superblock_lock(md, &sblock);
    995	if (r) {
    996		DMERR("%s: superblock lock failed", __func__);
    997		return r;
    998	}
    999
   1000	prepare_superblock(md, dm_block_data(sblock));
   1001
   1002	return dm_tm_commit(md->tm, sblock);
   1003}
   1004
   1005static int metadata_checkpoint(struct era_metadata *md)
   1006{
   1007	/*
   1008	 * For now we just rollover, but later I want to put a check in to
   1009	 * avoid this if the filter is still pretty fresh.
   1010	 */
   1011	return metadata_era_rollover(md);
   1012}
   1013
   1014/*
   1015 * Metadata snapshots allow userland to access era data.
   1016 */
   1017static int metadata_take_snap(struct era_metadata *md)
   1018{
   1019	int r, inc;
   1020	struct dm_block *clone;
   1021
   1022	if (md->metadata_snap != SUPERBLOCK_LOCATION) {
   1023		DMERR("%s: metadata snapshot already exists", __func__);
   1024		return -EINVAL;
   1025	}
   1026
   1027	r = metadata_era_rollover(md);
   1028	if (r) {
   1029		DMERR("%s: era rollover failed", __func__);
   1030		return r;
   1031	}
   1032
   1033	r = metadata_commit(md);
   1034	if (r) {
   1035		DMERR("%s: pre commit failed", __func__);
   1036		return r;
   1037	}
   1038
   1039	r = dm_sm_inc_block(md->sm, SUPERBLOCK_LOCATION);
   1040	if (r) {
   1041		DMERR("%s: couldn't increment superblock", __func__);
   1042		return r;
   1043	}
   1044
   1045	r = dm_tm_shadow_block(md->tm, SUPERBLOCK_LOCATION,
   1046			       &sb_validator, &clone, &inc);
   1047	if (r) {
   1048		DMERR("%s: couldn't shadow superblock", __func__);
   1049		dm_sm_dec_block(md->sm, SUPERBLOCK_LOCATION);
   1050		return r;
   1051	}
   1052	BUG_ON(!inc);
   1053
   1054	r = dm_sm_inc_block(md->sm, md->writeset_tree_root);
   1055	if (r) {
   1056		DMERR("%s: couldn't inc writeset tree root", __func__);
   1057		dm_tm_unlock(md->tm, clone);
   1058		return r;
   1059	}
   1060
   1061	r = dm_sm_inc_block(md->sm, md->era_array_root);
   1062	if (r) {
   1063		DMERR("%s: couldn't inc era tree root", __func__);
   1064		dm_sm_dec_block(md->sm, md->writeset_tree_root);
   1065		dm_tm_unlock(md->tm, clone);
   1066		return r;
   1067	}
   1068
   1069	md->metadata_snap = dm_block_location(clone);
   1070
   1071	dm_tm_unlock(md->tm, clone);
   1072
   1073	return 0;
   1074}
   1075
   1076static int metadata_drop_snap(struct era_metadata *md)
   1077{
   1078	int r;
   1079	dm_block_t location;
   1080	struct dm_block *clone;
   1081	struct superblock_disk *disk;
   1082
   1083	if (md->metadata_snap == SUPERBLOCK_LOCATION) {
   1084		DMERR("%s: no snap to drop", __func__);
   1085		return -EINVAL;
   1086	}
   1087
   1088	r = dm_tm_read_lock(md->tm, md->metadata_snap, &sb_validator, &clone);
   1089	if (r) {
   1090		DMERR("%s: couldn't read lock superblock clone", __func__);
   1091		return r;
   1092	}
   1093
   1094	/*
   1095	 * Whatever happens now we'll commit with no record of the metadata
   1096	 * snap.
   1097	 */
   1098	md->metadata_snap = SUPERBLOCK_LOCATION;
   1099
   1100	disk = dm_block_data(clone);
   1101	r = dm_btree_del(&md->writeset_tree_info,
   1102			 le64_to_cpu(disk->writeset_tree_root));
   1103	if (r) {
   1104		DMERR("%s: error deleting writeset tree clone", __func__);
   1105		dm_tm_unlock(md->tm, clone);
   1106		return r;
   1107	}
   1108
   1109	r = dm_array_del(&md->era_array_info, le64_to_cpu(disk->era_array_root));
   1110	if (r) {
   1111		DMERR("%s: error deleting era array clone", __func__);
   1112		dm_tm_unlock(md->tm, clone);
   1113		return r;
   1114	}
   1115
   1116	location = dm_block_location(clone);
   1117	dm_tm_unlock(md->tm, clone);
   1118
   1119	return dm_sm_dec_block(md->sm, location);
   1120}
   1121
   1122struct metadata_stats {
   1123	dm_block_t used;
   1124	dm_block_t total;
   1125	dm_block_t snap;
   1126	uint32_t era;
   1127};
   1128
   1129static int metadata_get_stats(struct era_metadata *md, void *ptr)
   1130{
   1131	int r;
   1132	struct metadata_stats *s = ptr;
   1133	dm_block_t nr_free, nr_total;
   1134
   1135	r = dm_sm_get_nr_free(md->sm, &nr_free);
   1136	if (r) {
   1137		DMERR("dm_sm_get_nr_free returned %d", r);
   1138		return r;
   1139	}
   1140
   1141	r = dm_sm_get_nr_blocks(md->sm, &nr_total);
   1142	if (r) {
   1143		DMERR("dm_pool_get_metadata_dev_size returned %d", r);
   1144		return r;
   1145	}
   1146
   1147	s->used = nr_total - nr_free;
   1148	s->total = nr_total;
   1149	s->snap = md->metadata_snap;
   1150	s->era = md->current_era;
   1151
   1152	return 0;
   1153}
   1154
   1155/*----------------------------------------------------------------*/
   1156
   1157struct era {
   1158	struct dm_target *ti;
   1159
   1160	struct dm_dev *metadata_dev;
   1161	struct dm_dev *origin_dev;
   1162
   1163	dm_block_t nr_blocks;
   1164	uint32_t sectors_per_block;
   1165	int sectors_per_block_shift;
   1166	struct era_metadata *md;
   1167
   1168	struct workqueue_struct *wq;
   1169	struct work_struct worker;
   1170
   1171	spinlock_t deferred_lock;
   1172	struct bio_list deferred_bios;
   1173
   1174	spinlock_t rpc_lock;
   1175	struct list_head rpc_calls;
   1176
   1177	struct digest digest;
   1178	atomic_t suspended;
   1179};
   1180
   1181struct rpc {
   1182	struct list_head list;
   1183
   1184	int (*fn0)(struct era_metadata *);
   1185	int (*fn1)(struct era_metadata *, void *);
   1186	void *arg;
   1187	int result;
   1188
   1189	struct completion complete;
   1190};
   1191
   1192/*----------------------------------------------------------------
   1193 * Remapping.
   1194 *---------------------------------------------------------------*/
   1195static bool block_size_is_power_of_two(struct era *era)
   1196{
   1197	return era->sectors_per_block_shift >= 0;
   1198}
   1199
   1200static dm_block_t get_block(struct era *era, struct bio *bio)
   1201{
   1202	sector_t block_nr = bio->bi_iter.bi_sector;
   1203
   1204	if (!block_size_is_power_of_two(era))
   1205		(void) sector_div(block_nr, era->sectors_per_block);
   1206	else
   1207		block_nr >>= era->sectors_per_block_shift;
   1208
   1209	return block_nr;
   1210}
   1211
   1212static void remap_to_origin(struct era *era, struct bio *bio)
   1213{
   1214	bio_set_dev(bio, era->origin_dev->bdev);
   1215}
   1216
   1217/*----------------------------------------------------------------
   1218 * Worker thread
   1219 *--------------------------------------------------------------*/
   1220static void wake_worker(struct era *era)
   1221{
   1222	if (!atomic_read(&era->suspended))
   1223		queue_work(era->wq, &era->worker);
   1224}
   1225
   1226static void process_old_eras(struct era *era)
   1227{
   1228	int r;
   1229
   1230	if (!era->digest.step)
   1231		return;
   1232
   1233	r = era->digest.step(era->md, &era->digest);
   1234	if (r < 0) {
   1235		DMERR("%s: digest step failed, stopping digestion", __func__);
   1236		era->digest.step = NULL;
   1237
   1238	} else if (era->digest.step)
   1239		wake_worker(era);
   1240}
   1241
   1242static void process_deferred_bios(struct era *era)
   1243{
   1244	int r;
   1245	struct bio_list deferred_bios, marked_bios;
   1246	struct bio *bio;
   1247	struct blk_plug plug;
   1248	bool commit_needed = false;
   1249	bool failed = false;
   1250	struct writeset *ws = era->md->current_writeset;
   1251
   1252	bio_list_init(&deferred_bios);
   1253	bio_list_init(&marked_bios);
   1254
   1255	spin_lock(&era->deferred_lock);
   1256	bio_list_merge(&deferred_bios, &era->deferred_bios);
   1257	bio_list_init(&era->deferred_bios);
   1258	spin_unlock(&era->deferred_lock);
   1259
   1260	if (bio_list_empty(&deferred_bios))
   1261		return;
   1262
   1263	while ((bio = bio_list_pop(&deferred_bios))) {
   1264		r = writeset_test_and_set(&era->md->bitset_info, ws,
   1265					  get_block(era, bio));
   1266		if (r < 0) {
   1267			/*
   1268			 * This is bad news, we need to rollback.
   1269			 * FIXME: finish.
   1270			 */
   1271			failed = true;
   1272		} else if (r == 0)
   1273			commit_needed = true;
   1274
   1275		bio_list_add(&marked_bios, bio);
   1276	}
   1277
   1278	if (commit_needed) {
   1279		r = metadata_commit(era->md);
   1280		if (r)
   1281			failed = true;
   1282	}
   1283
   1284	if (failed)
   1285		while ((bio = bio_list_pop(&marked_bios)))
   1286			bio_io_error(bio);
   1287	else {
   1288		blk_start_plug(&plug);
   1289		while ((bio = bio_list_pop(&marked_bios))) {
   1290			/*
   1291			 * Only update the in-core writeset if the on-disk one
   1292			 * was updated too.
   1293			 */
   1294			if (commit_needed)
   1295				set_bit(get_block(era, bio), ws->bits);
   1296			submit_bio_noacct(bio);
   1297		}
   1298		blk_finish_plug(&plug);
   1299	}
   1300}
   1301
   1302static void process_rpc_calls(struct era *era)
   1303{
   1304	int r;
   1305	bool need_commit = false;
   1306	struct list_head calls;
   1307	struct rpc *rpc, *tmp;
   1308
   1309	INIT_LIST_HEAD(&calls);
   1310	spin_lock(&era->rpc_lock);
   1311	list_splice_init(&era->rpc_calls, &calls);
   1312	spin_unlock(&era->rpc_lock);
   1313
   1314	list_for_each_entry_safe(rpc, tmp, &calls, list) {
   1315		rpc->result = rpc->fn0 ? rpc->fn0(era->md) : rpc->fn1(era->md, rpc->arg);
   1316		need_commit = true;
   1317	}
   1318
   1319	if (need_commit) {
   1320		r = metadata_commit(era->md);
   1321		if (r)
   1322			list_for_each_entry_safe(rpc, tmp, &calls, list)
   1323				rpc->result = r;
   1324	}
   1325
   1326	list_for_each_entry_safe(rpc, tmp, &calls, list)
   1327		complete(&rpc->complete);
   1328}
   1329
   1330static void kick_off_digest(struct era *era)
   1331{
   1332	if (era->md->archived_writesets) {
   1333		era->md->archived_writesets = false;
   1334		metadata_digest_start(era->md, &era->digest);
   1335	}
   1336}
   1337
   1338static void do_work(struct work_struct *ws)
   1339{
   1340	struct era *era = container_of(ws, struct era, worker);
   1341
   1342	kick_off_digest(era);
   1343	process_old_eras(era);
   1344	process_deferred_bios(era);
   1345	process_rpc_calls(era);
   1346}
   1347
   1348static void defer_bio(struct era *era, struct bio *bio)
   1349{
   1350	spin_lock(&era->deferred_lock);
   1351	bio_list_add(&era->deferred_bios, bio);
   1352	spin_unlock(&era->deferred_lock);
   1353
   1354	wake_worker(era);
   1355}
   1356
   1357/*
   1358 * Make an rpc call to the worker to change the metadata.
   1359 */
   1360static int perform_rpc(struct era *era, struct rpc *rpc)
   1361{
   1362	rpc->result = 0;
   1363	init_completion(&rpc->complete);
   1364
   1365	spin_lock(&era->rpc_lock);
   1366	list_add(&rpc->list, &era->rpc_calls);
   1367	spin_unlock(&era->rpc_lock);
   1368
   1369	wake_worker(era);
   1370	wait_for_completion(&rpc->complete);
   1371
   1372	return rpc->result;
   1373}
   1374
   1375static int in_worker0(struct era *era, int (*fn)(struct era_metadata *))
   1376{
   1377	struct rpc rpc;
   1378	rpc.fn0 = fn;
   1379	rpc.fn1 = NULL;
   1380
   1381	return perform_rpc(era, &rpc);
   1382}
   1383
   1384static int in_worker1(struct era *era,
   1385		      int (*fn)(struct era_metadata *, void *), void *arg)
   1386{
   1387	struct rpc rpc;
   1388	rpc.fn0 = NULL;
   1389	rpc.fn1 = fn;
   1390	rpc.arg = arg;
   1391
   1392	return perform_rpc(era, &rpc);
   1393}
   1394
   1395static void start_worker(struct era *era)
   1396{
   1397	atomic_set(&era->suspended, 0);
   1398}
   1399
   1400static void stop_worker(struct era *era)
   1401{
   1402	atomic_set(&era->suspended, 1);
   1403	drain_workqueue(era->wq);
   1404}
   1405
   1406/*----------------------------------------------------------------
   1407 * Target methods
   1408 *--------------------------------------------------------------*/
   1409static void era_destroy(struct era *era)
   1410{
   1411	if (era->md)
   1412		metadata_close(era->md);
   1413
   1414	if (era->wq)
   1415		destroy_workqueue(era->wq);
   1416
   1417	if (era->origin_dev)
   1418		dm_put_device(era->ti, era->origin_dev);
   1419
   1420	if (era->metadata_dev)
   1421		dm_put_device(era->ti, era->metadata_dev);
   1422
   1423	kfree(era);
   1424}
   1425
   1426static dm_block_t calc_nr_blocks(struct era *era)
   1427{
   1428	return dm_sector_div_up(era->ti->len, era->sectors_per_block);
   1429}
   1430
   1431static bool valid_block_size(dm_block_t block_size)
   1432{
   1433	bool greater_than_zero = block_size > 0;
   1434	bool multiple_of_min_block_size = (block_size & (MIN_BLOCK_SIZE - 1)) == 0;
   1435
   1436	return greater_than_zero && multiple_of_min_block_size;
   1437}
   1438
   1439/*
   1440 * <metadata dev> <data dev> <data block size (sectors)>
   1441 */
   1442static int era_ctr(struct dm_target *ti, unsigned argc, char **argv)
   1443{
   1444	int r;
   1445	char dummy;
   1446	struct era *era;
   1447	struct era_metadata *md;
   1448
   1449	if (argc != 3) {
   1450		ti->error = "Invalid argument count";
   1451		return -EINVAL;
   1452	}
   1453
   1454	era = kzalloc(sizeof(*era), GFP_KERNEL);
   1455	if (!era) {
   1456		ti->error = "Error allocating era structure";
   1457		return -ENOMEM;
   1458	}
   1459
   1460	era->ti = ti;
   1461
   1462	r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &era->metadata_dev);
   1463	if (r) {
   1464		ti->error = "Error opening metadata device";
   1465		era_destroy(era);
   1466		return -EINVAL;
   1467	}
   1468
   1469	r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &era->origin_dev);
   1470	if (r) {
   1471		ti->error = "Error opening data device";
   1472		era_destroy(era);
   1473		return -EINVAL;
   1474	}
   1475
   1476	r = sscanf(argv[2], "%u%c", &era->sectors_per_block, &dummy);
   1477	if (r != 1) {
   1478		ti->error = "Error parsing block size";
   1479		era_destroy(era);
   1480		return -EINVAL;
   1481	}
   1482
   1483	r = dm_set_target_max_io_len(ti, era->sectors_per_block);
   1484	if (r) {
   1485		ti->error = "could not set max io len";
   1486		era_destroy(era);
   1487		return -EINVAL;
   1488	}
   1489
   1490	if (!valid_block_size(era->sectors_per_block)) {
   1491		ti->error = "Invalid block size";
   1492		era_destroy(era);
   1493		return -EINVAL;
   1494	}
   1495	if (era->sectors_per_block & (era->sectors_per_block - 1))
   1496		era->sectors_per_block_shift = -1;
   1497	else
   1498		era->sectors_per_block_shift = __ffs(era->sectors_per_block);
   1499
   1500	md = metadata_open(era->metadata_dev->bdev, era->sectors_per_block, true);
   1501	if (IS_ERR(md)) {
   1502		ti->error = "Error reading metadata";
   1503		era_destroy(era);
   1504		return PTR_ERR(md);
   1505	}
   1506	era->md = md;
   1507
   1508	era->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
   1509	if (!era->wq) {
   1510		ti->error = "could not create workqueue for metadata object";
   1511		era_destroy(era);
   1512		return -ENOMEM;
   1513	}
   1514	INIT_WORK(&era->worker, do_work);
   1515
   1516	spin_lock_init(&era->deferred_lock);
   1517	bio_list_init(&era->deferred_bios);
   1518
   1519	spin_lock_init(&era->rpc_lock);
   1520	INIT_LIST_HEAD(&era->rpc_calls);
   1521
   1522	ti->private = era;
   1523	ti->num_flush_bios = 1;
   1524	ti->flush_supported = true;
   1525
   1526	ti->num_discard_bios = 1;
   1527
   1528	return 0;
   1529}
   1530
   1531static void era_dtr(struct dm_target *ti)
   1532{
   1533	era_destroy(ti->private);
   1534}
   1535
   1536static int era_map(struct dm_target *ti, struct bio *bio)
   1537{
   1538	struct era *era = ti->private;
   1539	dm_block_t block = get_block(era, bio);
   1540
   1541	/*
   1542	 * All bios get remapped to the origin device.  We do this now, but
   1543	 * it may not get issued until later.  Depending on whether the
   1544	 * block is marked in this era.
   1545	 */
   1546	remap_to_origin(era, bio);
   1547
   1548	/*
   1549	 * REQ_PREFLUSH bios carry no data, so we're not interested in them.
   1550	 */
   1551	if (!(bio->bi_opf & REQ_PREFLUSH) &&
   1552	    (bio_data_dir(bio) == WRITE) &&
   1553	    !metadata_current_marked(era->md, block)) {
   1554		defer_bio(era, bio);
   1555		return DM_MAPIO_SUBMITTED;
   1556	}
   1557
   1558	return DM_MAPIO_REMAPPED;
   1559}
   1560
   1561static void era_postsuspend(struct dm_target *ti)
   1562{
   1563	int r;
   1564	struct era *era = ti->private;
   1565
   1566	r = in_worker0(era, metadata_era_archive);
   1567	if (r) {
   1568		DMERR("%s: couldn't archive current era", __func__);
   1569		/* FIXME: fail mode */
   1570	}
   1571
   1572	stop_worker(era);
   1573
   1574	r = metadata_commit(era->md);
   1575	if (r) {
   1576		DMERR("%s: metadata_commit failed", __func__);
   1577		/* FIXME: fail mode */
   1578	}
   1579}
   1580
   1581static int era_preresume(struct dm_target *ti)
   1582{
   1583	int r;
   1584	struct era *era = ti->private;
   1585	dm_block_t new_size = calc_nr_blocks(era);
   1586
   1587	if (era->nr_blocks != new_size) {
   1588		r = metadata_resize(era->md, &new_size);
   1589		if (r) {
   1590			DMERR("%s: metadata_resize failed", __func__);
   1591			return r;
   1592		}
   1593
   1594		r = metadata_commit(era->md);
   1595		if (r) {
   1596			DMERR("%s: metadata_commit failed", __func__);
   1597			return r;
   1598		}
   1599
   1600		era->nr_blocks = new_size;
   1601	}
   1602
   1603	start_worker(era);
   1604
   1605	r = in_worker0(era, metadata_era_rollover);
   1606	if (r) {
   1607		DMERR("%s: metadata_era_rollover failed", __func__);
   1608		return r;
   1609	}
   1610
   1611	return 0;
   1612}
   1613
   1614/*
   1615 * Status format:
   1616 *
   1617 * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
   1618 * <current era> <held metadata root | '-'>
   1619 */
   1620static void era_status(struct dm_target *ti, status_type_t type,
   1621		       unsigned status_flags, char *result, unsigned maxlen)
   1622{
   1623	int r;
   1624	struct era *era = ti->private;
   1625	ssize_t sz = 0;
   1626	struct metadata_stats stats;
   1627	char buf[BDEVNAME_SIZE];
   1628
   1629	switch (type) {
   1630	case STATUSTYPE_INFO:
   1631		r = in_worker1(era, metadata_get_stats, &stats);
   1632		if (r)
   1633			goto err;
   1634
   1635		DMEMIT("%u %llu/%llu %u",
   1636		       (unsigned) (DM_ERA_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
   1637		       (unsigned long long) stats.used,
   1638		       (unsigned long long) stats.total,
   1639		       (unsigned) stats.era);
   1640
   1641		if (stats.snap != SUPERBLOCK_LOCATION)
   1642			DMEMIT(" %llu", stats.snap);
   1643		else
   1644			DMEMIT(" -");
   1645		break;
   1646
   1647	case STATUSTYPE_TABLE:
   1648		format_dev_t(buf, era->metadata_dev->bdev->bd_dev);
   1649		DMEMIT("%s ", buf);
   1650		format_dev_t(buf, era->origin_dev->bdev->bd_dev);
   1651		DMEMIT("%s %u", buf, era->sectors_per_block);
   1652		break;
   1653
   1654	case STATUSTYPE_IMA:
   1655		*result = '\0';
   1656		break;
   1657	}
   1658
   1659	return;
   1660
   1661err:
   1662	DMEMIT("Error");
   1663}
   1664
   1665static int era_message(struct dm_target *ti, unsigned argc, char **argv,
   1666		       char *result, unsigned maxlen)
   1667{
   1668	struct era *era = ti->private;
   1669
   1670	if (argc != 1) {
   1671		DMERR("incorrect number of message arguments");
   1672		return -EINVAL;
   1673	}
   1674
   1675	if (!strcasecmp(argv[0], "checkpoint"))
   1676		return in_worker0(era, metadata_checkpoint);
   1677
   1678	if (!strcasecmp(argv[0], "take_metadata_snap"))
   1679		return in_worker0(era, metadata_take_snap);
   1680
   1681	if (!strcasecmp(argv[0], "drop_metadata_snap"))
   1682		return in_worker0(era, metadata_drop_snap);
   1683
   1684	DMERR("unsupported message '%s'", argv[0]);
   1685	return -EINVAL;
   1686}
   1687
   1688static sector_t get_dev_size(struct dm_dev *dev)
   1689{
   1690	return bdev_nr_sectors(dev->bdev);
   1691}
   1692
   1693static int era_iterate_devices(struct dm_target *ti,
   1694			       iterate_devices_callout_fn fn, void *data)
   1695{
   1696	struct era *era = ti->private;
   1697	return fn(ti, era->origin_dev, 0, get_dev_size(era->origin_dev), data);
   1698}
   1699
   1700static void era_io_hints(struct dm_target *ti, struct queue_limits *limits)
   1701{
   1702	struct era *era = ti->private;
   1703	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
   1704
   1705	/*
   1706	 * If the system-determined stacked limits are compatible with the
   1707	 * era device's blocksize (io_opt is a factor) do not override them.
   1708	 */
   1709	if (io_opt_sectors < era->sectors_per_block ||
   1710	    do_div(io_opt_sectors, era->sectors_per_block)) {
   1711		blk_limits_io_min(limits, 0);
   1712		blk_limits_io_opt(limits, era->sectors_per_block << SECTOR_SHIFT);
   1713	}
   1714}
   1715
   1716/*----------------------------------------------------------------*/
   1717
   1718static struct target_type era_target = {
   1719	.name = "era",
   1720	.version = {1, 0, 0},
   1721	.module = THIS_MODULE,
   1722	.ctr = era_ctr,
   1723	.dtr = era_dtr,
   1724	.map = era_map,
   1725	.postsuspend = era_postsuspend,
   1726	.preresume = era_preresume,
   1727	.status = era_status,
   1728	.message = era_message,
   1729	.iterate_devices = era_iterate_devices,
   1730	.io_hints = era_io_hints
   1731};
   1732
   1733static int __init dm_era_init(void)
   1734{
   1735	int r;
   1736
   1737	r = dm_register_target(&era_target);
   1738	if (r) {
   1739		DMERR("era target registration failed: %d", r);
   1740		return r;
   1741	}
   1742
   1743	return 0;
   1744}
   1745
   1746static void __exit dm_era_exit(void)
   1747{
   1748	dm_unregister_target(&era_target);
   1749}
   1750
   1751module_init(dm_era_init);
   1752module_exit(dm_era_exit);
   1753
   1754MODULE_DESCRIPTION(DM_NAME " era target");
   1755MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
   1756MODULE_LICENSE("GPL");