cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

dm-zoned-metadata.c (73635B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Copyright (C) 2017 Western Digital Corporation or its affiliates.
      4 *
      5 * This file is released under the GPL.
      6 */
      7
      8#include "dm-zoned.h"
      9
     10#include <linux/module.h>
     11#include <linux/crc32.h>
     12#include <linux/sched/mm.h>
     13
     14#define	DM_MSG_PREFIX		"zoned metadata"
     15
     16/*
     17 * Metadata version.
     18 */
     19#define DMZ_META_VER	2
     20
     21/*
     22 * On-disk super block magic.
     23 */
     24#define DMZ_MAGIC	((((unsigned int)('D')) << 24) | \
     25			 (((unsigned int)('Z')) << 16) | \
     26			 (((unsigned int)('B')) <<  8) | \
     27			 ((unsigned int)('D')))
     28
     29/*
     30 * On disk super block.
     31 * This uses only 512 B but uses on disk a full 4KB block. This block is
     32 * followed on disk by the mapping table of chunks to zones and the bitmap
     33 * blocks indicating zone block validity.
     34 * The overall resulting metadata format is:
     35 *    (1) Super block (1 block)
     36 *    (2) Chunk mapping table (nr_map_blocks)
     37 *    (3) Bitmap blocks (nr_bitmap_blocks)
     38 * All metadata blocks are stored in conventional zones, starting from
     39 * the first conventional zone found on disk.
     40 */
     41struct dmz_super {
     42	/* Magic number */
     43	__le32		magic;			/*   4 */
     44
     45	/* Metadata version number */
     46	__le32		version;		/*   8 */
     47
     48	/* Generation number */
     49	__le64		gen;			/*  16 */
     50
     51	/* This block number */
     52	__le64		sb_block;		/*  24 */
     53
     54	/* The number of metadata blocks, including this super block */
     55	__le32		nr_meta_blocks;		/*  28 */
     56
     57	/* The number of sequential zones reserved for reclaim */
     58	__le32		nr_reserved_seq;	/*  32 */
     59
     60	/* The number of entries in the mapping table */
     61	__le32		nr_chunks;		/*  36 */
     62
     63	/* The number of blocks used for the chunk mapping table */
     64	__le32		nr_map_blocks;		/*  40 */
     65
     66	/* The number of blocks used for the block bitmaps */
     67	__le32		nr_bitmap_blocks;	/*  44 */
     68
     69	/* Checksum */
     70	__le32		crc;			/*  48 */
     71
     72	/* DM-Zoned label */
     73	u8		dmz_label[32];		/*  80 */
     74
     75	/* DM-Zoned UUID */
     76	u8		dmz_uuid[16];		/*  96 */
     77
     78	/* Device UUID */
     79	u8		dev_uuid[16];		/* 112 */
     80
     81	/* Padding to full 512B sector */
     82	u8		reserved[400];		/* 512 */
     83};
     84
     85/*
     86 * Chunk mapping entry: entries are indexed by chunk number
     87 * and give the zone ID (dzone_id) mapping the chunk on disk.
     88 * This zone may be sequential or random. If it is a sequential
     89 * zone, a second zone (bzone_id) used as a write buffer may
     90 * also be specified. This second zone will always be a randomly
     91 * writeable zone.
     92 */
     93struct dmz_map {
     94	__le32			dzone_id;
     95	__le32			bzone_id;
     96};
     97
     98/*
     99 * Chunk mapping table metadata: 512 8-bytes entries per 4KB block.
    100 */
    101#define DMZ_MAP_ENTRIES		(DMZ_BLOCK_SIZE / sizeof(struct dmz_map))
    102#define DMZ_MAP_ENTRIES_SHIFT	(ilog2(DMZ_MAP_ENTRIES))
    103#define DMZ_MAP_ENTRIES_MASK	(DMZ_MAP_ENTRIES - 1)
    104#define DMZ_MAP_UNMAPPED	UINT_MAX
    105
    106/*
    107 * Meta data block descriptor (for cached metadata blocks).
    108 */
    109struct dmz_mblock {
    110	struct rb_node		node;
    111	struct list_head	link;
    112	sector_t		no;
    113	unsigned int		ref;
    114	unsigned long		state;
    115	struct page		*page;
    116	void			*data;
    117};
    118
    119/*
    120 * Metadata block state flags.
    121 */
    122enum {
    123	DMZ_META_DIRTY,
    124	DMZ_META_READING,
    125	DMZ_META_WRITING,
    126	DMZ_META_ERROR,
    127};
    128
    129/*
    130 * Super block information (one per metadata set).
    131 */
    132struct dmz_sb {
    133	sector_t		block;
    134	struct dmz_dev		*dev;
    135	struct dmz_mblock	*mblk;
    136	struct dmz_super	*sb;
    137	struct dm_zone		*zone;
    138};
    139
    140/*
    141 * In-memory metadata.
    142 */
    143struct dmz_metadata {
    144	struct dmz_dev		*dev;
    145	unsigned int		nr_devs;
    146
    147	char			devname[BDEVNAME_SIZE];
    148	char			label[BDEVNAME_SIZE];
    149	uuid_t			uuid;
    150
    151	sector_t		zone_bitmap_size;
    152	unsigned int		zone_nr_bitmap_blocks;
    153	unsigned int		zone_bits_per_mblk;
    154
    155	sector_t		zone_nr_blocks;
    156	sector_t		zone_nr_blocks_shift;
    157
    158	sector_t		zone_nr_sectors;
    159	sector_t		zone_nr_sectors_shift;
    160
    161	unsigned int		nr_bitmap_blocks;
    162	unsigned int		nr_map_blocks;
    163
    164	unsigned int		nr_zones;
    165	unsigned int		nr_useable_zones;
    166	unsigned int		nr_meta_blocks;
    167	unsigned int		nr_meta_zones;
    168	unsigned int		nr_data_zones;
    169	unsigned int		nr_cache_zones;
    170	unsigned int		nr_rnd_zones;
    171	unsigned int		nr_reserved_seq;
    172	unsigned int		nr_chunks;
    173
    174	/* Zone information array */
    175	struct xarray		zones;
    176
    177	struct dmz_sb		sb[2];
    178	unsigned int		mblk_primary;
    179	unsigned int		sb_version;
    180	u64			sb_gen;
    181	unsigned int		min_nr_mblks;
    182	unsigned int		max_nr_mblks;
    183	atomic_t		nr_mblks;
    184	struct rw_semaphore	mblk_sem;
    185	struct mutex		mblk_flush_lock;
    186	spinlock_t		mblk_lock;
    187	struct rb_root		mblk_rbtree;
    188	struct list_head	mblk_lru_list;
    189	struct list_head	mblk_dirty_list;
    190	struct shrinker		mblk_shrinker;
    191
    192	/* Zone allocation management */
    193	struct mutex		map_lock;
    194	struct dmz_mblock	**map_mblk;
    195
    196	unsigned int		nr_cache;
    197	atomic_t		unmap_nr_cache;
    198	struct list_head	unmap_cache_list;
    199	struct list_head	map_cache_list;
    200
    201	atomic_t		nr_reserved_seq_zones;
    202	struct list_head	reserved_seq_zones_list;
    203
    204	wait_queue_head_t	free_wq;
    205};
    206
    207#define dmz_zmd_info(zmd, format, args...)	\
    208	DMINFO("(%s): " format, (zmd)->label, ## args)
    209
    210#define dmz_zmd_err(zmd, format, args...)	\
    211	DMERR("(%s): " format, (zmd)->label, ## args)
    212
    213#define dmz_zmd_warn(zmd, format, args...)	\
    214	DMWARN("(%s): " format, (zmd)->label, ## args)
    215
    216#define dmz_zmd_debug(zmd, format, args...)	\
    217	DMDEBUG("(%s): " format, (zmd)->label, ## args)
    218/*
    219 * Various accessors
    220 */
    221static unsigned int dmz_dev_zone_id(struct dmz_metadata *zmd, struct dm_zone *zone)
    222{
    223	if (WARN_ON(!zone))
    224		return 0;
    225
    226	return zone->id - zone->dev->zone_offset;
    227}
    228
    229sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone)
    230{
    231	unsigned int zone_id = dmz_dev_zone_id(zmd, zone);
    232
    233	return (sector_t)zone_id << zmd->zone_nr_sectors_shift;
    234}
    235
    236sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone)
    237{
    238	unsigned int zone_id = dmz_dev_zone_id(zmd, zone);
    239
    240	return (sector_t)zone_id << zmd->zone_nr_blocks_shift;
    241}
    242
    243unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd)
    244{
    245	return zmd->zone_nr_blocks;
    246}
    247
    248unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd)
    249{
    250	return zmd->zone_nr_blocks_shift;
    251}
    252
    253unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd)
    254{
    255	return zmd->zone_nr_sectors;
    256}
    257
    258unsigned int dmz_zone_nr_sectors_shift(struct dmz_metadata *zmd)
    259{
    260	return zmd->zone_nr_sectors_shift;
    261}
    262
    263unsigned int dmz_nr_zones(struct dmz_metadata *zmd)
    264{
    265	return zmd->nr_zones;
    266}
    267
    268unsigned int dmz_nr_chunks(struct dmz_metadata *zmd)
    269{
    270	return zmd->nr_chunks;
    271}
    272
    273unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd, int idx)
    274{
    275	return zmd->dev[idx].nr_rnd;
    276}
    277
    278unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd, int idx)
    279{
    280	return atomic_read(&zmd->dev[idx].unmap_nr_rnd);
    281}
    282
    283unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd)
    284{
    285	return zmd->nr_cache;
    286}
    287
    288unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd)
    289{
    290	return atomic_read(&zmd->unmap_nr_cache);
    291}
    292
    293unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd, int idx)
    294{
    295	return zmd->dev[idx].nr_seq;
    296}
    297
    298unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd, int idx)
    299{
    300	return atomic_read(&zmd->dev[idx].unmap_nr_seq);
    301}
    302
    303static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id)
    304{
    305	return xa_load(&zmd->zones, zone_id);
    306}
    307
    308static struct dm_zone *dmz_insert(struct dmz_metadata *zmd,
    309				  unsigned int zone_id, struct dmz_dev *dev)
    310{
    311	struct dm_zone *zone = kzalloc(sizeof(struct dm_zone), GFP_KERNEL);
    312
    313	if (!zone)
    314		return ERR_PTR(-ENOMEM);
    315
    316	if (xa_insert(&zmd->zones, zone_id, zone, GFP_KERNEL)) {
    317		kfree(zone);
    318		return ERR_PTR(-EBUSY);
    319	}
    320
    321	INIT_LIST_HEAD(&zone->link);
    322	atomic_set(&zone->refcount, 0);
    323	zone->id = zone_id;
    324	zone->chunk = DMZ_MAP_UNMAPPED;
    325	zone->dev = dev;
    326
    327	return zone;
    328}
    329
    330const char *dmz_metadata_label(struct dmz_metadata *zmd)
    331{
    332	return (const char *)zmd->label;
    333}
    334
    335bool dmz_check_dev(struct dmz_metadata *zmd)
    336{
    337	unsigned int i;
    338
    339	for (i = 0; i < zmd->nr_devs; i++) {
    340		if (!dmz_check_bdev(&zmd->dev[i]))
    341			return false;
    342	}
    343	return true;
    344}
    345
    346bool dmz_dev_is_dying(struct dmz_metadata *zmd)
    347{
    348	unsigned int i;
    349
    350	for (i = 0; i < zmd->nr_devs; i++) {
    351		if (dmz_bdev_is_dying(&zmd->dev[i]))
    352			return true;
    353	}
    354	return false;
    355}
    356
    357/*
    358 * Lock/unlock mapping table.
    359 * The map lock also protects all the zone lists.
    360 */
    361void dmz_lock_map(struct dmz_metadata *zmd)
    362{
    363	mutex_lock(&zmd->map_lock);
    364}
    365
    366void dmz_unlock_map(struct dmz_metadata *zmd)
    367{
    368	mutex_unlock(&zmd->map_lock);
    369}
    370
    371/*
    372 * Lock/unlock metadata access. This is a "read" lock on a semaphore
    373 * that prevents metadata flush from running while metadata are being
    374 * modified. The actual metadata write mutual exclusion is achieved with
    375 * the map lock and zone state management (active and reclaim state are
    376 * mutually exclusive).
    377 */
    378void dmz_lock_metadata(struct dmz_metadata *zmd)
    379{
    380	down_read(&zmd->mblk_sem);
    381}
    382
    383void dmz_unlock_metadata(struct dmz_metadata *zmd)
    384{
    385	up_read(&zmd->mblk_sem);
    386}
    387
    388/*
    389 * Lock/unlock flush: prevent concurrent executions
    390 * of dmz_flush_metadata as well as metadata modification in reclaim
    391 * while flush is being executed.
    392 */
    393void dmz_lock_flush(struct dmz_metadata *zmd)
    394{
    395	mutex_lock(&zmd->mblk_flush_lock);
    396}
    397
    398void dmz_unlock_flush(struct dmz_metadata *zmd)
    399{
    400	mutex_unlock(&zmd->mblk_flush_lock);
    401}
    402
    403/*
    404 * Allocate a metadata block.
    405 */
    406static struct dmz_mblock *dmz_alloc_mblock(struct dmz_metadata *zmd,
    407					   sector_t mblk_no)
    408{
    409	struct dmz_mblock *mblk = NULL;
    410
    411	/* See if we can reuse cached blocks */
    412	if (zmd->max_nr_mblks && atomic_read(&zmd->nr_mblks) > zmd->max_nr_mblks) {
    413		spin_lock(&zmd->mblk_lock);
    414		mblk = list_first_entry_or_null(&zmd->mblk_lru_list,
    415						struct dmz_mblock, link);
    416		if (mblk) {
    417			list_del_init(&mblk->link);
    418			rb_erase(&mblk->node, &zmd->mblk_rbtree);
    419			mblk->no = mblk_no;
    420		}
    421		spin_unlock(&zmd->mblk_lock);
    422		if (mblk)
    423			return mblk;
    424	}
    425
    426	/* Allocate a new block */
    427	mblk = kmalloc(sizeof(struct dmz_mblock), GFP_NOIO);
    428	if (!mblk)
    429		return NULL;
    430
    431	mblk->page = alloc_page(GFP_NOIO);
    432	if (!mblk->page) {
    433		kfree(mblk);
    434		return NULL;
    435	}
    436
    437	RB_CLEAR_NODE(&mblk->node);
    438	INIT_LIST_HEAD(&mblk->link);
    439	mblk->ref = 0;
    440	mblk->state = 0;
    441	mblk->no = mblk_no;
    442	mblk->data = page_address(mblk->page);
    443
    444	atomic_inc(&zmd->nr_mblks);
    445
    446	return mblk;
    447}
    448
    449/*
    450 * Free a metadata block.
    451 */
    452static void dmz_free_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
    453{
    454	__free_pages(mblk->page, 0);
    455	kfree(mblk);
    456
    457	atomic_dec(&zmd->nr_mblks);
    458}
    459
    460/*
    461 * Insert a metadata block in the rbtree.
    462 */
    463static void dmz_insert_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
    464{
    465	struct rb_root *root = &zmd->mblk_rbtree;
    466	struct rb_node **new = &(root->rb_node), *parent = NULL;
    467	struct dmz_mblock *b;
    468
    469	/* Figure out where to put the new node */
    470	while (*new) {
    471		b = container_of(*new, struct dmz_mblock, node);
    472		parent = *new;
    473		new = (b->no < mblk->no) ? &((*new)->rb_left) : &((*new)->rb_right);
    474	}
    475
    476	/* Add new node and rebalance tree */
    477	rb_link_node(&mblk->node, parent, new);
    478	rb_insert_color(&mblk->node, root);
    479}
    480
    481/*
    482 * Lookup a metadata block in the rbtree. If the block is found, increment
    483 * its reference count.
    484 */
    485static struct dmz_mblock *dmz_get_mblock_fast(struct dmz_metadata *zmd,
    486					      sector_t mblk_no)
    487{
    488	struct rb_root *root = &zmd->mblk_rbtree;
    489	struct rb_node *node = root->rb_node;
    490	struct dmz_mblock *mblk;
    491
    492	while (node) {
    493		mblk = container_of(node, struct dmz_mblock, node);
    494		if (mblk->no == mblk_no) {
    495			/*
    496			 * If this is the first reference to the block,
    497			 * remove it from the LRU list.
    498			 */
    499			mblk->ref++;
    500			if (mblk->ref == 1 &&
    501			    !test_bit(DMZ_META_DIRTY, &mblk->state))
    502				list_del_init(&mblk->link);
    503			return mblk;
    504		}
    505		node = (mblk->no < mblk_no) ? node->rb_left : node->rb_right;
    506	}
    507
    508	return NULL;
    509}
    510
    511/*
    512 * Metadata block BIO end callback.
    513 */
    514static void dmz_mblock_bio_end_io(struct bio *bio)
    515{
    516	struct dmz_mblock *mblk = bio->bi_private;
    517	int flag;
    518
    519	if (bio->bi_status)
    520		set_bit(DMZ_META_ERROR, &mblk->state);
    521
    522	if (bio_op(bio) == REQ_OP_WRITE)
    523		flag = DMZ_META_WRITING;
    524	else
    525		flag = DMZ_META_READING;
    526
    527	clear_bit_unlock(flag, &mblk->state);
    528	smp_mb__after_atomic();
    529	wake_up_bit(&mblk->state, flag);
    530
    531	bio_put(bio);
    532}
    533
    534/*
    535 * Read an uncached metadata block from disk and add it to the cache.
    536 */
    537static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
    538					      sector_t mblk_no)
    539{
    540	struct dmz_mblock *mblk, *m;
    541	sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
    542	struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev;
    543	struct bio *bio;
    544
    545	if (dmz_bdev_is_dying(dev))
    546		return ERR_PTR(-EIO);
    547
    548	/* Get a new block and a BIO to read it */
    549	mblk = dmz_alloc_mblock(zmd, mblk_no);
    550	if (!mblk)
    551		return ERR_PTR(-ENOMEM);
    552
    553	bio = bio_alloc(dev->bdev, 1, REQ_OP_READ | REQ_META | REQ_PRIO,
    554			GFP_NOIO);
    555
    556	spin_lock(&zmd->mblk_lock);
    557
    558	/*
    559	 * Make sure that another context did not start reading
    560	 * the block already.
    561	 */
    562	m = dmz_get_mblock_fast(zmd, mblk_no);
    563	if (m) {
    564		spin_unlock(&zmd->mblk_lock);
    565		dmz_free_mblock(zmd, mblk);
    566		bio_put(bio);
    567		return m;
    568	}
    569
    570	mblk->ref++;
    571	set_bit(DMZ_META_READING, &mblk->state);
    572	dmz_insert_mblock(zmd, mblk);
    573
    574	spin_unlock(&zmd->mblk_lock);
    575
    576	/* Submit read BIO */
    577	bio->bi_iter.bi_sector = dmz_blk2sect(block);
    578	bio->bi_private = mblk;
    579	bio->bi_end_io = dmz_mblock_bio_end_io;
    580	bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
    581	submit_bio(bio);
    582
    583	return mblk;
    584}
    585
    586/*
    587 * Free metadata blocks.
    588 */
    589static unsigned long dmz_shrink_mblock_cache(struct dmz_metadata *zmd,
    590					     unsigned long limit)
    591{
    592	struct dmz_mblock *mblk;
    593	unsigned long count = 0;
    594
    595	if (!zmd->max_nr_mblks)
    596		return 0;
    597
    598	while (!list_empty(&zmd->mblk_lru_list) &&
    599	       atomic_read(&zmd->nr_mblks) > zmd->min_nr_mblks &&
    600	       count < limit) {
    601		mblk = list_first_entry(&zmd->mblk_lru_list,
    602					struct dmz_mblock, link);
    603		list_del_init(&mblk->link);
    604		rb_erase(&mblk->node, &zmd->mblk_rbtree);
    605		dmz_free_mblock(zmd, mblk);
    606		count++;
    607	}
    608
    609	return count;
    610}
    611
    612/*
    613 * For mblock shrinker: get the number of unused metadata blocks in the cache.
    614 */
    615static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink,
    616					       struct shrink_control *sc)
    617{
    618	struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker);
    619
    620	return atomic_read(&zmd->nr_mblks);
    621}
    622
    623/*
    624 * For mblock shrinker: scan unused metadata blocks and shrink the cache.
    625 */
    626static unsigned long dmz_mblock_shrinker_scan(struct shrinker *shrink,
    627					      struct shrink_control *sc)
    628{
    629	struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker);
    630	unsigned long count;
    631
    632	spin_lock(&zmd->mblk_lock);
    633	count = dmz_shrink_mblock_cache(zmd, sc->nr_to_scan);
    634	spin_unlock(&zmd->mblk_lock);
    635
    636	return count ? count : SHRINK_STOP;
    637}
    638
    639/*
    640 * Release a metadata block.
    641 */
    642static void dmz_release_mblock(struct dmz_metadata *zmd,
    643			       struct dmz_mblock *mblk)
    644{
    645
    646	if (!mblk)
    647		return;
    648
    649	spin_lock(&zmd->mblk_lock);
    650
    651	mblk->ref--;
    652	if (mblk->ref == 0) {
    653		if (test_bit(DMZ_META_ERROR, &mblk->state)) {
    654			rb_erase(&mblk->node, &zmd->mblk_rbtree);
    655			dmz_free_mblock(zmd, mblk);
    656		} else if (!test_bit(DMZ_META_DIRTY, &mblk->state)) {
    657			list_add_tail(&mblk->link, &zmd->mblk_lru_list);
    658			dmz_shrink_mblock_cache(zmd, 1);
    659		}
    660	}
    661
    662	spin_unlock(&zmd->mblk_lock);
    663}
    664
    665/*
    666 * Get a metadata block from the rbtree. If the block
    667 * is not present, read it from disk.
    668 */
    669static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
    670					 sector_t mblk_no)
    671{
    672	struct dmz_mblock *mblk;
    673	struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev;
    674
    675	/* Check rbtree */
    676	spin_lock(&zmd->mblk_lock);
    677	mblk = dmz_get_mblock_fast(zmd, mblk_no);
    678	spin_unlock(&zmd->mblk_lock);
    679
    680	if (!mblk) {
    681		/* Cache miss: read the block from disk */
    682		mblk = dmz_get_mblock_slow(zmd, mblk_no);
    683		if (IS_ERR(mblk))
    684			return mblk;
    685	}
    686
    687	/* Wait for on-going read I/O and check for error */
    688	wait_on_bit_io(&mblk->state, DMZ_META_READING,
    689		       TASK_UNINTERRUPTIBLE);
    690	if (test_bit(DMZ_META_ERROR, &mblk->state)) {
    691		dmz_release_mblock(zmd, mblk);
    692		dmz_check_bdev(dev);
    693		return ERR_PTR(-EIO);
    694	}
    695
    696	return mblk;
    697}
    698
    699/*
    700 * Mark a metadata block dirty.
    701 */
    702static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
    703{
    704	spin_lock(&zmd->mblk_lock);
    705	if (!test_and_set_bit(DMZ_META_DIRTY, &mblk->state))
    706		list_add_tail(&mblk->link, &zmd->mblk_dirty_list);
    707	spin_unlock(&zmd->mblk_lock);
    708}
    709
    710/*
    711 * Issue a metadata block write BIO.
    712 */
    713static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
    714			    unsigned int set)
    715{
    716	struct dmz_dev *dev = zmd->sb[set].dev;
    717	sector_t block = zmd->sb[set].block + mblk->no;
    718	struct bio *bio;
    719
    720	if (dmz_bdev_is_dying(dev))
    721		return -EIO;
    722
    723	bio = bio_alloc(dev->bdev, 1, REQ_OP_WRITE | REQ_META | REQ_PRIO,
    724			GFP_NOIO);
    725
    726	set_bit(DMZ_META_WRITING, &mblk->state);
    727
    728	bio->bi_iter.bi_sector = dmz_blk2sect(block);
    729	bio->bi_private = mblk;
    730	bio->bi_end_io = dmz_mblock_bio_end_io;
    731	bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
    732	submit_bio(bio);
    733
    734	return 0;
    735}
    736
    737/*
    738 * Read/write a metadata block.
    739 */
    740static int dmz_rdwr_block(struct dmz_dev *dev, int op,
    741			  sector_t block, struct page *page)
    742{
    743	struct bio *bio;
    744	int ret;
    745
    746	if (WARN_ON(!dev))
    747		return -EIO;
    748
    749	if (dmz_bdev_is_dying(dev))
    750		return -EIO;
    751
    752	bio = bio_alloc(dev->bdev, 1, op | REQ_SYNC | REQ_META | REQ_PRIO,
    753			GFP_NOIO);
    754	bio->bi_iter.bi_sector = dmz_blk2sect(block);
    755	bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0);
    756	ret = submit_bio_wait(bio);
    757	bio_put(bio);
    758
    759	if (ret)
    760		dmz_check_bdev(dev);
    761	return ret;
    762}
    763
    764/*
    765 * Write super block of the specified metadata set.
    766 */
    767static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
    768{
    769	struct dmz_mblock *mblk = zmd->sb[set].mblk;
    770	struct dmz_super *sb = zmd->sb[set].sb;
    771	struct dmz_dev *dev = zmd->sb[set].dev;
    772	sector_t sb_block;
    773	u64 sb_gen = zmd->sb_gen + 1;
    774	int ret;
    775
    776	sb->magic = cpu_to_le32(DMZ_MAGIC);
    777
    778	sb->version = cpu_to_le32(zmd->sb_version);
    779	if (zmd->sb_version > 1) {
    780		BUILD_BUG_ON(UUID_SIZE != 16);
    781		export_uuid(sb->dmz_uuid, &zmd->uuid);
    782		memcpy(sb->dmz_label, zmd->label, BDEVNAME_SIZE);
    783		export_uuid(sb->dev_uuid, &dev->uuid);
    784	}
    785
    786	sb->gen = cpu_to_le64(sb_gen);
    787
    788	/*
    789	 * The metadata always references the absolute block address,
    790	 * ie relative to the entire block range, not the per-device
    791	 * block address.
    792	 */
    793	sb_block = zmd->sb[set].zone->id << zmd->zone_nr_blocks_shift;
    794	sb->sb_block = cpu_to_le64(sb_block);
    795	sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks);
    796	sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq);
    797	sb->nr_chunks = cpu_to_le32(zmd->nr_chunks);
    798
    799	sb->nr_map_blocks = cpu_to_le32(zmd->nr_map_blocks);
    800	sb->nr_bitmap_blocks = cpu_to_le32(zmd->nr_bitmap_blocks);
    801
    802	sb->crc = 0;
    803	sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE));
    804
    805	ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block,
    806			     mblk->page);
    807	if (ret == 0)
    808		ret = blkdev_issue_flush(dev->bdev);
    809
    810	return ret;
    811}
    812
    813/*
    814 * Write dirty metadata blocks to the specified set.
    815 */
    816static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
    817				   struct list_head *write_list,
    818				   unsigned int set)
    819{
    820	struct dmz_mblock *mblk;
    821	struct dmz_dev *dev = zmd->sb[set].dev;
    822	struct blk_plug plug;
    823	int ret = 0, nr_mblks_submitted = 0;
    824
    825	/* Issue writes */
    826	blk_start_plug(&plug);
    827	list_for_each_entry(mblk, write_list, link) {
    828		ret = dmz_write_mblock(zmd, mblk, set);
    829		if (ret)
    830			break;
    831		nr_mblks_submitted++;
    832	}
    833	blk_finish_plug(&plug);
    834
    835	/* Wait for completion */
    836	list_for_each_entry(mblk, write_list, link) {
    837		if (!nr_mblks_submitted)
    838			break;
    839		wait_on_bit_io(&mblk->state, DMZ_META_WRITING,
    840			       TASK_UNINTERRUPTIBLE);
    841		if (test_bit(DMZ_META_ERROR, &mblk->state)) {
    842			clear_bit(DMZ_META_ERROR, &mblk->state);
    843			dmz_check_bdev(dev);
    844			ret = -EIO;
    845		}
    846		nr_mblks_submitted--;
    847	}
    848
    849	/* Flush drive cache (this will also sync data) */
    850	if (ret == 0)
    851		ret = blkdev_issue_flush(dev->bdev);
    852
    853	return ret;
    854}
    855
    856/*
    857 * Log dirty metadata blocks.
    858 */
    859static int dmz_log_dirty_mblocks(struct dmz_metadata *zmd,
    860				 struct list_head *write_list)
    861{
    862	unsigned int log_set = zmd->mblk_primary ^ 0x1;
    863	int ret;
    864
    865	/* Write dirty blocks to the log */
    866	ret = dmz_write_dirty_mblocks(zmd, write_list, log_set);
    867	if (ret)
    868		return ret;
    869
    870	/*
    871	 * No error so far: now validate the log by updating the
    872	 * log index super block generation.
    873	 */
    874	ret = dmz_write_sb(zmd, log_set);
    875	if (ret)
    876		return ret;
    877
    878	return 0;
    879}
    880
    881/*
    882 * Flush dirty metadata blocks.
    883 */
    884int dmz_flush_metadata(struct dmz_metadata *zmd)
    885{
    886	struct dmz_mblock *mblk;
    887	struct list_head write_list;
    888	struct dmz_dev *dev;
    889	int ret;
    890
    891	if (WARN_ON(!zmd))
    892		return 0;
    893
    894	INIT_LIST_HEAD(&write_list);
    895
    896	/*
    897	 * Make sure that metadata blocks are stable before logging: take
    898	 * the write lock on the metadata semaphore to prevent target BIOs
    899	 * from modifying metadata.
    900	 */
    901	down_write(&zmd->mblk_sem);
    902	dev = zmd->sb[zmd->mblk_primary].dev;
    903
    904	/*
    905	 * This is called from the target flush work and reclaim work.
    906	 * Concurrent execution is not allowed.
    907	 */
    908	dmz_lock_flush(zmd);
    909
    910	if (dmz_bdev_is_dying(dev)) {
    911		ret = -EIO;
    912		goto out;
    913	}
    914
    915	/* Get dirty blocks */
    916	spin_lock(&zmd->mblk_lock);
    917	list_splice_init(&zmd->mblk_dirty_list, &write_list);
    918	spin_unlock(&zmd->mblk_lock);
    919
    920	/* If there are no dirty metadata blocks, just flush the device cache */
    921	if (list_empty(&write_list)) {
    922		ret = blkdev_issue_flush(dev->bdev);
    923		goto err;
    924	}
    925
    926	/*
    927	 * The primary metadata set is still clean. Keep it this way until
    928	 * all updates are successful in the secondary set. That is, use
    929	 * the secondary set as a log.
    930	 */
    931	ret = dmz_log_dirty_mblocks(zmd, &write_list);
    932	if (ret)
    933		goto err;
    934
    935	/*
    936	 * The log is on disk. It is now safe to update in place
    937	 * in the primary metadata set.
    938	 */
    939	ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary);
    940	if (ret)
    941		goto err;
    942
    943	ret = dmz_write_sb(zmd, zmd->mblk_primary);
    944	if (ret)
    945		goto err;
    946
    947	while (!list_empty(&write_list)) {
    948		mblk = list_first_entry(&write_list, struct dmz_mblock, link);
    949		list_del_init(&mblk->link);
    950
    951		spin_lock(&zmd->mblk_lock);
    952		clear_bit(DMZ_META_DIRTY, &mblk->state);
    953		if (mblk->ref == 0)
    954			list_add_tail(&mblk->link, &zmd->mblk_lru_list);
    955		spin_unlock(&zmd->mblk_lock);
    956	}
    957
    958	zmd->sb_gen++;
    959out:
    960	dmz_unlock_flush(zmd);
    961	up_write(&zmd->mblk_sem);
    962
    963	return ret;
    964
    965err:
    966	if (!list_empty(&write_list)) {
    967		spin_lock(&zmd->mblk_lock);
    968		list_splice(&write_list, &zmd->mblk_dirty_list);
    969		spin_unlock(&zmd->mblk_lock);
    970	}
    971	if (!dmz_check_bdev(dev))
    972		ret = -EIO;
    973	goto out;
    974}
    975
    976/*
    977 * Check super block.
    978 */
    979static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_sb *dsb,
    980			bool tertiary)
    981{
    982	struct dmz_super *sb = dsb->sb;
    983	struct dmz_dev *dev = dsb->dev;
    984	unsigned int nr_meta_zones, nr_data_zones;
    985	u32 crc, stored_crc;
    986	u64 gen, sb_block;
    987
    988	if (le32_to_cpu(sb->magic) != DMZ_MAGIC) {
    989		dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)",
    990			    DMZ_MAGIC, le32_to_cpu(sb->magic));
    991		return -ENXIO;
    992	}
    993
    994	zmd->sb_version = le32_to_cpu(sb->version);
    995	if (zmd->sb_version > DMZ_META_VER) {
    996		dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)",
    997			    DMZ_META_VER, zmd->sb_version);
    998		return -EINVAL;
    999	}
   1000	if (zmd->sb_version < 2 && tertiary) {
   1001		dmz_dev_err(dev, "Tertiary superblocks are not supported");
   1002		return -EINVAL;
   1003	}
   1004
   1005	gen = le64_to_cpu(sb->gen);
   1006	stored_crc = le32_to_cpu(sb->crc);
   1007	sb->crc = 0;
   1008	crc = crc32_le(gen, (unsigned char *)sb, DMZ_BLOCK_SIZE);
   1009	if (crc != stored_crc) {
   1010		dmz_dev_err(dev, "Invalid checksum (needed 0x%08x, got 0x%08x)",
   1011			    crc, stored_crc);
   1012		return -ENXIO;
   1013	}
   1014
   1015	sb_block = le64_to_cpu(sb->sb_block);
   1016	if (sb_block != (u64)dsb->zone->id << zmd->zone_nr_blocks_shift ) {
   1017		dmz_dev_err(dev, "Invalid superblock position "
   1018			    "(is %llu expected %llu)",
   1019			    sb_block,
   1020			    (u64)dsb->zone->id << zmd->zone_nr_blocks_shift);
   1021		return -EINVAL;
   1022	}
   1023	if (zmd->sb_version > 1) {
   1024		uuid_t sb_uuid;
   1025
   1026		import_uuid(&sb_uuid, sb->dmz_uuid);
   1027		if (uuid_is_null(&sb_uuid)) {
   1028			dmz_dev_err(dev, "NULL DM-Zoned uuid");
   1029			return -ENXIO;
   1030		} else if (uuid_is_null(&zmd->uuid)) {
   1031			uuid_copy(&zmd->uuid, &sb_uuid);
   1032		} else if (!uuid_equal(&zmd->uuid, &sb_uuid)) {
   1033			dmz_dev_err(dev, "mismatching DM-Zoned uuid, "
   1034				    "is %pUl expected %pUl",
   1035				    &sb_uuid, &zmd->uuid);
   1036			return -ENXIO;
   1037		}
   1038		if (!strlen(zmd->label))
   1039			memcpy(zmd->label, sb->dmz_label, BDEVNAME_SIZE);
   1040		else if (memcmp(zmd->label, sb->dmz_label, BDEVNAME_SIZE)) {
   1041			dmz_dev_err(dev, "mismatching DM-Zoned label, "
   1042				    "is %s expected %s",
   1043				    sb->dmz_label, zmd->label);
   1044			return -ENXIO;
   1045		}
   1046		import_uuid(&dev->uuid, sb->dev_uuid);
   1047		if (uuid_is_null(&dev->uuid)) {
   1048			dmz_dev_err(dev, "NULL device uuid");
   1049			return -ENXIO;
   1050		}
   1051
   1052		if (tertiary) {
   1053			/*
   1054			 * Generation number should be 0, but it doesn't
   1055			 * really matter if it isn't.
   1056			 */
   1057			if (gen != 0)
   1058				dmz_dev_warn(dev, "Invalid generation %llu",
   1059					    gen);
   1060			return 0;
   1061		}
   1062	}
   1063
   1064	nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + zmd->zone_nr_blocks - 1)
   1065		>> zmd->zone_nr_blocks_shift;
   1066	if (!nr_meta_zones ||
   1067	    (zmd->nr_devs <= 1 && nr_meta_zones >= zmd->nr_rnd_zones) ||
   1068	    (zmd->nr_devs > 1 && nr_meta_zones >= zmd->nr_cache_zones)) {
   1069		dmz_dev_err(dev, "Invalid number of metadata blocks");
   1070		return -ENXIO;
   1071	}
   1072
   1073	if (!le32_to_cpu(sb->nr_reserved_seq) ||
   1074	    le32_to_cpu(sb->nr_reserved_seq) >= (zmd->nr_useable_zones - nr_meta_zones)) {
   1075		dmz_dev_err(dev, "Invalid number of reserved sequential zones");
   1076		return -ENXIO;
   1077	}
   1078
   1079	nr_data_zones = zmd->nr_useable_zones -
   1080		(nr_meta_zones * 2 + le32_to_cpu(sb->nr_reserved_seq));
   1081	if (le32_to_cpu(sb->nr_chunks) > nr_data_zones) {
   1082		dmz_dev_err(dev, "Invalid number of chunks %u / %u",
   1083			    le32_to_cpu(sb->nr_chunks), nr_data_zones);
   1084		return -ENXIO;
   1085	}
   1086
   1087	/* OK */
   1088	zmd->nr_meta_blocks = le32_to_cpu(sb->nr_meta_blocks);
   1089	zmd->nr_reserved_seq = le32_to_cpu(sb->nr_reserved_seq);
   1090	zmd->nr_chunks = le32_to_cpu(sb->nr_chunks);
   1091	zmd->nr_map_blocks = le32_to_cpu(sb->nr_map_blocks);
   1092	zmd->nr_bitmap_blocks = le32_to_cpu(sb->nr_bitmap_blocks);
   1093	zmd->nr_meta_zones = nr_meta_zones;
   1094	zmd->nr_data_zones = nr_data_zones;
   1095
   1096	return 0;
   1097}
   1098
   1099/*
   1100 * Read the first or second super block from disk.
   1101 */
   1102static int dmz_read_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set)
   1103{
   1104	dmz_zmd_debug(zmd, "read superblock set %d dev %pg block %llu",
   1105		      set, sb->dev->bdev, sb->block);
   1106
   1107	return dmz_rdwr_block(sb->dev, REQ_OP_READ,
   1108			      sb->block, sb->mblk->page);
   1109}
   1110
   1111/*
   1112 * Determine the position of the secondary super blocks on disk.
   1113 * This is used only if a corruption of the primary super block
   1114 * is detected.
   1115 */
   1116static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd)
   1117{
   1118	unsigned int zone_nr_blocks = zmd->zone_nr_blocks;
   1119	struct dmz_mblock *mblk;
   1120	unsigned int zone_id = zmd->sb[0].zone->id;
   1121	int i;
   1122
   1123	/* Allocate a block */
   1124	mblk = dmz_alloc_mblock(zmd, 0);
   1125	if (!mblk)
   1126		return -ENOMEM;
   1127
   1128	zmd->sb[1].mblk = mblk;
   1129	zmd->sb[1].sb = mblk->data;
   1130
   1131	/* Bad first super block: search for the second one */
   1132	zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks;
   1133	zmd->sb[1].zone = dmz_get(zmd, zone_id + 1);
   1134	zmd->sb[1].dev = zmd->sb[0].dev;
   1135	for (i = 1; i < zmd->nr_rnd_zones; i++) {
   1136		if (dmz_read_sb(zmd, &zmd->sb[1], 1) != 0)
   1137			break;
   1138		if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC)
   1139			return 0;
   1140		zmd->sb[1].block += zone_nr_blocks;
   1141		zmd->sb[1].zone = dmz_get(zmd, zone_id + i);
   1142	}
   1143
   1144	dmz_free_mblock(zmd, mblk);
   1145	zmd->sb[1].mblk = NULL;
   1146	zmd->sb[1].zone = NULL;
   1147	zmd->sb[1].dev = NULL;
   1148
   1149	return -EIO;
   1150}
   1151
   1152/*
   1153 * Read a super block from disk.
   1154 */
   1155static int dmz_get_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set)
   1156{
   1157	struct dmz_mblock *mblk;
   1158	int ret;
   1159
   1160	/* Allocate a block */
   1161	mblk = dmz_alloc_mblock(zmd, 0);
   1162	if (!mblk)
   1163		return -ENOMEM;
   1164
   1165	sb->mblk = mblk;
   1166	sb->sb = mblk->data;
   1167
   1168	/* Read super block */
   1169	ret = dmz_read_sb(zmd, sb, set);
   1170	if (ret) {
   1171		dmz_free_mblock(zmd, mblk);
   1172		sb->mblk = NULL;
   1173		return ret;
   1174	}
   1175
   1176	return 0;
   1177}
   1178
   1179/*
   1180 * Recover a metadata set.
   1181 */
   1182static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set)
   1183{
   1184	unsigned int src_set = dst_set ^ 0x1;
   1185	struct page *page;
   1186	int i, ret;
   1187
   1188	dmz_dev_warn(zmd->sb[dst_set].dev,
   1189		     "Metadata set %u invalid: recovering", dst_set);
   1190
   1191	if (dst_set == 0)
   1192		zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone);
   1193	else
   1194		zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone);
   1195
   1196	page = alloc_page(GFP_NOIO);
   1197	if (!page)
   1198		return -ENOMEM;
   1199
   1200	/* Copy metadata blocks */
   1201	for (i = 1; i < zmd->nr_meta_blocks; i++) {
   1202		ret = dmz_rdwr_block(zmd->sb[src_set].dev, REQ_OP_READ,
   1203				     zmd->sb[src_set].block + i, page);
   1204		if (ret)
   1205			goto out;
   1206		ret = dmz_rdwr_block(zmd->sb[dst_set].dev, REQ_OP_WRITE,
   1207				     zmd->sb[dst_set].block + i, page);
   1208		if (ret)
   1209			goto out;
   1210	}
   1211
   1212	/* Finalize with the super block */
   1213	if (!zmd->sb[dst_set].mblk) {
   1214		zmd->sb[dst_set].mblk = dmz_alloc_mblock(zmd, 0);
   1215		if (!zmd->sb[dst_set].mblk) {
   1216			ret = -ENOMEM;
   1217			goto out;
   1218		}
   1219		zmd->sb[dst_set].sb = zmd->sb[dst_set].mblk->data;
   1220	}
   1221
   1222	ret = dmz_write_sb(zmd, dst_set);
   1223out:
   1224	__free_pages(page, 0);
   1225
   1226	return ret;
   1227}
   1228
   1229/*
   1230 * Get super block from disk.
   1231 */
   1232static int dmz_load_sb(struct dmz_metadata *zmd)
   1233{
   1234	bool sb_good[2] = {false, false};
   1235	u64 sb_gen[2] = {0, 0};
   1236	int ret;
   1237
   1238	if (!zmd->sb[0].zone) {
   1239		dmz_zmd_err(zmd, "Primary super block zone not set");
   1240		return -ENXIO;
   1241	}
   1242
   1243	/* Read and check the primary super block */
   1244	zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone);
   1245	zmd->sb[0].dev = zmd->sb[0].zone->dev;
   1246	ret = dmz_get_sb(zmd, &zmd->sb[0], 0);
   1247	if (ret) {
   1248		dmz_dev_err(zmd->sb[0].dev, "Read primary super block failed");
   1249		return ret;
   1250	}
   1251
   1252	ret = dmz_check_sb(zmd, &zmd->sb[0], false);
   1253
   1254	/* Read and check secondary super block */
   1255	if (ret == 0) {
   1256		sb_good[0] = true;
   1257		if (!zmd->sb[1].zone) {
   1258			unsigned int zone_id =
   1259				zmd->sb[0].zone->id + zmd->nr_meta_zones;
   1260
   1261			zmd->sb[1].zone = dmz_get(zmd, zone_id);
   1262		}
   1263		zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone);
   1264		zmd->sb[1].dev = zmd->sb[0].dev;
   1265		ret = dmz_get_sb(zmd, &zmd->sb[1], 1);
   1266	} else
   1267		ret = dmz_lookup_secondary_sb(zmd);
   1268
   1269	if (ret) {
   1270		dmz_dev_err(zmd->sb[1].dev, "Read secondary super block failed");
   1271		return ret;
   1272	}
   1273
   1274	ret = dmz_check_sb(zmd, &zmd->sb[1], false);
   1275	if (ret == 0)
   1276		sb_good[1] = true;
   1277
   1278	/* Use highest generation sb first */
   1279	if (!sb_good[0] && !sb_good[1]) {
   1280		dmz_zmd_err(zmd, "No valid super block found");
   1281		return -EIO;
   1282	}
   1283
   1284	if (sb_good[0])
   1285		sb_gen[0] = le64_to_cpu(zmd->sb[0].sb->gen);
   1286	else {
   1287		ret = dmz_recover_mblocks(zmd, 0);
   1288		if (ret) {
   1289			dmz_dev_err(zmd->sb[0].dev,
   1290				    "Recovery of superblock 0 failed");
   1291			return -EIO;
   1292		}
   1293	}
   1294
   1295	if (sb_good[1])
   1296		sb_gen[1] = le64_to_cpu(zmd->sb[1].sb->gen);
   1297	else {
   1298		ret = dmz_recover_mblocks(zmd, 1);
   1299
   1300		if (ret) {
   1301			dmz_dev_err(zmd->sb[1].dev,
   1302				    "Recovery of superblock 1 failed");
   1303			return -EIO;
   1304		}
   1305	}
   1306
   1307	if (sb_gen[0] >= sb_gen[1]) {
   1308		zmd->sb_gen = sb_gen[0];
   1309		zmd->mblk_primary = 0;
   1310	} else {
   1311		zmd->sb_gen = sb_gen[1];
   1312		zmd->mblk_primary = 1;
   1313	}
   1314
   1315	dmz_dev_debug(zmd->sb[zmd->mblk_primary].dev,
   1316		      "Using super block %u (gen %llu)",
   1317		      zmd->mblk_primary, zmd->sb_gen);
   1318
   1319	if (zmd->sb_version > 1) {
   1320		int i;
   1321		struct dmz_sb *sb;
   1322
   1323		sb = kzalloc(sizeof(struct dmz_sb), GFP_KERNEL);
   1324		if (!sb)
   1325			return -ENOMEM;
   1326		for (i = 1; i < zmd->nr_devs; i++) {
   1327			sb->block = 0;
   1328			sb->zone = dmz_get(zmd, zmd->dev[i].zone_offset);
   1329			sb->dev = &zmd->dev[i];
   1330			if (!dmz_is_meta(sb->zone)) {
   1331				dmz_dev_err(sb->dev,
   1332					    "Tertiary super block zone %u not marked as metadata zone",
   1333					    sb->zone->id);
   1334				ret = -EINVAL;
   1335				goto out_kfree;
   1336			}
   1337			ret = dmz_get_sb(zmd, sb, i + 1);
   1338			if (ret) {
   1339				dmz_dev_err(sb->dev,
   1340					    "Read tertiary super block failed");
   1341				dmz_free_mblock(zmd, sb->mblk);
   1342				goto out_kfree;
   1343			}
   1344			ret = dmz_check_sb(zmd, sb, true);
   1345			dmz_free_mblock(zmd, sb->mblk);
   1346			if (ret == -EINVAL)
   1347				goto out_kfree;
   1348		}
   1349	out_kfree:
   1350		kfree(sb);
   1351	}
   1352	return ret;
   1353}
   1354
   1355/*
   1356 * Initialize a zone descriptor.
   1357 */
   1358static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data)
   1359{
   1360	struct dmz_dev *dev = data;
   1361	struct dmz_metadata *zmd = dev->metadata;
   1362	int idx = num + dev->zone_offset;
   1363	struct dm_zone *zone;
   1364
   1365	zone = dmz_insert(zmd, idx, dev);
   1366	if (IS_ERR(zone))
   1367		return PTR_ERR(zone);
   1368
   1369	if (blkz->len != zmd->zone_nr_sectors) {
   1370		if (zmd->sb_version > 1) {
   1371			/* Ignore the eventual runt (smaller) zone */
   1372			set_bit(DMZ_OFFLINE, &zone->flags);
   1373			return 0;
   1374		} else if (blkz->start + blkz->len == dev->capacity)
   1375			return 0;
   1376		return -ENXIO;
   1377	}
   1378
   1379	/*
   1380	 * Devices that have zones with a capacity smaller than the zone size
   1381	 * (e.g. NVMe zoned namespaces) are not supported.
   1382	 */
   1383	if (blkz->capacity != blkz->len)
   1384		return -ENXIO;
   1385
   1386	switch (blkz->type) {
   1387	case BLK_ZONE_TYPE_CONVENTIONAL:
   1388		set_bit(DMZ_RND, &zone->flags);
   1389		break;
   1390	case BLK_ZONE_TYPE_SEQWRITE_REQ:
   1391	case BLK_ZONE_TYPE_SEQWRITE_PREF:
   1392		set_bit(DMZ_SEQ, &zone->flags);
   1393		break;
   1394	default:
   1395		return -ENXIO;
   1396	}
   1397
   1398	if (dmz_is_rnd(zone))
   1399		zone->wp_block = 0;
   1400	else
   1401		zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start);
   1402
   1403	if (blkz->cond == BLK_ZONE_COND_OFFLINE)
   1404		set_bit(DMZ_OFFLINE, &zone->flags);
   1405	else if (blkz->cond == BLK_ZONE_COND_READONLY)
   1406		set_bit(DMZ_READ_ONLY, &zone->flags);
   1407	else {
   1408		zmd->nr_useable_zones++;
   1409		if (dmz_is_rnd(zone)) {
   1410			zmd->nr_rnd_zones++;
   1411			if (zmd->nr_devs == 1 && !zmd->sb[0].zone) {
   1412				/* Primary super block zone */
   1413				zmd->sb[0].zone = zone;
   1414			}
   1415		}
   1416		if (zmd->nr_devs > 1 && num == 0) {
   1417			/*
   1418			 * Tertiary superblock zones are always at the
   1419			 * start of the zoned devices, so mark them
   1420			 * as metadata zone.
   1421			 */
   1422			set_bit(DMZ_META, &zone->flags);
   1423		}
   1424	}
   1425	return 0;
   1426}
   1427
   1428static int dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev)
   1429{
   1430	int idx;
   1431	sector_t zone_offset = 0;
   1432
   1433	for(idx = 0; idx < dev->nr_zones; idx++) {
   1434		struct dm_zone *zone;
   1435
   1436		zone = dmz_insert(zmd, idx, dev);
   1437		if (IS_ERR(zone))
   1438			return PTR_ERR(zone);
   1439		set_bit(DMZ_CACHE, &zone->flags);
   1440		zone->wp_block = 0;
   1441		zmd->nr_cache_zones++;
   1442		zmd->nr_useable_zones++;
   1443		if (dev->capacity - zone_offset < zmd->zone_nr_sectors) {
   1444			/* Disable runt zone */
   1445			set_bit(DMZ_OFFLINE, &zone->flags);
   1446			break;
   1447		}
   1448		zone_offset += zmd->zone_nr_sectors;
   1449	}
   1450	return 0;
   1451}
   1452
   1453/*
   1454 * Free zones descriptors.
   1455 */
   1456static void dmz_drop_zones(struct dmz_metadata *zmd)
   1457{
   1458	int idx;
   1459
   1460	for(idx = 0; idx < zmd->nr_zones; idx++) {
   1461		struct dm_zone *zone = xa_load(&zmd->zones, idx);
   1462
   1463		kfree(zone);
   1464		xa_erase(&zmd->zones, idx);
   1465	}
   1466	xa_destroy(&zmd->zones);
   1467}
   1468
   1469/*
   1470 * Allocate and initialize zone descriptors using the zone
   1471 * information from disk.
   1472 */
   1473static int dmz_init_zones(struct dmz_metadata *zmd)
   1474{
   1475	int i, ret;
   1476	struct dmz_dev *zoned_dev = &zmd->dev[0];
   1477
   1478	/* Init */
   1479	zmd->zone_nr_sectors = zmd->dev[0].zone_nr_sectors;
   1480	zmd->zone_nr_sectors_shift = ilog2(zmd->zone_nr_sectors);
   1481	zmd->zone_nr_blocks = dmz_sect2blk(zmd->zone_nr_sectors);
   1482	zmd->zone_nr_blocks_shift = ilog2(zmd->zone_nr_blocks);
   1483	zmd->zone_bitmap_size = zmd->zone_nr_blocks >> 3;
   1484	zmd->zone_nr_bitmap_blocks =
   1485		max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT);
   1486	zmd->zone_bits_per_mblk = min_t(sector_t, zmd->zone_nr_blocks,
   1487					DMZ_BLOCK_SIZE_BITS);
   1488
   1489	/* Allocate zone array */
   1490	zmd->nr_zones = 0;
   1491	for (i = 0; i < zmd->nr_devs; i++) {
   1492		struct dmz_dev *dev = &zmd->dev[i];
   1493
   1494		dev->metadata = zmd;
   1495		zmd->nr_zones += dev->nr_zones;
   1496
   1497		atomic_set(&dev->unmap_nr_rnd, 0);
   1498		INIT_LIST_HEAD(&dev->unmap_rnd_list);
   1499		INIT_LIST_HEAD(&dev->map_rnd_list);
   1500
   1501		atomic_set(&dev->unmap_nr_seq, 0);
   1502		INIT_LIST_HEAD(&dev->unmap_seq_list);
   1503		INIT_LIST_HEAD(&dev->map_seq_list);
   1504	}
   1505
   1506	if (!zmd->nr_zones) {
   1507		DMERR("(%s): No zones found", zmd->devname);
   1508		return -ENXIO;
   1509	}
   1510	xa_init(&zmd->zones);
   1511
   1512	DMDEBUG("(%s): Using %zu B for zone information",
   1513		zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones);
   1514
   1515	if (zmd->nr_devs > 1) {
   1516		ret = dmz_emulate_zones(zmd, &zmd->dev[0]);
   1517		if (ret < 0) {
   1518			DMDEBUG("(%s): Failed to emulate zones, error %d",
   1519				zmd->devname, ret);
   1520			dmz_drop_zones(zmd);
   1521			return ret;
   1522		}
   1523
   1524		/*
   1525		 * Primary superblock zone is always at zone 0 when multiple
   1526		 * drives are present.
   1527		 */
   1528		zmd->sb[0].zone = dmz_get(zmd, 0);
   1529
   1530		for (i = 1; i < zmd->nr_devs; i++) {
   1531			zoned_dev = &zmd->dev[i];
   1532
   1533			ret = blkdev_report_zones(zoned_dev->bdev, 0,
   1534						  BLK_ALL_ZONES,
   1535						  dmz_init_zone, zoned_dev);
   1536			if (ret < 0) {
   1537				DMDEBUG("(%s): Failed to report zones, error %d",
   1538					zmd->devname, ret);
   1539				dmz_drop_zones(zmd);
   1540				return ret;
   1541			}
   1542		}
   1543		return 0;
   1544	}
   1545
   1546	/*
   1547	 * Get zone information and initialize zone descriptors.  At the same
   1548	 * time, determine where the super block should be: first block of the
   1549	 * first randomly writable zone.
   1550	 */
   1551	ret = blkdev_report_zones(zoned_dev->bdev, 0, BLK_ALL_ZONES,
   1552				  dmz_init_zone, zoned_dev);
   1553	if (ret < 0) {
   1554		DMDEBUG("(%s): Failed to report zones, error %d",
   1555			zmd->devname, ret);
   1556		dmz_drop_zones(zmd);
   1557		return ret;
   1558	}
   1559
   1560	return 0;
   1561}
   1562
   1563static int dmz_update_zone_cb(struct blk_zone *blkz, unsigned int idx,
   1564			      void *data)
   1565{
   1566	struct dm_zone *zone = data;
   1567
   1568	clear_bit(DMZ_OFFLINE, &zone->flags);
   1569	clear_bit(DMZ_READ_ONLY, &zone->flags);
   1570	if (blkz->cond == BLK_ZONE_COND_OFFLINE)
   1571		set_bit(DMZ_OFFLINE, &zone->flags);
   1572	else if (blkz->cond == BLK_ZONE_COND_READONLY)
   1573		set_bit(DMZ_READ_ONLY, &zone->flags);
   1574
   1575	if (dmz_is_seq(zone))
   1576		zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start);
   1577	else
   1578		zone->wp_block = 0;
   1579	return 0;
   1580}
   1581
   1582/*
   1583 * Update a zone information.
   1584 */
   1585static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
   1586{
   1587	struct dmz_dev *dev = zone->dev;
   1588	unsigned int noio_flag;
   1589	int ret;
   1590
   1591	if (dev->flags & DMZ_BDEV_REGULAR)
   1592		return 0;
   1593
   1594	/*
   1595	 * Get zone information from disk. Since blkdev_report_zones() uses
   1596	 * GFP_KERNEL by default for memory allocations, set the per-task
   1597	 * PF_MEMALLOC_NOIO flag so that all allocations are done as if
   1598	 * GFP_NOIO was specified.
   1599	 */
   1600	noio_flag = memalloc_noio_save();
   1601	ret = blkdev_report_zones(dev->bdev, dmz_start_sect(zmd, zone), 1,
   1602				  dmz_update_zone_cb, zone);
   1603	memalloc_noio_restore(noio_flag);
   1604
   1605	if (ret == 0)
   1606		ret = -EIO;
   1607	if (ret < 0) {
   1608		dmz_dev_err(dev, "Get zone %u report failed",
   1609			    zone->id);
   1610		dmz_check_bdev(dev);
   1611		return ret;
   1612	}
   1613
   1614	return 0;
   1615}
   1616
   1617/*
   1618 * Check a zone write pointer position when the zone is marked
   1619 * with the sequential write error flag.
   1620 */
   1621static int dmz_handle_seq_write_err(struct dmz_metadata *zmd,
   1622				    struct dm_zone *zone)
   1623{
   1624	struct dmz_dev *dev = zone->dev;
   1625	unsigned int wp = 0;
   1626	int ret;
   1627
   1628	wp = zone->wp_block;
   1629	ret = dmz_update_zone(zmd, zone);
   1630	if (ret)
   1631		return ret;
   1632
   1633	dmz_dev_warn(dev, "Processing zone %u write error (zone wp %u/%u)",
   1634		     zone->id, zone->wp_block, wp);
   1635
   1636	if (zone->wp_block < wp) {
   1637		dmz_invalidate_blocks(zmd, zone, zone->wp_block,
   1638				      wp - zone->wp_block);
   1639	}
   1640
   1641	return 0;
   1642}
   1643
   1644/*
   1645 * Reset a zone write pointer.
   1646 */
   1647static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
   1648{
   1649	int ret;
   1650
   1651	/*
   1652	 * Ignore offline zones, read only zones,
   1653	 * and conventional zones.
   1654	 */
   1655	if (dmz_is_offline(zone) ||
   1656	    dmz_is_readonly(zone) ||
   1657	    dmz_is_rnd(zone))
   1658		return 0;
   1659
   1660	if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) {
   1661		struct dmz_dev *dev = zone->dev;
   1662
   1663		ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET,
   1664				       dmz_start_sect(zmd, zone),
   1665				       zmd->zone_nr_sectors, GFP_NOIO);
   1666		if (ret) {
   1667			dmz_dev_err(dev, "Reset zone %u failed %d",
   1668				    zone->id, ret);
   1669			return ret;
   1670		}
   1671	}
   1672
   1673	/* Clear write error bit and rewind write pointer position */
   1674	clear_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
   1675	zone->wp_block = 0;
   1676
   1677	return 0;
   1678}
   1679
   1680static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone);
   1681
   1682/*
   1683 * Initialize chunk mapping.
   1684 */
   1685static int dmz_load_mapping(struct dmz_metadata *zmd)
   1686{
   1687	struct dm_zone *dzone, *bzone;
   1688	struct dmz_mblock *dmap_mblk = NULL;
   1689	struct dmz_map *dmap;
   1690	unsigned int i = 0, e = 0, chunk = 0;
   1691	unsigned int dzone_id;
   1692	unsigned int bzone_id;
   1693
   1694	/* Metadata block array for the chunk mapping table */
   1695	zmd->map_mblk = kcalloc(zmd->nr_map_blocks,
   1696				sizeof(struct dmz_mblk *), GFP_KERNEL);
   1697	if (!zmd->map_mblk)
   1698		return -ENOMEM;
   1699
   1700	/* Get chunk mapping table blocks and initialize zone mapping */
   1701	while (chunk < zmd->nr_chunks) {
   1702		if (!dmap_mblk) {
   1703			/* Get mapping block */
   1704			dmap_mblk = dmz_get_mblock(zmd, i + 1);
   1705			if (IS_ERR(dmap_mblk))
   1706				return PTR_ERR(dmap_mblk);
   1707			zmd->map_mblk[i] = dmap_mblk;
   1708			dmap = (struct dmz_map *) dmap_mblk->data;
   1709			i++;
   1710			e = 0;
   1711		}
   1712
   1713		/* Check data zone */
   1714		dzone_id = le32_to_cpu(dmap[e].dzone_id);
   1715		if (dzone_id == DMZ_MAP_UNMAPPED)
   1716			goto next;
   1717
   1718		if (dzone_id >= zmd->nr_zones) {
   1719			dmz_zmd_err(zmd, "Chunk %u mapping: invalid data zone ID %u",
   1720				    chunk, dzone_id);
   1721			return -EIO;
   1722		}
   1723
   1724		dzone = dmz_get(zmd, dzone_id);
   1725		if (!dzone) {
   1726			dmz_zmd_err(zmd, "Chunk %u mapping: data zone %u not present",
   1727				    chunk, dzone_id);
   1728			return -EIO;
   1729		}
   1730		set_bit(DMZ_DATA, &dzone->flags);
   1731		dzone->chunk = chunk;
   1732		dmz_get_zone_weight(zmd, dzone);
   1733
   1734		if (dmz_is_cache(dzone))
   1735			list_add_tail(&dzone->link, &zmd->map_cache_list);
   1736		else if (dmz_is_rnd(dzone))
   1737			list_add_tail(&dzone->link, &dzone->dev->map_rnd_list);
   1738		else
   1739			list_add_tail(&dzone->link, &dzone->dev->map_seq_list);
   1740
   1741		/* Check buffer zone */
   1742		bzone_id = le32_to_cpu(dmap[e].bzone_id);
   1743		if (bzone_id == DMZ_MAP_UNMAPPED)
   1744			goto next;
   1745
   1746		if (bzone_id >= zmd->nr_zones) {
   1747			dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone ID %u",
   1748				    chunk, bzone_id);
   1749			return -EIO;
   1750		}
   1751
   1752		bzone = dmz_get(zmd, bzone_id);
   1753		if (!bzone) {
   1754			dmz_zmd_err(zmd, "Chunk %u mapping: buffer zone %u not present",
   1755				    chunk, bzone_id);
   1756			return -EIO;
   1757		}
   1758		if (!dmz_is_rnd(bzone) && !dmz_is_cache(bzone)) {
   1759			dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone %u",
   1760				    chunk, bzone_id);
   1761			return -EIO;
   1762		}
   1763
   1764		set_bit(DMZ_DATA, &bzone->flags);
   1765		set_bit(DMZ_BUF, &bzone->flags);
   1766		bzone->chunk = chunk;
   1767		bzone->bzone = dzone;
   1768		dzone->bzone = bzone;
   1769		dmz_get_zone_weight(zmd, bzone);
   1770		if (dmz_is_cache(bzone))
   1771			list_add_tail(&bzone->link, &zmd->map_cache_list);
   1772		else
   1773			list_add_tail(&bzone->link, &bzone->dev->map_rnd_list);
   1774next:
   1775		chunk++;
   1776		e++;
   1777		if (e >= DMZ_MAP_ENTRIES)
   1778			dmap_mblk = NULL;
   1779	}
   1780
   1781	/*
   1782	 * At this point, only meta zones and mapped data zones were
   1783	 * fully initialized. All remaining zones are unmapped data
   1784	 * zones. Finish initializing those here.
   1785	 */
   1786	for (i = 0; i < zmd->nr_zones; i++) {
   1787		dzone = dmz_get(zmd, i);
   1788		if (!dzone)
   1789			continue;
   1790		if (dmz_is_meta(dzone))
   1791			continue;
   1792		if (dmz_is_offline(dzone))
   1793			continue;
   1794
   1795		if (dmz_is_cache(dzone))
   1796			zmd->nr_cache++;
   1797		else if (dmz_is_rnd(dzone))
   1798			dzone->dev->nr_rnd++;
   1799		else
   1800			dzone->dev->nr_seq++;
   1801
   1802		if (dmz_is_data(dzone)) {
   1803			/* Already initialized */
   1804			continue;
   1805		}
   1806
   1807		/* Unmapped data zone */
   1808		set_bit(DMZ_DATA, &dzone->flags);
   1809		dzone->chunk = DMZ_MAP_UNMAPPED;
   1810		if (dmz_is_cache(dzone)) {
   1811			list_add_tail(&dzone->link, &zmd->unmap_cache_list);
   1812			atomic_inc(&zmd->unmap_nr_cache);
   1813		} else if (dmz_is_rnd(dzone)) {
   1814			list_add_tail(&dzone->link,
   1815				      &dzone->dev->unmap_rnd_list);
   1816			atomic_inc(&dzone->dev->unmap_nr_rnd);
   1817		} else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) {
   1818			list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list);
   1819			set_bit(DMZ_RESERVED, &dzone->flags);
   1820			atomic_inc(&zmd->nr_reserved_seq_zones);
   1821			dzone->dev->nr_seq--;
   1822		} else {
   1823			list_add_tail(&dzone->link,
   1824				      &dzone->dev->unmap_seq_list);
   1825			atomic_inc(&dzone->dev->unmap_nr_seq);
   1826		}
   1827	}
   1828
   1829	return 0;
   1830}
   1831
   1832/*
   1833 * Set a data chunk mapping.
   1834 */
   1835static void dmz_set_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk,
   1836				  unsigned int dzone_id, unsigned int bzone_id)
   1837{
   1838	struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT];
   1839	struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data;
   1840	int map_idx = chunk & DMZ_MAP_ENTRIES_MASK;
   1841
   1842	dmap[map_idx].dzone_id = cpu_to_le32(dzone_id);
   1843	dmap[map_idx].bzone_id = cpu_to_le32(bzone_id);
   1844	dmz_dirty_mblock(zmd, dmap_mblk);
   1845}
   1846
   1847/*
   1848 * The list of mapped zones is maintained in LRU order.
   1849 * This rotates a zone at the end of its map list.
   1850 */
   1851static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
   1852{
   1853	if (list_empty(&zone->link))
   1854		return;
   1855
   1856	list_del_init(&zone->link);
   1857	if (dmz_is_seq(zone)) {
   1858		/* LRU rotate sequential zone */
   1859		list_add_tail(&zone->link, &zone->dev->map_seq_list);
   1860	} else if (dmz_is_cache(zone)) {
   1861		/* LRU rotate cache zone */
   1862		list_add_tail(&zone->link, &zmd->map_cache_list);
   1863	} else {
   1864		/* LRU rotate random zone */
   1865		list_add_tail(&zone->link, &zone->dev->map_rnd_list);
   1866	}
   1867}
   1868
   1869/*
   1870 * The list of mapped random zones is maintained
   1871 * in LRU order. This rotates a zone at the end of the list.
   1872 */
   1873static void dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
   1874{
   1875	__dmz_lru_zone(zmd, zone);
   1876	if (zone->bzone)
   1877		__dmz_lru_zone(zmd, zone->bzone);
   1878}
   1879
   1880/*
   1881 * Wait for any zone to be freed.
   1882 */
   1883static void dmz_wait_for_free_zones(struct dmz_metadata *zmd)
   1884{
   1885	DEFINE_WAIT(wait);
   1886
   1887	prepare_to_wait(&zmd->free_wq, &wait, TASK_UNINTERRUPTIBLE);
   1888	dmz_unlock_map(zmd);
   1889	dmz_unlock_metadata(zmd);
   1890
   1891	io_schedule_timeout(HZ);
   1892
   1893	dmz_lock_metadata(zmd);
   1894	dmz_lock_map(zmd);
   1895	finish_wait(&zmd->free_wq, &wait);
   1896}
   1897
   1898/*
   1899 * Lock a zone for reclaim (set the zone RECLAIM bit).
   1900 * Returns false if the zone cannot be locked or if it is already locked
   1901 * and 1 otherwise.
   1902 */
   1903int dmz_lock_zone_reclaim(struct dm_zone *zone)
   1904{
   1905	/* Active zones cannot be reclaimed */
   1906	if (dmz_is_active(zone))
   1907		return 0;
   1908
   1909	return !test_and_set_bit(DMZ_RECLAIM, &zone->flags);
   1910}
   1911
   1912/*
   1913 * Clear a zone reclaim flag.
   1914 */
   1915void dmz_unlock_zone_reclaim(struct dm_zone *zone)
   1916{
   1917	WARN_ON(dmz_is_active(zone));
   1918	WARN_ON(!dmz_in_reclaim(zone));
   1919
   1920	clear_bit_unlock(DMZ_RECLAIM, &zone->flags);
   1921	smp_mb__after_atomic();
   1922	wake_up_bit(&zone->flags, DMZ_RECLAIM);
   1923}
   1924
   1925/*
   1926 * Wait for a zone reclaim to complete.
   1927 */
   1928static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone)
   1929{
   1930	dmz_unlock_map(zmd);
   1931	dmz_unlock_metadata(zmd);
   1932	set_bit(DMZ_RECLAIM_TERMINATE, &zone->flags);
   1933	wait_on_bit_timeout(&zone->flags, DMZ_RECLAIM, TASK_UNINTERRUPTIBLE, HZ);
   1934	clear_bit(DMZ_RECLAIM_TERMINATE, &zone->flags);
   1935	dmz_lock_metadata(zmd);
   1936	dmz_lock_map(zmd);
   1937}
   1938
   1939/*
   1940 * Select a cache or random write zone for reclaim.
   1941 */
   1942static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd,
   1943						    unsigned int idx, bool idle)
   1944{
   1945	struct dm_zone *dzone = NULL;
   1946	struct dm_zone *zone, *maxw_z = NULL;
   1947	struct list_head *zone_list;
   1948
   1949	/* If we have cache zones select from the cache zone list */
   1950	if (zmd->nr_cache) {
   1951		zone_list = &zmd->map_cache_list;
   1952		/* Try to relaim random zones, too, when idle */
   1953		if (idle && list_empty(zone_list))
   1954			zone_list = &zmd->dev[idx].map_rnd_list;
   1955	} else
   1956		zone_list = &zmd->dev[idx].map_rnd_list;
   1957
   1958	/*
   1959	 * Find the buffer zone with the heaviest weight or the first (oldest)
   1960	 * data zone that can be reclaimed.
   1961	 */
   1962	list_for_each_entry(zone, zone_list, link) {
   1963		if (dmz_is_buf(zone)) {
   1964			dzone = zone->bzone;
   1965			if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx)
   1966				continue;
   1967			if (!maxw_z || maxw_z->weight < dzone->weight)
   1968				maxw_z = dzone;
   1969		} else {
   1970			dzone = zone;
   1971			if (dmz_lock_zone_reclaim(dzone))
   1972				return dzone;
   1973		}
   1974	}
   1975
   1976	if (maxw_z && dmz_lock_zone_reclaim(maxw_z))
   1977		return maxw_z;
   1978
   1979	/*
   1980	 * If we come here, none of the zones inspected could be locked for
   1981	 * reclaim. Try again, being more aggressive, that is, find the
   1982	 * first zone that can be reclaimed regardless of its weitght.
   1983	 */
   1984	list_for_each_entry(zone, zone_list, link) {
   1985		if (dmz_is_buf(zone)) {
   1986			dzone = zone->bzone;
   1987			if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx)
   1988				continue;
   1989		} else
   1990			dzone = zone;
   1991		if (dmz_lock_zone_reclaim(dzone))
   1992			return dzone;
   1993	}
   1994
   1995	return NULL;
   1996}
   1997
   1998/*
   1999 * Select a buffered sequential zone for reclaim.
   2000 */
   2001static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd,
   2002						    unsigned int idx)
   2003{
   2004	struct dm_zone *zone;
   2005
   2006	list_for_each_entry(zone, &zmd->dev[idx].map_seq_list, link) {
   2007		if (!zone->bzone)
   2008			continue;
   2009		if (dmz_lock_zone_reclaim(zone))
   2010			return zone;
   2011	}
   2012
   2013	return NULL;
   2014}
   2015
   2016/*
   2017 * Select a zone for reclaim.
   2018 */
   2019struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd,
   2020					 unsigned int dev_idx, bool idle)
   2021{
   2022	struct dm_zone *zone = NULL;
   2023
   2024	/*
   2025	 * Search for a zone candidate to reclaim: 2 cases are possible.
   2026	 * (1) There is no free sequential zones. Then a random data zone
   2027	 *     cannot be reclaimed. So choose a sequential zone to reclaim so
   2028	 *     that afterward a random zone can be reclaimed.
   2029	 * (2) At least one free sequential zone is available, then choose
   2030	 *     the oldest random zone (data or buffer) that can be locked.
   2031	 */
   2032	dmz_lock_map(zmd);
   2033	if (list_empty(&zmd->reserved_seq_zones_list))
   2034		zone = dmz_get_seq_zone_for_reclaim(zmd, dev_idx);
   2035	if (!zone)
   2036		zone = dmz_get_rnd_zone_for_reclaim(zmd, dev_idx, idle);
   2037	dmz_unlock_map(zmd);
   2038
   2039	return zone;
   2040}
   2041
   2042/*
   2043 * Get the zone mapping a chunk, if the chunk is mapped already.
   2044 * If no mapping exist and the operation is WRITE, a zone is
   2045 * allocated and used to map the chunk.
   2046 * The zone returned will be set to the active state.
   2047 */
   2048struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, int op)
   2049{
   2050	struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT];
   2051	struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data;
   2052	int dmap_idx = chunk & DMZ_MAP_ENTRIES_MASK;
   2053	unsigned int dzone_id;
   2054	struct dm_zone *dzone = NULL;
   2055	int ret = 0;
   2056	int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND;
   2057
   2058	dmz_lock_map(zmd);
   2059again:
   2060	/* Get the chunk mapping */
   2061	dzone_id = le32_to_cpu(dmap[dmap_idx].dzone_id);
   2062	if (dzone_id == DMZ_MAP_UNMAPPED) {
   2063		/*
   2064		 * Read or discard in unmapped chunks are fine. But for
   2065		 * writes, we need a mapping, so get one.
   2066		 */
   2067		if (op != REQ_OP_WRITE)
   2068			goto out;
   2069
   2070		/* Allocate a random zone */
   2071		dzone = dmz_alloc_zone(zmd, 0, alloc_flags);
   2072		if (!dzone) {
   2073			if (dmz_dev_is_dying(zmd)) {
   2074				dzone = ERR_PTR(-EIO);
   2075				goto out;
   2076			}
   2077			dmz_wait_for_free_zones(zmd);
   2078			goto again;
   2079		}
   2080
   2081		dmz_map_zone(zmd, dzone, chunk);
   2082
   2083	} else {
   2084		/* The chunk is already mapped: get the mapping zone */
   2085		dzone = dmz_get(zmd, dzone_id);
   2086		if (!dzone) {
   2087			dzone = ERR_PTR(-EIO);
   2088			goto out;
   2089		}
   2090		if (dzone->chunk != chunk) {
   2091			dzone = ERR_PTR(-EIO);
   2092			goto out;
   2093		}
   2094
   2095		/* Repair write pointer if the sequential dzone has error */
   2096		if (dmz_seq_write_err(dzone)) {
   2097			ret = dmz_handle_seq_write_err(zmd, dzone);
   2098			if (ret) {
   2099				dzone = ERR_PTR(-EIO);
   2100				goto out;
   2101			}
   2102			clear_bit(DMZ_SEQ_WRITE_ERR, &dzone->flags);
   2103		}
   2104	}
   2105
   2106	/*
   2107	 * If the zone is being reclaimed, the chunk mapping may change
   2108	 * to a different zone. So wait for reclaim and retry. Otherwise,
   2109	 * activate the zone (this will prevent reclaim from touching it).
   2110	 */
   2111	if (dmz_in_reclaim(dzone)) {
   2112		dmz_wait_for_reclaim(zmd, dzone);
   2113		goto again;
   2114	}
   2115	dmz_activate_zone(dzone);
   2116	dmz_lru_zone(zmd, dzone);
   2117out:
   2118	dmz_unlock_map(zmd);
   2119
   2120	return dzone;
   2121}
   2122
   2123/*
   2124 * Write and discard change the block validity of data zones and their buffer
   2125 * zones. Check here that valid blocks are still present. If all blocks are
   2126 * invalid, the zones can be unmapped on the fly without waiting for reclaim
   2127 * to do it.
   2128 */
   2129void dmz_put_chunk_mapping(struct dmz_metadata *zmd, struct dm_zone *dzone)
   2130{
   2131	struct dm_zone *bzone;
   2132
   2133	dmz_lock_map(zmd);
   2134
   2135	bzone = dzone->bzone;
   2136	if (bzone) {
   2137		if (dmz_weight(bzone))
   2138			dmz_lru_zone(zmd, bzone);
   2139		else {
   2140			/* Empty buffer zone: reclaim it */
   2141			dmz_unmap_zone(zmd, bzone);
   2142			dmz_free_zone(zmd, bzone);
   2143			bzone = NULL;
   2144		}
   2145	}
   2146
   2147	/* Deactivate the data zone */
   2148	dmz_deactivate_zone(dzone);
   2149	if (dmz_is_active(dzone) || bzone || dmz_weight(dzone))
   2150		dmz_lru_zone(zmd, dzone);
   2151	else {
   2152		/* Unbuffered inactive empty data zone: reclaim it */
   2153		dmz_unmap_zone(zmd, dzone);
   2154		dmz_free_zone(zmd, dzone);
   2155	}
   2156
   2157	dmz_unlock_map(zmd);
   2158}
   2159
   2160/*
   2161 * Allocate and map a random zone to buffer a chunk
   2162 * already mapped to a sequential zone.
   2163 */
   2164struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
   2165				     struct dm_zone *dzone)
   2166{
   2167	struct dm_zone *bzone;
   2168	int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND;
   2169
   2170	dmz_lock_map(zmd);
   2171again:
   2172	bzone = dzone->bzone;
   2173	if (bzone)
   2174		goto out;
   2175
   2176	/* Allocate a random zone */
   2177	bzone = dmz_alloc_zone(zmd, 0, alloc_flags);
   2178	if (!bzone) {
   2179		if (dmz_dev_is_dying(zmd)) {
   2180			bzone = ERR_PTR(-EIO);
   2181			goto out;
   2182		}
   2183		dmz_wait_for_free_zones(zmd);
   2184		goto again;
   2185	}
   2186
   2187	/* Update the chunk mapping */
   2188	dmz_set_chunk_mapping(zmd, dzone->chunk, dzone->id, bzone->id);
   2189
   2190	set_bit(DMZ_BUF, &bzone->flags);
   2191	bzone->chunk = dzone->chunk;
   2192	bzone->bzone = dzone;
   2193	dzone->bzone = bzone;
   2194	if (dmz_is_cache(bzone))
   2195		list_add_tail(&bzone->link, &zmd->map_cache_list);
   2196	else
   2197		list_add_tail(&bzone->link, &bzone->dev->map_rnd_list);
   2198out:
   2199	dmz_unlock_map(zmd);
   2200
   2201	return bzone;
   2202}
   2203
   2204/*
   2205 * Get an unmapped (free) zone.
   2206 * This must be called with the mapping lock held.
   2207 */
   2208struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned int dev_idx,
   2209			       unsigned long flags)
   2210{
   2211	struct list_head *list;
   2212	struct dm_zone *zone;
   2213	int i;
   2214
   2215	/* Schedule reclaim to ensure free zones are available */
   2216	if (!(flags & DMZ_ALLOC_RECLAIM)) {
   2217		for (i = 0; i < zmd->nr_devs; i++)
   2218			dmz_schedule_reclaim(zmd->dev[i].reclaim);
   2219	}
   2220
   2221	i = 0;
   2222again:
   2223	if (flags & DMZ_ALLOC_CACHE)
   2224		list = &zmd->unmap_cache_list;
   2225	else if (flags & DMZ_ALLOC_RND)
   2226		list = &zmd->dev[dev_idx].unmap_rnd_list;
   2227	else
   2228		list = &zmd->dev[dev_idx].unmap_seq_list;
   2229
   2230	if (list_empty(list)) {
   2231		/*
   2232		 * No free zone: return NULL if this is for not reclaim.
   2233		 */
   2234		if (!(flags & DMZ_ALLOC_RECLAIM))
   2235			return NULL;
   2236		/*
   2237		 * Try to allocate from other devices
   2238		 */
   2239		if (i < zmd->nr_devs) {
   2240			dev_idx = (dev_idx + 1) % zmd->nr_devs;
   2241			i++;
   2242			goto again;
   2243		}
   2244
   2245		/*
   2246		 * Fallback to the reserved sequential zones
   2247		 */
   2248		zone = list_first_entry_or_null(&zmd->reserved_seq_zones_list,
   2249						struct dm_zone, link);
   2250		if (zone) {
   2251			list_del_init(&zone->link);
   2252			atomic_dec(&zmd->nr_reserved_seq_zones);
   2253		}
   2254		return zone;
   2255	}
   2256
   2257	zone = list_first_entry(list, struct dm_zone, link);
   2258	list_del_init(&zone->link);
   2259
   2260	if (dmz_is_cache(zone))
   2261		atomic_dec(&zmd->unmap_nr_cache);
   2262	else if (dmz_is_rnd(zone))
   2263		atomic_dec(&zone->dev->unmap_nr_rnd);
   2264	else
   2265		atomic_dec(&zone->dev->unmap_nr_seq);
   2266
   2267	if (dmz_is_offline(zone)) {
   2268		dmz_zmd_warn(zmd, "Zone %u is offline", zone->id);
   2269		zone = NULL;
   2270		goto again;
   2271	}
   2272	if (dmz_is_meta(zone)) {
   2273		dmz_zmd_warn(zmd, "Zone %u has metadata", zone->id);
   2274		zone = NULL;
   2275		goto again;
   2276	}
   2277	return zone;
   2278}
   2279
   2280/*
   2281 * Free a zone.
   2282 * This must be called with the mapping lock held.
   2283 */
   2284void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
   2285{
   2286	/* If this is a sequential zone, reset it */
   2287	if (dmz_is_seq(zone))
   2288		dmz_reset_zone(zmd, zone);
   2289
   2290	/* Return the zone to its type unmap list */
   2291	if (dmz_is_cache(zone)) {
   2292		list_add_tail(&zone->link, &zmd->unmap_cache_list);
   2293		atomic_inc(&zmd->unmap_nr_cache);
   2294	} else if (dmz_is_rnd(zone)) {
   2295		list_add_tail(&zone->link, &zone->dev->unmap_rnd_list);
   2296		atomic_inc(&zone->dev->unmap_nr_rnd);
   2297	} else if (dmz_is_reserved(zone)) {
   2298		list_add_tail(&zone->link, &zmd->reserved_seq_zones_list);
   2299		atomic_inc(&zmd->nr_reserved_seq_zones);
   2300	} else {
   2301		list_add_tail(&zone->link, &zone->dev->unmap_seq_list);
   2302		atomic_inc(&zone->dev->unmap_nr_seq);
   2303	}
   2304
   2305	wake_up_all(&zmd->free_wq);
   2306}
   2307
   2308/*
   2309 * Map a chunk to a zone.
   2310 * This must be called with the mapping lock held.
   2311 */
   2312void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone,
   2313		  unsigned int chunk)
   2314{
   2315	/* Set the chunk mapping */
   2316	dmz_set_chunk_mapping(zmd, chunk, dzone->id,
   2317			      DMZ_MAP_UNMAPPED);
   2318	dzone->chunk = chunk;
   2319	if (dmz_is_cache(dzone))
   2320		list_add_tail(&dzone->link, &zmd->map_cache_list);
   2321	else if (dmz_is_rnd(dzone))
   2322		list_add_tail(&dzone->link, &dzone->dev->map_rnd_list);
   2323	else
   2324		list_add_tail(&dzone->link, &dzone->dev->map_seq_list);
   2325}
   2326
   2327/*
   2328 * Unmap a zone.
   2329 * This must be called with the mapping lock held.
   2330 */
   2331void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
   2332{
   2333	unsigned int chunk = zone->chunk;
   2334	unsigned int dzone_id;
   2335
   2336	if (chunk == DMZ_MAP_UNMAPPED) {
   2337		/* Already unmapped */
   2338		return;
   2339	}
   2340
   2341	if (test_and_clear_bit(DMZ_BUF, &zone->flags)) {
   2342		/*
   2343		 * Unmapping the chunk buffer zone: clear only
   2344		 * the chunk buffer mapping
   2345		 */
   2346		dzone_id = zone->bzone->id;
   2347		zone->bzone->bzone = NULL;
   2348		zone->bzone = NULL;
   2349
   2350	} else {
   2351		/*
   2352		 * Unmapping the chunk data zone: the zone must
   2353		 * not be buffered.
   2354		 */
   2355		if (WARN_ON(zone->bzone)) {
   2356			zone->bzone->bzone = NULL;
   2357			zone->bzone = NULL;
   2358		}
   2359		dzone_id = DMZ_MAP_UNMAPPED;
   2360	}
   2361
   2362	dmz_set_chunk_mapping(zmd, chunk, dzone_id, DMZ_MAP_UNMAPPED);
   2363
   2364	zone->chunk = DMZ_MAP_UNMAPPED;
   2365	list_del_init(&zone->link);
   2366}
   2367
   2368/*
   2369 * Set @nr_bits bits in @bitmap starting from @bit.
   2370 * Return the number of bits changed from 0 to 1.
   2371 */
   2372static unsigned int dmz_set_bits(unsigned long *bitmap,
   2373				 unsigned int bit, unsigned int nr_bits)
   2374{
   2375	unsigned long *addr;
   2376	unsigned int end = bit + nr_bits;
   2377	unsigned int n = 0;
   2378
   2379	while (bit < end) {
   2380		if (((bit & (BITS_PER_LONG - 1)) == 0) &&
   2381		    ((end - bit) >= BITS_PER_LONG)) {
   2382			/* Try to set the whole word at once */
   2383			addr = bitmap + BIT_WORD(bit);
   2384			if (*addr == 0) {
   2385				*addr = ULONG_MAX;
   2386				n += BITS_PER_LONG;
   2387				bit += BITS_PER_LONG;
   2388				continue;
   2389			}
   2390		}
   2391
   2392		if (!test_and_set_bit(bit, bitmap))
   2393			n++;
   2394		bit++;
   2395	}
   2396
   2397	return n;
   2398}
   2399
   2400/*
   2401 * Get the bitmap block storing the bit for chunk_block in zone.
   2402 */
   2403static struct dmz_mblock *dmz_get_bitmap(struct dmz_metadata *zmd,
   2404					 struct dm_zone *zone,
   2405					 sector_t chunk_block)
   2406{
   2407	sector_t bitmap_block = 1 + zmd->nr_map_blocks +
   2408		(sector_t)(zone->id * zmd->zone_nr_bitmap_blocks) +
   2409		(chunk_block >> DMZ_BLOCK_SHIFT_BITS);
   2410
   2411	return dmz_get_mblock(zmd, bitmap_block);
   2412}
   2413
   2414/*
   2415 * Copy the valid blocks bitmap of from_zone to the bitmap of to_zone.
   2416 */
   2417int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
   2418			  struct dm_zone *to_zone)
   2419{
   2420	struct dmz_mblock *from_mblk, *to_mblk;
   2421	sector_t chunk_block = 0;
   2422
   2423	/* Get the zones bitmap blocks */
   2424	while (chunk_block < zmd->zone_nr_blocks) {
   2425		from_mblk = dmz_get_bitmap(zmd, from_zone, chunk_block);
   2426		if (IS_ERR(from_mblk))
   2427			return PTR_ERR(from_mblk);
   2428		to_mblk = dmz_get_bitmap(zmd, to_zone, chunk_block);
   2429		if (IS_ERR(to_mblk)) {
   2430			dmz_release_mblock(zmd, from_mblk);
   2431			return PTR_ERR(to_mblk);
   2432		}
   2433
   2434		memcpy(to_mblk->data, from_mblk->data, DMZ_BLOCK_SIZE);
   2435		dmz_dirty_mblock(zmd, to_mblk);
   2436
   2437		dmz_release_mblock(zmd, to_mblk);
   2438		dmz_release_mblock(zmd, from_mblk);
   2439
   2440		chunk_block += zmd->zone_bits_per_mblk;
   2441	}
   2442
   2443	to_zone->weight = from_zone->weight;
   2444
   2445	return 0;
   2446}
   2447
   2448/*
   2449 * Merge the valid blocks bitmap of from_zone into the bitmap of to_zone,
   2450 * starting from chunk_block.
   2451 */
   2452int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
   2453			   struct dm_zone *to_zone, sector_t chunk_block)
   2454{
   2455	unsigned int nr_blocks;
   2456	int ret;
   2457
   2458	/* Get the zones bitmap blocks */
   2459	while (chunk_block < zmd->zone_nr_blocks) {
   2460		/* Get a valid region from the source zone */
   2461		ret = dmz_first_valid_block(zmd, from_zone, &chunk_block);
   2462		if (ret <= 0)
   2463			return ret;
   2464
   2465		nr_blocks = ret;
   2466		ret = dmz_validate_blocks(zmd, to_zone, chunk_block, nr_blocks);
   2467		if (ret)
   2468			return ret;
   2469
   2470		chunk_block += nr_blocks;
   2471	}
   2472
   2473	return 0;
   2474}
   2475
   2476/*
   2477 * Validate all the blocks in the range [block..block+nr_blocks-1].
   2478 */
   2479int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
   2480			sector_t chunk_block, unsigned int nr_blocks)
   2481{
   2482	unsigned int count, bit, nr_bits;
   2483	unsigned int zone_nr_blocks = zmd->zone_nr_blocks;
   2484	struct dmz_mblock *mblk;
   2485	unsigned int n = 0;
   2486
   2487	dmz_zmd_debug(zmd, "=> VALIDATE zone %u, block %llu, %u blocks",
   2488		      zone->id, (unsigned long long)chunk_block,
   2489		      nr_blocks);
   2490
   2491	WARN_ON(chunk_block + nr_blocks > zone_nr_blocks);
   2492
   2493	while (nr_blocks) {
   2494		/* Get bitmap block */
   2495		mblk = dmz_get_bitmap(zmd, zone, chunk_block);
   2496		if (IS_ERR(mblk))
   2497			return PTR_ERR(mblk);
   2498
   2499		/* Set bits */
   2500		bit = chunk_block & DMZ_BLOCK_MASK_BITS;
   2501		nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
   2502
   2503		count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits);
   2504		if (count) {
   2505			dmz_dirty_mblock(zmd, mblk);
   2506			n += count;
   2507		}
   2508		dmz_release_mblock(zmd, mblk);
   2509
   2510		nr_blocks -= nr_bits;
   2511		chunk_block += nr_bits;
   2512	}
   2513
   2514	if (likely(zone->weight + n <= zone_nr_blocks))
   2515		zone->weight += n;
   2516	else {
   2517		dmz_zmd_warn(zmd, "Zone %u: weight %u should be <= %u",
   2518			     zone->id, zone->weight,
   2519			     zone_nr_blocks - n);
   2520		zone->weight = zone_nr_blocks;
   2521	}
   2522
   2523	return 0;
   2524}
   2525
   2526/*
   2527 * Clear nr_bits bits in bitmap starting from bit.
   2528 * Return the number of bits cleared.
   2529 */
   2530static int dmz_clear_bits(unsigned long *bitmap, int bit, int nr_bits)
   2531{
   2532	unsigned long *addr;
   2533	int end = bit + nr_bits;
   2534	int n = 0;
   2535
   2536	while (bit < end) {
   2537		if (((bit & (BITS_PER_LONG - 1)) == 0) &&
   2538		    ((end - bit) >= BITS_PER_LONG)) {
   2539			/* Try to clear whole word at once */
   2540			addr = bitmap + BIT_WORD(bit);
   2541			if (*addr == ULONG_MAX) {
   2542				*addr = 0;
   2543				n += BITS_PER_LONG;
   2544				bit += BITS_PER_LONG;
   2545				continue;
   2546			}
   2547		}
   2548
   2549		if (test_and_clear_bit(bit, bitmap))
   2550			n++;
   2551		bit++;
   2552	}
   2553
   2554	return n;
   2555}
   2556
   2557/*
   2558 * Invalidate all the blocks in the range [block..block+nr_blocks-1].
   2559 */
   2560int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
   2561			  sector_t chunk_block, unsigned int nr_blocks)
   2562{
   2563	unsigned int count, bit, nr_bits;
   2564	struct dmz_mblock *mblk;
   2565	unsigned int n = 0;
   2566
   2567	dmz_zmd_debug(zmd, "=> INVALIDATE zone %u, block %llu, %u blocks",
   2568		      zone->id, (u64)chunk_block, nr_blocks);
   2569
   2570	WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks);
   2571
   2572	while (nr_blocks) {
   2573		/* Get bitmap block */
   2574		mblk = dmz_get_bitmap(zmd, zone, chunk_block);
   2575		if (IS_ERR(mblk))
   2576			return PTR_ERR(mblk);
   2577
   2578		/* Clear bits */
   2579		bit = chunk_block & DMZ_BLOCK_MASK_BITS;
   2580		nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
   2581
   2582		count = dmz_clear_bits((unsigned long *)mblk->data,
   2583				       bit, nr_bits);
   2584		if (count) {
   2585			dmz_dirty_mblock(zmd, mblk);
   2586			n += count;
   2587		}
   2588		dmz_release_mblock(zmd, mblk);
   2589
   2590		nr_blocks -= nr_bits;
   2591		chunk_block += nr_bits;
   2592	}
   2593
   2594	if (zone->weight >= n)
   2595		zone->weight -= n;
   2596	else {
   2597		dmz_zmd_warn(zmd, "Zone %u: weight %u should be >= %u",
   2598			     zone->id, zone->weight, n);
   2599		zone->weight = 0;
   2600	}
   2601
   2602	return 0;
   2603}
   2604
   2605/*
   2606 * Get a block bit value.
   2607 */
   2608static int dmz_test_block(struct dmz_metadata *zmd, struct dm_zone *zone,
   2609			  sector_t chunk_block)
   2610{
   2611	struct dmz_mblock *mblk;
   2612	int ret;
   2613
   2614	WARN_ON(chunk_block >= zmd->zone_nr_blocks);
   2615
   2616	/* Get bitmap block */
   2617	mblk = dmz_get_bitmap(zmd, zone, chunk_block);
   2618	if (IS_ERR(mblk))
   2619		return PTR_ERR(mblk);
   2620
   2621	/* Get offset */
   2622	ret = test_bit(chunk_block & DMZ_BLOCK_MASK_BITS,
   2623		       (unsigned long *) mblk->data) != 0;
   2624
   2625	dmz_release_mblock(zmd, mblk);
   2626
   2627	return ret;
   2628}
   2629
   2630/*
   2631 * Return the number of blocks from chunk_block to the first block with a bit
   2632 * value specified by set. Search at most nr_blocks blocks from chunk_block.
   2633 */
   2634static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
   2635				 sector_t chunk_block, unsigned int nr_blocks,
   2636				 int set)
   2637{
   2638	struct dmz_mblock *mblk;
   2639	unsigned int bit, set_bit, nr_bits;
   2640	unsigned int zone_bits = zmd->zone_bits_per_mblk;
   2641	unsigned long *bitmap;
   2642	int n = 0;
   2643
   2644	WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks);
   2645
   2646	while (nr_blocks) {
   2647		/* Get bitmap block */
   2648		mblk = dmz_get_bitmap(zmd, zone, chunk_block);
   2649		if (IS_ERR(mblk))
   2650			return PTR_ERR(mblk);
   2651
   2652		/* Get offset */
   2653		bitmap = (unsigned long *) mblk->data;
   2654		bit = chunk_block & DMZ_BLOCK_MASK_BITS;
   2655		nr_bits = min(nr_blocks, zone_bits - bit);
   2656		if (set)
   2657			set_bit = find_next_bit(bitmap, zone_bits, bit);
   2658		else
   2659			set_bit = find_next_zero_bit(bitmap, zone_bits, bit);
   2660		dmz_release_mblock(zmd, mblk);
   2661
   2662		n += set_bit - bit;
   2663		if (set_bit < zone_bits)
   2664			break;
   2665
   2666		nr_blocks -= nr_bits;
   2667		chunk_block += nr_bits;
   2668	}
   2669
   2670	return n;
   2671}
   2672
   2673/*
   2674 * Test if chunk_block is valid. If it is, the number of consecutive
   2675 * valid blocks from chunk_block will be returned.
   2676 */
   2677int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone,
   2678		    sector_t chunk_block)
   2679{
   2680	int valid;
   2681
   2682	valid = dmz_test_block(zmd, zone, chunk_block);
   2683	if (valid <= 0)
   2684		return valid;
   2685
   2686	/* The block is valid: get the number of valid blocks from block */
   2687	return dmz_to_next_set_block(zmd, zone, chunk_block,
   2688				     zmd->zone_nr_blocks - chunk_block, 0);
   2689}
   2690
   2691/*
   2692 * Find the first valid block from @chunk_block in @zone.
   2693 * If such a block is found, its number is returned using
   2694 * @chunk_block and the total number of valid blocks from @chunk_block
   2695 * is returned.
   2696 */
   2697int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone,
   2698			  sector_t *chunk_block)
   2699{
   2700	sector_t start_block = *chunk_block;
   2701	int ret;
   2702
   2703	ret = dmz_to_next_set_block(zmd, zone, start_block,
   2704				    zmd->zone_nr_blocks - start_block, 1);
   2705	if (ret < 0)
   2706		return ret;
   2707
   2708	start_block += ret;
   2709	*chunk_block = start_block;
   2710
   2711	return dmz_to_next_set_block(zmd, zone, start_block,
   2712				     zmd->zone_nr_blocks - start_block, 0);
   2713}
   2714
   2715/*
   2716 * Count the number of bits set starting from bit up to bit + nr_bits - 1.
   2717 */
   2718static int dmz_count_bits(void *bitmap, int bit, int nr_bits)
   2719{
   2720	unsigned long *addr;
   2721	int end = bit + nr_bits;
   2722	int n = 0;
   2723
   2724	while (bit < end) {
   2725		if (((bit & (BITS_PER_LONG - 1)) == 0) &&
   2726		    ((end - bit) >= BITS_PER_LONG)) {
   2727			addr = (unsigned long *)bitmap + BIT_WORD(bit);
   2728			if (*addr == ULONG_MAX) {
   2729				n += BITS_PER_LONG;
   2730				bit += BITS_PER_LONG;
   2731				continue;
   2732			}
   2733		}
   2734
   2735		if (test_bit(bit, bitmap))
   2736			n++;
   2737		bit++;
   2738	}
   2739
   2740	return n;
   2741}
   2742
   2743/*
   2744 * Get a zone weight.
   2745 */
   2746static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone)
   2747{
   2748	struct dmz_mblock *mblk;
   2749	sector_t chunk_block = 0;
   2750	unsigned int bit, nr_bits;
   2751	unsigned int nr_blocks = zmd->zone_nr_blocks;
   2752	void *bitmap;
   2753	int n = 0;
   2754
   2755	while (nr_blocks) {
   2756		/* Get bitmap block */
   2757		mblk = dmz_get_bitmap(zmd, zone, chunk_block);
   2758		if (IS_ERR(mblk)) {
   2759			n = 0;
   2760			break;
   2761		}
   2762
   2763		/* Count bits in this block */
   2764		bitmap = mblk->data;
   2765		bit = chunk_block & DMZ_BLOCK_MASK_BITS;
   2766		nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
   2767		n += dmz_count_bits(bitmap, bit, nr_bits);
   2768
   2769		dmz_release_mblock(zmd, mblk);
   2770
   2771		nr_blocks -= nr_bits;
   2772		chunk_block += nr_bits;
   2773	}
   2774
   2775	zone->weight = n;
   2776}
   2777
   2778/*
   2779 * Cleanup the zoned metadata resources.
   2780 */
   2781static void dmz_cleanup_metadata(struct dmz_metadata *zmd)
   2782{
   2783	struct rb_root *root;
   2784	struct dmz_mblock *mblk, *next;
   2785	int i;
   2786
   2787	/* Release zone mapping resources */
   2788	if (zmd->map_mblk) {
   2789		for (i = 0; i < zmd->nr_map_blocks; i++)
   2790			dmz_release_mblock(zmd, zmd->map_mblk[i]);
   2791		kfree(zmd->map_mblk);
   2792		zmd->map_mblk = NULL;
   2793	}
   2794
   2795	/* Release super blocks */
   2796	for (i = 0; i < 2; i++) {
   2797		if (zmd->sb[i].mblk) {
   2798			dmz_free_mblock(zmd, zmd->sb[i].mblk);
   2799			zmd->sb[i].mblk = NULL;
   2800		}
   2801	}
   2802
   2803	/* Free cached blocks */
   2804	while (!list_empty(&zmd->mblk_dirty_list)) {
   2805		mblk = list_first_entry(&zmd->mblk_dirty_list,
   2806					struct dmz_mblock, link);
   2807		dmz_zmd_warn(zmd, "mblock %llu still in dirty list (ref %u)",
   2808			     (u64)mblk->no, mblk->ref);
   2809		list_del_init(&mblk->link);
   2810		rb_erase(&mblk->node, &zmd->mblk_rbtree);
   2811		dmz_free_mblock(zmd, mblk);
   2812	}
   2813
   2814	while (!list_empty(&zmd->mblk_lru_list)) {
   2815		mblk = list_first_entry(&zmd->mblk_lru_list,
   2816					struct dmz_mblock, link);
   2817		list_del_init(&mblk->link);
   2818		rb_erase(&mblk->node, &zmd->mblk_rbtree);
   2819		dmz_free_mblock(zmd, mblk);
   2820	}
   2821
   2822	/* Sanity checks: the mblock rbtree should now be empty */
   2823	root = &zmd->mblk_rbtree;
   2824	rbtree_postorder_for_each_entry_safe(mblk, next, root, node) {
   2825		dmz_zmd_warn(zmd, "mblock %llu ref %u still in rbtree",
   2826			     (u64)mblk->no, mblk->ref);
   2827		mblk->ref = 0;
   2828		dmz_free_mblock(zmd, mblk);
   2829	}
   2830
   2831	/* Free the zone descriptors */
   2832	dmz_drop_zones(zmd);
   2833
   2834	mutex_destroy(&zmd->mblk_flush_lock);
   2835	mutex_destroy(&zmd->map_lock);
   2836}
   2837
   2838static void dmz_print_dev(struct dmz_metadata *zmd, int num)
   2839{
   2840	struct dmz_dev *dev = &zmd->dev[num];
   2841
   2842	if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE)
   2843		dmz_dev_info(dev, "Regular block device");
   2844	else
   2845		dmz_dev_info(dev, "Host-%s zoned block device",
   2846			     bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ?
   2847			     "aware" : "managed");
   2848	if (zmd->sb_version > 1) {
   2849		sector_t sector_offset =
   2850			dev->zone_offset << zmd->zone_nr_sectors_shift;
   2851
   2852		dmz_dev_info(dev, "  %llu 512-byte logical sectors (offset %llu)",
   2853			     (u64)dev->capacity, (u64)sector_offset);
   2854		dmz_dev_info(dev, "  %u zones of %llu 512-byte logical sectors (offset %llu)",
   2855			     dev->nr_zones, (u64)zmd->zone_nr_sectors,
   2856			     (u64)dev->zone_offset);
   2857	} else {
   2858		dmz_dev_info(dev, "  %llu 512-byte logical sectors",
   2859			     (u64)dev->capacity);
   2860		dmz_dev_info(dev, "  %u zones of %llu 512-byte logical sectors",
   2861			     dev->nr_zones, (u64)zmd->zone_nr_sectors);
   2862	}
   2863}
   2864
   2865/*
   2866 * Initialize the zoned metadata.
   2867 */
   2868int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev,
   2869		     struct dmz_metadata **metadata,
   2870		     const char *devname)
   2871{
   2872	struct dmz_metadata *zmd;
   2873	unsigned int i;
   2874	struct dm_zone *zone;
   2875	int ret;
   2876
   2877	zmd = kzalloc(sizeof(struct dmz_metadata), GFP_KERNEL);
   2878	if (!zmd)
   2879		return -ENOMEM;
   2880
   2881	strcpy(zmd->devname, devname);
   2882	zmd->dev = dev;
   2883	zmd->nr_devs = num_dev;
   2884	zmd->mblk_rbtree = RB_ROOT;
   2885	init_rwsem(&zmd->mblk_sem);
   2886	mutex_init(&zmd->mblk_flush_lock);
   2887	spin_lock_init(&zmd->mblk_lock);
   2888	INIT_LIST_HEAD(&zmd->mblk_lru_list);
   2889	INIT_LIST_HEAD(&zmd->mblk_dirty_list);
   2890
   2891	mutex_init(&zmd->map_lock);
   2892
   2893	atomic_set(&zmd->unmap_nr_cache, 0);
   2894	INIT_LIST_HEAD(&zmd->unmap_cache_list);
   2895	INIT_LIST_HEAD(&zmd->map_cache_list);
   2896
   2897	atomic_set(&zmd->nr_reserved_seq_zones, 0);
   2898	INIT_LIST_HEAD(&zmd->reserved_seq_zones_list);
   2899
   2900	init_waitqueue_head(&zmd->free_wq);
   2901
   2902	/* Initialize zone descriptors */
   2903	ret = dmz_init_zones(zmd);
   2904	if (ret)
   2905		goto err;
   2906
   2907	/* Get super block */
   2908	ret = dmz_load_sb(zmd);
   2909	if (ret)
   2910		goto err;
   2911
   2912	/* Set metadata zones starting from sb_zone */
   2913	for (i = 0; i < zmd->nr_meta_zones << 1; i++) {
   2914		zone = dmz_get(zmd, zmd->sb[0].zone->id + i);
   2915		if (!zone) {
   2916			dmz_zmd_err(zmd,
   2917				    "metadata zone %u not present", i);
   2918			ret = -ENXIO;
   2919			goto err;
   2920		}
   2921		if (!dmz_is_rnd(zone) && !dmz_is_cache(zone)) {
   2922			dmz_zmd_err(zmd,
   2923				    "metadata zone %d is not random", i);
   2924			ret = -ENXIO;
   2925			goto err;
   2926		}
   2927		set_bit(DMZ_META, &zone->flags);
   2928	}
   2929	/* Load mapping table */
   2930	ret = dmz_load_mapping(zmd);
   2931	if (ret)
   2932		goto err;
   2933
   2934	/*
   2935	 * Cache size boundaries: allow at least 2 super blocks, the chunk map
   2936	 * blocks and enough blocks to be able to cache the bitmap blocks of
   2937	 * up to 16 zones when idle (min_nr_mblks). Otherwise, if busy, allow
   2938	 * the cache to add 512 more metadata blocks.
   2939	 */
   2940	zmd->min_nr_mblks = 2 + zmd->nr_map_blocks + zmd->zone_nr_bitmap_blocks * 16;
   2941	zmd->max_nr_mblks = zmd->min_nr_mblks + 512;
   2942	zmd->mblk_shrinker.count_objects = dmz_mblock_shrinker_count;
   2943	zmd->mblk_shrinker.scan_objects = dmz_mblock_shrinker_scan;
   2944	zmd->mblk_shrinker.seeks = DEFAULT_SEEKS;
   2945
   2946	/* Metadata cache shrinker */
   2947	ret = register_shrinker(&zmd->mblk_shrinker);
   2948	if (ret) {
   2949		dmz_zmd_err(zmd, "Register metadata cache shrinker failed");
   2950		goto err;
   2951	}
   2952
   2953	dmz_zmd_info(zmd, "DM-Zoned metadata version %d", zmd->sb_version);
   2954	for (i = 0; i < zmd->nr_devs; i++)
   2955		dmz_print_dev(zmd, i);
   2956
   2957	dmz_zmd_info(zmd, "  %u zones of %llu 512-byte logical sectors",
   2958		     zmd->nr_zones, (u64)zmd->zone_nr_sectors);
   2959	dmz_zmd_debug(zmd, "  %u metadata zones",
   2960		      zmd->nr_meta_zones * 2);
   2961	dmz_zmd_debug(zmd, "  %u data zones for %u chunks",
   2962		      zmd->nr_data_zones, zmd->nr_chunks);
   2963	dmz_zmd_debug(zmd, "    %u cache zones (%u unmapped)",
   2964		      zmd->nr_cache, atomic_read(&zmd->unmap_nr_cache));
   2965	for (i = 0; i < zmd->nr_devs; i++) {
   2966		dmz_zmd_debug(zmd, "    %u random zones (%u unmapped)",
   2967			      dmz_nr_rnd_zones(zmd, i),
   2968			      dmz_nr_unmap_rnd_zones(zmd, i));
   2969		dmz_zmd_debug(zmd, "    %u sequential zones (%u unmapped)",
   2970			      dmz_nr_seq_zones(zmd, i),
   2971			      dmz_nr_unmap_seq_zones(zmd, i));
   2972	}
   2973	dmz_zmd_debug(zmd, "  %u reserved sequential data zones",
   2974		      zmd->nr_reserved_seq);
   2975	dmz_zmd_debug(zmd, "Format:");
   2976	dmz_zmd_debug(zmd, "%u metadata blocks per set (%u max cache)",
   2977		      zmd->nr_meta_blocks, zmd->max_nr_mblks);
   2978	dmz_zmd_debug(zmd, "  %u data zone mapping blocks",
   2979		      zmd->nr_map_blocks);
   2980	dmz_zmd_debug(zmd, "  %u bitmap blocks",
   2981		      zmd->nr_bitmap_blocks);
   2982
   2983	*metadata = zmd;
   2984
   2985	return 0;
   2986err:
   2987	dmz_cleanup_metadata(zmd);
   2988	kfree(zmd);
   2989	*metadata = NULL;
   2990
   2991	return ret;
   2992}
   2993
   2994/*
   2995 * Cleanup the zoned metadata resources.
   2996 */
   2997void dmz_dtr_metadata(struct dmz_metadata *zmd)
   2998{
   2999	unregister_shrinker(&zmd->mblk_shrinker);
   3000	dmz_cleanup_metadata(zmd);
   3001	kfree(zmd);
   3002}
   3003
   3004/*
   3005 * Check zone information on resume.
   3006 */
   3007int dmz_resume_metadata(struct dmz_metadata *zmd)
   3008{
   3009	struct dm_zone *zone;
   3010	sector_t wp_block;
   3011	unsigned int i;
   3012	int ret;
   3013
   3014	/* Check zones */
   3015	for (i = 0; i < zmd->nr_zones; i++) {
   3016		zone = dmz_get(zmd, i);
   3017		if (!zone) {
   3018			dmz_zmd_err(zmd, "Unable to get zone %u", i);
   3019			return -EIO;
   3020		}
   3021		wp_block = zone->wp_block;
   3022
   3023		ret = dmz_update_zone(zmd, zone);
   3024		if (ret) {
   3025			dmz_zmd_err(zmd, "Broken zone %u", i);
   3026			return ret;
   3027		}
   3028
   3029		if (dmz_is_offline(zone)) {
   3030			dmz_zmd_warn(zmd, "Zone %u is offline", i);
   3031			continue;
   3032		}
   3033
   3034		/* Check write pointer */
   3035		if (!dmz_is_seq(zone))
   3036			zone->wp_block = 0;
   3037		else if (zone->wp_block != wp_block) {
   3038			dmz_zmd_err(zmd, "Zone %u: Invalid wp (%llu / %llu)",
   3039				    i, (u64)zone->wp_block, (u64)wp_block);
   3040			zone->wp_block = wp_block;
   3041			dmz_invalidate_blocks(zmd, zone, zone->wp_block,
   3042					      zmd->zone_nr_blocks - zone->wp_block);
   3043		}
   3044	}
   3045
   3046	return 0;
   3047}