zoned.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
zoned.c (55604B)
      1// SPDX-License-Identifier: GPL-2.0
      2
      3#include <linux/bitops.h>
      4#include <linux/slab.h>
      5#include <linux/blkdev.h>
      6#include <linux/sched/mm.h>
      7#include <linux/atomic.h>
      8#include <linux/vmalloc.h>
      9#include "ctree.h"
     10#include "volumes.h"
     11#include "zoned.h"
     12#include "rcu-string.h"
     13#include "disk-io.h"
     14#include "block-group.h"
     15#include "transaction.h"
     16#include "dev-replace.h"
     17#include "space-info.h"
     18
     19/* Maximum number of zones to report per blkdev_report_zones() call */
     20#define BTRFS_REPORT_NR_ZONES   4096
     21/* Invalid allocation pointer value for missing devices */
     22#define WP_MISSING_DEV ((u64)-1)
     23/* Pseudo write pointer value for conventional zone */
     24#define WP_CONVENTIONAL ((u64)-2)
     25
     26/*
     27 * Location of the first zone of superblock logging zone pairs.
     28 *
     29 * - primary superblock:    0B (zone 0)
     30 * - first copy:          512G (zone starting at that offset)
     31 * - second copy:           4T (zone starting at that offset)
     32 */
     33#define BTRFS_SB_LOG_PRIMARY_OFFSET	(0ULL)
     34#define BTRFS_SB_LOG_FIRST_OFFSET	(512ULL * SZ_1G)
     35#define BTRFS_SB_LOG_SECOND_OFFSET	(4096ULL * SZ_1G)
     36
     37#define BTRFS_SB_LOG_FIRST_SHIFT	const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
     38#define BTRFS_SB_LOG_SECOND_SHIFT	const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
     39
     40/* Number of superblock log zones */
     41#define BTRFS_NR_SB_LOG_ZONES 2
     42
     43/*
     44 * Minimum of active zones we need:
     45 *
     46 * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
     47 * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
     48 * - 1 zone for tree-log dedicated block group
     49 * - 1 zone for relocation
     50 */
     51#define BTRFS_MIN_ACTIVE_ZONES		(BTRFS_SUPER_MIRROR_MAX + 5)
     52
     53/*
     54 * Minimum / maximum supported zone size. Currently, SMR disks have a zone
     55 * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
     56 * We do not expect the zone size to become larger than 8GiB or smaller than
     57 * 4MiB in the near future.
     58 */
     59#define BTRFS_MAX_ZONE_SIZE		SZ_8G
     60#define BTRFS_MIN_ZONE_SIZE		SZ_4M
     61
     62#define SUPER_INFO_SECTORS	((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
     63
     64static inline bool sb_zone_is_full(const struct blk_zone *zone)
     65{
     66	return (zone->cond == BLK_ZONE_COND_FULL) ||
     67		(zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity);
     68}
     69
     70static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
     71{
     72	struct blk_zone *zones = data;
     73
     74	memcpy(&zones[idx], zone, sizeof(*zone));
     75
     76	return 0;
     77}
     78
     79static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
     80			    u64 *wp_ret)
     81{
     82	bool empty[BTRFS_NR_SB_LOG_ZONES];
     83	bool full[BTRFS_NR_SB_LOG_ZONES];
     84	sector_t sector;
     85	int i;
     86
     87	for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
     88		ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
     89		empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
     90		full[i] = sb_zone_is_full(&zones[i]);
     91	}
     92
     93	/*
     94	 * Possible states of log buffer zones
     95	 *
     96	 *           Empty[0]  In use[0]  Full[0]
     97	 * Empty[1]         *          x        0
     98	 * In use[1]        0          x        0
     99	 * Full[1]          1          1        C
    100	 *
    101	 * Log position:
    102	 *   *: Special case, no superblock is written
    103	 *   0: Use write pointer of zones[0]
    104	 *   1: Use write pointer of zones[1]
    105	 *   C: Compare super blocks from zones[0] and zones[1], use the latest
    106	 *      one determined by generation
    107	 *   x: Invalid state
    108	 */
    109
    110	if (empty[0] && empty[1]) {
    111		/* Special case to distinguish no superblock to read */
    112		*wp_ret = zones[0].start << SECTOR_SHIFT;
    113		return -ENOENT;
    114	} else if (full[0] && full[1]) {
    115		/* Compare two super blocks */
    116		struct address_space *mapping = bdev->bd_inode->i_mapping;
    117		struct page *page[BTRFS_NR_SB_LOG_ZONES];
    118		struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
    119		int i;
    120
    121		for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
    122			u64 bytenr;
    123
    124			bytenr = ((zones[i].start + zones[i].len)
    125				   << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE;
    126
    127			page[i] = read_cache_page_gfp(mapping,
    128					bytenr >> PAGE_SHIFT, GFP_NOFS);
    129			if (IS_ERR(page[i])) {
    130				if (i == 1)
    131					btrfs_release_disk_super(super[0]);
    132				return PTR_ERR(page[i]);
    133			}
    134			super[i] = page_address(page[i]);
    135		}
    136
    137		if (super[0]->generation > super[1]->generation)
    138			sector = zones[1].start;
    139		else
    140			sector = zones[0].start;
    141
    142		for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
    143			btrfs_release_disk_super(super[i]);
    144	} else if (!full[0] && (empty[1] || full[1])) {
    145		sector = zones[0].wp;
    146	} else if (full[0]) {
    147		sector = zones[1].wp;
    148	} else {
    149		return -EUCLEAN;
    150	}
    151	*wp_ret = sector << SECTOR_SHIFT;
    152	return 0;
    153}
    154
    155/*
    156 * Get the first zone number of the superblock mirror
    157 */
    158static inline u32 sb_zone_number(int shift, int mirror)
    159{
    160	u64 zone;
    161
    162	ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
    163	switch (mirror) {
    164	case 0: zone = 0; break;
    165	case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
    166	case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
    167	}
    168
    169	ASSERT(zone <= U32_MAX);
    170
    171	return (u32)zone;
    172}
    173
    174static inline sector_t zone_start_sector(u32 zone_number,
    175					 struct block_device *bdev)
    176{
    177	return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev));
    178}
    179
    180static inline u64 zone_start_physical(u32 zone_number,
    181				      struct btrfs_zoned_device_info *zone_info)
    182{
    183	return (u64)zone_number << zone_info->zone_size_shift;
    184}
    185
    186/*
    187 * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
    188 * device into static sized chunks and fake a conventional zone on each of
    189 * them.
    190 */
    191static int emulate_report_zones(struct btrfs_device *device, u64 pos,
    192				struct blk_zone *zones, unsigned int nr_zones)
    193{
    194	const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
    195	sector_t bdev_size = bdev_nr_sectors(device->bdev);
    196	unsigned int i;
    197
    198	pos >>= SECTOR_SHIFT;
    199	for (i = 0; i < nr_zones; i++) {
    200		zones[i].start = i * zone_sectors + pos;
    201		zones[i].len = zone_sectors;
    202		zones[i].capacity = zone_sectors;
    203		zones[i].wp = zones[i].start + zone_sectors;
    204		zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
    205		zones[i].cond = BLK_ZONE_COND_NOT_WP;
    206
    207		if (zones[i].wp >= bdev_size) {
    208			i++;
    209			break;
    210		}
    211	}
    212
    213	return i;
    214}
    215
    216static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
    217			       struct blk_zone *zones, unsigned int *nr_zones)
    218{
    219	struct btrfs_zoned_device_info *zinfo = device->zone_info;
    220	u32 zno;
    221	int ret;
    222
    223	if (!*nr_zones)
    224		return 0;
    225
    226	if (!bdev_is_zoned(device->bdev)) {
    227		ret = emulate_report_zones(device, pos, zones, *nr_zones);
    228		*nr_zones = ret;
    229		return 0;
    230	}
    231
    232	/* Check cache */
    233	if (zinfo->zone_cache) {
    234		unsigned int i;
    235
    236		ASSERT(IS_ALIGNED(pos, zinfo->zone_size));
    237		zno = pos >> zinfo->zone_size_shift;
    238		/*
    239		 * We cannot report zones beyond the zone end. So, it is OK to
    240		 * cap *nr_zones to at the end.
    241		 */
    242		*nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno);
    243
    244		for (i = 0; i < *nr_zones; i++) {
    245			struct blk_zone *zone_info;
    246
    247			zone_info = &zinfo->zone_cache[zno + i];
    248			if (!zone_info->len)
    249				break;
    250		}
    251
    252		if (i == *nr_zones) {
    253			/* Cache hit on all the zones */
    254			memcpy(zones, zinfo->zone_cache + zno,
    255			       sizeof(*zinfo->zone_cache) * *nr_zones);
    256			return 0;
    257		}
    258	}
    259
    260	ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
    261				  copy_zone_info_cb, zones);
    262	if (ret < 0) {
    263		btrfs_err_in_rcu(device->fs_info,
    264				 "zoned: failed to read zone %llu on %s (devid %llu)",
    265				 pos, rcu_str_deref(device->name),
    266				 device->devid);
    267		return ret;
    268	}
    269	*nr_zones = ret;
    270	if (!ret)
    271		return -EIO;
    272
    273	/* Populate cache */
    274	if (zinfo->zone_cache)
    275		memcpy(zinfo->zone_cache + zno, zones,
    276		       sizeof(*zinfo->zone_cache) * *nr_zones);
    277
    278	return 0;
    279}
    280
    281/* The emulated zone size is determined from the size of device extent */
    282static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
    283{
    284	struct btrfs_path *path;
    285	struct btrfs_root *root = fs_info->dev_root;
    286	struct btrfs_key key;
    287	struct extent_buffer *leaf;
    288	struct btrfs_dev_extent *dext;
    289	int ret = 0;
    290
    291	key.objectid = 1;
    292	key.type = BTRFS_DEV_EXTENT_KEY;
    293	key.offset = 0;
    294
    295	path = btrfs_alloc_path();
    296	if (!path)
    297		return -ENOMEM;
    298
    299	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
    300	if (ret < 0)
    301		goto out;
    302
    303	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
    304		ret = btrfs_next_leaf(root, path);
    305		if (ret < 0)
    306			goto out;
    307		/* No dev extents at all? Not good */
    308		if (ret > 0) {
    309			ret = -EUCLEAN;
    310			goto out;
    311		}
    312	}
    313
    314	leaf = path->nodes[0];
    315	dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
    316	fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
    317	ret = 0;
    318
    319out:
    320	btrfs_free_path(path);
    321
    322	return ret;
    323}
    324
    325int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
    326{
    327	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
    328	struct btrfs_device *device;
    329	int ret = 0;
    330
    331	/* fs_info->zone_size might not set yet. Use the incomapt flag here. */
    332	if (!btrfs_fs_incompat(fs_info, ZONED))
    333		return 0;
    334
    335	mutex_lock(&fs_devices->device_list_mutex);
    336	list_for_each_entry(device, &fs_devices->devices, dev_list) {
    337		/* We can skip reading of zone info for missing devices */
    338		if (!device->bdev)
    339			continue;
    340
    341		ret = btrfs_get_dev_zone_info(device, true);
    342		if (ret)
    343			break;
    344	}
    345	mutex_unlock(&fs_devices->device_list_mutex);
    346
    347	return ret;
    348}
    349
    350int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
    351{
    352	struct btrfs_fs_info *fs_info = device->fs_info;
    353	struct btrfs_zoned_device_info *zone_info = NULL;
    354	struct block_device *bdev = device->bdev;
    355	unsigned int max_active_zones;
    356	unsigned int nactive;
    357	sector_t nr_sectors;
    358	sector_t sector = 0;
    359	struct blk_zone *zones = NULL;
    360	unsigned int i, nreported = 0, nr_zones;
    361	sector_t zone_sectors;
    362	char *model, *emulated;
    363	int ret;
    364
    365	/*
    366	 * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
    367	 * yet be set.
    368	 */
    369	if (!btrfs_fs_incompat(fs_info, ZONED))
    370		return 0;
    371
    372	if (device->zone_info)
    373		return 0;
    374
    375	zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
    376	if (!zone_info)
    377		return -ENOMEM;
    378
    379	device->zone_info = zone_info;
    380
    381	if (!bdev_is_zoned(bdev)) {
    382		if (!fs_info->zone_size) {
    383			ret = calculate_emulated_zone_size(fs_info);
    384			if (ret)
    385				goto out;
    386		}
    387
    388		ASSERT(fs_info->zone_size);
    389		zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
    390	} else {
    391		zone_sectors = bdev_zone_sectors(bdev);
    392	}
    393
    394	/* Check if it's power of 2 (see is_power_of_2) */
    395	ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
    396	zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
    397
    398	/* We reject devices with a zone size larger than 8GB */
    399	if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
    400		btrfs_err_in_rcu(fs_info,
    401		"zoned: %s: zone size %llu larger than supported maximum %llu",
    402				 rcu_str_deref(device->name),
    403				 zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
    404		ret = -EINVAL;
    405		goto out;
    406	} else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
    407		btrfs_err_in_rcu(fs_info,
    408		"zoned: %s: zone size %llu smaller than supported minimum %u",
    409				 rcu_str_deref(device->name),
    410				 zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
    411		ret = -EINVAL;
    412		goto out;
    413	}
    414
    415	nr_sectors = bdev_nr_sectors(bdev);
    416	zone_info->zone_size_shift = ilog2(zone_info->zone_size);
    417	zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
    418	if (!IS_ALIGNED(nr_sectors, zone_sectors))
    419		zone_info->nr_zones++;
    420
    421	max_active_zones = bdev_max_active_zones(bdev);
    422	if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
    423		btrfs_err_in_rcu(fs_info,
    424"zoned: %s: max active zones %u is too small, need at least %u active zones",
    425				 rcu_str_deref(device->name), max_active_zones,
    426				 BTRFS_MIN_ACTIVE_ZONES);
    427		ret = -EINVAL;
    428		goto out;
    429	}
    430	zone_info->max_active_zones = max_active_zones;
    431
    432	zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
    433	if (!zone_info->seq_zones) {
    434		ret = -ENOMEM;
    435		goto out;
    436	}
    437
    438	zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
    439	if (!zone_info->empty_zones) {
    440		ret = -ENOMEM;
    441		goto out;
    442	}
    443
    444	zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
    445	if (!zone_info->active_zones) {
    446		ret = -ENOMEM;
    447		goto out;
    448	}
    449
    450	zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
    451	if (!zones) {
    452		ret = -ENOMEM;
    453		goto out;
    454	}
    455
    456	/*
    457	 * Enable zone cache only for a zoned device. On a non-zoned device, we
    458	 * fill the zone info with emulated CONVENTIONAL zones, so no need to
    459	 * use the cache.
    460	 */
    461	if (populate_cache && bdev_is_zoned(device->bdev)) {
    462		zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) *
    463						zone_info->nr_zones);
    464		if (!zone_info->zone_cache) {
    465			btrfs_err_in_rcu(device->fs_info,
    466				"zoned: failed to allocate zone cache for %s",
    467				rcu_str_deref(device->name));
    468			ret = -ENOMEM;
    469			goto out;
    470		}
    471	}
    472
    473	/* Get zones type */
    474	nactive = 0;
    475	while (sector < nr_sectors) {
    476		nr_zones = BTRFS_REPORT_NR_ZONES;
    477		ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
    478					  &nr_zones);
    479		if (ret)
    480			goto out;
    481
    482		for (i = 0; i < nr_zones; i++) {
    483			if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
    484				__set_bit(nreported, zone_info->seq_zones);
    485			switch (zones[i].cond) {
    486			case BLK_ZONE_COND_EMPTY:
    487				__set_bit(nreported, zone_info->empty_zones);
    488				break;
    489			case BLK_ZONE_COND_IMP_OPEN:
    490			case BLK_ZONE_COND_EXP_OPEN:
    491			case BLK_ZONE_COND_CLOSED:
    492				__set_bit(nreported, zone_info->active_zones);
    493				nactive++;
    494				break;
    495			}
    496			nreported++;
    497		}
    498		sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
    499	}
    500
    501	if (nreported != zone_info->nr_zones) {
    502		btrfs_err_in_rcu(device->fs_info,
    503				 "inconsistent number of zones on %s (%u/%u)",
    504				 rcu_str_deref(device->name), nreported,
    505				 zone_info->nr_zones);
    506		ret = -EIO;
    507		goto out;
    508	}
    509
    510	if (max_active_zones) {
    511		if (nactive > max_active_zones) {
    512			btrfs_err_in_rcu(device->fs_info,
    513			"zoned: %u active zones on %s exceeds max_active_zones %u",
    514					 nactive, rcu_str_deref(device->name),
    515					 max_active_zones);
    516			ret = -EIO;
    517			goto out;
    518		}
    519		atomic_set(&zone_info->active_zones_left,
    520			   max_active_zones - nactive);
    521	}
    522
    523	/* Validate superblock log */
    524	nr_zones = BTRFS_NR_SB_LOG_ZONES;
    525	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
    526		u32 sb_zone;
    527		u64 sb_wp;
    528		int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
    529
    530		sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
    531		if (sb_zone + 1 >= zone_info->nr_zones)
    532			continue;
    533
    534		ret = btrfs_get_dev_zones(device,
    535					  zone_start_physical(sb_zone, zone_info),
    536					  &zone_info->sb_zones[sb_pos],
    537					  &nr_zones);
    538		if (ret)
    539			goto out;
    540
    541		if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
    542			btrfs_err_in_rcu(device->fs_info,
    543	"zoned: failed to read super block log zone info at devid %llu zone %u",
    544					 device->devid, sb_zone);
    545			ret = -EUCLEAN;
    546			goto out;
    547		}
    548
    549		/*
    550		 * If zones[0] is conventional, always use the beginning of the
    551		 * zone to record superblock. No need to validate in that case.
    552		 */
    553		if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
    554		    BLK_ZONE_TYPE_CONVENTIONAL)
    555			continue;
    556
    557		ret = sb_write_pointer(device->bdev,
    558				       &zone_info->sb_zones[sb_pos], &sb_wp);
    559		if (ret != -ENOENT && ret) {
    560			btrfs_err_in_rcu(device->fs_info,
    561			"zoned: super block log zone corrupted devid %llu zone %u",
    562					 device->devid, sb_zone);
    563			ret = -EUCLEAN;
    564			goto out;
    565		}
    566	}
    567
    568
    569	kfree(zones);
    570
    571	switch (bdev_zoned_model(bdev)) {
    572	case BLK_ZONED_HM:
    573		model = "host-managed zoned";
    574		emulated = "";
    575		break;
    576	case BLK_ZONED_HA:
    577		model = "host-aware zoned";
    578		emulated = "";
    579		break;
    580	case BLK_ZONED_NONE:
    581		model = "regular";
    582		emulated = "emulated ";
    583		break;
    584	default:
    585		/* Just in case */
    586		btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
    587				 bdev_zoned_model(bdev),
    588				 rcu_str_deref(device->name));
    589		ret = -EOPNOTSUPP;
    590		goto out_free_zone_info;
    591	}
    592
    593	btrfs_info_in_rcu(fs_info,
    594		"%s block device %s, %u %szones of %llu bytes",
    595		model, rcu_str_deref(device->name), zone_info->nr_zones,
    596		emulated, zone_info->zone_size);
    597
    598	return 0;
    599
    600out:
    601	kfree(zones);
    602out_free_zone_info:
    603	btrfs_destroy_dev_zone_info(device);
    604
    605	return ret;
    606}
    607
    608void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
    609{
    610	struct btrfs_zoned_device_info *zone_info = device->zone_info;
    611
    612	if (!zone_info)
    613		return;
    614
    615	bitmap_free(zone_info->active_zones);
    616	bitmap_free(zone_info->seq_zones);
    617	bitmap_free(zone_info->empty_zones);
    618	vfree(zone_info->zone_cache);
    619	kfree(zone_info);
    620	device->zone_info = NULL;
    621}
    622
    623int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
    624		       struct blk_zone *zone)
    625{
    626	unsigned int nr_zones = 1;
    627	int ret;
    628
    629	ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
    630	if (ret != 0 || !nr_zones)
    631		return ret ? ret : -EIO;
    632
    633	return 0;
    634}
    635
    636int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
    637{
    638	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
    639	struct btrfs_device *device;
    640	u64 zoned_devices = 0;
    641	u64 nr_devices = 0;
    642	u64 zone_size = 0;
    643	const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
    644	int ret = 0;
    645
    646	/* Count zoned devices */
    647	list_for_each_entry(device, &fs_devices->devices, dev_list) {
    648		enum blk_zoned_model model;
    649
    650		if (!device->bdev)
    651			continue;
    652
    653		model = bdev_zoned_model(device->bdev);
    654		/*
    655		 * A Host-Managed zoned device must be used as a zoned device.
    656		 * A Host-Aware zoned device and a non-zoned devices can be
    657		 * treated as a zoned device, if ZONED flag is enabled in the
    658		 * superblock.
    659		 */
    660		if (model == BLK_ZONED_HM ||
    661		    (model == BLK_ZONED_HA && incompat_zoned) ||
    662		    (model == BLK_ZONED_NONE && incompat_zoned)) {
    663			struct btrfs_zoned_device_info *zone_info;
    664
    665			zone_info = device->zone_info;
    666			zoned_devices++;
    667			if (!zone_size) {
    668				zone_size = zone_info->zone_size;
    669			} else if (zone_info->zone_size != zone_size) {
    670				btrfs_err(fs_info,
    671		"zoned: unequal block device zone sizes: have %llu found %llu",
    672					  device->zone_info->zone_size,
    673					  zone_size);
    674				ret = -EINVAL;
    675				goto out;
    676			}
    677		}
    678		nr_devices++;
    679	}
    680
    681	if (!zoned_devices && !incompat_zoned)
    682		goto out;
    683
    684	if (!zoned_devices && incompat_zoned) {
    685		/* No zoned block device found on ZONED filesystem */
    686		btrfs_err(fs_info,
    687			  "zoned: no zoned devices found on a zoned filesystem");
    688		ret = -EINVAL;
    689		goto out;
    690	}
    691
    692	if (zoned_devices && !incompat_zoned) {
    693		btrfs_err(fs_info,
    694			  "zoned: mode not enabled but zoned device found");
    695		ret = -EINVAL;
    696		goto out;
    697	}
    698
    699	if (zoned_devices != nr_devices) {
    700		btrfs_err(fs_info,
    701			  "zoned: cannot mix zoned and regular devices");
    702		ret = -EINVAL;
    703		goto out;
    704	}
    705
    706	/*
    707	 * stripe_size is always aligned to BTRFS_STRIPE_LEN in
    708	 * btrfs_create_chunk(). Since we want stripe_len == zone_size,
    709	 * check the alignment here.
    710	 */
    711	if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
    712		btrfs_err(fs_info,
    713			  "zoned: zone size %llu not aligned to stripe %u",
    714			  zone_size, BTRFS_STRIPE_LEN);
    715		ret = -EINVAL;
    716		goto out;
    717	}
    718
    719	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
    720		btrfs_err(fs_info, "zoned: mixed block groups not supported");
    721		ret = -EINVAL;
    722		goto out;
    723	}
    724
    725	fs_info->zone_size = zone_size;
    726	fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
    727
    728	/*
    729	 * Check mount options here, because we might change fs_info->zoned
    730	 * from fs_info->zone_size.
    731	 */
    732	ret = btrfs_check_mountopts_zoned(fs_info);
    733	if (ret)
    734		goto out;
    735
    736	btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
    737out:
    738	return ret;
    739}
    740
    741int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
    742{
    743	if (!btrfs_is_zoned(info))
    744		return 0;
    745
    746	/*
    747	 * Space cache writing is not COWed. Disable that to avoid write errors
    748	 * in sequential zones.
    749	 */
    750	if (btrfs_test_opt(info, SPACE_CACHE)) {
    751		btrfs_err(info, "zoned: space cache v1 is not supported");
    752		return -EINVAL;
    753	}
    754
    755	if (btrfs_test_opt(info, NODATACOW)) {
    756		btrfs_err(info, "zoned: NODATACOW not supported");
    757		return -EINVAL;
    758	}
    759
    760	return 0;
    761}
    762
    763static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
    764			   int rw, u64 *bytenr_ret)
    765{
    766	u64 wp;
    767	int ret;
    768
    769	if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
    770		*bytenr_ret = zones[0].start << SECTOR_SHIFT;
    771		return 0;
    772	}
    773
    774	ret = sb_write_pointer(bdev, zones, &wp);
    775	if (ret != -ENOENT && ret < 0)
    776		return ret;
    777
    778	if (rw == WRITE) {
    779		struct blk_zone *reset = NULL;
    780
    781		if (wp == zones[0].start << SECTOR_SHIFT)
    782			reset = &zones[0];
    783		else if (wp == zones[1].start << SECTOR_SHIFT)
    784			reset = &zones[1];
    785
    786		if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
    787			ASSERT(sb_zone_is_full(reset));
    788
    789			ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
    790					       reset->start, reset->len,
    791					       GFP_NOFS);
    792			if (ret)
    793				return ret;
    794
    795			reset->cond = BLK_ZONE_COND_EMPTY;
    796			reset->wp = reset->start;
    797		}
    798	} else if (ret != -ENOENT) {
    799		/*
    800		 * For READ, we want the previous one. Move write pointer to
    801		 * the end of a zone, if it is at the head of a zone.
    802		 */
    803		u64 zone_end = 0;
    804
    805		if (wp == zones[0].start << SECTOR_SHIFT)
    806			zone_end = zones[1].start + zones[1].capacity;
    807		else if (wp == zones[1].start << SECTOR_SHIFT)
    808			zone_end = zones[0].start + zones[0].capacity;
    809		if (zone_end)
    810			wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT,
    811					BTRFS_SUPER_INFO_SIZE);
    812
    813		wp -= BTRFS_SUPER_INFO_SIZE;
    814	}
    815
    816	*bytenr_ret = wp;
    817	return 0;
    818
    819}
    820
    821int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
    822			       u64 *bytenr_ret)
    823{
    824	struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
    825	sector_t zone_sectors;
    826	u32 sb_zone;
    827	int ret;
    828	u8 zone_sectors_shift;
    829	sector_t nr_sectors;
    830	u32 nr_zones;
    831
    832	if (!bdev_is_zoned(bdev)) {
    833		*bytenr_ret = btrfs_sb_offset(mirror);
    834		return 0;
    835	}
    836
    837	ASSERT(rw == READ || rw == WRITE);
    838
    839	zone_sectors = bdev_zone_sectors(bdev);
    840	if (!is_power_of_2(zone_sectors))
    841		return -EINVAL;
    842	zone_sectors_shift = ilog2(zone_sectors);
    843	nr_sectors = bdev_nr_sectors(bdev);
    844	nr_zones = nr_sectors >> zone_sectors_shift;
    845
    846	sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
    847	if (sb_zone + 1 >= nr_zones)
    848		return -ENOENT;
    849
    850	ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
    851				  BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
    852				  zones);
    853	if (ret < 0)
    854		return ret;
    855	if (ret != BTRFS_NR_SB_LOG_ZONES)
    856		return -EIO;
    857
    858	return sb_log_location(bdev, zones, rw, bytenr_ret);
    859}
    860
    861int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
    862			  u64 *bytenr_ret)
    863{
    864	struct btrfs_zoned_device_info *zinfo = device->zone_info;
    865	u32 zone_num;
    866
    867	/*
    868	 * For a zoned filesystem on a non-zoned block device, use the same
    869	 * super block locations as regular filesystem. Doing so, the super
    870	 * block can always be retrieved and the zoned flag of the volume
    871	 * detected from the super block information.
    872	 */
    873	if (!bdev_is_zoned(device->bdev)) {
    874		*bytenr_ret = btrfs_sb_offset(mirror);
    875		return 0;
    876	}
    877
    878	zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
    879	if (zone_num + 1 >= zinfo->nr_zones)
    880		return -ENOENT;
    881
    882	return sb_log_location(device->bdev,
    883			       &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
    884			       rw, bytenr_ret);
    885}
    886
    887static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
    888				  int mirror)
    889{
    890	u32 zone_num;
    891
    892	if (!zinfo)
    893		return false;
    894
    895	zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
    896	if (zone_num + 1 >= zinfo->nr_zones)
    897		return false;
    898
    899	if (!test_bit(zone_num, zinfo->seq_zones))
    900		return false;
    901
    902	return true;
    903}
    904
    905int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
    906{
    907	struct btrfs_zoned_device_info *zinfo = device->zone_info;
    908	struct blk_zone *zone;
    909	int i;
    910
    911	if (!is_sb_log_zone(zinfo, mirror))
    912		return 0;
    913
    914	zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
    915	for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
    916		/* Advance the next zone */
    917		if (zone->cond == BLK_ZONE_COND_FULL) {
    918			zone++;
    919			continue;
    920		}
    921
    922		if (zone->cond == BLK_ZONE_COND_EMPTY)
    923			zone->cond = BLK_ZONE_COND_IMP_OPEN;
    924
    925		zone->wp += SUPER_INFO_SECTORS;
    926
    927		if (sb_zone_is_full(zone)) {
    928			/*
    929			 * No room left to write new superblock. Since
    930			 * superblock is written with REQ_SYNC, it is safe to
    931			 * finish the zone now.
    932			 *
    933			 * If the write pointer is exactly at the capacity,
    934			 * explicit ZONE_FINISH is not necessary.
    935			 */
    936			if (zone->wp != zone->start + zone->capacity) {
    937				int ret;
    938
    939				ret = blkdev_zone_mgmt(device->bdev,
    940						REQ_OP_ZONE_FINISH, zone->start,
    941						zone->len, GFP_NOFS);
    942				if (ret)
    943					return ret;
    944			}
    945
    946			zone->wp = zone->start + zone->len;
    947			zone->cond = BLK_ZONE_COND_FULL;
    948		}
    949		return 0;
    950	}
    951
    952	/* All the zones are FULL. Should not reach here. */
    953	ASSERT(0);
    954	return -EIO;
    955}
    956
    957int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
    958{
    959	sector_t zone_sectors;
    960	sector_t nr_sectors;
    961	u8 zone_sectors_shift;
    962	u32 sb_zone;
    963	u32 nr_zones;
    964
    965	zone_sectors = bdev_zone_sectors(bdev);
    966	zone_sectors_shift = ilog2(zone_sectors);
    967	nr_sectors = bdev_nr_sectors(bdev);
    968	nr_zones = nr_sectors >> zone_sectors_shift;
    969
    970	sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
    971	if (sb_zone + 1 >= nr_zones)
    972		return -ENOENT;
    973
    974	return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
    975				zone_start_sector(sb_zone, bdev),
    976				zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
    977}
    978
    979/**
    980 * btrfs_find_allocatable_zones - find allocatable zones within a given region
    981 *
    982 * @device:	the device to allocate a region on
    983 * @hole_start: the position of the hole to allocate the region
    984 * @num_bytes:	size of wanted region
    985 * @hole_end:	the end of the hole
    986 * @return:	position of allocatable zones
    987 *
    988 * Allocatable region should not contain any superblock locations.
    989 */
    990u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
    991				 u64 hole_end, u64 num_bytes)
    992{
    993	struct btrfs_zoned_device_info *zinfo = device->zone_info;
    994	const u8 shift = zinfo->zone_size_shift;
    995	u64 nzones = num_bytes >> shift;
    996	u64 pos = hole_start;
    997	u64 begin, end;
    998	bool have_sb;
    999	int i;
   1000
   1001	ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
   1002	ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
   1003
   1004	while (pos < hole_end) {
   1005		begin = pos >> shift;
   1006		end = begin + nzones;
   1007
   1008		if (end > zinfo->nr_zones)
   1009			return hole_end;
   1010
   1011		/* Check if zones in the region are all empty */
   1012		if (btrfs_dev_is_sequential(device, pos) &&
   1013		    find_next_zero_bit(zinfo->empty_zones, end, begin) != end) {
   1014			pos += zinfo->zone_size;
   1015			continue;
   1016		}
   1017
   1018		have_sb = false;
   1019		for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
   1020			u32 sb_zone;
   1021			u64 sb_pos;
   1022
   1023			sb_zone = sb_zone_number(shift, i);
   1024			if (!(end <= sb_zone ||
   1025			      sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
   1026				have_sb = true;
   1027				pos = zone_start_physical(
   1028					sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo);
   1029				break;
   1030			}
   1031
   1032			/* We also need to exclude regular superblock positions */
   1033			sb_pos = btrfs_sb_offset(i);
   1034			if (!(pos + num_bytes <= sb_pos ||
   1035			      sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
   1036				have_sb = true;
   1037				pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
   1038					    zinfo->zone_size);
   1039				break;
   1040			}
   1041		}
   1042		if (!have_sb)
   1043			break;
   1044	}
   1045
   1046	return pos;
   1047}
   1048
   1049static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos)
   1050{
   1051	struct btrfs_zoned_device_info *zone_info = device->zone_info;
   1052	unsigned int zno = (pos >> zone_info->zone_size_shift);
   1053
   1054	/* We can use any number of zones */
   1055	if (zone_info->max_active_zones == 0)
   1056		return true;
   1057
   1058	if (!test_bit(zno, zone_info->active_zones)) {
   1059		/* Active zone left? */
   1060		if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0)
   1061			return false;
   1062		if (test_and_set_bit(zno, zone_info->active_zones)) {
   1063			/* Someone already set the bit */
   1064			atomic_inc(&zone_info->active_zones_left);
   1065		}
   1066	}
   1067
   1068	return true;
   1069}
   1070
   1071static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
   1072{
   1073	struct btrfs_zoned_device_info *zone_info = device->zone_info;
   1074	unsigned int zno = (pos >> zone_info->zone_size_shift);
   1075
   1076	/* We can use any number of zones */
   1077	if (zone_info->max_active_zones == 0)
   1078		return;
   1079
   1080	if (test_and_clear_bit(zno, zone_info->active_zones))
   1081		atomic_inc(&zone_info->active_zones_left);
   1082}
   1083
   1084int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
   1085			    u64 length, u64 *bytes)
   1086{
   1087	int ret;
   1088
   1089	*bytes = 0;
   1090	ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
   1091			       physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
   1092			       GFP_NOFS);
   1093	if (ret)
   1094		return ret;
   1095
   1096	*bytes = length;
   1097	while (length) {
   1098		btrfs_dev_set_zone_empty(device, physical);
   1099		btrfs_dev_clear_active_zone(device, physical);
   1100		physical += device->zone_info->zone_size;
   1101		length -= device->zone_info->zone_size;
   1102	}
   1103
   1104	return 0;
   1105}
   1106
   1107int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
   1108{
   1109	struct btrfs_zoned_device_info *zinfo = device->zone_info;
   1110	const u8 shift = zinfo->zone_size_shift;
   1111	unsigned long begin = start >> shift;
   1112	unsigned long end = (start + size) >> shift;
   1113	u64 pos;
   1114	int ret;
   1115
   1116	ASSERT(IS_ALIGNED(start, zinfo->zone_size));
   1117	ASSERT(IS_ALIGNED(size, zinfo->zone_size));
   1118
   1119	if (end > zinfo->nr_zones)
   1120		return -ERANGE;
   1121
   1122	/* All the zones are conventional */
   1123	if (find_next_bit(zinfo->seq_zones, begin, end) == end)
   1124		return 0;
   1125
   1126	/* All the zones are sequential and empty */
   1127	if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end &&
   1128	    find_next_zero_bit(zinfo->empty_zones, begin, end) == end)
   1129		return 0;
   1130
   1131	for (pos = start; pos < start + size; pos += zinfo->zone_size) {
   1132		u64 reset_bytes;
   1133
   1134		if (!btrfs_dev_is_sequential(device, pos) ||
   1135		    btrfs_dev_is_empty_zone(device, pos))
   1136			continue;
   1137
   1138		/* Free regions should be empty */
   1139		btrfs_warn_in_rcu(
   1140			device->fs_info,
   1141		"zoned: resetting device %s (devid %llu) zone %llu for allocation",
   1142			rcu_str_deref(device->name), device->devid, pos >> shift);
   1143		WARN_ON_ONCE(1);
   1144
   1145		ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
   1146					      &reset_bytes);
   1147		if (ret)
   1148			return ret;
   1149	}
   1150
   1151	return 0;
   1152}
   1153
   1154/*
   1155 * Calculate an allocation pointer from the extent allocation information
   1156 * for a block group consist of conventional zones. It is pointed to the
   1157 * end of the highest addressed extent in the block group as an allocation
   1158 * offset.
   1159 */
   1160static int calculate_alloc_pointer(struct btrfs_block_group *cache,
   1161				   u64 *offset_ret)
   1162{
   1163	struct btrfs_fs_info *fs_info = cache->fs_info;
   1164	struct btrfs_root *root;
   1165	struct btrfs_path *path;
   1166	struct btrfs_key key;
   1167	struct btrfs_key found_key;
   1168	int ret;
   1169	u64 length;
   1170
   1171	path = btrfs_alloc_path();
   1172	if (!path)
   1173		return -ENOMEM;
   1174
   1175	key.objectid = cache->start + cache->length;
   1176	key.type = 0;
   1177	key.offset = 0;
   1178
   1179	root = btrfs_extent_root(fs_info, key.objectid);
   1180	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
   1181	/* We should not find the exact match */
   1182	if (!ret)
   1183		ret = -EUCLEAN;
   1184	if (ret < 0)
   1185		goto out;
   1186
   1187	ret = btrfs_previous_extent_item(root, path, cache->start);
   1188	if (ret) {
   1189		if (ret == 1) {
   1190			ret = 0;
   1191			*offset_ret = 0;
   1192		}
   1193		goto out;
   1194	}
   1195
   1196	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
   1197
   1198	if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
   1199		length = found_key.offset;
   1200	else
   1201		length = fs_info->nodesize;
   1202
   1203	if (!(found_key.objectid >= cache->start &&
   1204	       found_key.objectid + length <= cache->start + cache->length)) {
   1205		ret = -EUCLEAN;
   1206		goto out;
   1207	}
   1208	*offset_ret = found_key.objectid + length - cache->start;
   1209	ret = 0;
   1210
   1211out:
   1212	btrfs_free_path(path);
   1213	return ret;
   1214}
   1215
   1216int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
   1217{
   1218	struct btrfs_fs_info *fs_info = cache->fs_info;
   1219	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
   1220	struct extent_map *em;
   1221	struct map_lookup *map;
   1222	struct btrfs_device *device;
   1223	u64 logical = cache->start;
   1224	u64 length = cache->length;
   1225	int ret;
   1226	int i;
   1227	unsigned int nofs_flag;
   1228	u64 *alloc_offsets = NULL;
   1229	u64 *caps = NULL;
   1230	u64 *physical = NULL;
   1231	unsigned long *active = NULL;
   1232	u64 last_alloc = 0;
   1233	u32 num_sequential = 0, num_conventional = 0;
   1234
   1235	if (!btrfs_is_zoned(fs_info))
   1236		return 0;
   1237
   1238	/* Sanity check */
   1239	if (!IS_ALIGNED(length, fs_info->zone_size)) {
   1240		btrfs_err(fs_info,
   1241		"zoned: block group %llu len %llu unaligned to zone size %llu",
   1242			  logical, length, fs_info->zone_size);
   1243		return -EIO;
   1244	}
   1245
   1246	/* Get the chunk mapping */
   1247	read_lock(&em_tree->lock);
   1248	em = lookup_extent_mapping(em_tree, logical, length);
   1249	read_unlock(&em_tree->lock);
   1250
   1251	if (!em)
   1252		return -EINVAL;
   1253
   1254	map = em->map_lookup;
   1255
   1256	cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
   1257	if (!cache->physical_map) {
   1258		ret = -ENOMEM;
   1259		goto out;
   1260	}
   1261
   1262	alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
   1263	if (!alloc_offsets) {
   1264		ret = -ENOMEM;
   1265		goto out;
   1266	}
   1267
   1268	caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
   1269	if (!caps) {
   1270		ret = -ENOMEM;
   1271		goto out;
   1272	}
   1273
   1274	physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS);
   1275	if (!physical) {
   1276		ret = -ENOMEM;
   1277		goto out;
   1278	}
   1279
   1280	active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
   1281	if (!active) {
   1282		ret = -ENOMEM;
   1283		goto out;
   1284	}
   1285
   1286	for (i = 0; i < map->num_stripes; i++) {
   1287		bool is_sequential;
   1288		struct blk_zone zone;
   1289		struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
   1290		int dev_replace_is_ongoing = 0;
   1291
   1292		device = map->stripes[i].dev;
   1293		physical[i] = map->stripes[i].physical;
   1294
   1295		if (device->bdev == NULL) {
   1296			alloc_offsets[i] = WP_MISSING_DEV;
   1297			continue;
   1298		}
   1299
   1300		is_sequential = btrfs_dev_is_sequential(device, physical[i]);
   1301		if (is_sequential)
   1302			num_sequential++;
   1303		else
   1304			num_conventional++;
   1305
   1306		if (!is_sequential) {
   1307			alloc_offsets[i] = WP_CONVENTIONAL;
   1308			continue;
   1309		}
   1310
   1311		/*
   1312		 * This zone will be used for allocation, so mark this zone
   1313		 * non-empty.
   1314		 */
   1315		btrfs_dev_clear_zone_empty(device, physical[i]);
   1316
   1317		down_read(&dev_replace->rwsem);
   1318		dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
   1319		if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
   1320			btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]);
   1321		up_read(&dev_replace->rwsem);
   1322
   1323		/*
   1324		 * The group is mapped to a sequential zone. Get the zone write
   1325		 * pointer to determine the allocation offset within the zone.
   1326		 */
   1327		WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size));
   1328		nofs_flag = memalloc_nofs_save();
   1329		ret = btrfs_get_dev_zone(device, physical[i], &zone);
   1330		memalloc_nofs_restore(nofs_flag);
   1331		if (ret == -EIO || ret == -EOPNOTSUPP) {
   1332			ret = 0;
   1333			alloc_offsets[i] = WP_MISSING_DEV;
   1334			continue;
   1335		} else if (ret) {
   1336			goto out;
   1337		}
   1338
   1339		if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
   1340			btrfs_err_in_rcu(fs_info,
   1341	"zoned: unexpected conventional zone %llu on device %s (devid %llu)",
   1342				zone.start << SECTOR_SHIFT,
   1343				rcu_str_deref(device->name), device->devid);
   1344			ret = -EIO;
   1345			goto out;
   1346		}
   1347
   1348		caps[i] = (zone.capacity << SECTOR_SHIFT);
   1349
   1350		switch (zone.cond) {
   1351		case BLK_ZONE_COND_OFFLINE:
   1352		case BLK_ZONE_COND_READONLY:
   1353			btrfs_err(fs_info,
   1354		"zoned: offline/readonly zone %llu on device %s (devid %llu)",
   1355				  physical[i] >> device->zone_info->zone_size_shift,
   1356				  rcu_str_deref(device->name), device->devid);
   1357			alloc_offsets[i] = WP_MISSING_DEV;
   1358			break;
   1359		case BLK_ZONE_COND_EMPTY:
   1360			alloc_offsets[i] = 0;
   1361			break;
   1362		case BLK_ZONE_COND_FULL:
   1363			alloc_offsets[i] = caps[i];
   1364			break;
   1365		default:
   1366			/* Partially used zone */
   1367			alloc_offsets[i] =
   1368					((zone.wp - zone.start) << SECTOR_SHIFT);
   1369			__set_bit(i, active);
   1370			break;
   1371		}
   1372
   1373		/*
   1374		 * Consider a zone as active if we can allow any number of
   1375		 * active zones.
   1376		 */
   1377		if (!device->zone_info->max_active_zones)
   1378			__set_bit(i, active);
   1379	}
   1380
   1381	if (num_sequential > 0)
   1382		cache->seq_zone = true;
   1383
   1384	if (num_conventional > 0) {
   1385		/*
   1386		 * Avoid calling calculate_alloc_pointer() for new BG. It
   1387		 * is no use for new BG. It must be always 0.
   1388		 *
   1389		 * Also, we have a lock chain of extent buffer lock ->
   1390		 * chunk mutex.  For new BG, this function is called from
   1391		 * btrfs_make_block_group() which is already taking the
   1392		 * chunk mutex. Thus, we cannot call
   1393		 * calculate_alloc_pointer() which takes extent buffer
   1394		 * locks to avoid deadlock.
   1395		 */
   1396
   1397		/* Zone capacity is always zone size in emulation */
   1398		cache->zone_capacity = cache->length;
   1399		if (new) {
   1400			cache->alloc_offset = 0;
   1401			goto out;
   1402		}
   1403		ret = calculate_alloc_pointer(cache, &last_alloc);
   1404		if (ret || map->num_stripes == num_conventional) {
   1405			if (!ret)
   1406				cache->alloc_offset = last_alloc;
   1407			else
   1408				btrfs_err(fs_info,
   1409			"zoned: failed to determine allocation offset of bg %llu",
   1410					  cache->start);
   1411			goto out;
   1412		}
   1413	}
   1414
   1415	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
   1416	case 0: /* single */
   1417		if (alloc_offsets[0] == WP_MISSING_DEV) {
   1418			btrfs_err(fs_info,
   1419			"zoned: cannot recover write pointer for zone %llu",
   1420				physical[0]);
   1421			ret = -EIO;
   1422			goto out;
   1423		}
   1424		cache->alloc_offset = alloc_offsets[0];
   1425		cache->zone_capacity = caps[0];
   1426		cache->zone_is_active = test_bit(0, active);
   1427		break;
   1428	case BTRFS_BLOCK_GROUP_DUP:
   1429		if (map->type & BTRFS_BLOCK_GROUP_DATA) {
   1430			btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg");
   1431			ret = -EINVAL;
   1432			goto out;
   1433		}
   1434		if (alloc_offsets[0] == WP_MISSING_DEV) {
   1435			btrfs_err(fs_info,
   1436			"zoned: cannot recover write pointer for zone %llu",
   1437				physical[0]);
   1438			ret = -EIO;
   1439			goto out;
   1440		}
   1441		if (alloc_offsets[1] == WP_MISSING_DEV) {
   1442			btrfs_err(fs_info,
   1443			"zoned: cannot recover write pointer for zone %llu",
   1444				physical[1]);
   1445			ret = -EIO;
   1446			goto out;
   1447		}
   1448		if (alloc_offsets[0] != alloc_offsets[1]) {
   1449			btrfs_err(fs_info,
   1450			"zoned: write pointer offset mismatch of zones in DUP profile");
   1451			ret = -EIO;
   1452			goto out;
   1453		}
   1454		if (test_bit(0, active) != test_bit(1, active)) {
   1455			if (!btrfs_zone_activate(cache)) {
   1456				ret = -EIO;
   1457				goto out;
   1458			}
   1459		} else {
   1460			cache->zone_is_active = test_bit(0, active);
   1461		}
   1462		cache->alloc_offset = alloc_offsets[0];
   1463		cache->zone_capacity = min(caps[0], caps[1]);
   1464		break;
   1465	case BTRFS_BLOCK_GROUP_RAID1:
   1466	case BTRFS_BLOCK_GROUP_RAID0:
   1467	case BTRFS_BLOCK_GROUP_RAID10:
   1468	case BTRFS_BLOCK_GROUP_RAID5:
   1469	case BTRFS_BLOCK_GROUP_RAID6:
   1470		/* non-single profiles are not supported yet */
   1471	default:
   1472		btrfs_err(fs_info, "zoned: profile %s not yet supported",
   1473			  btrfs_bg_type_to_raid_name(map->type));
   1474		ret = -EINVAL;
   1475		goto out;
   1476	}
   1477
   1478	if (cache->zone_is_active) {
   1479		btrfs_get_block_group(cache);
   1480		spin_lock(&fs_info->zone_active_bgs_lock);
   1481		list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs);
   1482		spin_unlock(&fs_info->zone_active_bgs_lock);
   1483	}
   1484
   1485out:
   1486	if (cache->alloc_offset > fs_info->zone_size) {
   1487		btrfs_err(fs_info,
   1488			"zoned: invalid write pointer %llu in block group %llu",
   1489			cache->alloc_offset, cache->start);
   1490		ret = -EIO;
   1491	}
   1492
   1493	if (cache->alloc_offset > cache->zone_capacity) {
   1494		btrfs_err(fs_info,
   1495"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
   1496			  cache->alloc_offset, cache->zone_capacity,
   1497			  cache->start);
   1498		ret = -EIO;
   1499	}
   1500
   1501	/* An extent is allocated after the write pointer */
   1502	if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
   1503		btrfs_err(fs_info,
   1504			  "zoned: got wrong write pointer in BG %llu: %llu > %llu",
   1505			  logical, last_alloc, cache->alloc_offset);
   1506		ret = -EIO;
   1507	}
   1508
   1509	if (!ret)
   1510		cache->meta_write_pointer = cache->alloc_offset + cache->start;
   1511
   1512	if (ret) {
   1513		kfree(cache->physical_map);
   1514		cache->physical_map = NULL;
   1515	}
   1516	bitmap_free(active);
   1517	kfree(physical);
   1518	kfree(caps);
   1519	kfree(alloc_offsets);
   1520	free_extent_map(em);
   1521
   1522	return ret;
   1523}
   1524
   1525void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
   1526{
   1527	u64 unusable, free;
   1528
   1529	if (!btrfs_is_zoned(cache->fs_info))
   1530		return;
   1531
   1532	WARN_ON(cache->bytes_super != 0);
   1533	unusable = (cache->alloc_offset - cache->used) +
   1534		   (cache->length - cache->zone_capacity);
   1535	free = cache->zone_capacity - cache->alloc_offset;
   1536
   1537	/* We only need ->free_space in ALLOC_SEQ block groups */
   1538	cache->last_byte_to_unpin = (u64)-1;
   1539	cache->cached = BTRFS_CACHE_FINISHED;
   1540	cache->free_space_ctl->free_space = free;
   1541	cache->zone_unusable = unusable;
   1542}
   1543
   1544void btrfs_redirty_list_add(struct btrfs_transaction *trans,
   1545			    struct extent_buffer *eb)
   1546{
   1547	struct btrfs_fs_info *fs_info = eb->fs_info;
   1548
   1549	if (!btrfs_is_zoned(fs_info) ||
   1550	    btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) ||
   1551	    !list_empty(&eb->release_list))
   1552		return;
   1553
   1554	set_extent_buffer_dirty(eb);
   1555	set_extent_bits_nowait(&trans->dirty_pages, eb->start,
   1556			       eb->start + eb->len - 1, EXTENT_DIRTY);
   1557	memzero_extent_buffer(eb, 0, eb->len);
   1558	set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
   1559
   1560	spin_lock(&trans->releasing_ebs_lock);
   1561	list_add_tail(&eb->release_list, &trans->releasing_ebs);
   1562	spin_unlock(&trans->releasing_ebs_lock);
   1563	atomic_inc(&eb->refs);
   1564}
   1565
   1566void btrfs_free_redirty_list(struct btrfs_transaction *trans)
   1567{
   1568	spin_lock(&trans->releasing_ebs_lock);
   1569	while (!list_empty(&trans->releasing_ebs)) {
   1570		struct extent_buffer *eb;
   1571
   1572		eb = list_first_entry(&trans->releasing_ebs,
   1573				      struct extent_buffer, release_list);
   1574		list_del_init(&eb->release_list);
   1575		free_extent_buffer(eb);
   1576	}
   1577	spin_unlock(&trans->releasing_ebs_lock);
   1578}
   1579
   1580bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
   1581{
   1582	struct btrfs_fs_info *fs_info = inode->root->fs_info;
   1583	struct btrfs_block_group *cache;
   1584	bool ret = false;
   1585
   1586	if (!btrfs_is_zoned(fs_info))
   1587		return false;
   1588
   1589	if (!is_data_inode(&inode->vfs_inode))
   1590		return false;
   1591
   1592	/*
   1593	 * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
   1594	 * extent layout the relocation code has.
   1595	 * Furthermore we have set aside own block-group from which only the
   1596	 * relocation "process" can allocate and make sure only one process at a
   1597	 * time can add pages to an extent that gets relocated, so it's safe to
   1598	 * use regular REQ_OP_WRITE for this special case.
   1599	 */
   1600	if (btrfs_is_data_reloc_root(inode->root))
   1601		return false;
   1602
   1603	cache = btrfs_lookup_block_group(fs_info, start);
   1604	ASSERT(cache);
   1605	if (!cache)
   1606		return false;
   1607
   1608	ret = cache->seq_zone;
   1609	btrfs_put_block_group(cache);
   1610
   1611	return ret;
   1612}
   1613
   1614void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
   1615				 struct bio *bio)
   1616{
   1617	struct btrfs_ordered_extent *ordered;
   1618	const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
   1619
   1620	if (bio_op(bio) != REQ_OP_ZONE_APPEND)
   1621		return;
   1622
   1623	ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset);
   1624	if (WARN_ON(!ordered))
   1625		return;
   1626
   1627	ordered->physical = physical;
   1628	ordered->bdev = bio->bi_bdev;
   1629
   1630	btrfs_put_ordered_extent(ordered);
   1631}
   1632
   1633void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
   1634{
   1635	struct btrfs_inode *inode = BTRFS_I(ordered->inode);
   1636	struct btrfs_fs_info *fs_info = inode->root->fs_info;
   1637	struct extent_map_tree *em_tree;
   1638	struct extent_map *em;
   1639	struct btrfs_ordered_sum *sum;
   1640	u64 orig_logical = ordered->disk_bytenr;
   1641	u64 *logical = NULL;
   1642	int nr, stripe_len;
   1643
   1644	/* Zoned devices should not have partitions. So, we can assume it is 0 */
   1645	ASSERT(!bdev_is_partition(ordered->bdev));
   1646	if (WARN_ON(!ordered->bdev))
   1647		return;
   1648
   1649	if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev,
   1650				     ordered->physical, &logical, &nr,
   1651				     &stripe_len)))
   1652		goto out;
   1653
   1654	WARN_ON(nr != 1);
   1655
   1656	if (orig_logical == *logical)
   1657		goto out;
   1658
   1659	ordered->disk_bytenr = *logical;
   1660
   1661	em_tree = &inode->extent_tree;
   1662	write_lock(&em_tree->lock);
   1663	em = search_extent_mapping(em_tree, ordered->file_offset,
   1664				   ordered->num_bytes);
   1665	em->block_start = *logical;
   1666	free_extent_map(em);
   1667	write_unlock(&em_tree->lock);
   1668
   1669	list_for_each_entry(sum, &ordered->list, list) {
   1670		if (*logical < orig_logical)
   1671			sum->bytenr -= orig_logical - *logical;
   1672		else
   1673			sum->bytenr += *logical - orig_logical;
   1674	}
   1675
   1676out:
   1677	kfree(logical);
   1678}
   1679
   1680bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
   1681				    struct extent_buffer *eb,
   1682				    struct btrfs_block_group **cache_ret)
   1683{
   1684	struct btrfs_block_group *cache;
   1685	bool ret = true;
   1686
   1687	if (!btrfs_is_zoned(fs_info))
   1688		return true;
   1689
   1690	cache = btrfs_lookup_block_group(fs_info, eb->start);
   1691	if (!cache)
   1692		return true;
   1693
   1694	if (cache->meta_write_pointer != eb->start) {
   1695		btrfs_put_block_group(cache);
   1696		cache = NULL;
   1697		ret = false;
   1698	} else {
   1699		cache->meta_write_pointer = eb->start + eb->len;
   1700	}
   1701
   1702	*cache_ret = cache;
   1703
   1704	return ret;
   1705}
   1706
   1707void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
   1708				     struct extent_buffer *eb)
   1709{
   1710	if (!btrfs_is_zoned(eb->fs_info) || !cache)
   1711		return;
   1712
   1713	ASSERT(cache->meta_write_pointer == eb->start + eb->len);
   1714	cache->meta_write_pointer = eb->start;
   1715}
   1716
   1717int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
   1718{
   1719	if (!btrfs_dev_is_sequential(device, physical))
   1720		return -EOPNOTSUPP;
   1721
   1722	return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
   1723				    length >> SECTOR_SHIFT, GFP_NOFS, 0);
   1724}
   1725
   1726static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
   1727			  struct blk_zone *zone)
   1728{
   1729	struct btrfs_io_context *bioc = NULL;
   1730	u64 mapped_length = PAGE_SIZE;
   1731	unsigned int nofs_flag;
   1732	int nmirrors;
   1733	int i, ret;
   1734
   1735	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
   1736			       &mapped_length, &bioc);
   1737	if (ret || !bioc || mapped_length < PAGE_SIZE) {
   1738		btrfs_put_bioc(bioc);
   1739		return -EIO;
   1740	}
   1741
   1742	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
   1743		return -EINVAL;
   1744
   1745	nofs_flag = memalloc_nofs_save();
   1746	nmirrors = (int)bioc->num_stripes;
   1747	for (i = 0; i < nmirrors; i++) {
   1748		u64 physical = bioc->stripes[i].physical;
   1749		struct btrfs_device *dev = bioc->stripes[i].dev;
   1750
   1751		/* Missing device */
   1752		if (!dev->bdev)
   1753			continue;
   1754
   1755		ret = btrfs_get_dev_zone(dev, physical, zone);
   1756		/* Failing device */
   1757		if (ret == -EIO || ret == -EOPNOTSUPP)
   1758			continue;
   1759		break;
   1760	}
   1761	memalloc_nofs_restore(nofs_flag);
   1762
   1763	return ret;
   1764}
   1765
   1766/*
   1767 * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
   1768 * filling zeros between @physical_pos to a write pointer of dev-replace
   1769 * source device.
   1770 */
   1771int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
   1772				    u64 physical_start, u64 physical_pos)
   1773{
   1774	struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
   1775	struct blk_zone zone;
   1776	u64 length;
   1777	u64 wp;
   1778	int ret;
   1779
   1780	if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
   1781		return 0;
   1782
   1783	ret = read_zone_info(fs_info, logical, &zone);
   1784	if (ret)
   1785		return ret;
   1786
   1787	wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
   1788
   1789	if (physical_pos == wp)
   1790		return 0;
   1791
   1792	if (physical_pos > wp)
   1793		return -EUCLEAN;
   1794
   1795	length = wp - physical_pos;
   1796	return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
   1797}
   1798
   1799struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
   1800					    u64 logical, u64 length)
   1801{
   1802	struct btrfs_device *device;
   1803	struct extent_map *em;
   1804	struct map_lookup *map;
   1805
   1806	em = btrfs_get_chunk_map(fs_info, logical, length);
   1807	if (IS_ERR(em))
   1808		return ERR_CAST(em);
   1809
   1810	map = em->map_lookup;
   1811	/* We only support single profile for now */
   1812	device = map->stripes[0].dev;
   1813
   1814	free_extent_map(em);
   1815
   1816	return device;
   1817}
   1818
   1819/**
   1820 * Activate block group and underlying device zones
   1821 *
   1822 * @block_group: the block group to activate
   1823 *
   1824 * Return: true on success, false otherwise
   1825 */
   1826bool btrfs_zone_activate(struct btrfs_block_group *block_group)
   1827{
   1828	struct btrfs_fs_info *fs_info = block_group->fs_info;
   1829	struct map_lookup *map;
   1830	struct btrfs_device *device;
   1831	u64 physical;
   1832	bool ret;
   1833	int i;
   1834
   1835	if (!btrfs_is_zoned(block_group->fs_info))
   1836		return true;
   1837
   1838	map = block_group->physical_map;
   1839
   1840	spin_lock(&block_group->lock);
   1841	if (block_group->zone_is_active) {
   1842		ret = true;
   1843		goto out_unlock;
   1844	}
   1845
   1846	/* No space left */
   1847	if (btrfs_zoned_bg_is_full(block_group)) {
   1848		ret = false;
   1849		goto out_unlock;
   1850	}
   1851
   1852	for (i = 0; i < map->num_stripes; i++) {
   1853		device = map->stripes[i].dev;
   1854		physical = map->stripes[i].physical;
   1855
   1856		if (device->zone_info->max_active_zones == 0)
   1857			continue;
   1858
   1859		if (!btrfs_dev_set_active_zone(device, physical)) {
   1860			/* Cannot activate the zone */
   1861			ret = false;
   1862			goto out_unlock;
   1863		}
   1864	}
   1865
   1866	/* Successfully activated all the zones */
   1867	block_group->zone_is_active = 1;
   1868	spin_unlock(&block_group->lock);
   1869
   1870	/* For the active block group list */
   1871	btrfs_get_block_group(block_group);
   1872
   1873	spin_lock(&fs_info->zone_active_bgs_lock);
   1874	list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
   1875	spin_unlock(&fs_info->zone_active_bgs_lock);
   1876
   1877	return true;
   1878
   1879out_unlock:
   1880	spin_unlock(&block_group->lock);
   1881	return ret;
   1882}
   1883
   1884static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
   1885{
   1886	struct btrfs_fs_info *fs_info = block_group->fs_info;
   1887	struct map_lookup *map;
   1888	bool need_zone_finish;
   1889	int ret = 0;
   1890	int i;
   1891
   1892	spin_lock(&block_group->lock);
   1893	if (!block_group->zone_is_active) {
   1894		spin_unlock(&block_group->lock);
   1895		return 0;
   1896	}
   1897
   1898	/* Check if we have unwritten allocated space */
   1899	if ((block_group->flags &
   1900	     (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
   1901	    block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
   1902		spin_unlock(&block_group->lock);
   1903		return -EAGAIN;
   1904	}
   1905
   1906	/*
   1907	 * If we are sure that the block group is full (= no more room left for
   1908	 * new allocation) and the IO for the last usable block is completed, we
   1909	 * don't need to wait for the other IOs. This holds because we ensure
   1910	 * the sequential IO submissions using the ZONE_APPEND command for data
   1911	 * and block_group->meta_write_pointer for metadata.
   1912	 */
   1913	if (!fully_written) {
   1914		spin_unlock(&block_group->lock);
   1915
   1916		ret = btrfs_inc_block_group_ro(block_group, false);
   1917		if (ret)
   1918			return ret;
   1919
   1920		/* Ensure all writes in this block group finish */
   1921		btrfs_wait_block_group_reservations(block_group);
   1922		/* No need to wait for NOCOW writers. Zoned mode does not allow that */
   1923		btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
   1924					 block_group->length);
   1925
   1926		spin_lock(&block_group->lock);
   1927
   1928		/*
   1929		 * Bail out if someone already deactivated the block group, or
   1930		 * allocated space is left in the block group.
   1931		 */
   1932		if (!block_group->zone_is_active) {
   1933			spin_unlock(&block_group->lock);
   1934			btrfs_dec_block_group_ro(block_group);
   1935			return 0;
   1936		}
   1937
   1938		if (block_group->reserved) {
   1939			spin_unlock(&block_group->lock);
   1940			btrfs_dec_block_group_ro(block_group);
   1941			return -EAGAIN;
   1942		}
   1943	}
   1944
   1945	/*
   1946	 * The block group is not fully allocated, so not fully written yet. We
   1947	 * need to send ZONE_FINISH command to free up an active zone.
   1948	 */
   1949	need_zone_finish = !btrfs_zoned_bg_is_full(block_group);
   1950
   1951	block_group->zone_is_active = 0;
   1952	block_group->alloc_offset = block_group->zone_capacity;
   1953	block_group->free_space_ctl->free_space = 0;
   1954	btrfs_clear_treelog_bg(block_group);
   1955	btrfs_clear_data_reloc_bg(block_group);
   1956	spin_unlock(&block_group->lock);
   1957
   1958	map = block_group->physical_map;
   1959	for (i = 0; i < map->num_stripes; i++) {
   1960		struct btrfs_device *device = map->stripes[i].dev;
   1961		const u64 physical = map->stripes[i].physical;
   1962
   1963		if (device->zone_info->max_active_zones == 0)
   1964			continue;
   1965
   1966		if (need_zone_finish) {
   1967			ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
   1968					       physical >> SECTOR_SHIFT,
   1969					       device->zone_info->zone_size >> SECTOR_SHIFT,
   1970					       GFP_NOFS);
   1971
   1972			if (ret)
   1973				return ret;
   1974		}
   1975
   1976		btrfs_dev_clear_active_zone(device, physical);
   1977	}
   1978
   1979	if (!fully_written)
   1980		btrfs_dec_block_group_ro(block_group);
   1981
   1982	spin_lock(&fs_info->zone_active_bgs_lock);
   1983	ASSERT(!list_empty(&block_group->active_bg_list));
   1984	list_del_init(&block_group->active_bg_list);
   1985	spin_unlock(&fs_info->zone_active_bgs_lock);
   1986
   1987	/* For active_bg_list */
   1988	btrfs_put_block_group(block_group);
   1989
   1990	return 0;
   1991}
   1992
   1993int btrfs_zone_finish(struct btrfs_block_group *block_group)
   1994{
   1995	if (!btrfs_is_zoned(block_group->fs_info))
   1996		return 0;
   1997
   1998	return do_zone_finish(block_group, false);
   1999}
   2000
   2001bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
   2002{
   2003	struct btrfs_fs_info *fs_info = fs_devices->fs_info;
   2004	struct btrfs_device *device;
   2005	bool ret = false;
   2006
   2007	if (!btrfs_is_zoned(fs_info))
   2008		return true;
   2009
   2010	/* Check if there is a device with active zones left */
   2011	mutex_lock(&fs_info->chunk_mutex);
   2012	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
   2013		struct btrfs_zoned_device_info *zinfo = device->zone_info;
   2014
   2015		if (!device->bdev)
   2016			continue;
   2017
   2018		if (!zinfo->max_active_zones ||
   2019		    atomic_read(&zinfo->active_zones_left)) {
   2020			ret = true;
   2021			break;
   2022		}
   2023	}
   2024	mutex_unlock(&fs_info->chunk_mutex);
   2025
   2026	return ret;
   2027}
   2028
   2029void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
   2030{
   2031	struct btrfs_block_group *block_group;
   2032	u64 min_alloc_bytes;
   2033
   2034	if (!btrfs_is_zoned(fs_info))
   2035		return;
   2036
   2037	block_group = btrfs_lookup_block_group(fs_info, logical);
   2038	ASSERT(block_group);
   2039
   2040	/* No MIXED_BG on zoned btrfs. */
   2041	if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
   2042		min_alloc_bytes = fs_info->sectorsize;
   2043	else
   2044		min_alloc_bytes = fs_info->nodesize;
   2045
   2046	/* Bail out if we can allocate more data from this block group. */
   2047	if (logical + length + min_alloc_bytes <=
   2048	    block_group->start + block_group->zone_capacity)
   2049		goto out;
   2050
   2051	do_zone_finish(block_group, true);
   2052
   2053out:
   2054	btrfs_put_block_group(block_group);
   2055}
   2056
   2057static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
   2058{
   2059	struct btrfs_block_group *bg =
   2060		container_of(work, struct btrfs_block_group, zone_finish_work);
   2061
   2062	wait_on_extent_buffer_writeback(bg->last_eb);
   2063	free_extent_buffer(bg->last_eb);
   2064	btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
   2065	btrfs_put_block_group(bg);
   2066}
   2067
   2068void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
   2069				   struct extent_buffer *eb)
   2070{
   2071	if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
   2072		return;
   2073
   2074	if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
   2075		btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
   2076			  bg->start);
   2077		return;
   2078	}
   2079
   2080	/* For the work */
   2081	btrfs_get_block_group(bg);
   2082	atomic_inc(&eb->refs);
   2083	bg->last_eb = eb;
   2084	INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
   2085	queue_work(system_unbound_wq, &bg->zone_finish_work);
   2086}
   2087
   2088void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
   2089{
   2090	struct btrfs_fs_info *fs_info = bg->fs_info;
   2091
   2092	spin_lock(&fs_info->relocation_bg_lock);
   2093	if (fs_info->data_reloc_bg == bg->start)
   2094		fs_info->data_reloc_bg = 0;
   2095	spin_unlock(&fs_info->relocation_bg_lock);
   2096}
   2097
   2098void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
   2099{
   2100	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
   2101	struct btrfs_device *device;
   2102
   2103	if (!btrfs_is_zoned(fs_info))
   2104		return;
   2105
   2106	mutex_lock(&fs_devices->device_list_mutex);
   2107	list_for_each_entry(device, &fs_devices->devices, dev_list) {
   2108		if (device->zone_info) {
   2109			vfree(device->zone_info->zone_cache);
   2110			device->zone_info->zone_cache = NULL;
   2111		}
   2112	}
   2113	mutex_unlock(&fs_devices->device_list_mutex);
   2114}
   2115
   2116bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
   2117{
   2118	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
   2119	struct btrfs_device *device;
   2120	u64 used = 0;
   2121	u64 total = 0;
   2122	u64 factor;
   2123
   2124	ASSERT(btrfs_is_zoned(fs_info));
   2125
   2126	if (fs_info->bg_reclaim_threshold == 0)
   2127		return false;
   2128
   2129	mutex_lock(&fs_devices->device_list_mutex);
   2130	list_for_each_entry(device, &fs_devices->devices, dev_list) {
   2131		if (!device->bdev)
   2132			continue;
   2133
   2134		total += device->disk_total_bytes;
   2135		used += device->bytes_used;
   2136	}
   2137	mutex_unlock(&fs_devices->device_list_mutex);
   2138
   2139	factor = div64_u64(used * 100, total);
   2140	return factor >= fs_info->bg_reclaim_threshold;
   2141}
   2142
   2143void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
   2144				       u64 length)
   2145{
   2146	struct btrfs_block_group *block_group;
   2147
   2148	if (!btrfs_is_zoned(fs_info))
   2149		return;
   2150
   2151	block_group = btrfs_lookup_block_group(fs_info, logical);
   2152	/* It should be called on a previous data relocation block group. */
   2153	ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA));
   2154
   2155	spin_lock(&block_group->lock);
   2156	if (!block_group->zoned_data_reloc_ongoing)
   2157		goto out;
   2158
   2159	/* All relocation extents are written. */
   2160	if (block_group->start + block_group->alloc_offset == logical + length) {
   2161		/* Now, release this block group for further allocations. */
   2162		block_group->zoned_data_reloc_ongoing = 0;
   2163	}
   2164
   2165out:
   2166	spin_unlock(&block_group->lock);
   2167	btrfs_put_block_group(block_group);
   2168}