super.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
super.c (51472B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Simple file system for zoned block devices exposing zones as files.
      4 *
      5 * Copyright (C) 2019 Western Digital Corporation or its affiliates.
      6 */
      7#include <linux/module.h>
      8#include <linux/pagemap.h>
      9#include <linux/magic.h>
     10#include <linux/iomap.h>
     11#include <linux/init.h>
     12#include <linux/slab.h>
     13#include <linux/blkdev.h>
     14#include <linux/statfs.h>
     15#include <linux/writeback.h>
     16#include <linux/quotaops.h>
     17#include <linux/seq_file.h>
     18#include <linux/parser.h>
     19#include <linux/uio.h>
     20#include <linux/mman.h>
     21#include <linux/sched/mm.h>
     22#include <linux/crc32.h>
     23#include <linux/task_io_accounting_ops.h>
     24
     25#include "zonefs.h"
     26
     27#define CREATE_TRACE_POINTS
     28#include "trace.h"
     29
     30/*
     31 * Manage the active zone count. Called with zi->i_truncate_mutex held.
     32 */
     33static void zonefs_account_active(struct inode *inode)
     34{
     35	struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
     36	struct zonefs_inode_info *zi = ZONEFS_I(inode);
     37
     38	lockdep_assert_held(&zi->i_truncate_mutex);
     39
     40	if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
     41		return;
     42
     43	/*
     44	 * If the zone is active, that is, if it is explicitly open or
     45	 * partially written, check if it was already accounted as active.
     46	 */
     47	if ((zi->i_flags & ZONEFS_ZONE_OPEN) ||
     48	    (zi->i_wpoffset > 0 && zi->i_wpoffset < zi->i_max_size)) {
     49		if (!(zi->i_flags & ZONEFS_ZONE_ACTIVE)) {
     50			zi->i_flags |= ZONEFS_ZONE_ACTIVE;
     51			atomic_inc(&sbi->s_active_seq_files);
     52		}
     53		return;
     54	}
     55
     56	/* The zone is not active. If it was, update the active count */
     57	if (zi->i_flags & ZONEFS_ZONE_ACTIVE) {
     58		zi->i_flags &= ~ZONEFS_ZONE_ACTIVE;
     59		atomic_dec(&sbi->s_active_seq_files);
     60	}
     61}
     62
     63static inline int zonefs_zone_mgmt(struct inode *inode,
     64				   enum req_opf op)
     65{
     66	struct zonefs_inode_info *zi = ZONEFS_I(inode);
     67	int ret;
     68
     69	lockdep_assert_held(&zi->i_truncate_mutex);
     70
     71	/*
     72	 * With ZNS drives, closing an explicitly open zone that has not been
     73	 * written will change the zone state to "closed", that is, the zone
     74	 * will remain active. Since this can then cause failure of explicit
     75	 * open operation on other zones if the drive active zone resources
     76	 * are exceeded, make sure that the zone does not remain active by
     77	 * resetting it.
     78	 */
     79	if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset)
     80		op = REQ_OP_ZONE_RESET;
     81
     82	trace_zonefs_zone_mgmt(inode, op);
     83	ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
     84			       zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
     85	if (ret) {
     86		zonefs_err(inode->i_sb,
     87			   "Zone management operation %s at %llu failed %d\n",
     88			   blk_op_str(op), zi->i_zsector, ret);
     89		return ret;
     90	}
     91
     92	return 0;
     93}
     94
     95static inline void zonefs_i_size_write(struct inode *inode, loff_t isize)
     96{
     97	struct zonefs_inode_info *zi = ZONEFS_I(inode);
     98
     99	i_size_write(inode, isize);
    100	/*
    101	 * A full zone is no longer open/active and does not need
    102	 * explicit closing.
    103	 */
    104	if (isize >= zi->i_max_size) {
    105		struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
    106
    107		if (zi->i_flags & ZONEFS_ZONE_ACTIVE)
    108			atomic_dec(&sbi->s_active_seq_files);
    109		zi->i_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE);
    110	}
    111}
    112
    113static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
    114				   loff_t length, unsigned int flags,
    115				   struct iomap *iomap, struct iomap *srcmap)
    116{
    117	struct zonefs_inode_info *zi = ZONEFS_I(inode);
    118	struct super_block *sb = inode->i_sb;
    119	loff_t isize;
    120
    121	/*
    122	 * All blocks are always mapped below EOF. If reading past EOF,
    123	 * act as if there is a hole up to the file maximum size.
    124	 */
    125	mutex_lock(&zi->i_truncate_mutex);
    126	iomap->bdev = inode->i_sb->s_bdev;
    127	iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
    128	isize = i_size_read(inode);
    129	if (iomap->offset >= isize) {
    130		iomap->type = IOMAP_HOLE;
    131		iomap->addr = IOMAP_NULL_ADDR;
    132		iomap->length = length;
    133	} else {
    134		iomap->type = IOMAP_MAPPED;
    135		iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
    136		iomap->length = isize - iomap->offset;
    137	}
    138	mutex_unlock(&zi->i_truncate_mutex);
    139
    140	trace_zonefs_iomap_begin(inode, iomap);
    141
    142	return 0;
    143}
    144
    145static const struct iomap_ops zonefs_read_iomap_ops = {
    146	.iomap_begin	= zonefs_read_iomap_begin,
    147};
    148
    149static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
    150				    loff_t length, unsigned int flags,
    151				    struct iomap *iomap, struct iomap *srcmap)
    152{
    153	struct zonefs_inode_info *zi = ZONEFS_I(inode);
    154	struct super_block *sb = inode->i_sb;
    155	loff_t isize;
    156
    157	/* All write I/Os should always be within the file maximum size */
    158	if (WARN_ON_ONCE(offset + length > zi->i_max_size))
    159		return -EIO;
    160
    161	/*
    162	 * Sequential zones can only accept direct writes. This is already
    163	 * checked when writes are issued, so warn if we see a page writeback
    164	 * operation.
    165	 */
    166	if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
    167			 !(flags & IOMAP_DIRECT)))
    168		return -EIO;
    169
    170	/*
    171	 * For conventional zones, all blocks are always mapped. For sequential
    172	 * zones, all blocks after always mapped below the inode size (zone
    173	 * write pointer) and unwriten beyond.
    174	 */
    175	mutex_lock(&zi->i_truncate_mutex);
    176	iomap->bdev = inode->i_sb->s_bdev;
    177	iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
    178	iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
    179	isize = i_size_read(inode);
    180	if (iomap->offset >= isize) {
    181		iomap->type = IOMAP_UNWRITTEN;
    182		iomap->length = zi->i_max_size - iomap->offset;
    183	} else {
    184		iomap->type = IOMAP_MAPPED;
    185		iomap->length = isize - iomap->offset;
    186	}
    187	mutex_unlock(&zi->i_truncate_mutex);
    188
    189	trace_zonefs_iomap_begin(inode, iomap);
    190
    191	return 0;
    192}
    193
    194static const struct iomap_ops zonefs_write_iomap_ops = {
    195	.iomap_begin	= zonefs_write_iomap_begin,
    196};
    197
    198static int zonefs_read_folio(struct file *unused, struct folio *folio)
    199{
    200	return iomap_read_folio(folio, &zonefs_read_iomap_ops);
    201}
    202
    203static void zonefs_readahead(struct readahead_control *rac)
    204{
    205	iomap_readahead(rac, &zonefs_read_iomap_ops);
    206}
    207
    208/*
    209 * Map blocks for page writeback. This is used only on conventional zone files,
    210 * which implies that the page range can only be within the fixed inode size.
    211 */
    212static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
    213				   struct inode *inode, loff_t offset)
    214{
    215	struct zonefs_inode_info *zi = ZONEFS_I(inode);
    216
    217	if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV))
    218		return -EIO;
    219	if (WARN_ON_ONCE(offset >= i_size_read(inode)))
    220		return -EIO;
    221
    222	/* If the mapping is already OK, nothing needs to be done */
    223	if (offset >= wpc->iomap.offset &&
    224	    offset < wpc->iomap.offset + wpc->iomap.length)
    225		return 0;
    226
    227	return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset,
    228					IOMAP_WRITE, &wpc->iomap, NULL);
    229}
    230
    231static const struct iomap_writeback_ops zonefs_writeback_ops = {
    232	.map_blocks		= zonefs_write_map_blocks,
    233};
    234
    235static int zonefs_writepage(struct page *page, struct writeback_control *wbc)
    236{
    237	struct iomap_writepage_ctx wpc = { };
    238
    239	return iomap_writepage(page, wbc, &wpc, &zonefs_writeback_ops);
    240}
    241
    242static int zonefs_writepages(struct address_space *mapping,
    243			     struct writeback_control *wbc)
    244{
    245	struct iomap_writepage_ctx wpc = { };
    246
    247	return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
    248}
    249
    250static int zonefs_swap_activate(struct swap_info_struct *sis,
    251				struct file *swap_file, sector_t *span)
    252{
    253	struct inode *inode = file_inode(swap_file);
    254	struct zonefs_inode_info *zi = ZONEFS_I(inode);
    255
    256	if (zi->i_ztype != ZONEFS_ZTYPE_CNV) {
    257		zonefs_err(inode->i_sb,
    258			   "swap file: not a conventional zone file\n");
    259		return -EINVAL;
    260	}
    261
    262	return iomap_swapfile_activate(sis, swap_file, span,
    263				       &zonefs_read_iomap_ops);
    264}
    265
    266static const struct address_space_operations zonefs_file_aops = {
    267	.read_folio		= zonefs_read_folio,
    268	.readahead		= zonefs_readahead,
    269	.writepage		= zonefs_writepage,
    270	.writepages		= zonefs_writepages,
    271	.dirty_folio		= filemap_dirty_folio,
    272	.release_folio		= iomap_release_folio,
    273	.invalidate_folio	= iomap_invalidate_folio,
    274	.migratepage		= iomap_migrate_page,
    275	.is_partially_uptodate	= iomap_is_partially_uptodate,
    276	.error_remove_page	= generic_error_remove_page,
    277	.direct_IO		= noop_direct_IO,
    278	.swap_activate		= zonefs_swap_activate,
    279};
    280
    281static void zonefs_update_stats(struct inode *inode, loff_t new_isize)
    282{
    283	struct super_block *sb = inode->i_sb;
    284	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
    285	loff_t old_isize = i_size_read(inode);
    286	loff_t nr_blocks;
    287
    288	if (new_isize == old_isize)
    289		return;
    290
    291	spin_lock(&sbi->s_lock);
    292
    293	/*
    294	 * This may be called for an update after an IO error.
    295	 * So beware of the values seen.
    296	 */
    297	if (new_isize < old_isize) {
    298		nr_blocks = (old_isize - new_isize) >> sb->s_blocksize_bits;
    299		if (sbi->s_used_blocks > nr_blocks)
    300			sbi->s_used_blocks -= nr_blocks;
    301		else
    302			sbi->s_used_blocks = 0;
    303	} else {
    304		sbi->s_used_blocks +=
    305			(new_isize - old_isize) >> sb->s_blocksize_bits;
    306		if (sbi->s_used_blocks > sbi->s_blocks)
    307			sbi->s_used_blocks = sbi->s_blocks;
    308	}
    309
    310	spin_unlock(&sbi->s_lock);
    311}
    312
    313/*
    314 * Check a zone condition and adjust its file inode access permissions for
    315 * offline and readonly zones. Return the inode size corresponding to the
    316 * amount of readable data in the zone.
    317 */
    318static loff_t zonefs_check_zone_condition(struct inode *inode,
    319					  struct blk_zone *zone, bool warn,
    320					  bool mount)
    321{
    322	struct zonefs_inode_info *zi = ZONEFS_I(inode);
    323
    324	switch (zone->cond) {
    325	case BLK_ZONE_COND_OFFLINE:
    326		/*
    327		 * Dead zone: make the inode immutable, disable all accesses
    328		 * and set the file size to 0 (zone wp set to zone start).
    329		 */
    330		if (warn)
    331			zonefs_warn(inode->i_sb, "inode %lu: offline zone\n",
    332				    inode->i_ino);
    333		inode->i_flags |= S_IMMUTABLE;
    334		inode->i_mode &= ~0777;
    335		zone->wp = zone->start;
    336		return 0;
    337	case BLK_ZONE_COND_READONLY:
    338		/*
    339		 * The write pointer of read-only zones is invalid. If such a
    340		 * zone is found during mount, the file size cannot be retrieved
    341		 * so we treat the zone as offline (mount == true case).
    342		 * Otherwise, keep the file size as it was when last updated
    343		 * so that the user can recover data. In both cases, writes are
    344		 * always disabled for the zone.
    345		 */
    346		if (warn)
    347			zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n",
    348				    inode->i_ino);
    349		inode->i_flags |= S_IMMUTABLE;
    350		if (mount) {
    351			zone->cond = BLK_ZONE_COND_OFFLINE;
    352			inode->i_mode &= ~0777;
    353			zone->wp = zone->start;
    354			return 0;
    355		}
    356		inode->i_mode &= ~0222;
    357		return i_size_read(inode);
    358	case BLK_ZONE_COND_FULL:
    359		/* The write pointer of full zones is invalid. */
    360		return zi->i_max_size;
    361	default:
    362		if (zi->i_ztype == ZONEFS_ZTYPE_CNV)
    363			return zi->i_max_size;
    364		return (zone->wp - zone->start) << SECTOR_SHIFT;
    365	}
    366}
    367
    368struct zonefs_ioerr_data {
    369	struct inode	*inode;
    370	bool		write;
    371};
    372
    373static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
    374			      void *data)
    375{
    376	struct zonefs_ioerr_data *err = data;
    377	struct inode *inode = err->inode;
    378	struct zonefs_inode_info *zi = ZONEFS_I(inode);
    379	struct super_block *sb = inode->i_sb;
    380	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
    381	loff_t isize, data_size;
    382
    383	/*
    384	 * Check the zone condition: if the zone is not "bad" (offline or
    385	 * read-only), read errors are simply signaled to the IO issuer as long
    386	 * as there is no inconsistency between the inode size and the amount of
    387	 * data writen in the zone (data_size).
    388	 */
    389	data_size = zonefs_check_zone_condition(inode, zone, true, false);
    390	isize = i_size_read(inode);
    391	if (zone->cond != BLK_ZONE_COND_OFFLINE &&
    392	    zone->cond != BLK_ZONE_COND_READONLY &&
    393	    !err->write && isize == data_size)
    394		return 0;
    395
    396	/*
    397	 * At this point, we detected either a bad zone or an inconsistency
    398	 * between the inode size and the amount of data written in the zone.
    399	 * For the latter case, the cause may be a write IO error or an external
    400	 * action on the device. Two error patterns exist:
    401	 * 1) The inode size is lower than the amount of data in the zone:
    402	 *    a write operation partially failed and data was writen at the end
    403	 *    of the file. This can happen in the case of a large direct IO
    404	 *    needing several BIOs and/or write requests to be processed.
    405	 * 2) The inode size is larger than the amount of data in the zone:
    406	 *    this can happen with a deferred write error with the use of the
    407	 *    device side write cache after getting successful write IO
    408	 *    completions. Other possibilities are (a) an external corruption,
    409	 *    e.g. an application reset the zone directly, or (b) the device
    410	 *    has a serious problem (e.g. firmware bug).
    411	 *
    412	 * In all cases, warn about inode size inconsistency and handle the
    413	 * IO error according to the zone condition and to the mount options.
    414	 */
    415	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && isize != data_size)
    416		zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n",
    417			    inode->i_ino, isize, data_size);
    418
    419	/*
    420	 * First handle bad zones signaled by hardware. The mount options
    421	 * errors=zone-ro and errors=zone-offline result in changing the
    422	 * zone condition to read-only and offline respectively, as if the
    423	 * condition was signaled by the hardware.
    424	 */
    425	if (zone->cond == BLK_ZONE_COND_OFFLINE ||
    426	    sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) {
    427		zonefs_warn(sb, "inode %lu: read/write access disabled\n",
    428			    inode->i_ino);
    429		if (zone->cond != BLK_ZONE_COND_OFFLINE) {
    430			zone->cond = BLK_ZONE_COND_OFFLINE;
    431			data_size = zonefs_check_zone_condition(inode, zone,
    432								false, false);
    433		}
    434	} else if (zone->cond == BLK_ZONE_COND_READONLY ||
    435		   sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) {
    436		zonefs_warn(sb, "inode %lu: write access disabled\n",
    437			    inode->i_ino);
    438		if (zone->cond != BLK_ZONE_COND_READONLY) {
    439			zone->cond = BLK_ZONE_COND_READONLY;
    440			data_size = zonefs_check_zone_condition(inode, zone,
    441								false, false);
    442		}
    443	}
    444
    445	/*
    446	 * If the filesystem is mounted with the explicit-open mount option, we
    447	 * need to clear the ZONEFS_ZONE_OPEN flag if the zone transitioned to
    448	 * the read-only or offline condition, to avoid attempting an explicit
    449	 * close of the zone when the inode file is closed.
    450	 */
    451	if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) &&
    452	    (zone->cond == BLK_ZONE_COND_OFFLINE ||
    453	     zone->cond == BLK_ZONE_COND_READONLY))
    454		zi->i_flags &= ~ZONEFS_ZONE_OPEN;
    455
    456	/*
    457	 * If error=remount-ro was specified, any error result in remounting
    458	 * the volume as read-only.
    459	 */
    460	if ((sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO) && !sb_rdonly(sb)) {
    461		zonefs_warn(sb, "remounting filesystem read-only\n");
    462		sb->s_flags |= SB_RDONLY;
    463	}
    464
    465	/*
    466	 * Update block usage stats and the inode size  to prevent access to
    467	 * invalid data.
    468	 */
    469	zonefs_update_stats(inode, data_size);
    470	zonefs_i_size_write(inode, data_size);
    471	zi->i_wpoffset = data_size;
    472	zonefs_account_active(inode);
    473
    474	return 0;
    475}
    476
    477/*
    478 * When an file IO error occurs, check the file zone to see if there is a change
    479 * in the zone condition (e.g. offline or read-only). For a failed write to a
    480 * sequential zone, the zone write pointer position must also be checked to
    481 * eventually correct the file size and zonefs inode write pointer offset
    482 * (which can be out of sync with the drive due to partial write failures).
    483 */
    484static void __zonefs_io_error(struct inode *inode, bool write)
    485{
    486	struct zonefs_inode_info *zi = ZONEFS_I(inode);
    487	struct super_block *sb = inode->i_sb;
    488	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
    489	unsigned int noio_flag;
    490	unsigned int nr_zones =
    491		zi->i_zone_size >> (sbi->s_zone_sectors_shift + SECTOR_SHIFT);
    492	struct zonefs_ioerr_data err = {
    493		.inode = inode,
    494		.write = write,
    495	};
    496	int ret;
    497
    498	/*
    499	 * Memory allocations in blkdev_report_zones() can trigger a memory
    500	 * reclaim which may in turn cause a recursion into zonefs as well as
    501	 * struct request allocations for the same device. The former case may
    502	 * end up in a deadlock on the inode truncate mutex, while the latter
    503	 * may prevent IO forward progress. Executing the report zones under
    504	 * the GFP_NOIO context avoids both problems.
    505	 */
    506	noio_flag = memalloc_noio_save();
    507	ret = blkdev_report_zones(sb->s_bdev, zi->i_zsector, nr_zones,
    508				  zonefs_io_error_cb, &err);
    509	if (ret != nr_zones)
    510		zonefs_err(sb, "Get inode %lu zone information failed %d\n",
    511			   inode->i_ino, ret);
    512	memalloc_noio_restore(noio_flag);
    513}
    514
    515static void zonefs_io_error(struct inode *inode, bool write)
    516{
    517	struct zonefs_inode_info *zi = ZONEFS_I(inode);
    518
    519	mutex_lock(&zi->i_truncate_mutex);
    520	__zonefs_io_error(inode, write);
    521	mutex_unlock(&zi->i_truncate_mutex);
    522}
    523
    524static int zonefs_file_truncate(struct inode *inode, loff_t isize)
    525{
    526	struct zonefs_inode_info *zi = ZONEFS_I(inode);
    527	loff_t old_isize;
    528	enum req_opf op;
    529	int ret = 0;
    530
    531	/*
    532	 * Only sequential zone files can be truncated and truncation is allowed
    533	 * only down to a 0 size, which is equivalent to a zone reset, and to
    534	 * the maximum file size, which is equivalent to a zone finish.
    535	 */
    536	if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
    537		return -EPERM;
    538
    539	if (!isize)
    540		op = REQ_OP_ZONE_RESET;
    541	else if (isize == zi->i_max_size)
    542		op = REQ_OP_ZONE_FINISH;
    543	else
    544		return -EPERM;
    545
    546	inode_dio_wait(inode);
    547
    548	/* Serialize against page faults */
    549	filemap_invalidate_lock(inode->i_mapping);
    550
    551	/* Serialize against zonefs_iomap_begin() */
    552	mutex_lock(&zi->i_truncate_mutex);
    553
    554	old_isize = i_size_read(inode);
    555	if (isize == old_isize)
    556		goto unlock;
    557
    558	ret = zonefs_zone_mgmt(inode, op);
    559	if (ret)
    560		goto unlock;
    561
    562	/*
    563	 * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
    564	 * take care of open zones.
    565	 */
    566	if (zi->i_flags & ZONEFS_ZONE_OPEN) {
    567		/*
    568		 * Truncating a zone to EMPTY or FULL is the equivalent of
    569		 * closing the zone. For a truncation to 0, we need to
    570		 * re-open the zone to ensure new writes can be processed.
    571		 * For a truncation to the maximum file size, the zone is
    572		 * closed and writes cannot be accepted anymore, so clear
    573		 * the open flag.
    574		 */
    575		if (!isize)
    576			ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
    577		else
    578			zi->i_flags &= ~ZONEFS_ZONE_OPEN;
    579	}
    580
    581	zonefs_update_stats(inode, isize);
    582	truncate_setsize(inode, isize);
    583	zi->i_wpoffset = isize;
    584	zonefs_account_active(inode);
    585
    586unlock:
    587	mutex_unlock(&zi->i_truncate_mutex);
    588	filemap_invalidate_unlock(inode->i_mapping);
    589
    590	return ret;
    591}
    592
    593static int zonefs_inode_setattr(struct user_namespace *mnt_userns,
    594				struct dentry *dentry, struct iattr *iattr)
    595{
    596	struct inode *inode = d_inode(dentry);
    597	int ret;
    598
    599	if (unlikely(IS_IMMUTABLE(inode)))
    600		return -EPERM;
    601
    602	ret = setattr_prepare(&init_user_ns, dentry, iattr);
    603	if (ret)
    604		return ret;
    605
    606	/*
    607	 * Since files and directories cannot be created nor deleted, do not
    608	 * allow setting any write attributes on the sub-directories grouping
    609	 * files by zone type.
    610	 */
    611	if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) &&
    612	    (iattr->ia_mode & 0222))
    613		return -EPERM;
    614
    615	if (((iattr->ia_valid & ATTR_UID) &&
    616	     !uid_eq(iattr->ia_uid, inode->i_uid)) ||
    617	    ((iattr->ia_valid & ATTR_GID) &&
    618	     !gid_eq(iattr->ia_gid, inode->i_gid))) {
    619		ret = dquot_transfer(inode, iattr);
    620		if (ret)
    621			return ret;
    622	}
    623
    624	if (iattr->ia_valid & ATTR_SIZE) {
    625		ret = zonefs_file_truncate(inode, iattr->ia_size);
    626		if (ret)
    627			return ret;
    628	}
    629
    630	setattr_copy(&init_user_ns, inode, iattr);
    631
    632	return 0;
    633}
    634
    635static const struct inode_operations zonefs_file_inode_operations = {
    636	.setattr	= zonefs_inode_setattr,
    637};
    638
    639static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
    640			     int datasync)
    641{
    642	struct inode *inode = file_inode(file);
    643	int ret = 0;
    644
    645	if (unlikely(IS_IMMUTABLE(inode)))
    646		return -EPERM;
    647
    648	/*
    649	 * Since only direct writes are allowed in sequential files, page cache
    650	 * flush is needed only for conventional zone files.
    651	 */
    652	if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV)
    653		ret = file_write_and_wait_range(file, start, end);
    654	if (!ret)
    655		ret = blkdev_issue_flush(inode->i_sb->s_bdev);
    656
    657	if (ret)
    658		zonefs_io_error(inode, true);
    659
    660	return ret;
    661}
    662
    663static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
    664{
    665	struct inode *inode = file_inode(vmf->vma->vm_file);
    666	struct zonefs_inode_info *zi = ZONEFS_I(inode);
    667	vm_fault_t ret;
    668
    669	if (unlikely(IS_IMMUTABLE(inode)))
    670		return VM_FAULT_SIGBUS;
    671
    672	/*
    673	 * Sanity check: only conventional zone files can have shared
    674	 * writeable mappings.
    675	 */
    676	if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV))
    677		return VM_FAULT_NOPAGE;
    678
    679	sb_start_pagefault(inode->i_sb);
    680	file_update_time(vmf->vma->vm_file);
    681
    682	/* Serialize against truncates */
    683	filemap_invalidate_lock_shared(inode->i_mapping);
    684	ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
    685	filemap_invalidate_unlock_shared(inode->i_mapping);
    686
    687	sb_end_pagefault(inode->i_sb);
    688	return ret;
    689}
    690
    691static const struct vm_operations_struct zonefs_file_vm_ops = {
    692	.fault		= filemap_fault,
    693	.map_pages	= filemap_map_pages,
    694	.page_mkwrite	= zonefs_filemap_page_mkwrite,
    695};
    696
    697static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
    698{
    699	/*
    700	 * Conventional zones accept random writes, so their files can support
    701	 * shared writable mappings. For sequential zone files, only read
    702	 * mappings are possible since there are no guarantees for write
    703	 * ordering between msync() and page cache writeback.
    704	 */
    705	if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ &&
    706	    (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
    707		return -EINVAL;
    708
    709	file_accessed(file);
    710	vma->vm_ops = &zonefs_file_vm_ops;
    711
    712	return 0;
    713}
    714
    715static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
    716{
    717	loff_t isize = i_size_read(file_inode(file));
    718
    719	/*
    720	 * Seeks are limited to below the zone size for conventional zones
    721	 * and below the zone write pointer for sequential zones. In both
    722	 * cases, this limit is the inode size.
    723	 */
    724	return generic_file_llseek_size(file, offset, whence, isize, isize);
    725}
    726
    727static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
    728					int error, unsigned int flags)
    729{
    730	struct inode *inode = file_inode(iocb->ki_filp);
    731	struct zonefs_inode_info *zi = ZONEFS_I(inode);
    732
    733	if (error) {
    734		zonefs_io_error(inode, true);
    735		return error;
    736	}
    737
    738	if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) {
    739		/*
    740		 * Note that we may be seeing completions out of order,
    741		 * but that is not a problem since a write completed
    742		 * successfully necessarily means that all preceding writes
    743		 * were also successful. So we can safely increase the inode
    744		 * size to the write end location.
    745		 */
    746		mutex_lock(&zi->i_truncate_mutex);
    747		if (i_size_read(inode) < iocb->ki_pos + size) {
    748			zonefs_update_stats(inode, iocb->ki_pos + size);
    749			zonefs_i_size_write(inode, iocb->ki_pos + size);
    750		}
    751		mutex_unlock(&zi->i_truncate_mutex);
    752	}
    753
    754	return 0;
    755}
    756
    757static const struct iomap_dio_ops zonefs_write_dio_ops = {
    758	.end_io			= zonefs_file_write_dio_end_io,
    759};
    760
    761static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
    762{
    763	struct inode *inode = file_inode(iocb->ki_filp);
    764	struct zonefs_inode_info *zi = ZONEFS_I(inode);
    765	struct block_device *bdev = inode->i_sb->s_bdev;
    766	unsigned int max = bdev_max_zone_append_sectors(bdev);
    767	struct bio *bio;
    768	ssize_t size;
    769	int nr_pages;
    770	ssize_t ret;
    771
    772	max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
    773	iov_iter_truncate(from, max);
    774
    775	nr_pages = iov_iter_npages(from, BIO_MAX_VECS);
    776	if (!nr_pages)
    777		return 0;
    778
    779	bio = bio_alloc(bdev, nr_pages,
    780			REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
    781	bio->bi_iter.bi_sector = zi->i_zsector;
    782	bio->bi_ioprio = iocb->ki_ioprio;
    783	if (iocb->ki_flags & IOCB_DSYNC)
    784		bio->bi_opf |= REQ_FUA;
    785
    786	ret = bio_iov_iter_get_pages(bio, from);
    787	if (unlikely(ret))
    788		goto out_release;
    789
    790	size = bio->bi_iter.bi_size;
    791	task_io_account_write(size);
    792
    793	if (iocb->ki_flags & IOCB_HIPRI)
    794		bio_set_polled(bio, iocb);
    795
    796	ret = submit_bio_wait(bio);
    797
    798	zonefs_file_write_dio_end_io(iocb, size, ret, 0);
    799	trace_zonefs_file_dio_append(inode, size, ret);
    800
    801out_release:
    802	bio_release_pages(bio, false);
    803	bio_put(bio);
    804
    805	if (ret >= 0) {
    806		iocb->ki_pos += size;
    807		return size;
    808	}
    809
    810	return ret;
    811}
    812
    813/*
    814 * Do not exceed the LFS limits nor the file zone size. If pos is under the
    815 * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
    816 */
    817static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
    818					loff_t count)
    819{
    820	struct inode *inode = file_inode(file);
    821	struct zonefs_inode_info *zi = ZONEFS_I(inode);
    822	loff_t limit = rlimit(RLIMIT_FSIZE);
    823	loff_t max_size = zi->i_max_size;
    824
    825	if (limit != RLIM_INFINITY) {
    826		if (pos >= limit) {
    827			send_sig(SIGXFSZ, current, 0);
    828			return -EFBIG;
    829		}
    830		count = min(count, limit - pos);
    831	}
    832
    833	if (!(file->f_flags & O_LARGEFILE))
    834		max_size = min_t(loff_t, MAX_NON_LFS, max_size);
    835
    836	if (unlikely(pos >= max_size))
    837		return -EFBIG;
    838
    839	return min(count, max_size - pos);
    840}
    841
    842static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
    843{
    844	struct file *file = iocb->ki_filp;
    845	struct inode *inode = file_inode(file);
    846	struct zonefs_inode_info *zi = ZONEFS_I(inode);
    847	loff_t count;
    848
    849	if (IS_SWAPFILE(inode))
    850		return -ETXTBSY;
    851
    852	if (!iov_iter_count(from))
    853		return 0;
    854
    855	if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
    856		return -EINVAL;
    857
    858	if (iocb->ki_flags & IOCB_APPEND) {
    859		if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
    860			return -EINVAL;
    861		mutex_lock(&zi->i_truncate_mutex);
    862		iocb->ki_pos = zi->i_wpoffset;
    863		mutex_unlock(&zi->i_truncate_mutex);
    864	}
    865
    866	count = zonefs_write_check_limits(file, iocb->ki_pos,
    867					  iov_iter_count(from));
    868	if (count < 0)
    869		return count;
    870
    871	iov_iter_truncate(from, count);
    872	return iov_iter_count(from);
    873}
    874
    875/*
    876 * Handle direct writes. For sequential zone files, this is the only possible
    877 * write path. For these files, check that the user is issuing writes
    878 * sequentially from the end of the file. This code assumes that the block layer
    879 * delivers write requests to the device in sequential order. This is always the
    880 * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
    881 * elevator feature is being used (e.g. mq-deadline). The block layer always
    882 * automatically select such an elevator for zoned block devices during the
    883 * device initialization.
    884 */
    885static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
    886{
    887	struct inode *inode = file_inode(iocb->ki_filp);
    888	struct zonefs_inode_info *zi = ZONEFS_I(inode);
    889	struct super_block *sb = inode->i_sb;
    890	bool sync = is_sync_kiocb(iocb);
    891	bool append = false;
    892	ssize_t ret, count;
    893
    894	/*
    895	 * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
    896	 * as this can cause write reordering (e.g. the first aio gets EAGAIN
    897	 * on the inode lock but the second goes through but is now unaligned).
    898	 */
    899	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync &&
    900	    (iocb->ki_flags & IOCB_NOWAIT))
    901		return -EOPNOTSUPP;
    902
    903	if (iocb->ki_flags & IOCB_NOWAIT) {
    904		if (!inode_trylock(inode))
    905			return -EAGAIN;
    906	} else {
    907		inode_lock(inode);
    908	}
    909
    910	count = zonefs_write_checks(iocb, from);
    911	if (count <= 0) {
    912		ret = count;
    913		goto inode_unlock;
    914	}
    915
    916	if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
    917		ret = -EINVAL;
    918		goto inode_unlock;
    919	}
    920
    921	/* Enforce sequential writes (append only) in sequential zones */
    922	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
    923		mutex_lock(&zi->i_truncate_mutex);
    924		if (iocb->ki_pos != zi->i_wpoffset) {
    925			mutex_unlock(&zi->i_truncate_mutex);
    926			ret = -EINVAL;
    927			goto inode_unlock;
    928		}
    929		mutex_unlock(&zi->i_truncate_mutex);
    930		append = sync;
    931	}
    932
    933	if (append)
    934		ret = zonefs_file_dio_append(iocb, from);
    935	else
    936		ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
    937				   &zonefs_write_dio_ops, 0, NULL, 0);
    938	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
    939	    (ret > 0 || ret == -EIOCBQUEUED)) {
    940		if (ret > 0)
    941			count = ret;
    942
    943		/*
    944		 * Update the zone write pointer offset assuming the write
    945		 * operation succeeded. If it did not, the error recovery path
    946		 * will correct it. Also do active seq file accounting.
    947		 */
    948		mutex_lock(&zi->i_truncate_mutex);
    949		zi->i_wpoffset += count;
    950		zonefs_account_active(inode);
    951		mutex_unlock(&zi->i_truncate_mutex);
    952	}
    953
    954inode_unlock:
    955	inode_unlock(inode);
    956
    957	return ret;
    958}
    959
    960static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
    961					  struct iov_iter *from)
    962{
    963	struct inode *inode = file_inode(iocb->ki_filp);
    964	struct zonefs_inode_info *zi = ZONEFS_I(inode);
    965	ssize_t ret;
    966
    967	/*
    968	 * Direct IO writes are mandatory for sequential zone files so that the
    969	 * write IO issuing order is preserved.
    970	 */
    971	if (zi->i_ztype != ZONEFS_ZTYPE_CNV)
    972		return -EIO;
    973
    974	if (iocb->ki_flags & IOCB_NOWAIT) {
    975		if (!inode_trylock(inode))
    976			return -EAGAIN;
    977	} else {
    978		inode_lock(inode);
    979	}
    980
    981	ret = zonefs_write_checks(iocb, from);
    982	if (ret <= 0)
    983		goto inode_unlock;
    984
    985	ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
    986	if (ret > 0)
    987		iocb->ki_pos += ret;
    988	else if (ret == -EIO)
    989		zonefs_io_error(inode, true);
    990
    991inode_unlock:
    992	inode_unlock(inode);
    993	if (ret > 0)
    994		ret = generic_write_sync(iocb, ret);
    995
    996	return ret;
    997}
    998
    999static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
   1000{
   1001	struct inode *inode = file_inode(iocb->ki_filp);
   1002
   1003	if (unlikely(IS_IMMUTABLE(inode)))
   1004		return -EPERM;
   1005
   1006	if (sb_rdonly(inode->i_sb))
   1007		return -EROFS;
   1008
   1009	/* Write operations beyond the zone size are not allowed */
   1010	if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size)
   1011		return -EFBIG;
   1012
   1013	if (iocb->ki_flags & IOCB_DIRECT) {
   1014		ssize_t ret = zonefs_file_dio_write(iocb, from);
   1015		if (ret != -ENOTBLK)
   1016			return ret;
   1017	}
   1018
   1019	return zonefs_file_buffered_write(iocb, from);
   1020}
   1021
   1022static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size,
   1023				       int error, unsigned int flags)
   1024{
   1025	if (error) {
   1026		zonefs_io_error(file_inode(iocb->ki_filp), false);
   1027		return error;
   1028	}
   1029
   1030	return 0;
   1031}
   1032
   1033static const struct iomap_dio_ops zonefs_read_dio_ops = {
   1034	.end_io			= zonefs_file_read_dio_end_io,
   1035};
   1036
   1037static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
   1038{
   1039	struct inode *inode = file_inode(iocb->ki_filp);
   1040	struct zonefs_inode_info *zi = ZONEFS_I(inode);
   1041	struct super_block *sb = inode->i_sb;
   1042	loff_t isize;
   1043	ssize_t ret;
   1044
   1045	/* Offline zones cannot be read */
   1046	if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
   1047		return -EPERM;
   1048
   1049	if (iocb->ki_pos >= zi->i_max_size)
   1050		return 0;
   1051
   1052	if (iocb->ki_flags & IOCB_NOWAIT) {
   1053		if (!inode_trylock_shared(inode))
   1054			return -EAGAIN;
   1055	} else {
   1056		inode_lock_shared(inode);
   1057	}
   1058
   1059	/* Limit read operations to written data */
   1060	mutex_lock(&zi->i_truncate_mutex);
   1061	isize = i_size_read(inode);
   1062	if (iocb->ki_pos >= isize) {
   1063		mutex_unlock(&zi->i_truncate_mutex);
   1064		ret = 0;
   1065		goto inode_unlock;
   1066	}
   1067	iov_iter_truncate(to, isize - iocb->ki_pos);
   1068	mutex_unlock(&zi->i_truncate_mutex);
   1069
   1070	if (iocb->ki_flags & IOCB_DIRECT) {
   1071		size_t count = iov_iter_count(to);
   1072
   1073		if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
   1074			ret = -EINVAL;
   1075			goto inode_unlock;
   1076		}
   1077		file_accessed(iocb->ki_filp);
   1078		ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
   1079				   &zonefs_read_dio_ops, 0, NULL, 0);
   1080	} else {
   1081		ret = generic_file_read_iter(iocb, to);
   1082		if (ret == -EIO)
   1083			zonefs_io_error(inode, false);
   1084	}
   1085
   1086inode_unlock:
   1087	inode_unlock_shared(inode);
   1088
   1089	return ret;
   1090}
   1091
   1092/*
   1093 * Write open accounting is done only for sequential files.
   1094 */
   1095static inline bool zonefs_seq_file_need_wro(struct inode *inode,
   1096					    struct file *file)
   1097{
   1098	struct zonefs_inode_info *zi = ZONEFS_I(inode);
   1099
   1100	if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
   1101		return false;
   1102
   1103	if (!(file->f_mode & FMODE_WRITE))
   1104		return false;
   1105
   1106	return true;
   1107}
   1108
   1109static int zonefs_seq_file_write_open(struct inode *inode)
   1110{
   1111	struct zonefs_inode_info *zi = ZONEFS_I(inode);
   1112	int ret = 0;
   1113
   1114	mutex_lock(&zi->i_truncate_mutex);
   1115
   1116	if (!zi->i_wr_refcnt) {
   1117		struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
   1118		unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
   1119
   1120		if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
   1121
   1122			if (sbi->s_max_wro_seq_files
   1123			    && wro > sbi->s_max_wro_seq_files) {
   1124				atomic_dec(&sbi->s_wro_seq_files);
   1125				ret = -EBUSY;
   1126				goto unlock;
   1127			}
   1128
   1129			if (i_size_read(inode) < zi->i_max_size) {
   1130				ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
   1131				if (ret) {
   1132					atomic_dec(&sbi->s_wro_seq_files);
   1133					goto unlock;
   1134				}
   1135				zi->i_flags |= ZONEFS_ZONE_OPEN;
   1136				zonefs_account_active(inode);
   1137			}
   1138		}
   1139	}
   1140
   1141	zi->i_wr_refcnt++;
   1142
   1143unlock:
   1144	mutex_unlock(&zi->i_truncate_mutex);
   1145
   1146	return ret;
   1147}
   1148
   1149static int zonefs_file_open(struct inode *inode, struct file *file)
   1150{
   1151	int ret;
   1152
   1153	ret = generic_file_open(inode, file);
   1154	if (ret)
   1155		return ret;
   1156
   1157	if (zonefs_seq_file_need_wro(inode, file))
   1158		return zonefs_seq_file_write_open(inode);
   1159
   1160	return 0;
   1161}
   1162
   1163static void zonefs_seq_file_write_close(struct inode *inode)
   1164{
   1165	struct zonefs_inode_info *zi = ZONEFS_I(inode);
   1166	struct super_block *sb = inode->i_sb;
   1167	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
   1168	int ret = 0;
   1169
   1170	mutex_lock(&zi->i_truncate_mutex);
   1171
   1172	zi->i_wr_refcnt--;
   1173	if (zi->i_wr_refcnt)
   1174		goto unlock;
   1175
   1176	/*
   1177	 * The file zone may not be open anymore (e.g. the file was truncated to
   1178	 * its maximum size or it was fully written). For this case, we only
   1179	 * need to decrement the write open count.
   1180	 */
   1181	if (zi->i_flags & ZONEFS_ZONE_OPEN) {
   1182		ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
   1183		if (ret) {
   1184			__zonefs_io_error(inode, false);
   1185			/*
   1186			 * Leaving zones explicitly open may lead to a state
   1187			 * where most zones cannot be written (zone resources
   1188			 * exhausted). So take preventive action by remounting
   1189			 * read-only.
   1190			 */
   1191			if (zi->i_flags & ZONEFS_ZONE_OPEN &&
   1192			    !(sb->s_flags & SB_RDONLY)) {
   1193				zonefs_warn(sb,
   1194					"closing zone at %llu failed %d\n",
   1195					zi->i_zsector, ret);
   1196				zonefs_warn(sb,
   1197					"remounting filesystem read-only\n");
   1198				sb->s_flags |= SB_RDONLY;
   1199			}
   1200			goto unlock;
   1201		}
   1202
   1203		zi->i_flags &= ~ZONEFS_ZONE_OPEN;
   1204		zonefs_account_active(inode);
   1205	}
   1206
   1207	atomic_dec(&sbi->s_wro_seq_files);
   1208
   1209unlock:
   1210	mutex_unlock(&zi->i_truncate_mutex);
   1211}
   1212
   1213static int zonefs_file_release(struct inode *inode, struct file *file)
   1214{
   1215	/*
   1216	 * If we explicitly open a zone we must close it again as well, but the
   1217	 * zone management operation can fail (either due to an IO error or as
   1218	 * the zone has gone offline or read-only). Make sure we don't fail the
   1219	 * close(2) for user-space.
   1220	 */
   1221	if (zonefs_seq_file_need_wro(inode, file))
   1222		zonefs_seq_file_write_close(inode);
   1223
   1224	return 0;
   1225}
   1226
   1227static const struct file_operations zonefs_file_operations = {
   1228	.open		= zonefs_file_open,
   1229	.release	= zonefs_file_release,
   1230	.fsync		= zonefs_file_fsync,
   1231	.mmap		= zonefs_file_mmap,
   1232	.llseek		= zonefs_file_llseek,
   1233	.read_iter	= zonefs_file_read_iter,
   1234	.write_iter	= zonefs_file_write_iter,
   1235	.splice_read	= generic_file_splice_read,
   1236	.splice_write	= iter_file_splice_write,
   1237	.iopoll		= iocb_bio_iopoll,
   1238};
   1239
   1240static struct kmem_cache *zonefs_inode_cachep;
   1241
   1242static struct inode *zonefs_alloc_inode(struct super_block *sb)
   1243{
   1244	struct zonefs_inode_info *zi;
   1245
   1246	zi = alloc_inode_sb(sb, zonefs_inode_cachep, GFP_KERNEL);
   1247	if (!zi)
   1248		return NULL;
   1249
   1250	inode_init_once(&zi->i_vnode);
   1251	mutex_init(&zi->i_truncate_mutex);
   1252	zi->i_wr_refcnt = 0;
   1253	zi->i_flags = 0;
   1254
   1255	return &zi->i_vnode;
   1256}
   1257
   1258static void zonefs_free_inode(struct inode *inode)
   1259{
   1260	kmem_cache_free(zonefs_inode_cachep, ZONEFS_I(inode));
   1261}
   1262
   1263/*
   1264 * File system stat.
   1265 */
   1266static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf)
   1267{
   1268	struct super_block *sb = dentry->d_sb;
   1269	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
   1270	enum zonefs_ztype t;
   1271
   1272	buf->f_type = ZONEFS_MAGIC;
   1273	buf->f_bsize = sb->s_blocksize;
   1274	buf->f_namelen = ZONEFS_NAME_MAX;
   1275
   1276	spin_lock(&sbi->s_lock);
   1277
   1278	buf->f_blocks = sbi->s_blocks;
   1279	if (WARN_ON(sbi->s_used_blocks > sbi->s_blocks))
   1280		buf->f_bfree = 0;
   1281	else
   1282		buf->f_bfree = buf->f_blocks - sbi->s_used_blocks;
   1283	buf->f_bavail = buf->f_bfree;
   1284
   1285	for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) {
   1286		if (sbi->s_nr_files[t])
   1287			buf->f_files += sbi->s_nr_files[t] + 1;
   1288	}
   1289	buf->f_ffree = 0;
   1290
   1291	spin_unlock(&sbi->s_lock);
   1292
   1293	buf->f_fsid = uuid_to_fsid(sbi->s_uuid.b);
   1294
   1295	return 0;
   1296}
   1297
   1298enum {
   1299	Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair,
   1300	Opt_explicit_open, Opt_err,
   1301};
   1302
   1303static const match_table_t tokens = {
   1304	{ Opt_errors_ro,	"errors=remount-ro"},
   1305	{ Opt_errors_zro,	"errors=zone-ro"},
   1306	{ Opt_errors_zol,	"errors=zone-offline"},
   1307	{ Opt_errors_repair,	"errors=repair"},
   1308	{ Opt_explicit_open,	"explicit-open" },
   1309	{ Opt_err,		NULL}
   1310};
   1311
   1312static int zonefs_parse_options(struct super_block *sb, char *options)
   1313{
   1314	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
   1315	substring_t args[MAX_OPT_ARGS];
   1316	char *p;
   1317
   1318	if (!options)
   1319		return 0;
   1320
   1321	while ((p = strsep(&options, ",")) != NULL) {
   1322		int token;
   1323
   1324		if (!*p)
   1325			continue;
   1326
   1327		token = match_token(p, tokens, args);
   1328		switch (token) {
   1329		case Opt_errors_ro:
   1330			sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
   1331			sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_RO;
   1332			break;
   1333		case Opt_errors_zro:
   1334			sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
   1335			sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZRO;
   1336			break;
   1337		case Opt_errors_zol:
   1338			sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
   1339			sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZOL;
   1340			break;
   1341		case Opt_errors_repair:
   1342			sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
   1343			sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR;
   1344			break;
   1345		case Opt_explicit_open:
   1346			sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN;
   1347			break;
   1348		default:
   1349			return -EINVAL;
   1350		}
   1351	}
   1352
   1353	return 0;
   1354}
   1355
   1356static int zonefs_show_options(struct seq_file *seq, struct dentry *root)
   1357{
   1358	struct zonefs_sb_info *sbi = ZONEFS_SB(root->d_sb);
   1359
   1360	if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO)
   1361		seq_puts(seq, ",errors=remount-ro");
   1362	if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)
   1363		seq_puts(seq, ",errors=zone-ro");
   1364	if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)
   1365		seq_puts(seq, ",errors=zone-offline");
   1366	if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_REPAIR)
   1367		seq_puts(seq, ",errors=repair");
   1368
   1369	return 0;
   1370}
   1371
   1372static int zonefs_remount(struct super_block *sb, int *flags, char *data)
   1373{
   1374	sync_filesystem(sb);
   1375
   1376	return zonefs_parse_options(sb, data);
   1377}
   1378
   1379static const struct super_operations zonefs_sops = {
   1380	.alloc_inode	= zonefs_alloc_inode,
   1381	.free_inode	= zonefs_free_inode,
   1382	.statfs		= zonefs_statfs,
   1383	.remount_fs	= zonefs_remount,
   1384	.show_options	= zonefs_show_options,
   1385};
   1386
   1387static const struct inode_operations zonefs_dir_inode_operations = {
   1388	.lookup		= simple_lookup,
   1389	.setattr	= zonefs_inode_setattr,
   1390};
   1391
   1392static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode,
   1393				  enum zonefs_ztype type)
   1394{
   1395	struct super_block *sb = parent->i_sb;
   1396
   1397	inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk) + type + 1;
   1398	inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555);
   1399	inode->i_op = &zonefs_dir_inode_operations;
   1400	inode->i_fop = &simple_dir_operations;
   1401	set_nlink(inode, 2);
   1402	inc_nlink(parent);
   1403}
   1404
   1405static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
   1406				  enum zonefs_ztype type)
   1407{
   1408	struct super_block *sb = inode->i_sb;
   1409	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
   1410	struct zonefs_inode_info *zi = ZONEFS_I(inode);
   1411	int ret = 0;
   1412
   1413	inode->i_ino = zone->start >> sbi->s_zone_sectors_shift;
   1414	inode->i_mode = S_IFREG | sbi->s_perm;
   1415
   1416	zi->i_ztype = type;
   1417	zi->i_zsector = zone->start;
   1418	zi->i_zone_size = zone->len << SECTOR_SHIFT;
   1419
   1420	zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE,
   1421			       zone->capacity << SECTOR_SHIFT);
   1422	zi->i_wpoffset = zonefs_check_zone_condition(inode, zone, true, true);
   1423
   1424	inode->i_uid = sbi->s_uid;
   1425	inode->i_gid = sbi->s_gid;
   1426	inode->i_size = zi->i_wpoffset;
   1427	inode->i_blocks = zi->i_max_size >> SECTOR_SHIFT;
   1428
   1429	inode->i_op = &zonefs_file_inode_operations;
   1430	inode->i_fop = &zonefs_file_operations;
   1431	inode->i_mapping->a_ops = &zonefs_file_aops;
   1432
   1433	sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes);
   1434	sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits;
   1435	sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits;
   1436
   1437	mutex_lock(&zi->i_truncate_mutex);
   1438
   1439	/*
   1440	 * For sequential zones, make sure that any open zone is closed first
   1441	 * to ensure that the initial number of open zones is 0, in sync with
   1442	 * the open zone accounting done when the mount option
   1443	 * ZONEFS_MNTOPT_EXPLICIT_OPEN is used.
   1444	 */
   1445	if (type == ZONEFS_ZTYPE_SEQ &&
   1446	    (zone->cond == BLK_ZONE_COND_IMP_OPEN ||
   1447	     zone->cond == BLK_ZONE_COND_EXP_OPEN)) {
   1448		ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
   1449		if (ret)
   1450			goto unlock;
   1451	}
   1452
   1453	zonefs_account_active(inode);
   1454
   1455unlock:
   1456	mutex_unlock(&zi->i_truncate_mutex);
   1457
   1458	return ret;
   1459}
   1460
   1461static struct dentry *zonefs_create_inode(struct dentry *parent,
   1462					const char *name, struct blk_zone *zone,
   1463					enum zonefs_ztype type)
   1464{
   1465	struct inode *dir = d_inode(parent);
   1466	struct dentry *dentry;
   1467	struct inode *inode;
   1468	int ret;
   1469
   1470	dentry = d_alloc_name(parent, name);
   1471	if (!dentry)
   1472		return NULL;
   1473
   1474	inode = new_inode(parent->d_sb);
   1475	if (!inode)
   1476		goto dput;
   1477
   1478	inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime;
   1479	if (zone) {
   1480		ret = zonefs_init_file_inode(inode, zone, type);
   1481		if (ret) {
   1482			iput(inode);
   1483			goto dput;
   1484		}
   1485	} else {
   1486		zonefs_init_dir_inode(dir, inode, type);
   1487	}
   1488
   1489	d_add(dentry, inode);
   1490	dir->i_size++;
   1491
   1492	return dentry;
   1493
   1494dput:
   1495	dput(dentry);
   1496
   1497	return NULL;
   1498}
   1499
   1500struct zonefs_zone_data {
   1501	struct super_block	*sb;
   1502	unsigned int		nr_zones[ZONEFS_ZTYPE_MAX];
   1503	struct blk_zone		*zones;
   1504};
   1505
   1506/*
   1507 * Create a zone group and populate it with zone files.
   1508 */
   1509static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
   1510				enum zonefs_ztype type)
   1511{
   1512	struct super_block *sb = zd->sb;
   1513	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
   1514	struct blk_zone *zone, *next, *end;
   1515	const char *zgroup_name;
   1516	char *file_name;
   1517	struct dentry *dir;
   1518	unsigned int n = 0;
   1519	int ret;
   1520
   1521	/* If the group is empty, there is nothing to do */
   1522	if (!zd->nr_zones[type])
   1523		return 0;
   1524
   1525	file_name = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL);
   1526	if (!file_name)
   1527		return -ENOMEM;
   1528
   1529	if (type == ZONEFS_ZTYPE_CNV)
   1530		zgroup_name = "cnv";
   1531	else
   1532		zgroup_name = "seq";
   1533
   1534	dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type);
   1535	if (!dir) {
   1536		ret = -ENOMEM;
   1537		goto free;
   1538	}
   1539
   1540	/*
   1541	 * The first zone contains the super block: skip it.
   1542	 */
   1543	end = zd->zones + blkdev_nr_zones(sb->s_bdev->bd_disk);
   1544	for (zone = &zd->zones[1]; zone < end; zone = next) {
   1545
   1546		next = zone + 1;
   1547		if (zonefs_zone_type(zone) != type)
   1548			continue;
   1549
   1550		/*
   1551		 * For conventional zones, contiguous zones can be aggregated
   1552		 * together to form larger files. Note that this overwrites the
   1553		 * length of the first zone of the set of contiguous zones
   1554		 * aggregated together. If one offline or read-only zone is
   1555		 * found, assume that all zones aggregated have the same
   1556		 * condition.
   1557		 */
   1558		if (type == ZONEFS_ZTYPE_CNV &&
   1559		    (sbi->s_features & ZONEFS_F_AGGRCNV)) {
   1560			for (; next < end; next++) {
   1561				if (zonefs_zone_type(next) != type)
   1562					break;
   1563				zone->len += next->len;
   1564				zone->capacity += next->capacity;
   1565				if (next->cond == BLK_ZONE_COND_READONLY &&
   1566				    zone->cond != BLK_ZONE_COND_OFFLINE)
   1567					zone->cond = BLK_ZONE_COND_READONLY;
   1568				else if (next->cond == BLK_ZONE_COND_OFFLINE)
   1569					zone->cond = BLK_ZONE_COND_OFFLINE;
   1570			}
   1571			if (zone->capacity != zone->len) {
   1572				zonefs_err(sb, "Invalid conventional zone capacity\n");
   1573				ret = -EINVAL;
   1574				goto free;
   1575			}
   1576		}
   1577
   1578		/*
   1579		 * Use the file number within its group as file name.
   1580		 */
   1581		snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n);
   1582		if (!zonefs_create_inode(dir, file_name, zone, type)) {
   1583			ret = -ENOMEM;
   1584			goto free;
   1585		}
   1586
   1587		n++;
   1588	}
   1589
   1590	zonefs_info(sb, "Zone group \"%s\" has %u file%s\n",
   1591		    zgroup_name, n, n > 1 ? "s" : "");
   1592
   1593	sbi->s_nr_files[type] = n;
   1594	ret = 0;
   1595
   1596free:
   1597	kfree(file_name);
   1598
   1599	return ret;
   1600}
   1601
   1602static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx,
   1603				   void *data)
   1604{
   1605	struct zonefs_zone_data *zd = data;
   1606
   1607	/*
   1608	 * Count the number of usable zones: the first zone at index 0 contains
   1609	 * the super block and is ignored.
   1610	 */
   1611	switch (zone->type) {
   1612	case BLK_ZONE_TYPE_CONVENTIONAL:
   1613		zone->wp = zone->start + zone->len;
   1614		if (idx)
   1615			zd->nr_zones[ZONEFS_ZTYPE_CNV]++;
   1616		break;
   1617	case BLK_ZONE_TYPE_SEQWRITE_REQ:
   1618	case BLK_ZONE_TYPE_SEQWRITE_PREF:
   1619		if (idx)
   1620			zd->nr_zones[ZONEFS_ZTYPE_SEQ]++;
   1621		break;
   1622	default:
   1623		zonefs_err(zd->sb, "Unsupported zone type 0x%x\n",
   1624			   zone->type);
   1625		return -EIO;
   1626	}
   1627
   1628	memcpy(&zd->zones[idx], zone, sizeof(struct blk_zone));
   1629
   1630	return 0;
   1631}
   1632
   1633static int zonefs_get_zone_info(struct zonefs_zone_data *zd)
   1634{
   1635	struct block_device *bdev = zd->sb->s_bdev;
   1636	int ret;
   1637
   1638	zd->zones = kvcalloc(blkdev_nr_zones(bdev->bd_disk),
   1639			     sizeof(struct blk_zone), GFP_KERNEL);
   1640	if (!zd->zones)
   1641		return -ENOMEM;
   1642
   1643	/* Get zones information from the device */
   1644	ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES,
   1645				  zonefs_get_zone_info_cb, zd);
   1646	if (ret < 0) {
   1647		zonefs_err(zd->sb, "Zone report failed %d\n", ret);
   1648		return ret;
   1649	}
   1650
   1651	if (ret != blkdev_nr_zones(bdev->bd_disk)) {
   1652		zonefs_err(zd->sb, "Invalid zone report (%d/%u zones)\n",
   1653			   ret, blkdev_nr_zones(bdev->bd_disk));
   1654		return -EIO;
   1655	}
   1656
   1657	return 0;
   1658}
   1659
   1660static inline void zonefs_cleanup_zone_info(struct zonefs_zone_data *zd)
   1661{
   1662	kvfree(zd->zones);
   1663}
   1664
   1665/*
   1666 * Read super block information from the device.
   1667 */
   1668static int zonefs_read_super(struct super_block *sb)
   1669{
   1670	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
   1671	struct zonefs_super *super;
   1672	u32 crc, stored_crc;
   1673	struct page *page;
   1674	struct bio_vec bio_vec;
   1675	struct bio bio;
   1676	int ret;
   1677
   1678	page = alloc_page(GFP_KERNEL);
   1679	if (!page)
   1680		return -ENOMEM;
   1681
   1682	bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ);
   1683	bio.bi_iter.bi_sector = 0;
   1684	bio_add_page(&bio, page, PAGE_SIZE, 0);
   1685
   1686	ret = submit_bio_wait(&bio);
   1687	if (ret)
   1688		goto free_page;
   1689
   1690	super = kmap(page);
   1691
   1692	ret = -EINVAL;
   1693	if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC)
   1694		goto unmap;
   1695
   1696	stored_crc = le32_to_cpu(super->s_crc);
   1697	super->s_crc = 0;
   1698	crc = crc32(~0U, (unsigned char *)super, sizeof(struct zonefs_super));
   1699	if (crc != stored_crc) {
   1700		zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)",
   1701			   crc, stored_crc);
   1702		goto unmap;
   1703	}
   1704
   1705	sbi->s_features = le64_to_cpu(super->s_features);
   1706	if (sbi->s_features & ~ZONEFS_F_DEFINED_FEATURES) {
   1707		zonefs_err(sb, "Unknown features set 0x%llx\n",
   1708			   sbi->s_features);
   1709		goto unmap;
   1710	}
   1711
   1712	if (sbi->s_features & ZONEFS_F_UID) {
   1713		sbi->s_uid = make_kuid(current_user_ns(),
   1714				       le32_to_cpu(super->s_uid));
   1715		if (!uid_valid(sbi->s_uid)) {
   1716			zonefs_err(sb, "Invalid UID feature\n");
   1717			goto unmap;
   1718		}
   1719	}
   1720
   1721	if (sbi->s_features & ZONEFS_F_GID) {
   1722		sbi->s_gid = make_kgid(current_user_ns(),
   1723				       le32_to_cpu(super->s_gid));
   1724		if (!gid_valid(sbi->s_gid)) {
   1725			zonefs_err(sb, "Invalid GID feature\n");
   1726			goto unmap;
   1727		}
   1728	}
   1729
   1730	if (sbi->s_features & ZONEFS_F_PERM)
   1731		sbi->s_perm = le32_to_cpu(super->s_perm);
   1732
   1733	if (memchr_inv(super->s_reserved, 0, sizeof(super->s_reserved))) {
   1734		zonefs_err(sb, "Reserved area is being used\n");
   1735		goto unmap;
   1736	}
   1737
   1738	import_uuid(&sbi->s_uuid, super->s_uuid);
   1739	ret = 0;
   1740
   1741unmap:
   1742	kunmap(page);
   1743free_page:
   1744	__free_page(page);
   1745
   1746	return ret;
   1747}
   1748
   1749/*
   1750 * Check that the device is zoned. If it is, get the list of zones and create
   1751 * sub-directories and files according to the device zone configuration and
   1752 * format options.
   1753 */
   1754static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
   1755{
   1756	struct zonefs_zone_data zd;
   1757	struct zonefs_sb_info *sbi;
   1758	struct inode *inode;
   1759	enum zonefs_ztype t;
   1760	int ret;
   1761
   1762	if (!bdev_is_zoned(sb->s_bdev)) {
   1763		zonefs_err(sb, "Not a zoned block device\n");
   1764		return -EINVAL;
   1765	}
   1766
   1767	/*
   1768	 * Initialize super block information: the maximum file size is updated
   1769	 * when the zone files are created so that the format option
   1770	 * ZONEFS_F_AGGRCNV which increases the maximum file size of a file
   1771	 * beyond the zone size is taken into account.
   1772	 */
   1773	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
   1774	if (!sbi)
   1775		return -ENOMEM;
   1776
   1777	spin_lock_init(&sbi->s_lock);
   1778	sb->s_fs_info = sbi;
   1779	sb->s_magic = ZONEFS_MAGIC;
   1780	sb->s_maxbytes = 0;
   1781	sb->s_op = &zonefs_sops;
   1782	sb->s_time_gran	= 1;
   1783
   1784	/*
   1785	 * The block size is set to the device zone write granularity to ensure
   1786	 * that write operations are always aligned according to the device
   1787	 * interface constraints.
   1788	 */
   1789	sb_set_blocksize(sb, bdev_zone_write_granularity(sb->s_bdev));
   1790	sbi->s_zone_sectors_shift = ilog2(bdev_zone_sectors(sb->s_bdev));
   1791	sbi->s_uid = GLOBAL_ROOT_UID;
   1792	sbi->s_gid = GLOBAL_ROOT_GID;
   1793	sbi->s_perm = 0640;
   1794	sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO;
   1795
   1796	atomic_set(&sbi->s_wro_seq_files, 0);
   1797	sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev);
   1798	atomic_set(&sbi->s_active_seq_files, 0);
   1799	sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev);
   1800
   1801	ret = zonefs_read_super(sb);
   1802	if (ret)
   1803		return ret;
   1804
   1805	ret = zonefs_parse_options(sb, data);
   1806	if (ret)
   1807		return ret;
   1808
   1809	memset(&zd, 0, sizeof(struct zonefs_zone_data));
   1810	zd.sb = sb;
   1811	ret = zonefs_get_zone_info(&zd);
   1812	if (ret)
   1813		goto cleanup;
   1814
   1815	ret = zonefs_sysfs_register(sb);
   1816	if (ret)
   1817		goto cleanup;
   1818
   1819	zonefs_info(sb, "Mounting %u zones",
   1820		    blkdev_nr_zones(sb->s_bdev->bd_disk));
   1821
   1822	if (!sbi->s_max_wro_seq_files &&
   1823	    !sbi->s_max_active_seq_files &&
   1824	    sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
   1825		zonefs_info(sb,
   1826			"No open and active zone limits. Ignoring explicit_open mount option\n");
   1827		sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
   1828	}
   1829
   1830	/* Create root directory inode */
   1831	ret = -ENOMEM;
   1832	inode = new_inode(sb);
   1833	if (!inode)
   1834		goto cleanup;
   1835
   1836	inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk);
   1837	inode->i_mode = S_IFDIR | 0555;
   1838	inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode);
   1839	inode->i_op = &zonefs_dir_inode_operations;
   1840	inode->i_fop = &simple_dir_operations;
   1841	set_nlink(inode, 2);
   1842
   1843	sb->s_root = d_make_root(inode);
   1844	if (!sb->s_root)
   1845		goto cleanup;
   1846
   1847	/* Create and populate files in zone groups directories */
   1848	for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) {
   1849		ret = zonefs_create_zgroup(&zd, t);
   1850		if (ret)
   1851			break;
   1852	}
   1853
   1854cleanup:
   1855	zonefs_cleanup_zone_info(&zd);
   1856
   1857	return ret;
   1858}
   1859
   1860static struct dentry *zonefs_mount(struct file_system_type *fs_type,
   1861				   int flags, const char *dev_name, void *data)
   1862{
   1863	return mount_bdev(fs_type, flags, dev_name, data, zonefs_fill_super);
   1864}
   1865
   1866static void zonefs_kill_super(struct super_block *sb)
   1867{
   1868	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
   1869
   1870	if (sb->s_root)
   1871		d_genocide(sb->s_root);
   1872
   1873	zonefs_sysfs_unregister(sb);
   1874	kill_block_super(sb);
   1875	kfree(sbi);
   1876}
   1877
   1878/*
   1879 * File system definition and registration.
   1880 */
   1881static struct file_system_type zonefs_type = {
   1882	.owner		= THIS_MODULE,
   1883	.name		= "zonefs",
   1884	.mount		= zonefs_mount,
   1885	.kill_sb	= zonefs_kill_super,
   1886	.fs_flags	= FS_REQUIRES_DEV,
   1887};
   1888
   1889static int __init zonefs_init_inodecache(void)
   1890{
   1891	zonefs_inode_cachep = kmem_cache_create("zonefs_inode_cache",
   1892			sizeof(struct zonefs_inode_info), 0,
   1893			(SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT),
   1894			NULL);
   1895	if (zonefs_inode_cachep == NULL)
   1896		return -ENOMEM;
   1897	return 0;
   1898}
   1899
   1900static void zonefs_destroy_inodecache(void)
   1901{
   1902	/*
   1903	 * Make sure all delayed rcu free inodes are flushed before we
   1904	 * destroy the inode cache.
   1905	 */
   1906	rcu_barrier();
   1907	kmem_cache_destroy(zonefs_inode_cachep);
   1908}
   1909
   1910static int __init zonefs_init(void)
   1911{
   1912	int ret;
   1913
   1914	BUILD_BUG_ON(sizeof(struct zonefs_super) != ZONEFS_SUPER_SIZE);
   1915
   1916	ret = zonefs_init_inodecache();
   1917	if (ret)
   1918		return ret;
   1919
   1920	ret = register_filesystem(&zonefs_type);
   1921	if (ret)
   1922		goto destroy_inodecache;
   1923
   1924	ret = zonefs_sysfs_init();
   1925	if (ret)
   1926		goto unregister_fs;
   1927
   1928	return 0;
   1929
   1930unregister_fs:
   1931	unregister_filesystem(&zonefs_type);
   1932destroy_inodecache:
   1933	zonefs_destroy_inodecache();
   1934
   1935	return ret;
   1936}
   1937
   1938static void __exit zonefs_exit(void)
   1939{
   1940	zonefs_sysfs_exit();
   1941	zonefs_destroy_inodecache();
   1942	unregister_filesystem(&zonefs_type);
   1943}
   1944
   1945MODULE_AUTHOR("Damien Le Moal");
   1946MODULE_DESCRIPTION("Zone file system for zoned block devices");
   1947MODULE_LICENSE("GPL");
   1948MODULE_ALIAS_FS("zonefs");
   1949module_init(zonefs_init);
   1950module_exit(zonefs_exit);