cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

inode.c (183492B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 *  linux/fs/ext4/inode.c
      4 *
      5 * Copyright (C) 1992, 1993, 1994, 1995
      6 * Remy Card (card@masi.ibp.fr)
      7 * Laboratoire MASI - Institut Blaise Pascal
      8 * Universite Pierre et Marie Curie (Paris VI)
      9 *
     10 *  from
     11 *
     12 *  linux/fs/minix/inode.c
     13 *
     14 *  Copyright (C) 1991, 1992  Linus Torvalds
     15 *
     16 *  64-bit file support on 64-bit platforms by Jakub Jelinek
     17 *	(jj@sunsite.ms.mff.cuni.cz)
     18 *
     19 *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
     20 */
     21
     22#include <linux/fs.h>
     23#include <linux/mount.h>
     24#include <linux/time.h>
     25#include <linux/highuid.h>
     26#include <linux/pagemap.h>
     27#include <linux/dax.h>
     28#include <linux/quotaops.h>
     29#include <linux/string.h>
     30#include <linux/buffer_head.h>
     31#include <linux/writeback.h>
     32#include <linux/pagevec.h>
     33#include <linux/mpage.h>
     34#include <linux/namei.h>
     35#include <linux/uio.h>
     36#include <linux/bio.h>
     37#include <linux/workqueue.h>
     38#include <linux/kernel.h>
     39#include <linux/printk.h>
     40#include <linux/slab.h>
     41#include <linux/bitops.h>
     42#include <linux/iomap.h>
     43#include <linux/iversion.h>
     44
     45#include "ext4_jbd2.h"
     46#include "xattr.h"
     47#include "acl.h"
     48#include "truncate.h"
     49
     50#include <trace/events/ext4.h>
     51
     52static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
     53			      struct ext4_inode_info *ei)
     54{
     55	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
     56	__u32 csum;
     57	__u16 dummy_csum = 0;
     58	int offset = offsetof(struct ext4_inode, i_checksum_lo);
     59	unsigned int csum_size = sizeof(dummy_csum);
     60
     61	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
     62	csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
     63	offset += csum_size;
     64	csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
     65			   EXT4_GOOD_OLD_INODE_SIZE - offset);
     66
     67	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
     68		offset = offsetof(struct ext4_inode, i_checksum_hi);
     69		csum = ext4_chksum(sbi, csum, (__u8 *)raw +
     70				   EXT4_GOOD_OLD_INODE_SIZE,
     71				   offset - EXT4_GOOD_OLD_INODE_SIZE);
     72		if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
     73			csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
     74					   csum_size);
     75			offset += csum_size;
     76		}
     77		csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
     78				   EXT4_INODE_SIZE(inode->i_sb) - offset);
     79	}
     80
     81	return csum;
     82}
     83
     84static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
     85				  struct ext4_inode_info *ei)
     86{
     87	__u32 provided, calculated;
     88
     89	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
     90	    cpu_to_le32(EXT4_OS_LINUX) ||
     91	    !ext4_has_metadata_csum(inode->i_sb))
     92		return 1;
     93
     94	provided = le16_to_cpu(raw->i_checksum_lo);
     95	calculated = ext4_inode_csum(inode, raw, ei);
     96	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
     97	    EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
     98		provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
     99	else
    100		calculated &= 0xFFFF;
    101
    102	return provided == calculated;
    103}
    104
    105void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
    106			 struct ext4_inode_info *ei)
    107{
    108	__u32 csum;
    109
    110	if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
    111	    cpu_to_le32(EXT4_OS_LINUX) ||
    112	    !ext4_has_metadata_csum(inode->i_sb))
    113		return;
    114
    115	csum = ext4_inode_csum(inode, raw, ei);
    116	raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
    117	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
    118	    EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
    119		raw->i_checksum_hi = cpu_to_le16(csum >> 16);
    120}
    121
    122static inline int ext4_begin_ordered_truncate(struct inode *inode,
    123					      loff_t new_size)
    124{
    125	trace_ext4_begin_ordered_truncate(inode, new_size);
    126	/*
    127	 * If jinode is zero, then we never opened the file for
    128	 * writing, so there's no need to call
    129	 * jbd2_journal_begin_ordered_truncate() since there's no
    130	 * outstanding writes we need to flush.
    131	 */
    132	if (!EXT4_I(inode)->jinode)
    133		return 0;
    134	return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
    135						   EXT4_I(inode)->jinode,
    136						   new_size);
    137}
    138
    139static int __ext4_journalled_writepage(struct page *page, unsigned int len);
    140static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
    141				  int pextents);
    142
    143/*
    144 * Test whether an inode is a fast symlink.
    145 * A fast symlink has its symlink data stored in ext4_inode_info->i_data.
    146 */
    147int ext4_inode_is_fast_symlink(struct inode *inode)
    148{
    149	if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
    150		int ea_blocks = EXT4_I(inode)->i_file_acl ?
    151				EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
    152
    153		if (ext4_has_inline_data(inode))
    154			return 0;
    155
    156		return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
    157	}
    158	return S_ISLNK(inode->i_mode) && inode->i_size &&
    159	       (inode->i_size < EXT4_N_BLOCKS * 4);
    160}
    161
    162/*
    163 * Called at the last iput() if i_nlink is zero.
    164 */
    165void ext4_evict_inode(struct inode *inode)
    166{
    167	handle_t *handle;
    168	int err;
    169	/*
    170	 * Credits for final inode cleanup and freeing:
    171	 * sb + inode (ext4_orphan_del()), block bitmap, group descriptor
    172	 * (xattr block freeing), bitmap, group descriptor (inode freeing)
    173	 */
    174	int extra_credits = 6;
    175	struct ext4_xattr_inode_array *ea_inode_array = NULL;
    176	bool freeze_protected = false;
    177
    178	trace_ext4_evict_inode(inode);
    179
    180	if (inode->i_nlink) {
    181		/*
    182		 * When journalling data dirty buffers are tracked only in the
    183		 * journal. So although mm thinks everything is clean and
    184		 * ready for reaping the inode might still have some pages to
    185		 * write in the running transaction or waiting to be
    186		 * checkpointed. Thus calling jbd2_journal_invalidate_folio()
    187		 * (via truncate_inode_pages()) to discard these buffers can
    188		 * cause data loss. Also even if we did not discard these
    189		 * buffers, we would have no way to find them after the inode
    190		 * is reaped and thus user could see stale data if he tries to
    191		 * read them before the transaction is checkpointed. So be
    192		 * careful and force everything to disk here... We use
    193		 * ei->i_datasync_tid to store the newest transaction
    194		 * containing inode's data.
    195		 *
    196		 * Note that directories do not have this problem because they
    197		 * don't use page cache.
    198		 */
    199		if (inode->i_ino != EXT4_JOURNAL_INO &&
    200		    ext4_should_journal_data(inode) &&
    201		    S_ISREG(inode->i_mode) && inode->i_data.nrpages) {
    202			journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
    203			tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
    204
    205			jbd2_complete_transaction(journal, commit_tid);
    206			filemap_write_and_wait(&inode->i_data);
    207		}
    208		truncate_inode_pages_final(&inode->i_data);
    209
    210		goto no_delete;
    211	}
    212
    213	if (is_bad_inode(inode))
    214		goto no_delete;
    215	dquot_initialize(inode);
    216
    217	if (ext4_should_order_data(inode))
    218		ext4_begin_ordered_truncate(inode, 0);
    219	truncate_inode_pages_final(&inode->i_data);
    220
    221	/*
    222	 * For inodes with journalled data, transaction commit could have
    223	 * dirtied the inode. Flush worker is ignoring it because of I_FREEING
    224	 * flag but we still need to remove the inode from the writeback lists.
    225	 */
    226	if (!list_empty_careful(&inode->i_io_list)) {
    227		WARN_ON_ONCE(!ext4_should_journal_data(inode));
    228		inode_io_list_del(inode);
    229	}
    230
    231	/*
    232	 * Protect us against freezing - iput() caller didn't have to have any
    233	 * protection against it. When we are in a running transaction though,
    234	 * we are already protected against freezing and we cannot grab further
    235	 * protection due to lock ordering constraints.
    236	 */
    237	if (!ext4_journal_current_handle()) {
    238		sb_start_intwrite(inode->i_sb);
    239		freeze_protected = true;
    240	}
    241
    242	if (!IS_NOQUOTA(inode))
    243		extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
    244
    245	/*
    246	 * Block bitmap, group descriptor, and inode are accounted in both
    247	 * ext4_blocks_for_truncate() and extra_credits. So subtract 3.
    248	 */
    249	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
    250			 ext4_blocks_for_truncate(inode) + extra_credits - 3);
    251	if (IS_ERR(handle)) {
    252		ext4_std_error(inode->i_sb, PTR_ERR(handle));
    253		/*
    254		 * If we're going to skip the normal cleanup, we still need to
    255		 * make sure that the in-core orphan linked list is properly
    256		 * cleaned up.
    257		 */
    258		ext4_orphan_del(NULL, inode);
    259		if (freeze_protected)
    260			sb_end_intwrite(inode->i_sb);
    261		goto no_delete;
    262	}
    263
    264	if (IS_SYNC(inode))
    265		ext4_handle_sync(handle);
    266
    267	/*
    268	 * Set inode->i_size to 0 before calling ext4_truncate(). We need
    269	 * special handling of symlinks here because i_size is used to
    270	 * determine whether ext4_inode_info->i_data contains symlink data or
    271	 * block mappings. Setting i_size to 0 will remove its fast symlink
    272	 * status. Erase i_data so that it becomes a valid empty block map.
    273	 */
    274	if (ext4_inode_is_fast_symlink(inode))
    275		memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data));
    276	inode->i_size = 0;
    277	err = ext4_mark_inode_dirty(handle, inode);
    278	if (err) {
    279		ext4_warning(inode->i_sb,
    280			     "couldn't mark inode dirty (err %d)", err);
    281		goto stop_handle;
    282	}
    283	if (inode->i_blocks) {
    284		err = ext4_truncate(inode);
    285		if (err) {
    286			ext4_error_err(inode->i_sb, -err,
    287				       "couldn't truncate inode %lu (err %d)",
    288				       inode->i_ino, err);
    289			goto stop_handle;
    290		}
    291	}
    292
    293	/* Remove xattr references. */
    294	err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
    295				      extra_credits);
    296	if (err) {
    297		ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
    298stop_handle:
    299		ext4_journal_stop(handle);
    300		ext4_orphan_del(NULL, inode);
    301		if (freeze_protected)
    302			sb_end_intwrite(inode->i_sb);
    303		ext4_xattr_inode_array_free(ea_inode_array);
    304		goto no_delete;
    305	}
    306
    307	/*
    308	 * Kill off the orphan record which ext4_truncate created.
    309	 * AKPM: I think this can be inside the above `if'.
    310	 * Note that ext4_orphan_del() has to be able to cope with the
    311	 * deletion of a non-existent orphan - this is because we don't
    312	 * know if ext4_truncate() actually created an orphan record.
    313	 * (Well, we could do this if we need to, but heck - it works)
    314	 */
    315	ext4_orphan_del(handle, inode);
    316	EXT4_I(inode)->i_dtime	= (__u32)ktime_get_real_seconds();
    317
    318	/*
    319	 * One subtle ordering requirement: if anything has gone wrong
    320	 * (transaction abort, IO errors, whatever), then we can still
    321	 * do these next steps (the fs will already have been marked as
    322	 * having errors), but we can't free the inode if the mark_dirty
    323	 * fails.
    324	 */
    325	if (ext4_mark_inode_dirty(handle, inode))
    326		/* If that failed, just do the required in-core inode clear. */
    327		ext4_clear_inode(inode);
    328	else
    329		ext4_free_inode(handle, inode);
    330	ext4_journal_stop(handle);
    331	if (freeze_protected)
    332		sb_end_intwrite(inode->i_sb);
    333	ext4_xattr_inode_array_free(ea_inode_array);
    334	return;
    335no_delete:
    336	if (!list_empty(&EXT4_I(inode)->i_fc_list))
    337		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
    338	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
    339}
    340
    341#ifdef CONFIG_QUOTA
    342qsize_t *ext4_get_reserved_space(struct inode *inode)
    343{
    344	return &EXT4_I(inode)->i_reserved_quota;
    345}
    346#endif
    347
    348/*
    349 * Called with i_data_sem down, which is important since we can call
    350 * ext4_discard_preallocations() from here.
    351 */
    352void ext4_da_update_reserve_space(struct inode *inode,
    353					int used, int quota_claim)
    354{
    355	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
    356	struct ext4_inode_info *ei = EXT4_I(inode);
    357
    358	spin_lock(&ei->i_block_reservation_lock);
    359	trace_ext4_da_update_reserve_space(inode, used, quota_claim);
    360	if (unlikely(used > ei->i_reserved_data_blocks)) {
    361		ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
    362			 "with only %d reserved data blocks",
    363			 __func__, inode->i_ino, used,
    364			 ei->i_reserved_data_blocks);
    365		WARN_ON(1);
    366		used = ei->i_reserved_data_blocks;
    367	}
    368
    369	/* Update per-inode reservations */
    370	ei->i_reserved_data_blocks -= used;
    371	percpu_counter_sub(&sbi->s_dirtyclusters_counter, used);
    372
    373	spin_unlock(&ei->i_block_reservation_lock);
    374
    375	/* Update quota subsystem for data blocks */
    376	if (quota_claim)
    377		dquot_claim_block(inode, EXT4_C2B(sbi, used));
    378	else {
    379		/*
    380		 * We did fallocate with an offset that is already delayed
    381		 * allocated. So on delayed allocated writeback we should
    382		 * not re-claim the quota for fallocated blocks.
    383		 */
    384		dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
    385	}
    386
    387	/*
    388	 * If we have done all the pending block allocations and if
    389	 * there aren't any writers on the inode, we can discard the
    390	 * inode's preallocations.
    391	 */
    392	if ((ei->i_reserved_data_blocks == 0) &&
    393	    !inode_is_open_for_write(inode))
    394		ext4_discard_preallocations(inode, 0);
    395}
    396
    397static int __check_block_validity(struct inode *inode, const char *func,
    398				unsigned int line,
    399				struct ext4_map_blocks *map)
    400{
    401	if (ext4_has_feature_journal(inode->i_sb) &&
    402	    (inode->i_ino ==
    403	     le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
    404		return 0;
    405	if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
    406		ext4_error_inode(inode, func, line, map->m_pblk,
    407				 "lblock %lu mapped to illegal pblock %llu "
    408				 "(length %d)", (unsigned long) map->m_lblk,
    409				 map->m_pblk, map->m_len);
    410		return -EFSCORRUPTED;
    411	}
    412	return 0;
    413}
    414
    415int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
    416		       ext4_lblk_t len)
    417{
    418	int ret;
    419
    420	if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
    421		return fscrypt_zeroout_range(inode, lblk, pblk, len);
    422
    423	ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
    424	if (ret > 0)
    425		ret = 0;
    426
    427	return ret;
    428}
    429
    430#define check_block_validity(inode, map)	\
    431	__check_block_validity((inode), __func__, __LINE__, (map))
    432
    433#ifdef ES_AGGRESSIVE_TEST
    434static void ext4_map_blocks_es_recheck(handle_t *handle,
    435				       struct inode *inode,
    436				       struct ext4_map_blocks *es_map,
    437				       struct ext4_map_blocks *map,
    438				       int flags)
    439{
    440	int retval;
    441
    442	map->m_flags = 0;
    443	/*
    444	 * There is a race window that the result is not the same.
    445	 * e.g. xfstests #223 when dioread_nolock enables.  The reason
    446	 * is that we lookup a block mapping in extent status tree with
    447	 * out taking i_data_sem.  So at the time the unwritten extent
    448	 * could be converted.
    449	 */
    450	down_read(&EXT4_I(inode)->i_data_sem);
    451	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
    452		retval = ext4_ext_map_blocks(handle, inode, map, 0);
    453	} else {
    454		retval = ext4_ind_map_blocks(handle, inode, map, 0);
    455	}
    456	up_read((&EXT4_I(inode)->i_data_sem));
    457
    458	/*
    459	 * We don't check m_len because extent will be collpased in status
    460	 * tree.  So the m_len might not equal.
    461	 */
    462	if (es_map->m_lblk != map->m_lblk ||
    463	    es_map->m_flags != map->m_flags ||
    464	    es_map->m_pblk != map->m_pblk) {
    465		printk("ES cache assertion failed for inode: %lu "
    466		       "es_cached ex [%d/%d/%llu/%x] != "
    467		       "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
    468		       inode->i_ino, es_map->m_lblk, es_map->m_len,
    469		       es_map->m_pblk, es_map->m_flags, map->m_lblk,
    470		       map->m_len, map->m_pblk, map->m_flags,
    471		       retval, flags);
    472	}
    473}
    474#endif /* ES_AGGRESSIVE_TEST */
    475
    476/*
    477 * The ext4_map_blocks() function tries to look up the requested blocks,
    478 * and returns if the blocks are already mapped.
    479 *
    480 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
    481 * and store the allocated blocks in the result buffer head and mark it
    482 * mapped.
    483 *
    484 * If file type is extents based, it will call ext4_ext_map_blocks(),
    485 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
    486 * based files
    487 *
    488 * On success, it returns the number of blocks being mapped or allocated.  if
    489 * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
    490 * is marked as unwritten. If the create == 1, it will mark @map as mapped.
    491 *
    492 * It returns 0 if plain look up failed (blocks have not been allocated), in
    493 * that case, @map is returned as unmapped but we still do fill map->m_len to
    494 * indicate the length of a hole starting at map->m_lblk.
    495 *
    496 * It returns the error in case of allocation failure.
    497 */
    498int ext4_map_blocks(handle_t *handle, struct inode *inode,
    499		    struct ext4_map_blocks *map, int flags)
    500{
    501	struct extent_status es;
    502	int retval;
    503	int ret = 0;
    504#ifdef ES_AGGRESSIVE_TEST
    505	struct ext4_map_blocks orig_map;
    506
    507	memcpy(&orig_map, map, sizeof(*map));
    508#endif
    509
    510	map->m_flags = 0;
    511	ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n",
    512		  flags, map->m_len, (unsigned long) map->m_lblk);
    513
    514	/*
    515	 * ext4_map_blocks returns an int, and m_len is an unsigned int
    516	 */
    517	if (unlikely(map->m_len > INT_MAX))
    518		map->m_len = INT_MAX;
    519
    520	/* We can handle the block number less than EXT_MAX_BLOCKS */
    521	if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
    522		return -EFSCORRUPTED;
    523
    524	/* Lookup extent status tree firstly */
    525	if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
    526	    ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
    527		if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
    528			map->m_pblk = ext4_es_pblock(&es) +
    529					map->m_lblk - es.es_lblk;
    530			map->m_flags |= ext4_es_is_written(&es) ?
    531					EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
    532			retval = es.es_len - (map->m_lblk - es.es_lblk);
    533			if (retval > map->m_len)
    534				retval = map->m_len;
    535			map->m_len = retval;
    536		} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
    537			map->m_pblk = 0;
    538			retval = es.es_len - (map->m_lblk - es.es_lblk);
    539			if (retval > map->m_len)
    540				retval = map->m_len;
    541			map->m_len = retval;
    542			retval = 0;
    543		} else {
    544			BUG();
    545		}
    546
    547		if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
    548			return retval;
    549#ifdef ES_AGGRESSIVE_TEST
    550		ext4_map_blocks_es_recheck(handle, inode, map,
    551					   &orig_map, flags);
    552#endif
    553		goto found;
    554	}
    555	/*
    556	 * In the query cache no-wait mode, nothing we can do more if we
    557	 * cannot find extent in the cache.
    558	 */
    559	if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
    560		return 0;
    561
    562	/*
    563	 * Try to see if we can get the block without requesting a new
    564	 * file system block.
    565	 */
    566	down_read(&EXT4_I(inode)->i_data_sem);
    567	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
    568		retval = ext4_ext_map_blocks(handle, inode, map, 0);
    569	} else {
    570		retval = ext4_ind_map_blocks(handle, inode, map, 0);
    571	}
    572	if (retval > 0) {
    573		unsigned int status;
    574
    575		if (unlikely(retval != map->m_len)) {
    576			ext4_warning(inode->i_sb,
    577				     "ES len assertion failed for inode "
    578				     "%lu: retval %d != map->m_len %d",
    579				     inode->i_ino, retval, map->m_len);
    580			WARN_ON(1);
    581		}
    582
    583		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
    584				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
    585		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
    586		    !(status & EXTENT_STATUS_WRITTEN) &&
    587		    ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
    588				       map->m_lblk + map->m_len - 1))
    589			status |= EXTENT_STATUS_DELAYED;
    590		ret = ext4_es_insert_extent(inode, map->m_lblk,
    591					    map->m_len, map->m_pblk, status);
    592		if (ret < 0)
    593			retval = ret;
    594	}
    595	up_read((&EXT4_I(inode)->i_data_sem));
    596
    597found:
    598	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
    599		ret = check_block_validity(inode, map);
    600		if (ret != 0)
    601			return ret;
    602	}
    603
    604	/* If it is only a block(s) look up */
    605	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
    606		return retval;
    607
    608	/*
    609	 * Returns if the blocks have already allocated
    610	 *
    611	 * Note that if blocks have been preallocated
    612	 * ext4_ext_get_block() returns the create = 0
    613	 * with buffer head unmapped.
    614	 */
    615	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
    616		/*
    617		 * If we need to convert extent to unwritten
    618		 * we continue and do the actual work in
    619		 * ext4_ext_map_blocks()
    620		 */
    621		if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
    622			return retval;
    623
    624	/*
    625	 * Here we clear m_flags because after allocating an new extent,
    626	 * it will be set again.
    627	 */
    628	map->m_flags &= ~EXT4_MAP_FLAGS;
    629
    630	/*
    631	 * New blocks allocate and/or writing to unwritten extent
    632	 * will possibly result in updating i_data, so we take
    633	 * the write lock of i_data_sem, and call get_block()
    634	 * with create == 1 flag.
    635	 */
    636	down_write(&EXT4_I(inode)->i_data_sem);
    637
    638	/*
    639	 * We need to check for EXT4 here because migrate
    640	 * could have changed the inode type in between
    641	 */
    642	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
    643		retval = ext4_ext_map_blocks(handle, inode, map, flags);
    644	} else {
    645		retval = ext4_ind_map_blocks(handle, inode, map, flags);
    646
    647		if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
    648			/*
    649			 * We allocated new blocks which will result in
    650			 * i_data's format changing.  Force the migrate
    651			 * to fail by clearing migrate flags
    652			 */
    653			ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
    654		}
    655
    656		/*
    657		 * Update reserved blocks/metadata blocks after successful
    658		 * block allocation which had been deferred till now. We don't
    659		 * support fallocate for non extent files. So we can update
    660		 * reserve space here.
    661		 */
    662		if ((retval > 0) &&
    663			(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
    664			ext4_da_update_reserve_space(inode, retval, 1);
    665	}
    666
    667	if (retval > 0) {
    668		unsigned int status;
    669
    670		if (unlikely(retval != map->m_len)) {
    671			ext4_warning(inode->i_sb,
    672				     "ES len assertion failed for inode "
    673				     "%lu: retval %d != map->m_len %d",
    674				     inode->i_ino, retval, map->m_len);
    675			WARN_ON(1);
    676		}
    677
    678		/*
    679		 * We have to zeroout blocks before inserting them into extent
    680		 * status tree. Otherwise someone could look them up there and
    681		 * use them before they are really zeroed. We also have to
    682		 * unmap metadata before zeroing as otherwise writeback can
    683		 * overwrite zeros with stale data from block device.
    684		 */
    685		if (flags & EXT4_GET_BLOCKS_ZERO &&
    686		    map->m_flags & EXT4_MAP_MAPPED &&
    687		    map->m_flags & EXT4_MAP_NEW) {
    688			ret = ext4_issue_zeroout(inode, map->m_lblk,
    689						 map->m_pblk, map->m_len);
    690			if (ret) {
    691				retval = ret;
    692				goto out_sem;
    693			}
    694		}
    695
    696		/*
    697		 * If the extent has been zeroed out, we don't need to update
    698		 * extent status tree.
    699		 */
    700		if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
    701		    ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
    702			if (ext4_es_is_written(&es))
    703				goto out_sem;
    704		}
    705		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
    706				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
    707		if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) &&
    708		    !(status & EXTENT_STATUS_WRITTEN) &&
    709		    ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
    710				       map->m_lblk + map->m_len - 1))
    711			status |= EXTENT_STATUS_DELAYED;
    712		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
    713					    map->m_pblk, status);
    714		if (ret < 0) {
    715			retval = ret;
    716			goto out_sem;
    717		}
    718	}
    719
    720out_sem:
    721	up_write((&EXT4_I(inode)->i_data_sem));
    722	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
    723		ret = check_block_validity(inode, map);
    724		if (ret != 0)
    725			return ret;
    726
    727		/*
    728		 * Inodes with freshly allocated blocks where contents will be
    729		 * visible after transaction commit must be on transaction's
    730		 * ordered data list.
    731		 */
    732		if (map->m_flags & EXT4_MAP_NEW &&
    733		    !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
    734		    !(flags & EXT4_GET_BLOCKS_ZERO) &&
    735		    !ext4_is_quota_file(inode) &&
    736		    ext4_should_order_data(inode)) {
    737			loff_t start_byte =
    738				(loff_t)map->m_lblk << inode->i_blkbits;
    739			loff_t length = (loff_t)map->m_len << inode->i_blkbits;
    740
    741			if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
    742				ret = ext4_jbd2_inode_add_wait(handle, inode,
    743						start_byte, length);
    744			else
    745				ret = ext4_jbd2_inode_add_write(handle, inode,
    746						start_byte, length);
    747			if (ret)
    748				return ret;
    749		}
    750	}
    751	if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN ||
    752				map->m_flags & EXT4_MAP_MAPPED))
    753		ext4_fc_track_range(handle, inode, map->m_lblk,
    754					map->m_lblk + map->m_len - 1);
    755	if (retval < 0)
    756		ext_debug(inode, "failed with err %d\n", retval);
    757	return retval;
    758}
    759
    760/*
    761 * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages
    762 * we have to be careful as someone else may be manipulating b_state as well.
    763 */
    764static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
    765{
    766	unsigned long old_state;
    767	unsigned long new_state;
    768
    769	flags &= EXT4_MAP_FLAGS;
    770
    771	/* Dummy buffer_head? Set non-atomically. */
    772	if (!bh->b_page) {
    773		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
    774		return;
    775	}
    776	/*
    777	 * Someone else may be modifying b_state. Be careful! This is ugly but
    778	 * once we get rid of using bh as a container for mapping information
    779	 * to pass to / from get_block functions, this can go away.
    780	 */
    781	do {
    782		old_state = READ_ONCE(bh->b_state);
    783		new_state = (old_state & ~EXT4_MAP_FLAGS) | flags;
    784	} while (unlikely(
    785		 cmpxchg(&bh->b_state, old_state, new_state) != old_state));
    786}
    787
    788static int _ext4_get_block(struct inode *inode, sector_t iblock,
    789			   struct buffer_head *bh, int flags)
    790{
    791	struct ext4_map_blocks map;
    792	int ret = 0;
    793
    794	if (ext4_has_inline_data(inode))
    795		return -ERANGE;
    796
    797	map.m_lblk = iblock;
    798	map.m_len = bh->b_size >> inode->i_blkbits;
    799
    800	ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map,
    801			      flags);
    802	if (ret > 0) {
    803		map_bh(bh, inode->i_sb, map.m_pblk);
    804		ext4_update_bh_state(bh, map.m_flags);
    805		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
    806		ret = 0;
    807	} else if (ret == 0) {
    808		/* hole case, need to fill in bh->b_size */
    809		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
    810	}
    811	return ret;
    812}
    813
    814int ext4_get_block(struct inode *inode, sector_t iblock,
    815		   struct buffer_head *bh, int create)
    816{
    817	return _ext4_get_block(inode, iblock, bh,
    818			       create ? EXT4_GET_BLOCKS_CREATE : 0);
    819}
    820
    821/*
    822 * Get block function used when preparing for buffered write if we require
    823 * creating an unwritten extent if blocks haven't been allocated.  The extent
    824 * will be converted to written after the IO is complete.
    825 */
    826int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
    827			     struct buffer_head *bh_result, int create)
    828{
    829	ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
    830		   inode->i_ino, create);
    831	return _ext4_get_block(inode, iblock, bh_result,
    832			       EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
    833}
    834
    835/* Maximum number of blocks we map for direct IO at once. */
    836#define DIO_MAX_BLOCKS 4096
    837
    838/*
    839 * `handle' can be NULL if create is zero
    840 */
    841struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
    842				ext4_lblk_t block, int map_flags)
    843{
    844	struct ext4_map_blocks map;
    845	struct buffer_head *bh;
    846	int create = map_flags & EXT4_GET_BLOCKS_CREATE;
    847	bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT;
    848	int err;
    849
    850	ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
    851		    || handle != NULL || create == 0);
    852	ASSERT(create == 0 || !nowait);
    853
    854	map.m_lblk = block;
    855	map.m_len = 1;
    856	err = ext4_map_blocks(handle, inode, &map, map_flags);
    857
    858	if (err == 0)
    859		return create ? ERR_PTR(-ENOSPC) : NULL;
    860	if (err < 0)
    861		return ERR_PTR(err);
    862
    863	if (nowait)
    864		return sb_find_get_block(inode->i_sb, map.m_pblk);
    865
    866	bh = sb_getblk(inode->i_sb, map.m_pblk);
    867	if (unlikely(!bh))
    868		return ERR_PTR(-ENOMEM);
    869	if (map.m_flags & EXT4_MAP_NEW) {
    870		ASSERT(create != 0);
    871		ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
    872			    || (handle != NULL));
    873
    874		/*
    875		 * Now that we do not always journal data, we should
    876		 * keep in mind whether this should always journal the
    877		 * new buffer as metadata.  For now, regular file
    878		 * writes use ext4_get_block instead, so it's not a
    879		 * problem.
    880		 */
    881		lock_buffer(bh);
    882		BUFFER_TRACE(bh, "call get_create_access");
    883		err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
    884						     EXT4_JTR_NONE);
    885		if (unlikely(err)) {
    886			unlock_buffer(bh);
    887			goto errout;
    888		}
    889		if (!buffer_uptodate(bh)) {
    890			memset(bh->b_data, 0, inode->i_sb->s_blocksize);
    891			set_buffer_uptodate(bh);
    892		}
    893		unlock_buffer(bh);
    894		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
    895		err = ext4_handle_dirty_metadata(handle, inode, bh);
    896		if (unlikely(err))
    897			goto errout;
    898	} else
    899		BUFFER_TRACE(bh, "not a new buffer");
    900	return bh;
    901errout:
    902	brelse(bh);
    903	return ERR_PTR(err);
    904}
    905
    906struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
    907			       ext4_lblk_t block, int map_flags)
    908{
    909	struct buffer_head *bh;
    910	int ret;
    911
    912	bh = ext4_getblk(handle, inode, block, map_flags);
    913	if (IS_ERR(bh))
    914		return bh;
    915	if (!bh || ext4_buffer_uptodate(bh))
    916		return bh;
    917
    918	ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true);
    919	if (ret) {
    920		put_bh(bh);
    921		return ERR_PTR(ret);
    922	}
    923	return bh;
    924}
    925
    926/* Read a contiguous batch of blocks. */
    927int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
    928		     bool wait, struct buffer_head **bhs)
    929{
    930	int i, err;
    931
    932	for (i = 0; i < bh_count; i++) {
    933		bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */);
    934		if (IS_ERR(bhs[i])) {
    935			err = PTR_ERR(bhs[i]);
    936			bh_count = i;
    937			goto out_brelse;
    938		}
    939	}
    940
    941	for (i = 0; i < bh_count; i++)
    942		/* Note that NULL bhs[i] is valid because of holes. */
    943		if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
    944			ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false);
    945
    946	if (!wait)
    947		return 0;
    948
    949	for (i = 0; i < bh_count; i++)
    950		if (bhs[i])
    951			wait_on_buffer(bhs[i]);
    952
    953	for (i = 0; i < bh_count; i++) {
    954		if (bhs[i] && !buffer_uptodate(bhs[i])) {
    955			err = -EIO;
    956			goto out_brelse;
    957		}
    958	}
    959	return 0;
    960
    961out_brelse:
    962	for (i = 0; i < bh_count; i++) {
    963		brelse(bhs[i]);
    964		bhs[i] = NULL;
    965	}
    966	return err;
    967}
    968
    969int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,
    970			   struct buffer_head *head,
    971			   unsigned from,
    972			   unsigned to,
    973			   int *partial,
    974			   int (*fn)(handle_t *handle, struct inode *inode,
    975				     struct buffer_head *bh))
    976{
    977	struct buffer_head *bh;
    978	unsigned block_start, block_end;
    979	unsigned blocksize = head->b_size;
    980	int err, ret = 0;
    981	struct buffer_head *next;
    982
    983	for (bh = head, block_start = 0;
    984	     ret == 0 && (bh != head || !block_start);
    985	     block_start = block_end, bh = next) {
    986		next = bh->b_this_page;
    987		block_end = block_start + blocksize;
    988		if (block_end <= from || block_start >= to) {
    989			if (partial && !buffer_uptodate(bh))
    990				*partial = 1;
    991			continue;
    992		}
    993		err = (*fn)(handle, inode, bh);
    994		if (!ret)
    995			ret = err;
    996	}
    997	return ret;
    998}
    999
   1000/*
   1001 * To preserve ordering, it is essential that the hole instantiation and
   1002 * the data write be encapsulated in a single transaction.  We cannot
   1003 * close off a transaction and start a new one between the ext4_get_block()
   1004 * and the commit_write().  So doing the jbd2_journal_start at the start of
   1005 * prepare_write() is the right place.
   1006 *
   1007 * Also, this function can nest inside ext4_writepage().  In that case, we
   1008 * *know* that ext4_writepage() has generated enough buffer credits to do the
   1009 * whole page.  So we won't block on the journal in that case, which is good,
   1010 * because the caller may be PF_MEMALLOC.
   1011 *
   1012 * By accident, ext4 can be reentered when a transaction is open via
   1013 * quota file writes.  If we were to commit the transaction while thus
   1014 * reentered, there can be a deadlock - we would be holding a quota
   1015 * lock, and the commit would never complete if another thread had a
   1016 * transaction open and was blocking on the quota lock - a ranking
   1017 * violation.
   1018 *
   1019 * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
   1020 * will _not_ run commit under these circumstances because handle->h_ref
   1021 * is elevated.  We'll still have enough credits for the tiny quotafile
   1022 * write.
   1023 */
   1024int do_journal_get_write_access(handle_t *handle, struct inode *inode,
   1025				struct buffer_head *bh)
   1026{
   1027	int dirty = buffer_dirty(bh);
   1028	int ret;
   1029
   1030	if (!buffer_mapped(bh) || buffer_freed(bh))
   1031		return 0;
   1032	/*
   1033	 * __block_write_begin() could have dirtied some buffers. Clean
   1034	 * the dirty bit as jbd2_journal_get_write_access() could complain
   1035	 * otherwise about fs integrity issues. Setting of the dirty bit
   1036	 * by __block_write_begin() isn't a real problem here as we clear
   1037	 * the bit before releasing a page lock and thus writeback cannot
   1038	 * ever write the buffer.
   1039	 */
   1040	if (dirty)
   1041		clear_buffer_dirty(bh);
   1042	BUFFER_TRACE(bh, "get write access");
   1043	ret = ext4_journal_get_write_access(handle, inode->i_sb, bh,
   1044					    EXT4_JTR_NONE);
   1045	if (!ret && dirty)
   1046		ret = ext4_handle_dirty_metadata(handle, NULL, bh);
   1047	return ret;
   1048}
   1049
   1050#ifdef CONFIG_FS_ENCRYPTION
   1051static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
   1052				  get_block_t *get_block)
   1053{
   1054	unsigned from = pos & (PAGE_SIZE - 1);
   1055	unsigned to = from + len;
   1056	struct inode *inode = page->mapping->host;
   1057	unsigned block_start, block_end;
   1058	sector_t block;
   1059	int err = 0;
   1060	unsigned blocksize = inode->i_sb->s_blocksize;
   1061	unsigned bbits;
   1062	struct buffer_head *bh, *head, *wait[2];
   1063	int nr_wait = 0;
   1064	int i;
   1065
   1066	BUG_ON(!PageLocked(page));
   1067	BUG_ON(from > PAGE_SIZE);
   1068	BUG_ON(to > PAGE_SIZE);
   1069	BUG_ON(from > to);
   1070
   1071	if (!page_has_buffers(page))
   1072		create_empty_buffers(page, blocksize, 0);
   1073	head = page_buffers(page);
   1074	bbits = ilog2(blocksize);
   1075	block = (sector_t)page->index << (PAGE_SHIFT - bbits);
   1076
   1077	for (bh = head, block_start = 0; bh != head || !block_start;
   1078	    block++, block_start = block_end, bh = bh->b_this_page) {
   1079		block_end = block_start + blocksize;
   1080		if (block_end <= from || block_start >= to) {
   1081			if (PageUptodate(page)) {
   1082				set_buffer_uptodate(bh);
   1083			}
   1084			continue;
   1085		}
   1086		if (buffer_new(bh))
   1087			clear_buffer_new(bh);
   1088		if (!buffer_mapped(bh)) {
   1089			WARN_ON(bh->b_size != blocksize);
   1090			err = get_block(inode, block, bh, 1);
   1091			if (err)
   1092				break;
   1093			if (buffer_new(bh)) {
   1094				if (PageUptodate(page)) {
   1095					clear_buffer_new(bh);
   1096					set_buffer_uptodate(bh);
   1097					mark_buffer_dirty(bh);
   1098					continue;
   1099				}
   1100				if (block_end > to || block_start < from)
   1101					zero_user_segments(page, to, block_end,
   1102							   block_start, from);
   1103				continue;
   1104			}
   1105		}
   1106		if (PageUptodate(page)) {
   1107			set_buffer_uptodate(bh);
   1108			continue;
   1109		}
   1110		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
   1111		    !buffer_unwritten(bh) &&
   1112		    (block_start < from || block_end > to)) {
   1113			ext4_read_bh_lock(bh, 0, false);
   1114			wait[nr_wait++] = bh;
   1115		}
   1116	}
   1117	/*
   1118	 * If we issued read requests, let them complete.
   1119	 */
   1120	for (i = 0; i < nr_wait; i++) {
   1121		wait_on_buffer(wait[i]);
   1122		if (!buffer_uptodate(wait[i]))
   1123			err = -EIO;
   1124	}
   1125	if (unlikely(err)) {
   1126		page_zero_new_buffers(page, from, to);
   1127	} else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
   1128		for (i = 0; i < nr_wait; i++) {
   1129			int err2;
   1130
   1131			err2 = fscrypt_decrypt_pagecache_blocks(page, blocksize,
   1132								bh_offset(wait[i]));
   1133			if (err2) {
   1134				clear_buffer_uptodate(wait[i]);
   1135				err = err2;
   1136			}
   1137		}
   1138	}
   1139
   1140	return err;
   1141}
   1142#endif
   1143
   1144static int ext4_write_begin(struct file *file, struct address_space *mapping,
   1145			    loff_t pos, unsigned len,
   1146			    struct page **pagep, void **fsdata)
   1147{
   1148	struct inode *inode = mapping->host;
   1149	int ret, needed_blocks;
   1150	handle_t *handle;
   1151	int retries = 0;
   1152	struct page *page;
   1153	pgoff_t index;
   1154	unsigned from, to;
   1155
   1156	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
   1157		return -EIO;
   1158
   1159	trace_ext4_write_begin(inode, pos, len);
   1160	/*
   1161	 * Reserve one block more for addition to orphan list in case
   1162	 * we allocate blocks but write fails for some reason
   1163	 */
   1164	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
   1165	index = pos >> PAGE_SHIFT;
   1166	from = pos & (PAGE_SIZE - 1);
   1167	to = from + len;
   1168
   1169	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
   1170		ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
   1171						    pagep);
   1172		if (ret < 0)
   1173			return ret;
   1174		if (ret == 1)
   1175			return 0;
   1176	}
   1177
   1178	/*
   1179	 * grab_cache_page_write_begin() can take a long time if the
   1180	 * system is thrashing due to memory pressure, or if the page
   1181	 * is being written back.  So grab it first before we start
   1182	 * the transaction handle.  This also allows us to allocate
   1183	 * the page (if needed) without using GFP_NOFS.
   1184	 */
   1185retry_grab:
   1186	page = grab_cache_page_write_begin(mapping, index);
   1187	if (!page)
   1188		return -ENOMEM;
   1189	unlock_page(page);
   1190
   1191retry_journal:
   1192	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
   1193	if (IS_ERR(handle)) {
   1194		put_page(page);
   1195		return PTR_ERR(handle);
   1196	}
   1197
   1198	lock_page(page);
   1199	if (page->mapping != mapping) {
   1200		/* The page got truncated from under us */
   1201		unlock_page(page);
   1202		put_page(page);
   1203		ext4_journal_stop(handle);
   1204		goto retry_grab;
   1205	}
   1206	/* In case writeback began while the page was unlocked */
   1207	wait_for_stable_page(page);
   1208
   1209#ifdef CONFIG_FS_ENCRYPTION
   1210	if (ext4_should_dioread_nolock(inode))
   1211		ret = ext4_block_write_begin(page, pos, len,
   1212					     ext4_get_block_unwritten);
   1213	else
   1214		ret = ext4_block_write_begin(page, pos, len,
   1215					     ext4_get_block);
   1216#else
   1217	if (ext4_should_dioread_nolock(inode))
   1218		ret = __block_write_begin(page, pos, len,
   1219					  ext4_get_block_unwritten);
   1220	else
   1221		ret = __block_write_begin(page, pos, len, ext4_get_block);
   1222#endif
   1223	if (!ret && ext4_should_journal_data(inode)) {
   1224		ret = ext4_walk_page_buffers(handle, inode,
   1225					     page_buffers(page), from, to, NULL,
   1226					     do_journal_get_write_access);
   1227	}
   1228
   1229	if (ret) {
   1230		bool extended = (pos + len > inode->i_size) &&
   1231				!ext4_verity_in_progress(inode);
   1232
   1233		unlock_page(page);
   1234		/*
   1235		 * __block_write_begin may have instantiated a few blocks
   1236		 * outside i_size.  Trim these off again. Don't need
   1237		 * i_size_read because we hold i_rwsem.
   1238		 *
   1239		 * Add inode to orphan list in case we crash before
   1240		 * truncate finishes
   1241		 */
   1242		if (extended && ext4_can_truncate(inode))
   1243			ext4_orphan_add(handle, inode);
   1244
   1245		ext4_journal_stop(handle);
   1246		if (extended) {
   1247			ext4_truncate_failed_write(inode);
   1248			/*
   1249			 * If truncate failed early the inode might
   1250			 * still be on the orphan list; we need to
   1251			 * make sure the inode is removed from the
   1252			 * orphan list in that case.
   1253			 */
   1254			if (inode->i_nlink)
   1255				ext4_orphan_del(NULL, inode);
   1256		}
   1257
   1258		if (ret == -ENOSPC &&
   1259		    ext4_should_retry_alloc(inode->i_sb, &retries))
   1260			goto retry_journal;
   1261		put_page(page);
   1262		return ret;
   1263	}
   1264	*pagep = page;
   1265	return ret;
   1266}
   1267
   1268/* For write_end() in data=journal mode */
   1269static int write_end_fn(handle_t *handle, struct inode *inode,
   1270			struct buffer_head *bh)
   1271{
   1272	int ret;
   1273	if (!buffer_mapped(bh) || buffer_freed(bh))
   1274		return 0;
   1275	set_buffer_uptodate(bh);
   1276	ret = ext4_handle_dirty_metadata(handle, NULL, bh);
   1277	clear_buffer_meta(bh);
   1278	clear_buffer_prio(bh);
   1279	return ret;
   1280}
   1281
   1282/*
   1283 * We need to pick up the new inode size which generic_commit_write gave us
   1284 * `file' can be NULL - eg, when called from page_symlink().
   1285 *
   1286 * ext4 never places buffers on inode->i_mapping->private_list.  metadata
   1287 * buffers are managed internally.
   1288 */
   1289static int ext4_write_end(struct file *file,
   1290			  struct address_space *mapping,
   1291			  loff_t pos, unsigned len, unsigned copied,
   1292			  struct page *page, void *fsdata)
   1293{
   1294	handle_t *handle = ext4_journal_current_handle();
   1295	struct inode *inode = mapping->host;
   1296	loff_t old_size = inode->i_size;
   1297	int ret = 0, ret2;
   1298	int i_size_changed = 0;
   1299	bool verity = ext4_verity_in_progress(inode);
   1300
   1301	trace_ext4_write_end(inode, pos, len, copied);
   1302
   1303	if (ext4_has_inline_data(inode))
   1304		return ext4_write_inline_data_end(inode, pos, len, copied, page);
   1305
   1306	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
   1307	/*
   1308	 * it's important to update i_size while still holding page lock:
   1309	 * page writeout could otherwise come in and zero beyond i_size.
   1310	 *
   1311	 * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree
   1312	 * blocks are being written past EOF, so skip the i_size update.
   1313	 */
   1314	if (!verity)
   1315		i_size_changed = ext4_update_inode_size(inode, pos + copied);
   1316	unlock_page(page);
   1317	put_page(page);
   1318
   1319	if (old_size < pos && !verity)
   1320		pagecache_isize_extended(inode, old_size, pos);
   1321	/*
   1322	 * Don't mark the inode dirty under page lock. First, it unnecessarily
   1323	 * makes the holding time of page lock longer. Second, it forces lock
   1324	 * ordering of page lock and transaction start for journaling
   1325	 * filesystems.
   1326	 */
   1327	if (i_size_changed)
   1328		ret = ext4_mark_inode_dirty(handle, inode);
   1329
   1330	if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
   1331		/* if we have allocated more blocks and copied
   1332		 * less. We will have blocks allocated outside
   1333		 * inode->i_size. So truncate them
   1334		 */
   1335		ext4_orphan_add(handle, inode);
   1336
   1337	ret2 = ext4_journal_stop(handle);
   1338	if (!ret)
   1339		ret = ret2;
   1340
   1341	if (pos + len > inode->i_size && !verity) {
   1342		ext4_truncate_failed_write(inode);
   1343		/*
   1344		 * If truncate failed early the inode might still be
   1345		 * on the orphan list; we need to make sure the inode
   1346		 * is removed from the orphan list in that case.
   1347		 */
   1348		if (inode->i_nlink)
   1349			ext4_orphan_del(NULL, inode);
   1350	}
   1351
   1352	return ret ? ret : copied;
   1353}
   1354
   1355/*
   1356 * This is a private version of page_zero_new_buffers() which doesn't
   1357 * set the buffer to be dirty, since in data=journalled mode we need
   1358 * to call ext4_handle_dirty_metadata() instead.
   1359 */
   1360static void ext4_journalled_zero_new_buffers(handle_t *handle,
   1361					    struct inode *inode,
   1362					    struct page *page,
   1363					    unsigned from, unsigned to)
   1364{
   1365	unsigned int block_start = 0, block_end;
   1366	struct buffer_head *head, *bh;
   1367
   1368	bh = head = page_buffers(page);
   1369	do {
   1370		block_end = block_start + bh->b_size;
   1371		if (buffer_new(bh)) {
   1372			if (block_end > from && block_start < to) {
   1373				if (!PageUptodate(page)) {
   1374					unsigned start, size;
   1375
   1376					start = max(from, block_start);
   1377					size = min(to, block_end) - start;
   1378
   1379					zero_user(page, start, size);
   1380					write_end_fn(handle, inode, bh);
   1381				}
   1382				clear_buffer_new(bh);
   1383			}
   1384		}
   1385		block_start = block_end;
   1386		bh = bh->b_this_page;
   1387	} while (bh != head);
   1388}
   1389
   1390static int ext4_journalled_write_end(struct file *file,
   1391				     struct address_space *mapping,
   1392				     loff_t pos, unsigned len, unsigned copied,
   1393				     struct page *page, void *fsdata)
   1394{
   1395	handle_t *handle = ext4_journal_current_handle();
   1396	struct inode *inode = mapping->host;
   1397	loff_t old_size = inode->i_size;
   1398	int ret = 0, ret2;
   1399	int partial = 0;
   1400	unsigned from, to;
   1401	int size_changed = 0;
   1402	bool verity = ext4_verity_in_progress(inode);
   1403
   1404	trace_ext4_journalled_write_end(inode, pos, len, copied);
   1405	from = pos & (PAGE_SIZE - 1);
   1406	to = from + len;
   1407
   1408	BUG_ON(!ext4_handle_valid(handle));
   1409
   1410	if (ext4_has_inline_data(inode))
   1411		return ext4_write_inline_data_end(inode, pos, len, copied, page);
   1412
   1413	if (unlikely(copied < len) && !PageUptodate(page)) {
   1414		copied = 0;
   1415		ext4_journalled_zero_new_buffers(handle, inode, page, from, to);
   1416	} else {
   1417		if (unlikely(copied < len))
   1418			ext4_journalled_zero_new_buffers(handle, inode, page,
   1419							 from + copied, to);
   1420		ret = ext4_walk_page_buffers(handle, inode, page_buffers(page),
   1421					     from, from + copied, &partial,
   1422					     write_end_fn);
   1423		if (!partial)
   1424			SetPageUptodate(page);
   1425	}
   1426	if (!verity)
   1427		size_changed = ext4_update_inode_size(inode, pos + copied);
   1428	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
   1429	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
   1430	unlock_page(page);
   1431	put_page(page);
   1432
   1433	if (old_size < pos && !verity)
   1434		pagecache_isize_extended(inode, old_size, pos);
   1435
   1436	if (size_changed) {
   1437		ret2 = ext4_mark_inode_dirty(handle, inode);
   1438		if (!ret)
   1439			ret = ret2;
   1440	}
   1441
   1442	if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
   1443		/* if we have allocated more blocks and copied
   1444		 * less. We will have blocks allocated outside
   1445		 * inode->i_size. So truncate them
   1446		 */
   1447		ext4_orphan_add(handle, inode);
   1448
   1449	ret2 = ext4_journal_stop(handle);
   1450	if (!ret)
   1451		ret = ret2;
   1452	if (pos + len > inode->i_size && !verity) {
   1453		ext4_truncate_failed_write(inode);
   1454		/*
   1455		 * If truncate failed early the inode might still be
   1456		 * on the orphan list; we need to make sure the inode
   1457		 * is removed from the orphan list in that case.
   1458		 */
   1459		if (inode->i_nlink)
   1460			ext4_orphan_del(NULL, inode);
   1461	}
   1462
   1463	return ret ? ret : copied;
   1464}
   1465
   1466/*
   1467 * Reserve space for a single cluster
   1468 */
   1469static int ext4_da_reserve_space(struct inode *inode)
   1470{
   1471	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   1472	struct ext4_inode_info *ei = EXT4_I(inode);
   1473	int ret;
   1474
   1475	/*
   1476	 * We will charge metadata quota at writeout time; this saves
   1477	 * us from metadata over-estimation, though we may go over by
   1478	 * a small amount in the end.  Here we just reserve for data.
   1479	 */
   1480	ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
   1481	if (ret)
   1482		return ret;
   1483
   1484	spin_lock(&ei->i_block_reservation_lock);
   1485	if (ext4_claim_free_clusters(sbi, 1, 0)) {
   1486		spin_unlock(&ei->i_block_reservation_lock);
   1487		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
   1488		return -ENOSPC;
   1489	}
   1490	ei->i_reserved_data_blocks++;
   1491	trace_ext4_da_reserve_space(inode);
   1492	spin_unlock(&ei->i_block_reservation_lock);
   1493
   1494	return 0;       /* success */
   1495}
   1496
   1497void ext4_da_release_space(struct inode *inode, int to_free)
   1498{
   1499	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   1500	struct ext4_inode_info *ei = EXT4_I(inode);
   1501
   1502	if (!to_free)
   1503		return;		/* Nothing to release, exit */
   1504
   1505	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
   1506
   1507	trace_ext4_da_release_space(inode, to_free);
   1508	if (unlikely(to_free > ei->i_reserved_data_blocks)) {
   1509		/*
   1510		 * if there aren't enough reserved blocks, then the
   1511		 * counter is messed up somewhere.  Since this
   1512		 * function is called from invalidate page, it's
   1513		 * harmless to return without any action.
   1514		 */
   1515		ext4_warning(inode->i_sb, "ext4_da_release_space: "
   1516			 "ino %lu, to_free %d with only %d reserved "
   1517			 "data blocks", inode->i_ino, to_free,
   1518			 ei->i_reserved_data_blocks);
   1519		WARN_ON(1);
   1520		to_free = ei->i_reserved_data_blocks;
   1521	}
   1522	ei->i_reserved_data_blocks -= to_free;
   1523
   1524	/* update fs dirty data blocks counter */
   1525	percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
   1526
   1527	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
   1528
   1529	dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
   1530}
   1531
   1532/*
   1533 * Delayed allocation stuff
   1534 */
   1535
   1536struct mpage_da_data {
   1537	struct inode *inode;
   1538	struct writeback_control *wbc;
   1539
   1540	pgoff_t first_page;	/* The first page to write */
   1541	pgoff_t next_page;	/* Current page to examine */
   1542	pgoff_t last_page;	/* Last page to examine */
   1543	/*
   1544	 * Extent to map - this can be after first_page because that can be
   1545	 * fully mapped. We somewhat abuse m_flags to store whether the extent
   1546	 * is delalloc or unwritten.
   1547	 */
   1548	struct ext4_map_blocks map;
   1549	struct ext4_io_submit io_submit;	/* IO submission data */
   1550	unsigned int do_map:1;
   1551	unsigned int scanned_until_end:1;
   1552};
   1553
   1554static void mpage_release_unused_pages(struct mpage_da_data *mpd,
   1555				       bool invalidate)
   1556{
   1557	int nr_pages, i;
   1558	pgoff_t index, end;
   1559	struct pagevec pvec;
   1560	struct inode *inode = mpd->inode;
   1561	struct address_space *mapping = inode->i_mapping;
   1562
   1563	/* This is necessary when next_page == 0. */
   1564	if (mpd->first_page >= mpd->next_page)
   1565		return;
   1566
   1567	mpd->scanned_until_end = 0;
   1568	index = mpd->first_page;
   1569	end   = mpd->next_page - 1;
   1570	if (invalidate) {
   1571		ext4_lblk_t start, last;
   1572		start = index << (PAGE_SHIFT - inode->i_blkbits);
   1573		last = end << (PAGE_SHIFT - inode->i_blkbits);
   1574		ext4_es_remove_extent(inode, start, last - start + 1);
   1575	}
   1576
   1577	pagevec_init(&pvec);
   1578	while (index <= end) {
   1579		nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end);
   1580		if (nr_pages == 0)
   1581			break;
   1582		for (i = 0; i < nr_pages; i++) {
   1583			struct page *page = pvec.pages[i];
   1584			struct folio *folio = page_folio(page);
   1585
   1586			BUG_ON(!folio_test_locked(folio));
   1587			BUG_ON(folio_test_writeback(folio));
   1588			if (invalidate) {
   1589				if (folio_mapped(folio))
   1590					folio_clear_dirty_for_io(folio);
   1591				block_invalidate_folio(folio, 0,
   1592						folio_size(folio));
   1593				folio_clear_uptodate(folio);
   1594			}
   1595			folio_unlock(folio);
   1596		}
   1597		pagevec_release(&pvec);
   1598	}
   1599}
   1600
   1601static void ext4_print_free_blocks(struct inode *inode)
   1602{
   1603	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   1604	struct super_block *sb = inode->i_sb;
   1605	struct ext4_inode_info *ei = EXT4_I(inode);
   1606
   1607	ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
   1608	       EXT4_C2B(EXT4_SB(inode->i_sb),
   1609			ext4_count_free_clusters(sb)));
   1610	ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
   1611	ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
   1612	       (long long) EXT4_C2B(EXT4_SB(sb),
   1613		percpu_counter_sum(&sbi->s_freeclusters_counter)));
   1614	ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
   1615	       (long long) EXT4_C2B(EXT4_SB(sb),
   1616		percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
   1617	ext4_msg(sb, KERN_CRIT, "Block reservation details");
   1618	ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
   1619		 ei->i_reserved_data_blocks);
   1620	return;
   1621}
   1622
   1623static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode,
   1624				      struct buffer_head *bh)
   1625{
   1626	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
   1627}
   1628
   1629/*
   1630 * ext4_insert_delayed_block - adds a delayed block to the extents status
   1631 *                             tree, incrementing the reserved cluster/block
   1632 *                             count or making a pending reservation
   1633 *                             where needed
   1634 *
   1635 * @inode - file containing the newly added block
   1636 * @lblk - logical block to be added
   1637 *
   1638 * Returns 0 on success, negative error code on failure.
   1639 */
   1640static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
   1641{
   1642	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   1643	int ret;
   1644	bool allocated = false;
   1645	bool reserved = false;
   1646
   1647	/*
   1648	 * If the cluster containing lblk is shared with a delayed,
   1649	 * written, or unwritten extent in a bigalloc file system, it's
   1650	 * already been accounted for and does not need to be reserved.
   1651	 * A pending reservation must be made for the cluster if it's
   1652	 * shared with a written or unwritten extent and doesn't already
   1653	 * have one.  Written and unwritten extents can be purged from the
   1654	 * extents status tree if the system is under memory pressure, so
   1655	 * it's necessary to examine the extent tree if a search of the
   1656	 * extents status tree doesn't get a match.
   1657	 */
   1658	if (sbi->s_cluster_ratio == 1) {
   1659		ret = ext4_da_reserve_space(inode);
   1660		if (ret != 0)   /* ENOSPC */
   1661			goto errout;
   1662		reserved = true;
   1663	} else {   /* bigalloc */
   1664		if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
   1665			if (!ext4_es_scan_clu(inode,
   1666					      &ext4_es_is_mapped, lblk)) {
   1667				ret = ext4_clu_mapped(inode,
   1668						      EXT4_B2C(sbi, lblk));
   1669				if (ret < 0)
   1670					goto errout;
   1671				if (ret == 0) {
   1672					ret = ext4_da_reserve_space(inode);
   1673					if (ret != 0)   /* ENOSPC */
   1674						goto errout;
   1675					reserved = true;
   1676				} else {
   1677					allocated = true;
   1678				}
   1679			} else {
   1680				allocated = true;
   1681			}
   1682		}
   1683	}
   1684
   1685	ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
   1686	if (ret && reserved)
   1687		ext4_da_release_space(inode, 1);
   1688
   1689errout:
   1690	return ret;
   1691}
   1692
   1693/*
   1694 * This function is grabs code from the very beginning of
   1695 * ext4_map_blocks, but assumes that the caller is from delayed write
   1696 * time. This function looks up the requested blocks and sets the
   1697 * buffer delay bit under the protection of i_data_sem.
   1698 */
   1699static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
   1700			      struct ext4_map_blocks *map,
   1701			      struct buffer_head *bh)
   1702{
   1703	struct extent_status es;
   1704	int retval;
   1705	sector_t invalid_block = ~((sector_t) 0xffff);
   1706#ifdef ES_AGGRESSIVE_TEST
   1707	struct ext4_map_blocks orig_map;
   1708
   1709	memcpy(&orig_map, map, sizeof(*map));
   1710#endif
   1711
   1712	if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
   1713		invalid_block = ~0;
   1714
   1715	map->m_flags = 0;
   1716	ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
   1717		  (unsigned long) map->m_lblk);
   1718
   1719	/* Lookup extent status tree firstly */
   1720	if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
   1721		if (ext4_es_is_hole(&es)) {
   1722			retval = 0;
   1723			down_read(&EXT4_I(inode)->i_data_sem);
   1724			goto add_delayed;
   1725		}
   1726
   1727		/*
   1728		 * Delayed extent could be allocated by fallocate.
   1729		 * So we need to check it.
   1730		 */
   1731		if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) {
   1732			map_bh(bh, inode->i_sb, invalid_block);
   1733			set_buffer_new(bh);
   1734			set_buffer_delay(bh);
   1735			return 0;
   1736		}
   1737
   1738		map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk;
   1739		retval = es.es_len - (iblock - es.es_lblk);
   1740		if (retval > map->m_len)
   1741			retval = map->m_len;
   1742		map->m_len = retval;
   1743		if (ext4_es_is_written(&es))
   1744			map->m_flags |= EXT4_MAP_MAPPED;
   1745		else if (ext4_es_is_unwritten(&es))
   1746			map->m_flags |= EXT4_MAP_UNWRITTEN;
   1747		else
   1748			BUG();
   1749
   1750#ifdef ES_AGGRESSIVE_TEST
   1751		ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
   1752#endif
   1753		return retval;
   1754	}
   1755
   1756	/*
   1757	 * Try to see if we can get the block without requesting a new
   1758	 * file system block.
   1759	 */
   1760	down_read(&EXT4_I(inode)->i_data_sem);
   1761	if (ext4_has_inline_data(inode))
   1762		retval = 0;
   1763	else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
   1764		retval = ext4_ext_map_blocks(NULL, inode, map, 0);
   1765	else
   1766		retval = ext4_ind_map_blocks(NULL, inode, map, 0);
   1767
   1768add_delayed:
   1769	if (retval == 0) {
   1770		int ret;
   1771
   1772		/*
   1773		 * XXX: __block_prepare_write() unmaps passed block,
   1774		 * is it OK?
   1775		 */
   1776
   1777		ret = ext4_insert_delayed_block(inode, map->m_lblk);
   1778		if (ret != 0) {
   1779			retval = ret;
   1780			goto out_unlock;
   1781		}
   1782
   1783		map_bh(bh, inode->i_sb, invalid_block);
   1784		set_buffer_new(bh);
   1785		set_buffer_delay(bh);
   1786	} else if (retval > 0) {
   1787		int ret;
   1788		unsigned int status;
   1789
   1790		if (unlikely(retval != map->m_len)) {
   1791			ext4_warning(inode->i_sb,
   1792				     "ES len assertion failed for inode "
   1793				     "%lu: retval %d != map->m_len %d",
   1794				     inode->i_ino, retval, map->m_len);
   1795			WARN_ON(1);
   1796		}
   1797
   1798		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
   1799				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
   1800		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
   1801					    map->m_pblk, status);
   1802		if (ret != 0)
   1803			retval = ret;
   1804	}
   1805
   1806out_unlock:
   1807	up_read((&EXT4_I(inode)->i_data_sem));
   1808
   1809	return retval;
   1810}
   1811
   1812/*
   1813 * This is a special get_block_t callback which is used by
   1814 * ext4_da_write_begin().  It will either return mapped block or
   1815 * reserve space for a single block.
   1816 *
   1817 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
   1818 * We also have b_blocknr = -1 and b_bdev initialized properly
   1819 *
   1820 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
   1821 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
   1822 * initialized properly.
   1823 */
   1824int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
   1825			   struct buffer_head *bh, int create)
   1826{
   1827	struct ext4_map_blocks map;
   1828	int ret = 0;
   1829
   1830	BUG_ON(create == 0);
   1831	BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
   1832
   1833	map.m_lblk = iblock;
   1834	map.m_len = 1;
   1835
   1836	/*
   1837	 * first, we need to know whether the block is allocated already
   1838	 * preallocated blocks are unmapped but should treated
   1839	 * the same as allocated blocks.
   1840	 */
   1841	ret = ext4_da_map_blocks(inode, iblock, &map, bh);
   1842	if (ret <= 0)
   1843		return ret;
   1844
   1845	map_bh(bh, inode->i_sb, map.m_pblk);
   1846	ext4_update_bh_state(bh, map.m_flags);
   1847
   1848	if (buffer_unwritten(bh)) {
   1849		/* A delayed write to unwritten bh should be marked
   1850		 * new and mapped.  Mapped ensures that we don't do
   1851		 * get_block multiple times when we write to the same
   1852		 * offset and new ensures that we do proper zero out
   1853		 * for partial write.
   1854		 */
   1855		set_buffer_new(bh);
   1856		set_buffer_mapped(bh);
   1857	}
   1858	return 0;
   1859}
   1860
   1861static int __ext4_journalled_writepage(struct page *page,
   1862				       unsigned int len)
   1863{
   1864	struct address_space *mapping = page->mapping;
   1865	struct inode *inode = mapping->host;
   1866	handle_t *handle = NULL;
   1867	int ret = 0, err = 0;
   1868	int inline_data = ext4_has_inline_data(inode);
   1869	struct buffer_head *inode_bh = NULL;
   1870	loff_t size;
   1871
   1872	ClearPageChecked(page);
   1873
   1874	if (inline_data) {
   1875		BUG_ON(page->index != 0);
   1876		BUG_ON(len > ext4_get_max_inline_size(inode));
   1877		inode_bh = ext4_journalled_write_inline_data(inode, len, page);
   1878		if (inode_bh == NULL)
   1879			goto out;
   1880	}
   1881	/*
   1882	 * We need to release the page lock before we start the
   1883	 * journal, so grab a reference so the page won't disappear
   1884	 * out from under us.
   1885	 */
   1886	get_page(page);
   1887	unlock_page(page);
   1888
   1889	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
   1890				    ext4_writepage_trans_blocks(inode));
   1891	if (IS_ERR(handle)) {
   1892		ret = PTR_ERR(handle);
   1893		put_page(page);
   1894		goto out_no_pagelock;
   1895	}
   1896	BUG_ON(!ext4_handle_valid(handle));
   1897
   1898	lock_page(page);
   1899	put_page(page);
   1900	size = i_size_read(inode);
   1901	if (page->mapping != mapping || page_offset(page) > size) {
   1902		/* The page got truncated from under us */
   1903		ext4_journal_stop(handle);
   1904		ret = 0;
   1905		goto out;
   1906	}
   1907
   1908	if (inline_data) {
   1909		ret = ext4_mark_inode_dirty(handle, inode);
   1910	} else {
   1911		struct buffer_head *page_bufs = page_buffers(page);
   1912
   1913		if (page->index == size >> PAGE_SHIFT)
   1914			len = size & ~PAGE_MASK;
   1915		else
   1916			len = PAGE_SIZE;
   1917
   1918		ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
   1919					     NULL, do_journal_get_write_access);
   1920
   1921		err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
   1922					     NULL, write_end_fn);
   1923	}
   1924	if (ret == 0)
   1925		ret = err;
   1926	err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
   1927	if (ret == 0)
   1928		ret = err;
   1929	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
   1930	err = ext4_journal_stop(handle);
   1931	if (!ret)
   1932		ret = err;
   1933
   1934	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
   1935out:
   1936	unlock_page(page);
   1937out_no_pagelock:
   1938	brelse(inode_bh);
   1939	return ret;
   1940}
   1941
   1942/*
   1943 * Note that we don't need to start a transaction unless we're journaling data
   1944 * because we should have holes filled from ext4_page_mkwrite(). We even don't
   1945 * need to file the inode to the transaction's list in ordered mode because if
   1946 * we are writing back data added by write(), the inode is already there and if
   1947 * we are writing back data modified via mmap(), no one guarantees in which
   1948 * transaction the data will hit the disk. In case we are journaling data, we
   1949 * cannot start transaction directly because transaction start ranks above page
   1950 * lock so we have to do some magic.
   1951 *
   1952 * This function can get called via...
   1953 *   - ext4_writepages after taking page lock (have journal handle)
   1954 *   - journal_submit_inode_data_buffers (no journal handle)
   1955 *   - shrink_page_list via the kswapd/direct reclaim (no journal handle)
   1956 *   - grab_page_cache when doing write_begin (have journal handle)
   1957 *
   1958 * We don't do any block allocation in this function. If we have page with
   1959 * multiple blocks we need to write those buffer_heads that are mapped. This
   1960 * is important for mmaped based write. So if we do with blocksize 1K
   1961 * truncate(f, 1024);
   1962 * a = mmap(f, 0, 4096);
   1963 * a[0] = 'a';
   1964 * truncate(f, 4096);
   1965 * we have in the page first buffer_head mapped via page_mkwrite call back
   1966 * but other buffer_heads would be unmapped but dirty (dirty done via the
   1967 * do_wp_page). So writepage should write the first block. If we modify
   1968 * the mmap area beyond 1024 we will again get a page_fault and the
   1969 * page_mkwrite callback will do the block allocation and mark the
   1970 * buffer_heads mapped.
   1971 *
   1972 * We redirty the page if we have any buffer_heads that is either delay or
   1973 * unwritten in the page.
   1974 *
   1975 * We can get recursively called as show below.
   1976 *
   1977 *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
   1978 *		ext4_writepage()
   1979 *
   1980 * But since we don't do any block allocation we should not deadlock.
   1981 * Page also have the dirty flag cleared so we don't get recurive page_lock.
   1982 */
   1983static int ext4_writepage(struct page *page,
   1984			  struct writeback_control *wbc)
   1985{
   1986	struct folio *folio = page_folio(page);
   1987	int ret = 0;
   1988	loff_t size;
   1989	unsigned int len;
   1990	struct buffer_head *page_bufs = NULL;
   1991	struct inode *inode = page->mapping->host;
   1992	struct ext4_io_submit io_submit;
   1993	bool keep_towrite = false;
   1994
   1995	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
   1996		folio_invalidate(folio, 0, folio_size(folio));
   1997		folio_unlock(folio);
   1998		return -EIO;
   1999	}
   2000
   2001	trace_ext4_writepage(page);
   2002	size = i_size_read(inode);
   2003	if (page->index == size >> PAGE_SHIFT &&
   2004	    !ext4_verity_in_progress(inode))
   2005		len = size & ~PAGE_MASK;
   2006	else
   2007		len = PAGE_SIZE;
   2008
   2009	/* Should never happen but for bugs in other kernel subsystems */
   2010	if (!page_has_buffers(page)) {
   2011		ext4_warning_inode(inode,
   2012		   "page %lu does not have buffers attached", page->index);
   2013		ClearPageDirty(page);
   2014		unlock_page(page);
   2015		return 0;
   2016	}
   2017
   2018	page_bufs = page_buffers(page);
   2019	/*
   2020	 * We cannot do block allocation or other extent handling in this
   2021	 * function. If there are buffers needing that, we have to redirty
   2022	 * the page. But we may reach here when we do a journal commit via
   2023	 * journal_submit_inode_data_buffers() and in that case we must write
   2024	 * allocated buffers to achieve data=ordered mode guarantees.
   2025	 *
   2026	 * Also, if there is only one buffer per page (the fs block
   2027	 * size == the page size), if one buffer needs block
   2028	 * allocation or needs to modify the extent tree to clear the
   2029	 * unwritten flag, we know that the page can't be written at
   2030	 * all, so we might as well refuse the write immediately.
   2031	 * Unfortunately if the block size != page size, we can't as
   2032	 * easily detect this case using ext4_walk_page_buffers(), but
   2033	 * for the extremely common case, this is an optimization that
   2034	 * skips a useless round trip through ext4_bio_write_page().
   2035	 */
   2036	if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL,
   2037				   ext4_bh_delay_or_unwritten)) {
   2038		redirty_page_for_writepage(wbc, page);
   2039		if ((current->flags & PF_MEMALLOC) ||
   2040		    (inode->i_sb->s_blocksize == PAGE_SIZE)) {
   2041			/*
   2042			 * For memory cleaning there's no point in writing only
   2043			 * some buffers. So just bail out. Warn if we came here
   2044			 * from direct reclaim.
   2045			 */
   2046			WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
   2047							== PF_MEMALLOC);
   2048			unlock_page(page);
   2049			return 0;
   2050		}
   2051		keep_towrite = true;
   2052	}
   2053
   2054	if (PageChecked(page) && ext4_should_journal_data(inode))
   2055		/*
   2056		 * It's mmapped pagecache.  Add buffers and journal it.  There
   2057		 * doesn't seem much point in redirtying the page here.
   2058		 */
   2059		return __ext4_journalled_writepage(page, len);
   2060
   2061	ext4_io_submit_init(&io_submit, wbc);
   2062	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
   2063	if (!io_submit.io_end) {
   2064		redirty_page_for_writepage(wbc, page);
   2065		unlock_page(page);
   2066		return -ENOMEM;
   2067	}
   2068	ret = ext4_bio_write_page(&io_submit, page, len, keep_towrite);
   2069	ext4_io_submit(&io_submit);
   2070	/* Drop io_end reference we got from init */
   2071	ext4_put_io_end_defer(io_submit.io_end);
   2072	return ret;
   2073}
   2074
   2075static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
   2076{
   2077	int len;
   2078	loff_t size;
   2079	int err;
   2080
   2081	BUG_ON(page->index != mpd->first_page);
   2082	clear_page_dirty_for_io(page);
   2083	/*
   2084	 * We have to be very careful here!  Nothing protects writeback path
   2085	 * against i_size changes and the page can be writeably mapped into
   2086	 * page tables. So an application can be growing i_size and writing
   2087	 * data through mmap while writeback runs. clear_page_dirty_for_io()
   2088	 * write-protects our page in page tables and the page cannot get
   2089	 * written to again until we release page lock. So only after
   2090	 * clear_page_dirty_for_io() we are safe to sample i_size for
   2091	 * ext4_bio_write_page() to zero-out tail of the written page. We rely
   2092	 * on the barrier provided by TestClearPageDirty in
   2093	 * clear_page_dirty_for_io() to make sure i_size is really sampled only
   2094	 * after page tables are updated.
   2095	 */
   2096	size = i_size_read(mpd->inode);
   2097	if (page->index == size >> PAGE_SHIFT &&
   2098	    !ext4_verity_in_progress(mpd->inode))
   2099		len = size & ~PAGE_MASK;
   2100	else
   2101		len = PAGE_SIZE;
   2102	err = ext4_bio_write_page(&mpd->io_submit, page, len, false);
   2103	if (!err)
   2104		mpd->wbc->nr_to_write--;
   2105	mpd->first_page++;
   2106
   2107	return err;
   2108}
   2109
   2110#define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay))
   2111
   2112/*
   2113 * mballoc gives us at most this number of blocks...
   2114 * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
   2115 * The rest of mballoc seems to handle chunks up to full group size.
   2116 */
   2117#define MAX_WRITEPAGES_EXTENT_LEN 2048
   2118
   2119/*
   2120 * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
   2121 *
   2122 * @mpd - extent of blocks
   2123 * @lblk - logical number of the block in the file
   2124 * @bh - buffer head we want to add to the extent
   2125 *
   2126 * The function is used to collect contig. blocks in the same state. If the
   2127 * buffer doesn't require mapping for writeback and we haven't started the
   2128 * extent of buffers to map yet, the function returns 'true' immediately - the
   2129 * caller can write the buffer right away. Otherwise the function returns true
   2130 * if the block has been added to the extent, false if the block couldn't be
   2131 * added.
   2132 */
   2133static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
   2134				   struct buffer_head *bh)
   2135{
   2136	struct ext4_map_blocks *map = &mpd->map;
   2137
   2138	/* Buffer that doesn't need mapping for writeback? */
   2139	if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
   2140	    (!buffer_delay(bh) && !buffer_unwritten(bh))) {
   2141		/* So far no extent to map => we write the buffer right away */
   2142		if (map->m_len == 0)
   2143			return true;
   2144		return false;
   2145	}
   2146
   2147	/* First block in the extent? */
   2148	if (map->m_len == 0) {
   2149		/* We cannot map unless handle is started... */
   2150		if (!mpd->do_map)
   2151			return false;
   2152		map->m_lblk = lblk;
   2153		map->m_len = 1;
   2154		map->m_flags = bh->b_state & BH_FLAGS;
   2155		return true;
   2156	}
   2157
   2158	/* Don't go larger than mballoc is willing to allocate */
   2159	if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
   2160		return false;
   2161
   2162	/* Can we merge the block to our big extent? */
   2163	if (lblk == map->m_lblk + map->m_len &&
   2164	    (bh->b_state & BH_FLAGS) == map->m_flags) {
   2165		map->m_len++;
   2166		return true;
   2167	}
   2168	return false;
   2169}
   2170
   2171/*
   2172 * mpage_process_page_bufs - submit page buffers for IO or add them to extent
   2173 *
   2174 * @mpd - extent of blocks for mapping
   2175 * @head - the first buffer in the page
   2176 * @bh - buffer we should start processing from
   2177 * @lblk - logical number of the block in the file corresponding to @bh
   2178 *
   2179 * Walk through page buffers from @bh upto @head (exclusive) and either submit
   2180 * the page for IO if all buffers in this page were mapped and there's no
   2181 * accumulated extent of buffers to map or add buffers in the page to the
   2182 * extent of buffers to map. The function returns 1 if the caller can continue
   2183 * by processing the next page, 0 if it should stop adding buffers to the
   2184 * extent to map because we cannot extend it anymore. It can also return value
   2185 * < 0 in case of error during IO submission.
   2186 */
   2187static int mpage_process_page_bufs(struct mpage_da_data *mpd,
   2188				   struct buffer_head *head,
   2189				   struct buffer_head *bh,
   2190				   ext4_lblk_t lblk)
   2191{
   2192	struct inode *inode = mpd->inode;
   2193	int err;
   2194	ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1)
   2195							>> inode->i_blkbits;
   2196
   2197	if (ext4_verity_in_progress(inode))
   2198		blocks = EXT_MAX_BLOCKS;
   2199
   2200	do {
   2201		BUG_ON(buffer_locked(bh));
   2202
   2203		if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
   2204			/* Found extent to map? */
   2205			if (mpd->map.m_len)
   2206				return 0;
   2207			/* Buffer needs mapping and handle is not started? */
   2208			if (!mpd->do_map)
   2209				return 0;
   2210			/* Everything mapped so far and we hit EOF */
   2211			break;
   2212		}
   2213	} while (lblk++, (bh = bh->b_this_page) != head);
   2214	/* So far everything mapped? Submit the page for IO. */
   2215	if (mpd->map.m_len == 0) {
   2216		err = mpage_submit_page(mpd, head->b_page);
   2217		if (err < 0)
   2218			return err;
   2219	}
   2220	if (lblk >= blocks) {
   2221		mpd->scanned_until_end = 1;
   2222		return 0;
   2223	}
   2224	return 1;
   2225}
   2226
   2227/*
   2228 * mpage_process_page - update page buffers corresponding to changed extent and
   2229 *		       may submit fully mapped page for IO
   2230 *
   2231 * @mpd		- description of extent to map, on return next extent to map
   2232 * @m_lblk	- logical block mapping.
   2233 * @m_pblk	- corresponding physical mapping.
   2234 * @map_bh	- determines on return whether this page requires any further
   2235 *		  mapping or not.
   2236 * Scan given page buffers corresponding to changed extent and update buffer
   2237 * state according to new extent state.
   2238 * We map delalloc buffers to their physical location, clear unwritten bits.
   2239 * If the given page is not fully mapped, we update @map to the next extent in
   2240 * the given page that needs mapping & return @map_bh as true.
   2241 */
   2242static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
   2243			      ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
   2244			      bool *map_bh)
   2245{
   2246	struct buffer_head *head, *bh;
   2247	ext4_io_end_t *io_end = mpd->io_submit.io_end;
   2248	ext4_lblk_t lblk = *m_lblk;
   2249	ext4_fsblk_t pblock = *m_pblk;
   2250	int err = 0;
   2251	int blkbits = mpd->inode->i_blkbits;
   2252	ssize_t io_end_size = 0;
   2253	struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
   2254
   2255	bh = head = page_buffers(page);
   2256	do {
   2257		if (lblk < mpd->map.m_lblk)
   2258			continue;
   2259		if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
   2260			/*
   2261			 * Buffer after end of mapped extent.
   2262			 * Find next buffer in the page to map.
   2263			 */
   2264			mpd->map.m_len = 0;
   2265			mpd->map.m_flags = 0;
   2266			io_end_vec->size += io_end_size;
   2267
   2268			err = mpage_process_page_bufs(mpd, head, bh, lblk);
   2269			if (err > 0)
   2270				err = 0;
   2271			if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
   2272				io_end_vec = ext4_alloc_io_end_vec(io_end);
   2273				if (IS_ERR(io_end_vec)) {
   2274					err = PTR_ERR(io_end_vec);
   2275					goto out;
   2276				}
   2277				io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits;
   2278			}
   2279			*map_bh = true;
   2280			goto out;
   2281		}
   2282		if (buffer_delay(bh)) {
   2283			clear_buffer_delay(bh);
   2284			bh->b_blocknr = pblock++;
   2285		}
   2286		clear_buffer_unwritten(bh);
   2287		io_end_size += (1 << blkbits);
   2288	} while (lblk++, (bh = bh->b_this_page) != head);
   2289
   2290	io_end_vec->size += io_end_size;
   2291	*map_bh = false;
   2292out:
   2293	*m_lblk = lblk;
   2294	*m_pblk = pblock;
   2295	return err;
   2296}
   2297
   2298/*
   2299 * mpage_map_buffers - update buffers corresponding to changed extent and
   2300 *		       submit fully mapped pages for IO
   2301 *
   2302 * @mpd - description of extent to map, on return next extent to map
   2303 *
   2304 * Scan buffers corresponding to changed extent (we expect corresponding pages
   2305 * to be already locked) and update buffer state according to new extent state.
   2306 * We map delalloc buffers to their physical location, clear unwritten bits,
   2307 * and mark buffers as uninit when we perform writes to unwritten extents
   2308 * and do extent conversion after IO is finished. If the last page is not fully
   2309 * mapped, we update @map to the next extent in the last page that needs
   2310 * mapping. Otherwise we submit the page for IO.
   2311 */
   2312static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
   2313{
   2314	struct pagevec pvec;
   2315	int nr_pages, i;
   2316	struct inode *inode = mpd->inode;
   2317	int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
   2318	pgoff_t start, end;
   2319	ext4_lblk_t lblk;
   2320	ext4_fsblk_t pblock;
   2321	int err;
   2322	bool map_bh = false;
   2323
   2324	start = mpd->map.m_lblk >> bpp_bits;
   2325	end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
   2326	lblk = start << bpp_bits;
   2327	pblock = mpd->map.m_pblk;
   2328
   2329	pagevec_init(&pvec);
   2330	while (start <= end) {
   2331		nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
   2332						&start, end);
   2333		if (nr_pages == 0)
   2334			break;
   2335		for (i = 0; i < nr_pages; i++) {
   2336			struct page *page = pvec.pages[i];
   2337
   2338			err = mpage_process_page(mpd, page, &lblk, &pblock,
   2339						 &map_bh);
   2340			/*
   2341			 * If map_bh is true, means page may require further bh
   2342			 * mapping, or maybe the page was submitted for IO.
   2343			 * So we return to call further extent mapping.
   2344			 */
   2345			if (err < 0 || map_bh)
   2346				goto out;
   2347			/* Page fully mapped - let IO run! */
   2348			err = mpage_submit_page(mpd, page);
   2349			if (err < 0)
   2350				goto out;
   2351		}
   2352		pagevec_release(&pvec);
   2353	}
   2354	/* Extent fully mapped and matches with page boundary. We are done. */
   2355	mpd->map.m_len = 0;
   2356	mpd->map.m_flags = 0;
   2357	return 0;
   2358out:
   2359	pagevec_release(&pvec);
   2360	return err;
   2361}
   2362
   2363static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
   2364{
   2365	struct inode *inode = mpd->inode;
   2366	struct ext4_map_blocks *map = &mpd->map;
   2367	int get_blocks_flags;
   2368	int err, dioread_nolock;
   2369
   2370	trace_ext4_da_write_pages_extent(inode, map);
   2371	/*
   2372	 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
   2373	 * to convert an unwritten extent to be initialized (in the case
   2374	 * where we have written into one or more preallocated blocks).  It is
   2375	 * possible that we're going to need more metadata blocks than
   2376	 * previously reserved. However we must not fail because we're in
   2377	 * writeback and there is nothing we can do about it so it might result
   2378	 * in data loss.  So use reserved blocks to allocate metadata if
   2379	 * possible.
   2380	 *
   2381	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if
   2382	 * the blocks in question are delalloc blocks.  This indicates
   2383	 * that the blocks and quotas has already been checked when
   2384	 * the data was copied into the page cache.
   2385	 */
   2386	get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
   2387			   EXT4_GET_BLOCKS_METADATA_NOFAIL |
   2388			   EXT4_GET_BLOCKS_IO_SUBMIT;
   2389	dioread_nolock = ext4_should_dioread_nolock(inode);
   2390	if (dioread_nolock)
   2391		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
   2392	if (map->m_flags & BIT(BH_Delay))
   2393		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
   2394
   2395	err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
   2396	if (err < 0)
   2397		return err;
   2398	if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
   2399		if (!mpd->io_submit.io_end->handle &&
   2400		    ext4_handle_valid(handle)) {
   2401			mpd->io_submit.io_end->handle = handle->h_rsv_handle;
   2402			handle->h_rsv_handle = NULL;
   2403		}
   2404		ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
   2405	}
   2406
   2407	BUG_ON(map->m_len == 0);
   2408	return 0;
   2409}
   2410
   2411/*
   2412 * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
   2413 *				 mpd->len and submit pages underlying it for IO
   2414 *
   2415 * @handle - handle for journal operations
   2416 * @mpd - extent to map
   2417 * @give_up_on_write - we set this to true iff there is a fatal error and there
   2418 *                     is no hope of writing the data. The caller should discard
   2419 *                     dirty pages to avoid infinite loops.
   2420 *
   2421 * The function maps extent starting at mpd->lblk of length mpd->len. If it is
   2422 * delayed, blocks are allocated, if it is unwritten, we may need to convert
   2423 * them to initialized or split the described range from larger unwritten
   2424 * extent. Note that we need not map all the described range since allocation
   2425 * can return less blocks or the range is covered by more unwritten extents. We
   2426 * cannot map more because we are limited by reserved transaction credits. On
   2427 * the other hand we always make sure that the last touched page is fully
   2428 * mapped so that it can be written out (and thus forward progress is
   2429 * guaranteed). After mapping we submit all mapped pages for IO.
   2430 */
   2431static int mpage_map_and_submit_extent(handle_t *handle,
   2432				       struct mpage_da_data *mpd,
   2433				       bool *give_up_on_write)
   2434{
   2435	struct inode *inode = mpd->inode;
   2436	struct ext4_map_blocks *map = &mpd->map;
   2437	int err;
   2438	loff_t disksize;
   2439	int progress = 0;
   2440	ext4_io_end_t *io_end = mpd->io_submit.io_end;
   2441	struct ext4_io_end_vec *io_end_vec;
   2442
   2443	io_end_vec = ext4_alloc_io_end_vec(io_end);
   2444	if (IS_ERR(io_end_vec))
   2445		return PTR_ERR(io_end_vec);
   2446	io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
   2447	do {
   2448		err = mpage_map_one_extent(handle, mpd);
   2449		if (err < 0) {
   2450			struct super_block *sb = inode->i_sb;
   2451
   2452			if (ext4_forced_shutdown(EXT4_SB(sb)) ||
   2453			    ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
   2454				goto invalidate_dirty_pages;
   2455			/*
   2456			 * Let the uper layers retry transient errors.
   2457			 * In the case of ENOSPC, if ext4_count_free_blocks()
   2458			 * is non-zero, a commit should free up blocks.
   2459			 */
   2460			if ((err == -ENOMEM) ||
   2461			    (err == -ENOSPC && ext4_count_free_clusters(sb))) {
   2462				if (progress)
   2463					goto update_disksize;
   2464				return err;
   2465			}
   2466			ext4_msg(sb, KERN_CRIT,
   2467				 "Delayed block allocation failed for "
   2468				 "inode %lu at logical offset %llu with"
   2469				 " max blocks %u with error %d",
   2470				 inode->i_ino,
   2471				 (unsigned long long)map->m_lblk,
   2472				 (unsigned)map->m_len, -err);
   2473			ext4_msg(sb, KERN_CRIT,
   2474				 "This should not happen!! Data will "
   2475				 "be lost\n");
   2476			if (err == -ENOSPC)
   2477				ext4_print_free_blocks(inode);
   2478		invalidate_dirty_pages:
   2479			*give_up_on_write = true;
   2480			return err;
   2481		}
   2482		progress = 1;
   2483		/*
   2484		 * Update buffer state, submit mapped pages, and get us new
   2485		 * extent to map
   2486		 */
   2487		err = mpage_map_and_submit_buffers(mpd);
   2488		if (err < 0)
   2489			goto update_disksize;
   2490	} while (map->m_len);
   2491
   2492update_disksize:
   2493	/*
   2494	 * Update on-disk size after IO is submitted.  Races with
   2495	 * truncate are avoided by checking i_size under i_data_sem.
   2496	 */
   2497	disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
   2498	if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
   2499		int err2;
   2500		loff_t i_size;
   2501
   2502		down_write(&EXT4_I(inode)->i_data_sem);
   2503		i_size = i_size_read(inode);
   2504		if (disksize > i_size)
   2505			disksize = i_size;
   2506		if (disksize > EXT4_I(inode)->i_disksize)
   2507			EXT4_I(inode)->i_disksize = disksize;
   2508		up_write(&EXT4_I(inode)->i_data_sem);
   2509		err2 = ext4_mark_inode_dirty(handle, inode);
   2510		if (err2) {
   2511			ext4_error_err(inode->i_sb, -err2,
   2512				       "Failed to mark inode %lu dirty",
   2513				       inode->i_ino);
   2514		}
   2515		if (!err)
   2516			err = err2;
   2517	}
   2518	return err;
   2519}
   2520
   2521/*
   2522 * Calculate the total number of credits to reserve for one writepages
   2523 * iteration. This is called from ext4_writepages(). We map an extent of
   2524 * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
   2525 * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
   2526 * bpp - 1 blocks in bpp different extents.
   2527 */
   2528static int ext4_da_writepages_trans_blocks(struct inode *inode)
   2529{
   2530	int bpp = ext4_journal_blocks_per_page(inode);
   2531
   2532	return ext4_meta_trans_blocks(inode,
   2533				MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
   2534}
   2535
   2536/*
   2537 * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
   2538 * 				 and underlying extent to map
   2539 *
   2540 * @mpd - where to look for pages
   2541 *
   2542 * Walk dirty pages in the mapping. If they are fully mapped, submit them for
   2543 * IO immediately. When we find a page which isn't mapped we start accumulating
   2544 * extent of buffers underlying these pages that needs mapping (formed by
   2545 * either delayed or unwritten buffers). We also lock the pages containing
   2546 * these buffers. The extent found is returned in @mpd structure (starting at
   2547 * mpd->lblk with length mpd->len blocks).
   2548 *
   2549 * Note that this function can attach bios to one io_end structure which are
   2550 * neither logically nor physically contiguous. Although it may seem as an
   2551 * unnecessary complication, it is actually inevitable in blocksize < pagesize
   2552 * case as we need to track IO to all buffers underlying a page in one io_end.
   2553 */
   2554static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
   2555{
   2556	struct address_space *mapping = mpd->inode->i_mapping;
   2557	struct pagevec pvec;
   2558	unsigned int nr_pages;
   2559	long left = mpd->wbc->nr_to_write;
   2560	pgoff_t index = mpd->first_page;
   2561	pgoff_t end = mpd->last_page;
   2562	xa_mark_t tag;
   2563	int i, err = 0;
   2564	int blkbits = mpd->inode->i_blkbits;
   2565	ext4_lblk_t lblk;
   2566	struct buffer_head *head;
   2567
   2568	if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
   2569		tag = PAGECACHE_TAG_TOWRITE;
   2570	else
   2571		tag = PAGECACHE_TAG_DIRTY;
   2572
   2573	pagevec_init(&pvec);
   2574	mpd->map.m_len = 0;
   2575	mpd->next_page = index;
   2576	while (index <= end) {
   2577		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
   2578				tag);
   2579		if (nr_pages == 0)
   2580			break;
   2581
   2582		for (i = 0; i < nr_pages; i++) {
   2583			struct page *page = pvec.pages[i];
   2584
   2585			/*
   2586			 * Accumulated enough dirty pages? This doesn't apply
   2587			 * to WB_SYNC_ALL mode. For integrity sync we have to
   2588			 * keep going because someone may be concurrently
   2589			 * dirtying pages, and we might have synced a lot of
   2590			 * newly appeared dirty pages, but have not synced all
   2591			 * of the old dirty pages.
   2592			 */
   2593			if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0)
   2594				goto out;
   2595
   2596			/* If we can't merge this page, we are done. */
   2597			if (mpd->map.m_len > 0 && mpd->next_page != page->index)
   2598				goto out;
   2599
   2600			lock_page(page);
   2601			/*
   2602			 * If the page is no longer dirty, or its mapping no
   2603			 * longer corresponds to inode we are writing (which
   2604			 * means it has been truncated or invalidated), or the
   2605			 * page is already under writeback and we are not doing
   2606			 * a data integrity writeback, skip the page
   2607			 */
   2608			if (!PageDirty(page) ||
   2609			    (PageWriteback(page) &&
   2610			     (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
   2611			    unlikely(page->mapping != mapping)) {
   2612				unlock_page(page);
   2613				continue;
   2614			}
   2615
   2616			wait_on_page_writeback(page);
   2617			BUG_ON(PageWriteback(page));
   2618
   2619			/*
   2620			 * Should never happen but for buggy code in
   2621			 * other subsystems that call
   2622			 * set_page_dirty() without properly warning
   2623			 * the file system first.  See [1] for more
   2624			 * information.
   2625			 *
   2626			 * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
   2627			 */
   2628			if (!page_has_buffers(page)) {
   2629				ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index);
   2630				ClearPageDirty(page);
   2631				unlock_page(page);
   2632				continue;
   2633			}
   2634
   2635			if (mpd->map.m_len == 0)
   2636				mpd->first_page = page->index;
   2637			mpd->next_page = page->index + 1;
   2638			/* Add all dirty buffers to mpd */
   2639			lblk = ((ext4_lblk_t)page->index) <<
   2640				(PAGE_SHIFT - blkbits);
   2641			head = page_buffers(page);
   2642			err = mpage_process_page_bufs(mpd, head, head, lblk);
   2643			if (err <= 0)
   2644				goto out;
   2645			err = 0;
   2646			left--;
   2647		}
   2648		pagevec_release(&pvec);
   2649		cond_resched();
   2650	}
   2651	mpd->scanned_until_end = 1;
   2652	return 0;
   2653out:
   2654	pagevec_release(&pvec);
   2655	return err;
   2656}
   2657
   2658static int ext4_writepages(struct address_space *mapping,
   2659			   struct writeback_control *wbc)
   2660{
   2661	pgoff_t	writeback_index = 0;
   2662	long nr_to_write = wbc->nr_to_write;
   2663	int range_whole = 0;
   2664	int cycled = 1;
   2665	handle_t *handle = NULL;
   2666	struct mpage_da_data mpd;
   2667	struct inode *inode = mapping->host;
   2668	int needed_blocks, rsv_blocks = 0, ret = 0;
   2669	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
   2670	struct blk_plug plug;
   2671	bool give_up_on_write = false;
   2672
   2673	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
   2674		return -EIO;
   2675
   2676	percpu_down_read(&sbi->s_writepages_rwsem);
   2677	trace_ext4_writepages(inode, wbc);
   2678
   2679	/*
   2680	 * No pages to write? This is mainly a kludge to avoid starting
   2681	 * a transaction for special inodes like journal inode on last iput()
   2682	 * because that could violate lock ordering on umount
   2683	 */
   2684	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
   2685		goto out_writepages;
   2686
   2687	if (ext4_should_journal_data(inode)) {
   2688		ret = generic_writepages(mapping, wbc);
   2689		goto out_writepages;
   2690	}
   2691
   2692	/*
   2693	 * If the filesystem has aborted, it is read-only, so return
   2694	 * right away instead of dumping stack traces later on that
   2695	 * will obscure the real source of the problem.  We test
   2696	 * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because
   2697	 * the latter could be true if the filesystem is mounted
   2698	 * read-only, and in that case, ext4_writepages should
   2699	 * *never* be called, so if that ever happens, we would want
   2700	 * the stack trace.
   2701	 */
   2702	if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) ||
   2703		     ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) {
   2704		ret = -EROFS;
   2705		goto out_writepages;
   2706	}
   2707
   2708	/*
   2709	 * If we have inline data and arrive here, it means that
   2710	 * we will soon create the block for the 1st page, so
   2711	 * we'd better clear the inline data here.
   2712	 */
   2713	if (ext4_has_inline_data(inode)) {
   2714		/* Just inode will be modified... */
   2715		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
   2716		if (IS_ERR(handle)) {
   2717			ret = PTR_ERR(handle);
   2718			goto out_writepages;
   2719		}
   2720		BUG_ON(ext4_test_inode_state(inode,
   2721				EXT4_STATE_MAY_INLINE_DATA));
   2722		ext4_destroy_inline_data(handle, inode);
   2723		ext4_journal_stop(handle);
   2724	}
   2725
   2726	if (ext4_should_dioread_nolock(inode)) {
   2727		/*
   2728		 * We may need to convert up to one extent per block in
   2729		 * the page and we may dirty the inode.
   2730		 */
   2731		rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
   2732						PAGE_SIZE >> inode->i_blkbits);
   2733	}
   2734
   2735	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
   2736		range_whole = 1;
   2737
   2738	if (wbc->range_cyclic) {
   2739		writeback_index = mapping->writeback_index;
   2740		if (writeback_index)
   2741			cycled = 0;
   2742		mpd.first_page = writeback_index;
   2743		mpd.last_page = -1;
   2744	} else {
   2745		mpd.first_page = wbc->range_start >> PAGE_SHIFT;
   2746		mpd.last_page = wbc->range_end >> PAGE_SHIFT;
   2747	}
   2748
   2749	mpd.inode = inode;
   2750	mpd.wbc = wbc;
   2751	ext4_io_submit_init(&mpd.io_submit, wbc);
   2752retry:
   2753	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
   2754		tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
   2755	blk_start_plug(&plug);
   2756
   2757	/*
   2758	 * First writeback pages that don't need mapping - we can avoid
   2759	 * starting a transaction unnecessarily and also avoid being blocked
   2760	 * in the block layer on device congestion while having transaction
   2761	 * started.
   2762	 */
   2763	mpd.do_map = 0;
   2764	mpd.scanned_until_end = 0;
   2765	mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
   2766	if (!mpd.io_submit.io_end) {
   2767		ret = -ENOMEM;
   2768		goto unplug;
   2769	}
   2770	ret = mpage_prepare_extent_to_map(&mpd);
   2771	/* Unlock pages we didn't use */
   2772	mpage_release_unused_pages(&mpd, false);
   2773	/* Submit prepared bio */
   2774	ext4_io_submit(&mpd.io_submit);
   2775	ext4_put_io_end_defer(mpd.io_submit.io_end);
   2776	mpd.io_submit.io_end = NULL;
   2777	if (ret < 0)
   2778		goto unplug;
   2779
   2780	while (!mpd.scanned_until_end && wbc->nr_to_write > 0) {
   2781		/* For each extent of pages we use new io_end */
   2782		mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
   2783		if (!mpd.io_submit.io_end) {
   2784			ret = -ENOMEM;
   2785			break;
   2786		}
   2787
   2788		/*
   2789		 * We have two constraints: We find one extent to map and we
   2790		 * must always write out whole page (makes a difference when
   2791		 * blocksize < pagesize) so that we don't block on IO when we
   2792		 * try to write out the rest of the page. Journalled mode is
   2793		 * not supported by delalloc.
   2794		 */
   2795		BUG_ON(ext4_should_journal_data(inode));
   2796		needed_blocks = ext4_da_writepages_trans_blocks(inode);
   2797
   2798		/* start a new transaction */
   2799		handle = ext4_journal_start_with_reserve(inode,
   2800				EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
   2801		if (IS_ERR(handle)) {
   2802			ret = PTR_ERR(handle);
   2803			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
   2804			       "%ld pages, ino %lu; err %d", __func__,
   2805				wbc->nr_to_write, inode->i_ino, ret);
   2806			/* Release allocated io_end */
   2807			ext4_put_io_end(mpd.io_submit.io_end);
   2808			mpd.io_submit.io_end = NULL;
   2809			break;
   2810		}
   2811		mpd.do_map = 1;
   2812
   2813		trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
   2814		ret = mpage_prepare_extent_to_map(&mpd);
   2815		if (!ret && mpd.map.m_len)
   2816			ret = mpage_map_and_submit_extent(handle, &mpd,
   2817					&give_up_on_write);
   2818		/*
   2819		 * Caution: If the handle is synchronous,
   2820		 * ext4_journal_stop() can wait for transaction commit
   2821		 * to finish which may depend on writeback of pages to
   2822		 * complete or on page lock to be released.  In that
   2823		 * case, we have to wait until after we have
   2824		 * submitted all the IO, released page locks we hold,
   2825		 * and dropped io_end reference (for extent conversion
   2826		 * to be able to complete) before stopping the handle.
   2827		 */
   2828		if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
   2829			ext4_journal_stop(handle);
   2830			handle = NULL;
   2831			mpd.do_map = 0;
   2832		}
   2833		/* Unlock pages we didn't use */
   2834		mpage_release_unused_pages(&mpd, give_up_on_write);
   2835		/* Submit prepared bio */
   2836		ext4_io_submit(&mpd.io_submit);
   2837
   2838		/*
   2839		 * Drop our io_end reference we got from init. We have
   2840		 * to be careful and use deferred io_end finishing if
   2841		 * we are still holding the transaction as we can
   2842		 * release the last reference to io_end which may end
   2843		 * up doing unwritten extent conversion.
   2844		 */
   2845		if (handle) {
   2846			ext4_put_io_end_defer(mpd.io_submit.io_end);
   2847			ext4_journal_stop(handle);
   2848		} else
   2849			ext4_put_io_end(mpd.io_submit.io_end);
   2850		mpd.io_submit.io_end = NULL;
   2851
   2852		if (ret == -ENOSPC && sbi->s_journal) {
   2853			/*
   2854			 * Commit the transaction which would
   2855			 * free blocks released in the transaction
   2856			 * and try again
   2857			 */
   2858			jbd2_journal_force_commit_nested(sbi->s_journal);
   2859			ret = 0;
   2860			continue;
   2861		}
   2862		/* Fatal error - ENOMEM, EIO... */
   2863		if (ret)
   2864			break;
   2865	}
   2866unplug:
   2867	blk_finish_plug(&plug);
   2868	if (!ret && !cycled && wbc->nr_to_write > 0) {
   2869		cycled = 1;
   2870		mpd.last_page = writeback_index - 1;
   2871		mpd.first_page = 0;
   2872		goto retry;
   2873	}
   2874
   2875	/* Update index */
   2876	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
   2877		/*
   2878		 * Set the writeback_index so that range_cyclic
   2879		 * mode will write it back later
   2880		 */
   2881		mapping->writeback_index = mpd.first_page;
   2882
   2883out_writepages:
   2884	trace_ext4_writepages_result(inode, wbc, ret,
   2885				     nr_to_write - wbc->nr_to_write);
   2886	percpu_up_read(&sbi->s_writepages_rwsem);
   2887	return ret;
   2888}
   2889
   2890static int ext4_dax_writepages(struct address_space *mapping,
   2891			       struct writeback_control *wbc)
   2892{
   2893	int ret;
   2894	long nr_to_write = wbc->nr_to_write;
   2895	struct inode *inode = mapping->host;
   2896	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
   2897
   2898	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
   2899		return -EIO;
   2900
   2901	percpu_down_read(&sbi->s_writepages_rwsem);
   2902	trace_ext4_writepages(inode, wbc);
   2903
   2904	ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
   2905	trace_ext4_writepages_result(inode, wbc, ret,
   2906				     nr_to_write - wbc->nr_to_write);
   2907	percpu_up_read(&sbi->s_writepages_rwsem);
   2908	return ret;
   2909}
   2910
   2911static int ext4_nonda_switch(struct super_block *sb)
   2912{
   2913	s64 free_clusters, dirty_clusters;
   2914	struct ext4_sb_info *sbi = EXT4_SB(sb);
   2915
   2916	/*
   2917	 * switch to non delalloc mode if we are running low
   2918	 * on free block. The free block accounting via percpu
   2919	 * counters can get slightly wrong with percpu_counter_batch getting
   2920	 * accumulated on each CPU without updating global counters
   2921	 * Delalloc need an accurate free block accounting. So switch
   2922	 * to non delalloc when we are near to error range.
   2923	 */
   2924	free_clusters =
   2925		percpu_counter_read_positive(&sbi->s_freeclusters_counter);
   2926	dirty_clusters =
   2927		percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
   2928	/*
   2929	 * Start pushing delalloc when 1/2 of free blocks are dirty.
   2930	 */
   2931	if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
   2932		try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
   2933
   2934	if (2 * free_clusters < 3 * dirty_clusters ||
   2935	    free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
   2936		/*
   2937		 * free block count is less than 150% of dirty blocks
   2938		 * or free blocks is less than watermark
   2939		 */
   2940		return 1;
   2941	}
   2942	return 0;
   2943}
   2944
   2945static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
   2946			       loff_t pos, unsigned len,
   2947			       struct page **pagep, void **fsdata)
   2948{
   2949	int ret, retries = 0;
   2950	struct page *page;
   2951	pgoff_t index;
   2952	struct inode *inode = mapping->host;
   2953
   2954	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
   2955		return -EIO;
   2956
   2957	index = pos >> PAGE_SHIFT;
   2958
   2959	if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) {
   2960		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
   2961		return ext4_write_begin(file, mapping, pos,
   2962					len, pagep, fsdata);
   2963	}
   2964	*fsdata = (void *)0;
   2965	trace_ext4_da_write_begin(inode, pos, len);
   2966
   2967	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
   2968		ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len,
   2969						      pagep, fsdata);
   2970		if (ret < 0)
   2971			return ret;
   2972		if (ret == 1)
   2973			return 0;
   2974	}
   2975
   2976retry:
   2977	page = grab_cache_page_write_begin(mapping, index);
   2978	if (!page)
   2979		return -ENOMEM;
   2980
   2981	/* In case writeback began while the page was unlocked */
   2982	wait_for_stable_page(page);
   2983
   2984#ifdef CONFIG_FS_ENCRYPTION
   2985	ret = ext4_block_write_begin(page, pos, len,
   2986				     ext4_da_get_block_prep);
   2987#else
   2988	ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
   2989#endif
   2990	if (ret < 0) {
   2991		unlock_page(page);
   2992		put_page(page);
   2993		/*
   2994		 * block_write_begin may have instantiated a few blocks
   2995		 * outside i_size.  Trim these off again. Don't need
   2996		 * i_size_read because we hold inode lock.
   2997		 */
   2998		if (pos + len > inode->i_size)
   2999			ext4_truncate_failed_write(inode);
   3000
   3001		if (ret == -ENOSPC &&
   3002		    ext4_should_retry_alloc(inode->i_sb, &retries))
   3003			goto retry;
   3004		return ret;
   3005	}
   3006
   3007	*pagep = page;
   3008	return ret;
   3009}
   3010
   3011/*
   3012 * Check if we should update i_disksize
   3013 * when write to the end of file but not require block allocation
   3014 */
   3015static int ext4_da_should_update_i_disksize(struct page *page,
   3016					    unsigned long offset)
   3017{
   3018	struct buffer_head *bh;
   3019	struct inode *inode = page->mapping->host;
   3020	unsigned int idx;
   3021	int i;
   3022
   3023	bh = page_buffers(page);
   3024	idx = offset >> inode->i_blkbits;
   3025
   3026	for (i = 0; i < idx; i++)
   3027		bh = bh->b_this_page;
   3028
   3029	if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
   3030		return 0;
   3031	return 1;
   3032}
   3033
   3034static int ext4_da_write_end(struct file *file,
   3035			     struct address_space *mapping,
   3036			     loff_t pos, unsigned len, unsigned copied,
   3037			     struct page *page, void *fsdata)
   3038{
   3039	struct inode *inode = mapping->host;
   3040	loff_t new_i_size;
   3041	unsigned long start, end;
   3042	int write_mode = (int)(unsigned long)fsdata;
   3043
   3044	if (write_mode == FALL_BACK_TO_NONDELALLOC)
   3045		return ext4_write_end(file, mapping, pos,
   3046				      len, copied, page, fsdata);
   3047
   3048	trace_ext4_da_write_end(inode, pos, len, copied);
   3049
   3050	if (write_mode != CONVERT_INLINE_DATA &&
   3051	    ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
   3052	    ext4_has_inline_data(inode))
   3053		return ext4_write_inline_data_end(inode, pos, len, copied, page);
   3054
   3055	start = pos & (PAGE_SIZE - 1);
   3056	end = start + copied - 1;
   3057
   3058	/*
   3059	 * Since we are holding inode lock, we are sure i_disksize <=
   3060	 * i_size. We also know that if i_disksize < i_size, there are
   3061	 * delalloc writes pending in the range upto i_size. If the end of
   3062	 * the current write is <= i_size, there's no need to touch
   3063	 * i_disksize since writeback will push i_disksize upto i_size
   3064	 * eventually. If the end of the current write is > i_size and
   3065	 * inside an allocated block (ext4_da_should_update_i_disksize()
   3066	 * check), we need to update i_disksize here as neither
   3067	 * ext4_writepage() nor certain ext4_writepages() paths not
   3068	 * allocating blocks update i_disksize.
   3069	 *
   3070	 * Note that we defer inode dirtying to generic_write_end() /
   3071	 * ext4_da_write_inline_data_end().
   3072	 */
   3073	new_i_size = pos + copied;
   3074	if (copied && new_i_size > inode->i_size &&
   3075	    ext4_da_should_update_i_disksize(page, end))
   3076		ext4_update_i_disksize(inode, new_i_size);
   3077
   3078	return generic_write_end(file, mapping, pos, len, copied, page, fsdata);
   3079}
   3080
   3081/*
   3082 * Force all delayed allocation blocks to be allocated for a given inode.
   3083 */
   3084int ext4_alloc_da_blocks(struct inode *inode)
   3085{
   3086	trace_ext4_alloc_da_blocks(inode);
   3087
   3088	if (!EXT4_I(inode)->i_reserved_data_blocks)
   3089		return 0;
   3090
   3091	/*
   3092	 * We do something simple for now.  The filemap_flush() will
   3093	 * also start triggering a write of the data blocks, which is
   3094	 * not strictly speaking necessary (and for users of
   3095	 * laptop_mode, not even desirable).  However, to do otherwise
   3096	 * would require replicating code paths in:
   3097	 *
   3098	 * ext4_writepages() ->
   3099	 *    write_cache_pages() ---> (via passed in callback function)
   3100	 *        __mpage_da_writepage() -->
   3101	 *           mpage_add_bh_to_extent()
   3102	 *           mpage_da_map_blocks()
   3103	 *
   3104	 * The problem is that write_cache_pages(), located in
   3105	 * mm/page-writeback.c, marks pages clean in preparation for
   3106	 * doing I/O, which is not desirable if we're not planning on
   3107	 * doing I/O at all.
   3108	 *
   3109	 * We could call write_cache_pages(), and then redirty all of
   3110	 * the pages by calling redirty_page_for_writepage() but that
   3111	 * would be ugly in the extreme.  So instead we would need to
   3112	 * replicate parts of the code in the above functions,
   3113	 * simplifying them because we wouldn't actually intend to
   3114	 * write out the pages, but rather only collect contiguous
   3115	 * logical block extents, call the multi-block allocator, and
   3116	 * then update the buffer heads with the block allocations.
   3117	 *
   3118	 * For now, though, we'll cheat by calling filemap_flush(),
   3119	 * which will map the blocks, and start the I/O, but not
   3120	 * actually wait for the I/O to complete.
   3121	 */
   3122	return filemap_flush(inode->i_mapping);
   3123}
   3124
   3125/*
   3126 * bmap() is special.  It gets used by applications such as lilo and by
   3127 * the swapper to find the on-disk block of a specific piece of data.
   3128 *
   3129 * Naturally, this is dangerous if the block concerned is still in the
   3130 * journal.  If somebody makes a swapfile on an ext4 data-journaling
   3131 * filesystem and enables swap, then they may get a nasty shock when the
   3132 * data getting swapped to that swapfile suddenly gets overwritten by
   3133 * the original zero's written out previously to the journal and
   3134 * awaiting writeback in the kernel's buffer cache.
   3135 *
   3136 * So, if we see any bmap calls here on a modified, data-journaled file,
   3137 * take extra steps to flush any blocks which might be in the cache.
   3138 */
   3139static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
   3140{
   3141	struct inode *inode = mapping->host;
   3142	journal_t *journal;
   3143	int err;
   3144
   3145	/*
   3146	 * We can get here for an inline file via the FIBMAP ioctl
   3147	 */
   3148	if (ext4_has_inline_data(inode))
   3149		return 0;
   3150
   3151	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
   3152			test_opt(inode->i_sb, DELALLOC)) {
   3153		/*
   3154		 * With delalloc we want to sync the file
   3155		 * so that we can make sure we allocate
   3156		 * blocks for file
   3157		 */
   3158		filemap_write_and_wait(mapping);
   3159	}
   3160
   3161	if (EXT4_JOURNAL(inode) &&
   3162	    ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
   3163		/*
   3164		 * This is a REALLY heavyweight approach, but the use of
   3165		 * bmap on dirty files is expected to be extremely rare:
   3166		 * only if we run lilo or swapon on a freshly made file
   3167		 * do we expect this to happen.
   3168		 *
   3169		 * (bmap requires CAP_SYS_RAWIO so this does not
   3170		 * represent an unprivileged user DOS attack --- we'd be
   3171		 * in trouble if mortal users could trigger this path at
   3172		 * will.)
   3173		 *
   3174		 * NB. EXT4_STATE_JDATA is not set on files other than
   3175		 * regular files.  If somebody wants to bmap a directory
   3176		 * or symlink and gets confused because the buffer
   3177		 * hasn't yet been flushed to disk, they deserve
   3178		 * everything they get.
   3179		 */
   3180
   3181		ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
   3182		journal = EXT4_JOURNAL(inode);
   3183		jbd2_journal_lock_updates(journal);
   3184		err = jbd2_journal_flush(journal, 0);
   3185		jbd2_journal_unlock_updates(journal);
   3186
   3187		if (err)
   3188			return 0;
   3189	}
   3190
   3191	return iomap_bmap(mapping, block, &ext4_iomap_ops);
   3192}
   3193
   3194static int ext4_read_folio(struct file *file, struct folio *folio)
   3195{
   3196	struct page *page = &folio->page;
   3197	int ret = -EAGAIN;
   3198	struct inode *inode = page->mapping->host;
   3199
   3200	trace_ext4_readpage(page);
   3201
   3202	if (ext4_has_inline_data(inode))
   3203		ret = ext4_readpage_inline(inode, page);
   3204
   3205	if (ret == -EAGAIN)
   3206		return ext4_mpage_readpages(inode, NULL, page);
   3207
   3208	return ret;
   3209}
   3210
   3211static void ext4_readahead(struct readahead_control *rac)
   3212{
   3213	struct inode *inode = rac->mapping->host;
   3214
   3215	/* If the file has inline data, no need to do readahead. */
   3216	if (ext4_has_inline_data(inode))
   3217		return;
   3218
   3219	ext4_mpage_readpages(inode, rac, NULL);
   3220}
   3221
   3222static void ext4_invalidate_folio(struct folio *folio, size_t offset,
   3223				size_t length)
   3224{
   3225	trace_ext4_invalidate_folio(folio, offset, length);
   3226
   3227	/* No journalling happens on data buffers when this function is used */
   3228	WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio)));
   3229
   3230	block_invalidate_folio(folio, offset, length);
   3231}
   3232
   3233static int __ext4_journalled_invalidate_folio(struct folio *folio,
   3234					    size_t offset, size_t length)
   3235{
   3236	journal_t *journal = EXT4_JOURNAL(folio->mapping->host);
   3237
   3238	trace_ext4_journalled_invalidate_folio(folio, offset, length);
   3239
   3240	/*
   3241	 * If it's a full truncate we just forget about the pending dirtying
   3242	 */
   3243	if (offset == 0 && length == folio_size(folio))
   3244		folio_clear_checked(folio);
   3245
   3246	return jbd2_journal_invalidate_folio(journal, folio, offset, length);
   3247}
   3248
   3249/* Wrapper for aops... */
   3250static void ext4_journalled_invalidate_folio(struct folio *folio,
   3251					   size_t offset,
   3252					   size_t length)
   3253{
   3254	WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0);
   3255}
   3256
   3257static bool ext4_release_folio(struct folio *folio, gfp_t wait)
   3258{
   3259	journal_t *journal = EXT4_JOURNAL(folio->mapping->host);
   3260
   3261	trace_ext4_releasepage(&folio->page);
   3262
   3263	/* Page has dirty journalled data -> cannot release */
   3264	if (folio_test_checked(folio))
   3265		return false;
   3266	if (journal)
   3267		return jbd2_journal_try_to_free_buffers(journal, folio);
   3268	else
   3269		return try_to_free_buffers(folio);
   3270}
   3271
   3272static bool ext4_inode_datasync_dirty(struct inode *inode)
   3273{
   3274	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
   3275
   3276	if (journal) {
   3277		if (jbd2_transaction_committed(journal,
   3278			EXT4_I(inode)->i_datasync_tid))
   3279			return false;
   3280		if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
   3281			return !list_empty(&EXT4_I(inode)->i_fc_list);
   3282		return true;
   3283	}
   3284
   3285	/* Any metadata buffers to write? */
   3286	if (!list_empty(&inode->i_mapping->private_list))
   3287		return true;
   3288	return inode->i_state & I_DIRTY_DATASYNC;
   3289}
   3290
   3291static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
   3292			   struct ext4_map_blocks *map, loff_t offset,
   3293			   loff_t length, unsigned int flags)
   3294{
   3295	u8 blkbits = inode->i_blkbits;
   3296
   3297	/*
   3298	 * Writes that span EOF might trigger an I/O size update on completion,
   3299	 * so consider them to be dirty for the purpose of O_DSYNC, even if
   3300	 * there is no other metadata changes being made or are pending.
   3301	 */
   3302	iomap->flags = 0;
   3303	if (ext4_inode_datasync_dirty(inode) ||
   3304	    offset + length > i_size_read(inode))
   3305		iomap->flags |= IOMAP_F_DIRTY;
   3306
   3307	if (map->m_flags & EXT4_MAP_NEW)
   3308		iomap->flags |= IOMAP_F_NEW;
   3309
   3310	if (flags & IOMAP_DAX)
   3311		iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
   3312	else
   3313		iomap->bdev = inode->i_sb->s_bdev;
   3314	iomap->offset = (u64) map->m_lblk << blkbits;
   3315	iomap->length = (u64) map->m_len << blkbits;
   3316
   3317	if ((map->m_flags & EXT4_MAP_MAPPED) &&
   3318	    !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
   3319		iomap->flags |= IOMAP_F_MERGED;
   3320
   3321	/*
   3322	 * Flags passed to ext4_map_blocks() for direct I/O writes can result
   3323	 * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
   3324	 * set. In order for any allocated unwritten extents to be converted
   3325	 * into written extents correctly within the ->end_io() handler, we
   3326	 * need to ensure that the iomap->type is set appropriately. Hence, the
   3327	 * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
   3328	 * been set first.
   3329	 */
   3330	if (map->m_flags & EXT4_MAP_UNWRITTEN) {
   3331		iomap->type = IOMAP_UNWRITTEN;
   3332		iomap->addr = (u64) map->m_pblk << blkbits;
   3333		if (flags & IOMAP_DAX)
   3334			iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
   3335	} else if (map->m_flags & EXT4_MAP_MAPPED) {
   3336		iomap->type = IOMAP_MAPPED;
   3337		iomap->addr = (u64) map->m_pblk << blkbits;
   3338		if (flags & IOMAP_DAX)
   3339			iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
   3340	} else {
   3341		iomap->type = IOMAP_HOLE;
   3342		iomap->addr = IOMAP_NULL_ADDR;
   3343	}
   3344}
   3345
   3346static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
   3347			    unsigned int flags)
   3348{
   3349	handle_t *handle;
   3350	u8 blkbits = inode->i_blkbits;
   3351	int ret, dio_credits, m_flags = 0, retries = 0;
   3352
   3353	/*
   3354	 * Trim the mapping request to the maximum value that we can map at
   3355	 * once for direct I/O.
   3356	 */
   3357	if (map->m_len > DIO_MAX_BLOCKS)
   3358		map->m_len = DIO_MAX_BLOCKS;
   3359	dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
   3360
   3361retry:
   3362	/*
   3363	 * Either we allocate blocks and then don't get an unwritten extent, so
   3364	 * in that case we have reserved enough credits. Or, the blocks are
   3365	 * already allocated and unwritten. In that case, the extent conversion
   3366	 * fits into the credits as well.
   3367	 */
   3368	handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
   3369	if (IS_ERR(handle))
   3370		return PTR_ERR(handle);
   3371
   3372	/*
   3373	 * DAX and direct I/O are the only two operations that are currently
   3374	 * supported with IOMAP_WRITE.
   3375	 */
   3376	WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT)));
   3377	if (flags & IOMAP_DAX)
   3378		m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
   3379	/*
   3380	 * We use i_size instead of i_disksize here because delalloc writeback
   3381	 * can complete at any point during the I/O and subsequently push the
   3382	 * i_disksize out to i_size. This could be beyond where direct I/O is
   3383	 * happening and thus expose allocated blocks to direct I/O reads.
   3384	 */
   3385	else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode))
   3386		m_flags = EXT4_GET_BLOCKS_CREATE;
   3387	else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
   3388		m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
   3389
   3390	ret = ext4_map_blocks(handle, inode, map, m_flags);
   3391
   3392	/*
   3393	 * We cannot fill holes in indirect tree based inodes as that could
   3394	 * expose stale data in the case of a crash. Use the magic error code
   3395	 * to fallback to buffered I/O.
   3396	 */
   3397	if (!m_flags && !ret)
   3398		ret = -ENOTBLK;
   3399
   3400	ext4_journal_stop(handle);
   3401	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
   3402		goto retry;
   3403
   3404	return ret;
   3405}
   3406
   3407
   3408static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
   3409		unsigned flags, struct iomap *iomap, struct iomap *srcmap)
   3410{
   3411	int ret;
   3412	struct ext4_map_blocks map;
   3413	u8 blkbits = inode->i_blkbits;
   3414
   3415	if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
   3416		return -EINVAL;
   3417
   3418	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
   3419		return -ERANGE;
   3420
   3421	/*
   3422	 * Calculate the first and last logical blocks respectively.
   3423	 */
   3424	map.m_lblk = offset >> blkbits;
   3425	map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
   3426			  EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
   3427
   3428	if (flags & IOMAP_WRITE) {
   3429		/*
   3430		 * We check here if the blocks are already allocated, then we
   3431		 * don't need to start a journal txn and we can directly return
   3432		 * the mapping information. This could boost performance
   3433		 * especially in multi-threaded overwrite requests.
   3434		 */
   3435		if (offset + length <= i_size_read(inode)) {
   3436			ret = ext4_map_blocks(NULL, inode, &map, 0);
   3437			if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
   3438				goto out;
   3439		}
   3440		ret = ext4_iomap_alloc(inode, &map, flags);
   3441	} else {
   3442		ret = ext4_map_blocks(NULL, inode, &map, 0);
   3443	}
   3444
   3445	if (ret < 0)
   3446		return ret;
   3447out:
   3448	/*
   3449	 * When inline encryption is enabled, sometimes I/O to an encrypted file
   3450	 * has to be broken up to guarantee DUN contiguity.  Handle this by
   3451	 * limiting the length of the mapping returned.
   3452	 */
   3453	map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
   3454
   3455	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
   3456
   3457	return 0;
   3458}
   3459
   3460static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
   3461		loff_t length, unsigned flags, struct iomap *iomap,
   3462		struct iomap *srcmap)
   3463{
   3464	int ret;
   3465
   3466	/*
   3467	 * Even for writes we don't need to allocate blocks, so just pretend
   3468	 * we are reading to save overhead of starting a transaction.
   3469	 */
   3470	flags &= ~IOMAP_WRITE;
   3471	ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap);
   3472	WARN_ON_ONCE(iomap->type != IOMAP_MAPPED);
   3473	return ret;
   3474}
   3475
   3476static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
   3477			  ssize_t written, unsigned flags, struct iomap *iomap)
   3478{
   3479	/*
   3480	 * Check to see whether an error occurred while writing out the data to
   3481	 * the allocated blocks. If so, return the magic error code so that we
   3482	 * fallback to buffered I/O and attempt to complete the remainder of
   3483	 * the I/O. Any blocks that may have been allocated in preparation for
   3484	 * the direct I/O will be reused during buffered I/O.
   3485	 */
   3486	if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
   3487		return -ENOTBLK;
   3488
   3489	return 0;
   3490}
   3491
   3492const struct iomap_ops ext4_iomap_ops = {
   3493	.iomap_begin		= ext4_iomap_begin,
   3494	.iomap_end		= ext4_iomap_end,
   3495};
   3496
   3497const struct iomap_ops ext4_iomap_overwrite_ops = {
   3498	.iomap_begin		= ext4_iomap_overwrite_begin,
   3499	.iomap_end		= ext4_iomap_end,
   3500};
   3501
   3502static bool ext4_iomap_is_delalloc(struct inode *inode,
   3503				   struct ext4_map_blocks *map)
   3504{
   3505	struct extent_status es;
   3506	ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
   3507
   3508	ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
   3509				  map->m_lblk, end, &es);
   3510
   3511	if (!es.es_len || es.es_lblk > end)
   3512		return false;
   3513
   3514	if (es.es_lblk > map->m_lblk) {
   3515		map->m_len = es.es_lblk - map->m_lblk;
   3516		return false;
   3517	}
   3518
   3519	offset = map->m_lblk - es.es_lblk;
   3520	map->m_len = es.es_len - offset;
   3521
   3522	return true;
   3523}
   3524
   3525static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
   3526				   loff_t length, unsigned int flags,
   3527				   struct iomap *iomap, struct iomap *srcmap)
   3528{
   3529	int ret;
   3530	bool delalloc = false;
   3531	struct ext4_map_blocks map;
   3532	u8 blkbits = inode->i_blkbits;
   3533
   3534	if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
   3535		return -EINVAL;
   3536
   3537	if (ext4_has_inline_data(inode)) {
   3538		ret = ext4_inline_data_iomap(inode, iomap);
   3539		if (ret != -EAGAIN) {
   3540			if (ret == 0 && offset >= iomap->length)
   3541				ret = -ENOENT;
   3542			return ret;
   3543		}
   3544	}
   3545
   3546	/*
   3547	 * Calculate the first and last logical block respectively.
   3548	 */
   3549	map.m_lblk = offset >> blkbits;
   3550	map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
   3551			  EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
   3552
   3553	/*
   3554	 * Fiemap callers may call for offset beyond s_bitmap_maxbytes.
   3555	 * So handle it here itself instead of querying ext4_map_blocks().
   3556	 * Since ext4_map_blocks() will warn about it and will return
   3557	 * -EIO error.
   3558	 */
   3559	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
   3560		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   3561
   3562		if (offset >= sbi->s_bitmap_maxbytes) {
   3563			map.m_flags = 0;
   3564			goto set_iomap;
   3565		}
   3566	}
   3567
   3568	ret = ext4_map_blocks(NULL, inode, &map, 0);
   3569	if (ret < 0)
   3570		return ret;
   3571	if (ret == 0)
   3572		delalloc = ext4_iomap_is_delalloc(inode, &map);
   3573
   3574set_iomap:
   3575	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
   3576	if (delalloc && iomap->type == IOMAP_HOLE)
   3577		iomap->type = IOMAP_DELALLOC;
   3578
   3579	return 0;
   3580}
   3581
   3582const struct iomap_ops ext4_iomap_report_ops = {
   3583	.iomap_begin = ext4_iomap_begin_report,
   3584};
   3585
   3586/*
   3587 * Whenever the folio is being dirtied, corresponding buffers should already
   3588 * be attached to the transaction (we take care of this in ext4_page_mkwrite()
   3589 * and ext4_write_begin()). However we cannot move buffers to dirty transaction
   3590 * lists here because ->dirty_folio is called under VFS locks and the folio
   3591 * is not necessarily locked.
   3592 *
   3593 * We cannot just dirty the folio and leave attached buffers clean, because the
   3594 * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
   3595 * or jbddirty because all the journalling code will explode.
   3596 *
   3597 * So what we do is to mark the folio "pending dirty" and next time writepage
   3598 * is called, propagate that into the buffers appropriately.
   3599 */
   3600static bool ext4_journalled_dirty_folio(struct address_space *mapping,
   3601		struct folio *folio)
   3602{
   3603	WARN_ON_ONCE(!folio_buffers(folio));
   3604	folio_set_checked(folio);
   3605	return filemap_dirty_folio(mapping, folio);
   3606}
   3607
   3608static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio)
   3609{
   3610	WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio));
   3611	WARN_ON_ONCE(!folio_buffers(folio));
   3612	return block_dirty_folio(mapping, folio);
   3613}
   3614
   3615static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
   3616				    struct file *file, sector_t *span)
   3617{
   3618	return iomap_swapfile_activate(sis, file, span,
   3619				       &ext4_iomap_report_ops);
   3620}
   3621
   3622static const struct address_space_operations ext4_aops = {
   3623	.read_folio		= ext4_read_folio,
   3624	.readahead		= ext4_readahead,
   3625	.writepage		= ext4_writepage,
   3626	.writepages		= ext4_writepages,
   3627	.write_begin		= ext4_write_begin,
   3628	.write_end		= ext4_write_end,
   3629	.dirty_folio		= ext4_dirty_folio,
   3630	.bmap			= ext4_bmap,
   3631	.invalidate_folio	= ext4_invalidate_folio,
   3632	.release_folio		= ext4_release_folio,
   3633	.direct_IO		= noop_direct_IO,
   3634	.migratepage		= buffer_migrate_page,
   3635	.is_partially_uptodate  = block_is_partially_uptodate,
   3636	.error_remove_page	= generic_error_remove_page,
   3637	.swap_activate		= ext4_iomap_swap_activate,
   3638};
   3639
   3640static const struct address_space_operations ext4_journalled_aops = {
   3641	.read_folio		= ext4_read_folio,
   3642	.readahead		= ext4_readahead,
   3643	.writepage		= ext4_writepage,
   3644	.writepages		= ext4_writepages,
   3645	.write_begin		= ext4_write_begin,
   3646	.write_end		= ext4_journalled_write_end,
   3647	.dirty_folio		= ext4_journalled_dirty_folio,
   3648	.bmap			= ext4_bmap,
   3649	.invalidate_folio	= ext4_journalled_invalidate_folio,
   3650	.release_folio		= ext4_release_folio,
   3651	.direct_IO		= noop_direct_IO,
   3652	.is_partially_uptodate  = block_is_partially_uptodate,
   3653	.error_remove_page	= generic_error_remove_page,
   3654	.swap_activate		= ext4_iomap_swap_activate,
   3655};
   3656
   3657static const struct address_space_operations ext4_da_aops = {
   3658	.read_folio		= ext4_read_folio,
   3659	.readahead		= ext4_readahead,
   3660	.writepage		= ext4_writepage,
   3661	.writepages		= ext4_writepages,
   3662	.write_begin		= ext4_da_write_begin,
   3663	.write_end		= ext4_da_write_end,
   3664	.dirty_folio		= ext4_dirty_folio,
   3665	.bmap			= ext4_bmap,
   3666	.invalidate_folio	= ext4_invalidate_folio,
   3667	.release_folio		= ext4_release_folio,
   3668	.direct_IO		= noop_direct_IO,
   3669	.migratepage		= buffer_migrate_page,
   3670	.is_partially_uptodate  = block_is_partially_uptodate,
   3671	.error_remove_page	= generic_error_remove_page,
   3672	.swap_activate		= ext4_iomap_swap_activate,
   3673};
   3674
   3675static const struct address_space_operations ext4_dax_aops = {
   3676	.writepages		= ext4_dax_writepages,
   3677	.direct_IO		= noop_direct_IO,
   3678	.dirty_folio		= noop_dirty_folio,
   3679	.bmap			= ext4_bmap,
   3680	.swap_activate		= ext4_iomap_swap_activate,
   3681};
   3682
   3683void ext4_set_aops(struct inode *inode)
   3684{
   3685	switch (ext4_inode_journal_mode(inode)) {
   3686	case EXT4_INODE_ORDERED_DATA_MODE:
   3687	case EXT4_INODE_WRITEBACK_DATA_MODE:
   3688		break;
   3689	case EXT4_INODE_JOURNAL_DATA_MODE:
   3690		inode->i_mapping->a_ops = &ext4_journalled_aops;
   3691		return;
   3692	default:
   3693		BUG();
   3694	}
   3695	if (IS_DAX(inode))
   3696		inode->i_mapping->a_ops = &ext4_dax_aops;
   3697	else if (test_opt(inode->i_sb, DELALLOC))
   3698		inode->i_mapping->a_ops = &ext4_da_aops;
   3699	else
   3700		inode->i_mapping->a_ops = &ext4_aops;
   3701}
   3702
   3703static int __ext4_block_zero_page_range(handle_t *handle,
   3704		struct address_space *mapping, loff_t from, loff_t length)
   3705{
   3706	ext4_fsblk_t index = from >> PAGE_SHIFT;
   3707	unsigned offset = from & (PAGE_SIZE-1);
   3708	unsigned blocksize, pos;
   3709	ext4_lblk_t iblock;
   3710	struct inode *inode = mapping->host;
   3711	struct buffer_head *bh;
   3712	struct page *page;
   3713	int err = 0;
   3714
   3715	page = find_or_create_page(mapping, from >> PAGE_SHIFT,
   3716				   mapping_gfp_constraint(mapping, ~__GFP_FS));
   3717	if (!page)
   3718		return -ENOMEM;
   3719
   3720	blocksize = inode->i_sb->s_blocksize;
   3721
   3722	iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
   3723
   3724	if (!page_has_buffers(page))
   3725		create_empty_buffers(page, blocksize, 0);
   3726
   3727	/* Find the buffer that contains "offset" */
   3728	bh = page_buffers(page);
   3729	pos = blocksize;
   3730	while (offset >= pos) {
   3731		bh = bh->b_this_page;
   3732		iblock++;
   3733		pos += blocksize;
   3734	}
   3735	if (buffer_freed(bh)) {
   3736		BUFFER_TRACE(bh, "freed: skip");
   3737		goto unlock;
   3738	}
   3739	if (!buffer_mapped(bh)) {
   3740		BUFFER_TRACE(bh, "unmapped");
   3741		ext4_get_block(inode, iblock, bh, 0);
   3742		/* unmapped? It's a hole - nothing to do */
   3743		if (!buffer_mapped(bh)) {
   3744			BUFFER_TRACE(bh, "still unmapped");
   3745			goto unlock;
   3746		}
   3747	}
   3748
   3749	/* Ok, it's mapped. Make sure it's up-to-date */
   3750	if (PageUptodate(page))
   3751		set_buffer_uptodate(bh);
   3752
   3753	if (!buffer_uptodate(bh)) {
   3754		err = ext4_read_bh_lock(bh, 0, true);
   3755		if (err)
   3756			goto unlock;
   3757		if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
   3758			/* We expect the key to be set. */
   3759			BUG_ON(!fscrypt_has_encryption_key(inode));
   3760			err = fscrypt_decrypt_pagecache_blocks(page, blocksize,
   3761							       bh_offset(bh));
   3762			if (err) {
   3763				clear_buffer_uptodate(bh);
   3764				goto unlock;
   3765			}
   3766		}
   3767	}
   3768	if (ext4_should_journal_data(inode)) {
   3769		BUFFER_TRACE(bh, "get write access");
   3770		err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
   3771						    EXT4_JTR_NONE);
   3772		if (err)
   3773			goto unlock;
   3774	}
   3775	zero_user(page, offset, length);
   3776	BUFFER_TRACE(bh, "zeroed end of block");
   3777
   3778	if (ext4_should_journal_data(inode)) {
   3779		err = ext4_handle_dirty_metadata(handle, inode, bh);
   3780	} else {
   3781		err = 0;
   3782		mark_buffer_dirty(bh);
   3783		if (ext4_should_order_data(inode))
   3784			err = ext4_jbd2_inode_add_write(handle, inode, from,
   3785					length);
   3786	}
   3787
   3788unlock:
   3789	unlock_page(page);
   3790	put_page(page);
   3791	return err;
   3792}
   3793
   3794/*
   3795 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
   3796 * starting from file offset 'from'.  The range to be zero'd must
   3797 * be contained with in one block.  If the specified range exceeds
   3798 * the end of the block it will be shortened to end of the block
   3799 * that corresponds to 'from'
   3800 */
   3801static int ext4_block_zero_page_range(handle_t *handle,
   3802		struct address_space *mapping, loff_t from, loff_t length)
   3803{
   3804	struct inode *inode = mapping->host;
   3805	unsigned offset = from & (PAGE_SIZE-1);
   3806	unsigned blocksize = inode->i_sb->s_blocksize;
   3807	unsigned max = blocksize - (offset & (blocksize - 1));
   3808
   3809	/*
   3810	 * correct length if it does not fall between
   3811	 * 'from' and the end of the block
   3812	 */
   3813	if (length > max || length < 0)
   3814		length = max;
   3815
   3816	if (IS_DAX(inode)) {
   3817		return dax_zero_range(inode, from, length, NULL,
   3818				      &ext4_iomap_ops);
   3819	}
   3820	return __ext4_block_zero_page_range(handle, mapping, from, length);
   3821}
   3822
   3823/*
   3824 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
   3825 * up to the end of the block which corresponds to `from'.
   3826 * This required during truncate. We need to physically zero the tail end
   3827 * of that block so it doesn't yield old data if the file is later grown.
   3828 */
   3829static int ext4_block_truncate_page(handle_t *handle,
   3830		struct address_space *mapping, loff_t from)
   3831{
   3832	unsigned offset = from & (PAGE_SIZE-1);
   3833	unsigned length;
   3834	unsigned blocksize;
   3835	struct inode *inode = mapping->host;
   3836
   3837	/* If we are processing an encrypted inode during orphan list handling */
   3838	if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
   3839		return 0;
   3840
   3841	blocksize = inode->i_sb->s_blocksize;
   3842	length = blocksize - (offset & (blocksize - 1));
   3843
   3844	return ext4_block_zero_page_range(handle, mapping, from, length);
   3845}
   3846
   3847int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
   3848			     loff_t lstart, loff_t length)
   3849{
   3850	struct super_block *sb = inode->i_sb;
   3851	struct address_space *mapping = inode->i_mapping;
   3852	unsigned partial_start, partial_end;
   3853	ext4_fsblk_t start, end;
   3854	loff_t byte_end = (lstart + length - 1);
   3855	int err = 0;
   3856
   3857	partial_start = lstart & (sb->s_blocksize - 1);
   3858	partial_end = byte_end & (sb->s_blocksize - 1);
   3859
   3860	start = lstart >> sb->s_blocksize_bits;
   3861	end = byte_end >> sb->s_blocksize_bits;
   3862
   3863	/* Handle partial zero within the single block */
   3864	if (start == end &&
   3865	    (partial_start || (partial_end != sb->s_blocksize - 1))) {
   3866		err = ext4_block_zero_page_range(handle, mapping,
   3867						 lstart, length);
   3868		return err;
   3869	}
   3870	/* Handle partial zero out on the start of the range */
   3871	if (partial_start) {
   3872		err = ext4_block_zero_page_range(handle, mapping,
   3873						 lstart, sb->s_blocksize);
   3874		if (err)
   3875			return err;
   3876	}
   3877	/* Handle partial zero out on the end of the range */
   3878	if (partial_end != sb->s_blocksize - 1)
   3879		err = ext4_block_zero_page_range(handle, mapping,
   3880						 byte_end - partial_end,
   3881						 partial_end + 1);
   3882	return err;
   3883}
   3884
   3885int ext4_can_truncate(struct inode *inode)
   3886{
   3887	if (S_ISREG(inode->i_mode))
   3888		return 1;
   3889	if (S_ISDIR(inode->i_mode))
   3890		return 1;
   3891	if (S_ISLNK(inode->i_mode))
   3892		return !ext4_inode_is_fast_symlink(inode);
   3893	return 0;
   3894}
   3895
   3896/*
   3897 * We have to make sure i_disksize gets properly updated before we truncate
   3898 * page cache due to hole punching or zero range. Otherwise i_disksize update
   3899 * can get lost as it may have been postponed to submission of writeback but
   3900 * that will never happen after we truncate page cache.
   3901 */
   3902int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
   3903				      loff_t len)
   3904{
   3905	handle_t *handle;
   3906	int ret;
   3907
   3908	loff_t size = i_size_read(inode);
   3909
   3910	WARN_ON(!inode_is_locked(inode));
   3911	if (offset > size || offset + len < size)
   3912		return 0;
   3913
   3914	if (EXT4_I(inode)->i_disksize >= size)
   3915		return 0;
   3916
   3917	handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
   3918	if (IS_ERR(handle))
   3919		return PTR_ERR(handle);
   3920	ext4_update_i_disksize(inode, size);
   3921	ret = ext4_mark_inode_dirty(handle, inode);
   3922	ext4_journal_stop(handle);
   3923
   3924	return ret;
   3925}
   3926
   3927static void ext4_wait_dax_page(struct inode *inode)
   3928{
   3929	filemap_invalidate_unlock(inode->i_mapping);
   3930	schedule();
   3931	filemap_invalidate_lock(inode->i_mapping);
   3932}
   3933
   3934int ext4_break_layouts(struct inode *inode)
   3935{
   3936	struct page *page;
   3937	int error;
   3938
   3939	if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
   3940		return -EINVAL;
   3941
   3942	do {
   3943		page = dax_layout_busy_page(inode->i_mapping);
   3944		if (!page)
   3945			return 0;
   3946
   3947		error = ___wait_var_event(&page->_refcount,
   3948				atomic_read(&page->_refcount) == 1,
   3949				TASK_INTERRUPTIBLE, 0, 0,
   3950				ext4_wait_dax_page(inode));
   3951	} while (error == 0);
   3952
   3953	return error;
   3954}
   3955
   3956/*
   3957 * ext4_punch_hole: punches a hole in a file by releasing the blocks
   3958 * associated with the given offset and length
   3959 *
   3960 * @inode:  File inode
   3961 * @offset: The offset where the hole will begin
   3962 * @len:    The length of the hole
   3963 *
   3964 * Returns: 0 on success or negative on failure
   3965 */
   3966
   3967int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
   3968{
   3969	struct inode *inode = file_inode(file);
   3970	struct super_block *sb = inode->i_sb;
   3971	ext4_lblk_t first_block, stop_block;
   3972	struct address_space *mapping = inode->i_mapping;
   3973	loff_t first_block_offset, last_block_offset, max_length;
   3974	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   3975	handle_t *handle;
   3976	unsigned int credits;
   3977	int ret = 0, ret2 = 0;
   3978
   3979	trace_ext4_punch_hole(inode, offset, length, 0);
   3980
   3981	/*
   3982	 * Write out all dirty pages to avoid race conditions
   3983	 * Then release them.
   3984	 */
   3985	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
   3986		ret = filemap_write_and_wait_range(mapping, offset,
   3987						   offset + length - 1);
   3988		if (ret)
   3989			return ret;
   3990	}
   3991
   3992	inode_lock(inode);
   3993
   3994	/* No need to punch hole beyond i_size */
   3995	if (offset >= inode->i_size)
   3996		goto out_mutex;
   3997
   3998	/*
   3999	 * If the hole extends beyond i_size, set the hole
   4000	 * to end after the page that contains i_size
   4001	 */
   4002	if (offset + length > inode->i_size) {
   4003		length = inode->i_size +
   4004		   PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
   4005		   offset;
   4006	}
   4007
   4008	/*
   4009	 * For punch hole the length + offset needs to be within one block
   4010	 * before last range. Adjust the length if it goes beyond that limit.
   4011	 */
   4012	max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
   4013	if (offset + length > max_length)
   4014		length = max_length - offset;
   4015
   4016	if (offset & (sb->s_blocksize - 1) ||
   4017	    (offset + length) & (sb->s_blocksize - 1)) {
   4018		/*
   4019		 * Attach jinode to inode for jbd2 if we do any zeroing of
   4020		 * partial block
   4021		 */
   4022		ret = ext4_inode_attach_jinode(inode);
   4023		if (ret < 0)
   4024			goto out_mutex;
   4025
   4026	}
   4027
   4028	/* Wait all existing dio workers, newcomers will block on i_rwsem */
   4029	inode_dio_wait(inode);
   4030
   4031	ret = file_modified(file);
   4032	if (ret)
   4033		goto out_mutex;
   4034
   4035	/*
   4036	 * Prevent page faults from reinstantiating pages we have released from
   4037	 * page cache.
   4038	 */
   4039	filemap_invalidate_lock(mapping);
   4040
   4041	ret = ext4_break_layouts(inode);
   4042	if (ret)
   4043		goto out_dio;
   4044
   4045	first_block_offset = round_up(offset, sb->s_blocksize);
   4046	last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
   4047
   4048	/* Now release the pages and zero block aligned part of pages*/
   4049	if (last_block_offset > first_block_offset) {
   4050		ret = ext4_update_disksize_before_punch(inode, offset, length);
   4051		if (ret)
   4052			goto out_dio;
   4053		truncate_pagecache_range(inode, first_block_offset,
   4054					 last_block_offset);
   4055	}
   4056
   4057	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
   4058		credits = ext4_writepage_trans_blocks(inode);
   4059	else
   4060		credits = ext4_blocks_for_truncate(inode);
   4061	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
   4062	if (IS_ERR(handle)) {
   4063		ret = PTR_ERR(handle);
   4064		ext4_std_error(sb, ret);
   4065		goto out_dio;
   4066	}
   4067
   4068	ret = ext4_zero_partial_blocks(handle, inode, offset,
   4069				       length);
   4070	if (ret)
   4071		goto out_stop;
   4072
   4073	first_block = (offset + sb->s_blocksize - 1) >>
   4074		EXT4_BLOCK_SIZE_BITS(sb);
   4075	stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
   4076
   4077	/* If there are blocks to remove, do it */
   4078	if (stop_block > first_block) {
   4079
   4080		down_write(&EXT4_I(inode)->i_data_sem);
   4081		ext4_discard_preallocations(inode, 0);
   4082
   4083		ret = ext4_es_remove_extent(inode, first_block,
   4084					    stop_block - first_block);
   4085		if (ret) {
   4086			up_write(&EXT4_I(inode)->i_data_sem);
   4087			goto out_stop;
   4088		}
   4089
   4090		if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
   4091			ret = ext4_ext_remove_space(inode, first_block,
   4092						    stop_block - 1);
   4093		else
   4094			ret = ext4_ind_remove_space(handle, inode, first_block,
   4095						    stop_block);
   4096
   4097		up_write(&EXT4_I(inode)->i_data_sem);
   4098	}
   4099	ext4_fc_track_range(handle, inode, first_block, stop_block);
   4100	if (IS_SYNC(inode))
   4101		ext4_handle_sync(handle);
   4102
   4103	inode->i_mtime = inode->i_ctime = current_time(inode);
   4104	ret2 = ext4_mark_inode_dirty(handle, inode);
   4105	if (unlikely(ret2))
   4106		ret = ret2;
   4107	if (ret >= 0)
   4108		ext4_update_inode_fsync_trans(handle, inode, 1);
   4109out_stop:
   4110	ext4_journal_stop(handle);
   4111out_dio:
   4112	filemap_invalidate_unlock(mapping);
   4113out_mutex:
   4114	inode_unlock(inode);
   4115	return ret;
   4116}
   4117
   4118int ext4_inode_attach_jinode(struct inode *inode)
   4119{
   4120	struct ext4_inode_info *ei = EXT4_I(inode);
   4121	struct jbd2_inode *jinode;
   4122
   4123	if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal)
   4124		return 0;
   4125
   4126	jinode = jbd2_alloc_inode(GFP_KERNEL);
   4127	spin_lock(&inode->i_lock);
   4128	if (!ei->jinode) {
   4129		if (!jinode) {
   4130			spin_unlock(&inode->i_lock);
   4131			return -ENOMEM;
   4132		}
   4133		ei->jinode = jinode;
   4134		jbd2_journal_init_jbd_inode(ei->jinode, inode);
   4135		jinode = NULL;
   4136	}
   4137	spin_unlock(&inode->i_lock);
   4138	if (unlikely(jinode != NULL))
   4139		jbd2_free_inode(jinode);
   4140	return 0;
   4141}
   4142
   4143/*
   4144 * ext4_truncate()
   4145 *
   4146 * We block out ext4_get_block() block instantiations across the entire
   4147 * transaction, and VFS/VM ensures that ext4_truncate() cannot run
   4148 * simultaneously on behalf of the same inode.
   4149 *
   4150 * As we work through the truncate and commit bits of it to the journal there
   4151 * is one core, guiding principle: the file's tree must always be consistent on
   4152 * disk.  We must be able to restart the truncate after a crash.
   4153 *
   4154 * The file's tree may be transiently inconsistent in memory (although it
   4155 * probably isn't), but whenever we close off and commit a journal transaction,
   4156 * the contents of (the filesystem + the journal) must be consistent and
   4157 * restartable.  It's pretty simple, really: bottom up, right to left (although
   4158 * left-to-right works OK too).
   4159 *
   4160 * Note that at recovery time, journal replay occurs *before* the restart of
   4161 * truncate against the orphan inode list.
   4162 *
   4163 * The committed inode has the new, desired i_size (which is the same as
   4164 * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
   4165 * that this inode's truncate did not complete and it will again call
   4166 * ext4_truncate() to have another go.  So there will be instantiated blocks
   4167 * to the right of the truncation point in a crashed ext4 filesystem.  But
   4168 * that's fine - as long as they are linked from the inode, the post-crash
   4169 * ext4_truncate() run will find them and release them.
   4170 */
   4171int ext4_truncate(struct inode *inode)
   4172{
   4173	struct ext4_inode_info *ei = EXT4_I(inode);
   4174	unsigned int credits;
   4175	int err = 0, err2;
   4176	handle_t *handle;
   4177	struct address_space *mapping = inode->i_mapping;
   4178
   4179	/*
   4180	 * There is a possibility that we're either freeing the inode
   4181	 * or it's a completely new inode. In those cases we might not
   4182	 * have i_rwsem locked because it's not necessary.
   4183	 */
   4184	if (!(inode->i_state & (I_NEW|I_FREEING)))
   4185		WARN_ON(!inode_is_locked(inode));
   4186	trace_ext4_truncate_enter(inode);
   4187
   4188	if (!ext4_can_truncate(inode))
   4189		goto out_trace;
   4190
   4191	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
   4192		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
   4193
   4194	if (ext4_has_inline_data(inode)) {
   4195		int has_inline = 1;
   4196
   4197		err = ext4_inline_data_truncate(inode, &has_inline);
   4198		if (err || has_inline)
   4199			goto out_trace;
   4200	}
   4201
   4202	/* If we zero-out tail of the page, we have to create jinode for jbd2 */
   4203	if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
   4204		if (ext4_inode_attach_jinode(inode) < 0)
   4205			goto out_trace;
   4206	}
   4207
   4208	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
   4209		credits = ext4_writepage_trans_blocks(inode);
   4210	else
   4211		credits = ext4_blocks_for_truncate(inode);
   4212
   4213	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
   4214	if (IS_ERR(handle)) {
   4215		err = PTR_ERR(handle);
   4216		goto out_trace;
   4217	}
   4218
   4219	if (inode->i_size & (inode->i_sb->s_blocksize - 1))
   4220		ext4_block_truncate_page(handle, mapping, inode->i_size);
   4221
   4222	/*
   4223	 * We add the inode to the orphan list, so that if this
   4224	 * truncate spans multiple transactions, and we crash, we will
   4225	 * resume the truncate when the filesystem recovers.  It also
   4226	 * marks the inode dirty, to catch the new size.
   4227	 *
   4228	 * Implication: the file must always be in a sane, consistent
   4229	 * truncatable state while each transaction commits.
   4230	 */
   4231	err = ext4_orphan_add(handle, inode);
   4232	if (err)
   4233		goto out_stop;
   4234
   4235	down_write(&EXT4_I(inode)->i_data_sem);
   4236
   4237	ext4_discard_preallocations(inode, 0);
   4238
   4239	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
   4240		err = ext4_ext_truncate(handle, inode);
   4241	else
   4242		ext4_ind_truncate(handle, inode);
   4243
   4244	up_write(&ei->i_data_sem);
   4245	if (err)
   4246		goto out_stop;
   4247
   4248	if (IS_SYNC(inode))
   4249		ext4_handle_sync(handle);
   4250
   4251out_stop:
   4252	/*
   4253	 * If this was a simple ftruncate() and the file will remain alive,
   4254	 * then we need to clear up the orphan record which we created above.
   4255	 * However, if this was a real unlink then we were called by
   4256	 * ext4_evict_inode(), and we allow that function to clean up the
   4257	 * orphan info for us.
   4258	 */
   4259	if (inode->i_nlink)
   4260		ext4_orphan_del(handle, inode);
   4261
   4262	inode->i_mtime = inode->i_ctime = current_time(inode);
   4263	err2 = ext4_mark_inode_dirty(handle, inode);
   4264	if (unlikely(err2 && !err))
   4265		err = err2;
   4266	ext4_journal_stop(handle);
   4267
   4268out_trace:
   4269	trace_ext4_truncate_exit(inode);
   4270	return err;
   4271}
   4272
   4273static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
   4274{
   4275	if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
   4276		return inode_peek_iversion_raw(inode);
   4277	else
   4278		return inode_peek_iversion(inode);
   4279}
   4280
   4281static int ext4_inode_blocks_set(struct ext4_inode *raw_inode,
   4282				 struct ext4_inode_info *ei)
   4283{
   4284	struct inode *inode = &(ei->vfs_inode);
   4285	u64 i_blocks = READ_ONCE(inode->i_blocks);
   4286	struct super_block *sb = inode->i_sb;
   4287
   4288	if (i_blocks <= ~0U) {
   4289		/*
   4290		 * i_blocks can be represented in a 32 bit variable
   4291		 * as multiple of 512 bytes
   4292		 */
   4293		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
   4294		raw_inode->i_blocks_high = 0;
   4295		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
   4296		return 0;
   4297	}
   4298
   4299	/*
   4300	 * This should never happen since sb->s_maxbytes should not have
   4301	 * allowed this, sb->s_maxbytes was set according to the huge_file
   4302	 * feature in ext4_fill_super().
   4303	 */
   4304	if (!ext4_has_feature_huge_file(sb))
   4305		return -EFSCORRUPTED;
   4306
   4307	if (i_blocks <= 0xffffffffffffULL) {
   4308		/*
   4309		 * i_blocks can be represented in a 48 bit variable
   4310		 * as multiple of 512 bytes
   4311		 */
   4312		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
   4313		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
   4314		ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
   4315	} else {
   4316		ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
   4317		/* i_block is stored in file system block size */
   4318		i_blocks = i_blocks >> (inode->i_blkbits - 9);
   4319		raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
   4320		raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
   4321	}
   4322	return 0;
   4323}
   4324
   4325static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode)
   4326{
   4327	struct ext4_inode_info *ei = EXT4_I(inode);
   4328	uid_t i_uid;
   4329	gid_t i_gid;
   4330	projid_t i_projid;
   4331	int block;
   4332	int err;
   4333
   4334	err = ext4_inode_blocks_set(raw_inode, ei);
   4335
   4336	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
   4337	i_uid = i_uid_read(inode);
   4338	i_gid = i_gid_read(inode);
   4339	i_projid = from_kprojid(&init_user_ns, ei->i_projid);
   4340	if (!(test_opt(inode->i_sb, NO_UID32))) {
   4341		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
   4342		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
   4343		/*
   4344		 * Fix up interoperability with old kernels. Otherwise,
   4345		 * old inodes get re-used with the upper 16 bits of the
   4346		 * uid/gid intact.
   4347		 */
   4348		if (ei->i_dtime && list_empty(&ei->i_orphan)) {
   4349			raw_inode->i_uid_high = 0;
   4350			raw_inode->i_gid_high = 0;
   4351		} else {
   4352			raw_inode->i_uid_high =
   4353				cpu_to_le16(high_16_bits(i_uid));
   4354			raw_inode->i_gid_high =
   4355				cpu_to_le16(high_16_bits(i_gid));
   4356		}
   4357	} else {
   4358		raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
   4359		raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
   4360		raw_inode->i_uid_high = 0;
   4361		raw_inode->i_gid_high = 0;
   4362	}
   4363	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
   4364
   4365	EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
   4366	EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
   4367	EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
   4368	EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
   4369
   4370	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
   4371	raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
   4372	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
   4373		raw_inode->i_file_acl_high =
   4374			cpu_to_le16(ei->i_file_acl >> 32);
   4375	raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
   4376	ext4_isize_set(raw_inode, ei->i_disksize);
   4377
   4378	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
   4379	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
   4380		if (old_valid_dev(inode->i_rdev)) {
   4381			raw_inode->i_block[0] =
   4382				cpu_to_le32(old_encode_dev(inode->i_rdev));
   4383			raw_inode->i_block[1] = 0;
   4384		} else {
   4385			raw_inode->i_block[0] = 0;
   4386			raw_inode->i_block[1] =
   4387				cpu_to_le32(new_encode_dev(inode->i_rdev));
   4388			raw_inode->i_block[2] = 0;
   4389		}
   4390	} else if (!ext4_has_inline_data(inode)) {
   4391		for (block = 0; block < EXT4_N_BLOCKS; block++)
   4392			raw_inode->i_block[block] = ei->i_data[block];
   4393	}
   4394
   4395	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
   4396		u64 ivers = ext4_inode_peek_iversion(inode);
   4397
   4398		raw_inode->i_disk_version = cpu_to_le32(ivers);
   4399		if (ei->i_extra_isize) {
   4400			if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
   4401				raw_inode->i_version_hi =
   4402					cpu_to_le32(ivers >> 32);
   4403			raw_inode->i_extra_isize =
   4404				cpu_to_le16(ei->i_extra_isize);
   4405		}
   4406	}
   4407
   4408	if (i_projid != EXT4_DEF_PROJID &&
   4409	    !ext4_has_feature_project(inode->i_sb))
   4410		err = err ?: -EFSCORRUPTED;
   4411
   4412	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
   4413	    EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
   4414		raw_inode->i_projid = cpu_to_le32(i_projid);
   4415
   4416	ext4_inode_csum_set(inode, raw_inode, ei);
   4417	return err;
   4418}
   4419
   4420/*
   4421 * ext4_get_inode_loc returns with an extra refcount against the inode's
   4422 * underlying buffer_head on success. If we pass 'inode' and it does not
   4423 * have in-inode xattr, we have all inode data in memory that is needed
   4424 * to recreate the on-disk version of this inode.
   4425 */
   4426static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
   4427				struct inode *inode, struct ext4_iloc *iloc,
   4428				ext4_fsblk_t *ret_block)
   4429{
   4430	struct ext4_group_desc	*gdp;
   4431	struct buffer_head	*bh;
   4432	ext4_fsblk_t		block;
   4433	struct blk_plug		plug;
   4434	int			inodes_per_block, inode_offset;
   4435
   4436	iloc->bh = NULL;
   4437	if (ino < EXT4_ROOT_INO ||
   4438	    ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
   4439		return -EFSCORRUPTED;
   4440
   4441	iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
   4442	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
   4443	if (!gdp)
   4444		return -EIO;
   4445
   4446	/*
   4447	 * Figure out the offset within the block group inode table
   4448	 */
   4449	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
   4450	inode_offset = ((ino - 1) %
   4451			EXT4_INODES_PER_GROUP(sb));
   4452	block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
   4453	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
   4454
   4455	bh = sb_getblk(sb, block);
   4456	if (unlikely(!bh))
   4457		return -ENOMEM;
   4458	if (ext4_buffer_uptodate(bh))
   4459		goto has_buffer;
   4460
   4461	lock_buffer(bh);
   4462	if (ext4_buffer_uptodate(bh)) {
   4463		/* Someone brought it uptodate while we waited */
   4464		unlock_buffer(bh);
   4465		goto has_buffer;
   4466	}
   4467
   4468	/*
   4469	 * If we have all information of the inode in memory and this
   4470	 * is the only valid inode in the block, we need not read the
   4471	 * block.
   4472	 */
   4473	if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
   4474		struct buffer_head *bitmap_bh;
   4475		int i, start;
   4476
   4477		start = inode_offset & ~(inodes_per_block - 1);
   4478
   4479		/* Is the inode bitmap in cache? */
   4480		bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
   4481		if (unlikely(!bitmap_bh))
   4482			goto make_io;
   4483
   4484		/*
   4485		 * If the inode bitmap isn't in cache then the
   4486		 * optimisation may end up performing two reads instead
   4487		 * of one, so skip it.
   4488		 */
   4489		if (!buffer_uptodate(bitmap_bh)) {
   4490			brelse(bitmap_bh);
   4491			goto make_io;
   4492		}
   4493		for (i = start; i < start + inodes_per_block; i++) {
   4494			if (i == inode_offset)
   4495				continue;
   4496			if (ext4_test_bit(i, bitmap_bh->b_data))
   4497				break;
   4498		}
   4499		brelse(bitmap_bh);
   4500		if (i == start + inodes_per_block) {
   4501			struct ext4_inode *raw_inode =
   4502				(struct ext4_inode *) (bh->b_data + iloc->offset);
   4503
   4504			/* all other inodes are free, so skip I/O */
   4505			memset(bh->b_data, 0, bh->b_size);
   4506			if (!ext4_test_inode_state(inode, EXT4_STATE_NEW))
   4507				ext4_fill_raw_inode(inode, raw_inode);
   4508			set_buffer_uptodate(bh);
   4509			unlock_buffer(bh);
   4510			goto has_buffer;
   4511		}
   4512	}
   4513
   4514make_io:
   4515	/*
   4516	 * If we need to do any I/O, try to pre-readahead extra
   4517	 * blocks from the inode table.
   4518	 */
   4519	blk_start_plug(&plug);
   4520	if (EXT4_SB(sb)->s_inode_readahead_blks) {
   4521		ext4_fsblk_t b, end, table;
   4522		unsigned num;
   4523		__u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;
   4524
   4525		table = ext4_inode_table(sb, gdp);
   4526		/* s_inode_readahead_blks is always a power of 2 */
   4527		b = block & ~((ext4_fsblk_t) ra_blks - 1);
   4528		if (table > b)
   4529			b = table;
   4530		end = b + ra_blks;
   4531		num = EXT4_INODES_PER_GROUP(sb);
   4532		if (ext4_has_group_desc_csum(sb))
   4533			num -= ext4_itable_unused_count(sb, gdp);
   4534		table += num / inodes_per_block;
   4535		if (end > table)
   4536			end = table;
   4537		while (b <= end)
   4538			ext4_sb_breadahead_unmovable(sb, b++);
   4539	}
   4540
   4541	/*
   4542	 * There are other valid inodes in the buffer, this inode
   4543	 * has in-inode xattrs, or we don't have this inode in memory.
   4544	 * Read the block from disk.
   4545	 */
   4546	trace_ext4_load_inode(sb, ino);
   4547	ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
   4548	blk_finish_plug(&plug);
   4549	wait_on_buffer(bh);
   4550	ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO);
   4551	if (!buffer_uptodate(bh)) {
   4552		if (ret_block)
   4553			*ret_block = block;
   4554		brelse(bh);
   4555		return -EIO;
   4556	}
   4557has_buffer:
   4558	iloc->bh = bh;
   4559	return 0;
   4560}
   4561
   4562static int __ext4_get_inode_loc_noinmem(struct inode *inode,
   4563					struct ext4_iloc *iloc)
   4564{
   4565	ext4_fsblk_t err_blk = 0;
   4566	int ret;
   4567
   4568	ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc,
   4569					&err_blk);
   4570
   4571	if (ret == -EIO)
   4572		ext4_error_inode_block(inode, err_blk, EIO,
   4573					"unable to read itable block");
   4574
   4575	return ret;
   4576}
   4577
   4578int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
   4579{
   4580	ext4_fsblk_t err_blk = 0;
   4581	int ret;
   4582
   4583	ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc,
   4584					&err_blk);
   4585
   4586	if (ret == -EIO)
   4587		ext4_error_inode_block(inode, err_blk, EIO,
   4588					"unable to read itable block");
   4589
   4590	return ret;
   4591}
   4592
   4593
   4594int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
   4595			  struct ext4_iloc *iloc)
   4596{
   4597	return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL);
   4598}
   4599
   4600static bool ext4_should_enable_dax(struct inode *inode)
   4601{
   4602	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   4603
   4604	if (test_opt2(inode->i_sb, DAX_NEVER))
   4605		return false;
   4606	if (!S_ISREG(inode->i_mode))
   4607		return false;
   4608	if (ext4_should_journal_data(inode))
   4609		return false;
   4610	if (ext4_has_inline_data(inode))
   4611		return false;
   4612	if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
   4613		return false;
   4614	if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
   4615		return false;
   4616	if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags))
   4617		return false;
   4618	if (test_opt(inode->i_sb, DAX_ALWAYS))
   4619		return true;
   4620
   4621	return ext4_test_inode_flag(inode, EXT4_INODE_DAX);
   4622}
   4623
   4624void ext4_set_inode_flags(struct inode *inode, bool init)
   4625{
   4626	unsigned int flags = EXT4_I(inode)->i_flags;
   4627	unsigned int new_fl = 0;
   4628
   4629	WARN_ON_ONCE(IS_DAX(inode) && init);
   4630
   4631	if (flags & EXT4_SYNC_FL)
   4632		new_fl |= S_SYNC;
   4633	if (flags & EXT4_APPEND_FL)
   4634		new_fl |= S_APPEND;
   4635	if (flags & EXT4_IMMUTABLE_FL)
   4636		new_fl |= S_IMMUTABLE;
   4637	if (flags & EXT4_NOATIME_FL)
   4638		new_fl |= S_NOATIME;
   4639	if (flags & EXT4_DIRSYNC_FL)
   4640		new_fl |= S_DIRSYNC;
   4641
   4642	/* Because of the way inode_set_flags() works we must preserve S_DAX
   4643	 * here if already set. */
   4644	new_fl |= (inode->i_flags & S_DAX);
   4645	if (init && ext4_should_enable_dax(inode))
   4646		new_fl |= S_DAX;
   4647
   4648	if (flags & EXT4_ENCRYPT_FL)
   4649		new_fl |= S_ENCRYPTED;
   4650	if (flags & EXT4_CASEFOLD_FL)
   4651		new_fl |= S_CASEFOLD;
   4652	if (flags & EXT4_VERITY_FL)
   4653		new_fl |= S_VERITY;
   4654	inode_set_flags(inode, new_fl,
   4655			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
   4656			S_ENCRYPTED|S_CASEFOLD|S_VERITY);
   4657}
   4658
   4659static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
   4660				  struct ext4_inode_info *ei)
   4661{
   4662	blkcnt_t i_blocks ;
   4663	struct inode *inode = &(ei->vfs_inode);
   4664	struct super_block *sb = inode->i_sb;
   4665
   4666	if (ext4_has_feature_huge_file(sb)) {
   4667		/* we are using combined 48 bit field */
   4668		i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
   4669					le32_to_cpu(raw_inode->i_blocks_lo);
   4670		if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
   4671			/* i_blocks represent file system block size */
   4672			return i_blocks  << (inode->i_blkbits - 9);
   4673		} else {
   4674			return i_blocks;
   4675		}
   4676	} else {
   4677		return le32_to_cpu(raw_inode->i_blocks_lo);
   4678	}
   4679}
   4680
   4681static inline int ext4_iget_extra_inode(struct inode *inode,
   4682					 struct ext4_inode *raw_inode,
   4683					 struct ext4_inode_info *ei)
   4684{
   4685	__le32 *magic = (void *)raw_inode +
   4686			EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
   4687
   4688	if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <=
   4689	    EXT4_INODE_SIZE(inode->i_sb) &&
   4690	    *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
   4691		ext4_set_inode_state(inode, EXT4_STATE_XATTR);
   4692		return ext4_find_inline_data_nolock(inode);
   4693	} else
   4694		EXT4_I(inode)->i_inline_off = 0;
   4695	return 0;
   4696}
   4697
   4698int ext4_get_projid(struct inode *inode, kprojid_t *projid)
   4699{
   4700	if (!ext4_has_feature_project(inode->i_sb))
   4701		return -EOPNOTSUPP;
   4702	*projid = EXT4_I(inode)->i_projid;
   4703	return 0;
   4704}
   4705
   4706/*
   4707 * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of
   4708 * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag
   4709 * set.
   4710 */
   4711static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
   4712{
   4713	if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
   4714		inode_set_iversion_raw(inode, val);
   4715	else
   4716		inode_set_iversion_queried(inode, val);
   4717}
   4718
   4719struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
   4720			  ext4_iget_flags flags, const char *function,
   4721			  unsigned int line)
   4722{
   4723	struct ext4_iloc iloc;
   4724	struct ext4_inode *raw_inode;
   4725	struct ext4_inode_info *ei;
   4726	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
   4727	struct inode *inode;
   4728	journal_t *journal = EXT4_SB(sb)->s_journal;
   4729	long ret;
   4730	loff_t size;
   4731	int block;
   4732	uid_t i_uid;
   4733	gid_t i_gid;
   4734	projid_t i_projid;
   4735
   4736	if ((!(flags & EXT4_IGET_SPECIAL) &&
   4737	     ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
   4738	      ino == le32_to_cpu(es->s_usr_quota_inum) ||
   4739	      ino == le32_to_cpu(es->s_grp_quota_inum) ||
   4740	      ino == le32_to_cpu(es->s_prj_quota_inum) ||
   4741	      ino == le32_to_cpu(es->s_orphan_file_inum))) ||
   4742	    (ino < EXT4_ROOT_INO) ||
   4743	    (ino > le32_to_cpu(es->s_inodes_count))) {
   4744		if (flags & EXT4_IGET_HANDLE)
   4745			return ERR_PTR(-ESTALE);
   4746		__ext4_error(sb, function, line, false, EFSCORRUPTED, 0,
   4747			     "inode #%lu: comm %s: iget: illegal inode #",
   4748			     ino, current->comm);
   4749		return ERR_PTR(-EFSCORRUPTED);
   4750	}
   4751
   4752	inode = iget_locked(sb, ino);
   4753	if (!inode)
   4754		return ERR_PTR(-ENOMEM);
   4755	if (!(inode->i_state & I_NEW))
   4756		return inode;
   4757
   4758	ei = EXT4_I(inode);
   4759	iloc.bh = NULL;
   4760
   4761	ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
   4762	if (ret < 0)
   4763		goto bad_inode;
   4764	raw_inode = ext4_raw_inode(&iloc);
   4765
   4766	if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) {
   4767		ext4_error_inode(inode, function, line, 0,
   4768				 "iget: root inode unallocated");
   4769		ret = -EFSCORRUPTED;
   4770		goto bad_inode;
   4771	}
   4772
   4773	if ((flags & EXT4_IGET_HANDLE) &&
   4774	    (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
   4775		ret = -ESTALE;
   4776		goto bad_inode;
   4777	}
   4778
   4779	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
   4780		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
   4781		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
   4782			EXT4_INODE_SIZE(inode->i_sb) ||
   4783		    (ei->i_extra_isize & 3)) {
   4784			ext4_error_inode(inode, function, line, 0,
   4785					 "iget: bad extra_isize %u "
   4786					 "(inode size %u)",
   4787					 ei->i_extra_isize,
   4788					 EXT4_INODE_SIZE(inode->i_sb));
   4789			ret = -EFSCORRUPTED;
   4790			goto bad_inode;
   4791		}
   4792	} else
   4793		ei->i_extra_isize = 0;
   4794
   4795	/* Precompute checksum seed for inode metadata */
   4796	if (ext4_has_metadata_csum(sb)) {
   4797		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   4798		__u32 csum;
   4799		__le32 inum = cpu_to_le32(inode->i_ino);
   4800		__le32 gen = raw_inode->i_generation;
   4801		csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
   4802				   sizeof(inum));
   4803		ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
   4804					      sizeof(gen));
   4805	}
   4806
   4807	if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
   4808	    ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) &&
   4809	     (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) {
   4810		ext4_error_inode_err(inode, function, line, 0,
   4811				EFSBADCRC, "iget: checksum invalid");
   4812		ret = -EFSBADCRC;
   4813		goto bad_inode;
   4814	}
   4815
   4816	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
   4817	i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
   4818	i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
   4819	if (ext4_has_feature_project(sb) &&
   4820	    EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE &&
   4821	    EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
   4822		i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid);
   4823	else
   4824		i_projid = EXT4_DEF_PROJID;
   4825
   4826	if (!(test_opt(inode->i_sb, NO_UID32))) {
   4827		i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
   4828		i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
   4829	}
   4830	i_uid_write(inode, i_uid);
   4831	i_gid_write(inode, i_gid);
   4832	ei->i_projid = make_kprojid(&init_user_ns, i_projid);
   4833	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
   4834
   4835	ext4_clear_state_flags(ei);	/* Only relevant on 32-bit archs */
   4836	ei->i_inline_off = 0;
   4837	ei->i_dir_start_lookup = 0;
   4838	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
   4839	/* We now have enough fields to check if the inode was active or not.
   4840	 * This is needed because nfsd might try to access dead inodes
   4841	 * the test is that same one that e2fsck uses
   4842	 * NeilBrown 1999oct15
   4843	 */
   4844	if (inode->i_nlink == 0) {
   4845		if ((inode->i_mode == 0 ||
   4846		     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
   4847		    ino != EXT4_BOOT_LOADER_INO) {
   4848			/* this inode is deleted */
   4849			ret = -ESTALE;
   4850			goto bad_inode;
   4851		}
   4852		/* The only unlinked inodes we let through here have
   4853		 * valid i_mode and are being read by the orphan
   4854		 * recovery code: that's fine, we're about to complete
   4855		 * the process of deleting those.
   4856		 * OR it is the EXT4_BOOT_LOADER_INO which is
   4857		 * not initialized on a new filesystem. */
   4858	}
   4859	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
   4860	ext4_set_inode_flags(inode, true);
   4861	inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
   4862	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
   4863	if (ext4_has_feature_64bit(sb))
   4864		ei->i_file_acl |=
   4865			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
   4866	inode->i_size = ext4_isize(sb, raw_inode);
   4867	if ((size = i_size_read(inode)) < 0) {
   4868		ext4_error_inode(inode, function, line, 0,
   4869				 "iget: bad i_size value: %lld", size);
   4870		ret = -EFSCORRUPTED;
   4871		goto bad_inode;
   4872	}
   4873	/*
   4874	 * If dir_index is not enabled but there's dir with INDEX flag set,
   4875	 * we'd normally treat htree data as empty space. But with metadata
   4876	 * checksumming that corrupts checksums so forbid that.
   4877	 */
   4878	if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) &&
   4879	    ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
   4880		ext4_error_inode(inode, function, line, 0,
   4881			 "iget: Dir with htree data on filesystem without dir_index feature.");
   4882		ret = -EFSCORRUPTED;
   4883		goto bad_inode;
   4884	}
   4885	ei->i_disksize = inode->i_size;
   4886#ifdef CONFIG_QUOTA
   4887	ei->i_reserved_quota = 0;
   4888#endif
   4889	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
   4890	ei->i_block_group = iloc.block_group;
   4891	ei->i_last_alloc_group = ~0;
   4892	/*
   4893	 * NOTE! The in-memory inode i_data array is in little-endian order
   4894	 * even on big-endian machines: we do NOT byteswap the block numbers!
   4895	 */
   4896	for (block = 0; block < EXT4_N_BLOCKS; block++)
   4897		ei->i_data[block] = raw_inode->i_block[block];
   4898	INIT_LIST_HEAD(&ei->i_orphan);
   4899	ext4_fc_init_inode(&ei->vfs_inode);
   4900
   4901	/*
   4902	 * Set transaction id's of transactions that have to be committed
   4903	 * to finish f[data]sync. We set them to currently running transaction
   4904	 * as we cannot be sure that the inode or some of its metadata isn't
   4905	 * part of the transaction - the inode could have been reclaimed and
   4906	 * now it is reread from disk.
   4907	 */
   4908	if (journal) {
   4909		transaction_t *transaction;
   4910		tid_t tid;
   4911
   4912		read_lock(&journal->j_state_lock);
   4913		if (journal->j_running_transaction)
   4914			transaction = journal->j_running_transaction;
   4915		else
   4916			transaction = journal->j_committing_transaction;
   4917		if (transaction)
   4918			tid = transaction->t_tid;
   4919		else
   4920			tid = journal->j_commit_sequence;
   4921		read_unlock(&journal->j_state_lock);
   4922		ei->i_sync_tid = tid;
   4923		ei->i_datasync_tid = tid;
   4924	}
   4925
   4926	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
   4927		if (ei->i_extra_isize == 0) {
   4928			/* The extra space is currently unused. Use it. */
   4929			BUILD_BUG_ON(sizeof(struct ext4_inode) & 3);
   4930			ei->i_extra_isize = sizeof(struct ext4_inode) -
   4931					    EXT4_GOOD_OLD_INODE_SIZE;
   4932		} else {
   4933			ret = ext4_iget_extra_inode(inode, raw_inode, ei);
   4934			if (ret)
   4935				goto bad_inode;
   4936		}
   4937	}
   4938
   4939	EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
   4940	EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
   4941	EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
   4942	EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
   4943
   4944	if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
   4945		u64 ivers = le32_to_cpu(raw_inode->i_disk_version);
   4946
   4947		if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
   4948			if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
   4949				ivers |=
   4950		    (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
   4951		}
   4952		ext4_inode_set_iversion_queried(inode, ivers);
   4953	}
   4954
   4955	ret = 0;
   4956	if (ei->i_file_acl &&
   4957	    !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) {
   4958		ext4_error_inode(inode, function, line, 0,
   4959				 "iget: bad extended attribute block %llu",
   4960				 ei->i_file_acl);
   4961		ret = -EFSCORRUPTED;
   4962		goto bad_inode;
   4963	} else if (!ext4_has_inline_data(inode)) {
   4964		/* validate the block references in the inode */
   4965		if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
   4966			(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
   4967			(S_ISLNK(inode->i_mode) &&
   4968			!ext4_inode_is_fast_symlink(inode)))) {
   4969			if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
   4970				ret = ext4_ext_check_inode(inode);
   4971			else
   4972				ret = ext4_ind_check_inode(inode);
   4973		}
   4974	}
   4975	if (ret)
   4976		goto bad_inode;
   4977
   4978	if (S_ISREG(inode->i_mode)) {
   4979		inode->i_op = &ext4_file_inode_operations;
   4980		inode->i_fop = &ext4_file_operations;
   4981		ext4_set_aops(inode);
   4982	} else if (S_ISDIR(inode->i_mode)) {
   4983		inode->i_op = &ext4_dir_inode_operations;
   4984		inode->i_fop = &ext4_dir_operations;
   4985	} else if (S_ISLNK(inode->i_mode)) {
   4986		/* VFS does not allow setting these so must be corruption */
   4987		if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
   4988			ext4_error_inode(inode, function, line, 0,
   4989					 "iget: immutable or append flags "
   4990					 "not allowed on symlinks");
   4991			ret = -EFSCORRUPTED;
   4992			goto bad_inode;
   4993		}
   4994		if (IS_ENCRYPTED(inode)) {
   4995			inode->i_op = &ext4_encrypted_symlink_inode_operations;
   4996		} else if (ext4_inode_is_fast_symlink(inode)) {
   4997			inode->i_link = (char *)ei->i_data;
   4998			inode->i_op = &ext4_fast_symlink_inode_operations;
   4999			nd_terminate_link(ei->i_data, inode->i_size,
   5000				sizeof(ei->i_data) - 1);
   5001		} else {
   5002			inode->i_op = &ext4_symlink_inode_operations;
   5003		}
   5004	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
   5005	      S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
   5006		inode->i_op = &ext4_special_inode_operations;
   5007		if (raw_inode->i_block[0])
   5008			init_special_inode(inode, inode->i_mode,
   5009			   old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
   5010		else
   5011			init_special_inode(inode, inode->i_mode,
   5012			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
   5013	} else if (ino == EXT4_BOOT_LOADER_INO) {
   5014		make_bad_inode(inode);
   5015	} else {
   5016		ret = -EFSCORRUPTED;
   5017		ext4_error_inode(inode, function, line, 0,
   5018				 "iget: bogus i_mode (%o)", inode->i_mode);
   5019		goto bad_inode;
   5020	}
   5021	if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
   5022		ext4_error_inode(inode, function, line, 0,
   5023				 "casefold flag without casefold feature");
   5024	brelse(iloc.bh);
   5025
   5026	unlock_new_inode(inode);
   5027	return inode;
   5028
   5029bad_inode:
   5030	brelse(iloc.bh);
   5031	iget_failed(inode);
   5032	return ERR_PTR(ret);
   5033}
   5034
   5035static void __ext4_update_other_inode_time(struct super_block *sb,
   5036					   unsigned long orig_ino,
   5037					   unsigned long ino,
   5038					   struct ext4_inode *raw_inode)
   5039{
   5040	struct inode *inode;
   5041
   5042	inode = find_inode_by_ino_rcu(sb, ino);
   5043	if (!inode)
   5044		return;
   5045
   5046	if (!inode_is_dirtytime_only(inode))
   5047		return;
   5048
   5049	spin_lock(&inode->i_lock);
   5050	if (inode_is_dirtytime_only(inode)) {
   5051		struct ext4_inode_info	*ei = EXT4_I(inode);
   5052
   5053		inode->i_state &= ~I_DIRTY_TIME;
   5054		spin_unlock(&inode->i_lock);
   5055
   5056		spin_lock(&ei->i_raw_lock);
   5057		EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
   5058		EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
   5059		EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
   5060		ext4_inode_csum_set(inode, raw_inode, ei);
   5061		spin_unlock(&ei->i_raw_lock);
   5062		trace_ext4_other_inode_update_time(inode, orig_ino);
   5063		return;
   5064	}
   5065	spin_unlock(&inode->i_lock);
   5066}
   5067
   5068/*
   5069 * Opportunistically update the other time fields for other inodes in
   5070 * the same inode table block.
   5071 */
   5072static void ext4_update_other_inodes_time(struct super_block *sb,
   5073					  unsigned long orig_ino, char *buf)
   5074{
   5075	unsigned long ino;
   5076	int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
   5077	int inode_size = EXT4_INODE_SIZE(sb);
   5078
   5079	/*
   5080	 * Calculate the first inode in the inode table block.  Inode
   5081	 * numbers are one-based.  That is, the first inode in a block
   5082	 * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1).
   5083	 */
   5084	ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1;
   5085	rcu_read_lock();
   5086	for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
   5087		if (ino == orig_ino)
   5088			continue;
   5089		__ext4_update_other_inode_time(sb, orig_ino, ino,
   5090					       (struct ext4_inode *)buf);
   5091	}
   5092	rcu_read_unlock();
   5093}
   5094
   5095/*
   5096 * Post the struct inode info into an on-disk inode location in the
   5097 * buffer-cache.  This gobbles the caller's reference to the
   5098 * buffer_head in the inode location struct.
   5099 *
   5100 * The caller must have write access to iloc->bh.
   5101 */
   5102static int ext4_do_update_inode(handle_t *handle,
   5103				struct inode *inode,
   5104				struct ext4_iloc *iloc)
   5105{
   5106	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
   5107	struct ext4_inode_info *ei = EXT4_I(inode);
   5108	struct buffer_head *bh = iloc->bh;
   5109	struct super_block *sb = inode->i_sb;
   5110	int err;
   5111	int need_datasync = 0, set_large_file = 0;
   5112
   5113	spin_lock(&ei->i_raw_lock);
   5114
   5115	/*
   5116	 * For fields not tracked in the in-memory inode, initialise them
   5117	 * to zero for new inodes.
   5118	 */
   5119	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
   5120		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
   5121
   5122	if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode))
   5123		need_datasync = 1;
   5124	if (ei->i_disksize > 0x7fffffffULL) {
   5125		if (!ext4_has_feature_large_file(sb) ||
   5126		    EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV))
   5127			set_large_file = 1;
   5128	}
   5129
   5130	err = ext4_fill_raw_inode(inode, raw_inode);
   5131	spin_unlock(&ei->i_raw_lock);
   5132	if (err) {
   5133		EXT4_ERROR_INODE(inode, "corrupted inode contents");
   5134		goto out_brelse;
   5135	}
   5136
   5137	if (inode->i_sb->s_flags & SB_LAZYTIME)
   5138		ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
   5139					      bh->b_data);
   5140
   5141	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
   5142	err = ext4_handle_dirty_metadata(handle, NULL, bh);
   5143	if (err)
   5144		goto out_error;
   5145	ext4_clear_inode_state(inode, EXT4_STATE_NEW);
   5146	if (set_large_file) {
   5147		BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
   5148		err = ext4_journal_get_write_access(handle, sb,
   5149						    EXT4_SB(sb)->s_sbh,
   5150						    EXT4_JTR_NONE);
   5151		if (err)
   5152			goto out_error;
   5153		lock_buffer(EXT4_SB(sb)->s_sbh);
   5154		ext4_set_feature_large_file(sb);
   5155		ext4_superblock_csum_set(sb);
   5156		unlock_buffer(EXT4_SB(sb)->s_sbh);
   5157		ext4_handle_sync(handle);
   5158		err = ext4_handle_dirty_metadata(handle, NULL,
   5159						 EXT4_SB(sb)->s_sbh);
   5160	}
   5161	ext4_update_inode_fsync_trans(handle, inode, need_datasync);
   5162out_error:
   5163	ext4_std_error(inode->i_sb, err);
   5164out_brelse:
   5165	brelse(bh);
   5166	return err;
   5167}
   5168
   5169/*
   5170 * ext4_write_inode()
   5171 *
   5172 * We are called from a few places:
   5173 *
   5174 * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
   5175 *   Here, there will be no transaction running. We wait for any running
   5176 *   transaction to commit.
   5177 *
   5178 * - Within flush work (sys_sync(), kupdate and such).
   5179 *   We wait on commit, if told to.
   5180 *
   5181 * - Within iput_final() -> write_inode_now()
   5182 *   We wait on commit, if told to.
   5183 *
   5184 * In all cases it is actually safe for us to return without doing anything,
   5185 * because the inode has been copied into a raw inode buffer in
   5186 * ext4_mark_inode_dirty().  This is a correctness thing for WB_SYNC_ALL
   5187 * writeback.
   5188 *
   5189 * Note that we are absolutely dependent upon all inode dirtiers doing the
   5190 * right thing: they *must* call mark_inode_dirty() after dirtying info in
   5191 * which we are interested.
   5192 *
   5193 * It would be a bug for them to not do this.  The code:
   5194 *
   5195 *	mark_inode_dirty(inode)
   5196 *	stuff();
   5197 *	inode->i_size = expr;
   5198 *
   5199 * is in error because write_inode() could occur while `stuff()' is running,
   5200 * and the new i_size will be lost.  Plus the inode will no longer be on the
   5201 * superblock's dirty inode list.
   5202 */
   5203int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
   5204{
   5205	int err;
   5206
   5207	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) ||
   5208	    sb_rdonly(inode->i_sb))
   5209		return 0;
   5210
   5211	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
   5212		return -EIO;
   5213
   5214	if (EXT4_SB(inode->i_sb)->s_journal) {
   5215		if (ext4_journal_current_handle()) {
   5216			jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
   5217			dump_stack();
   5218			return -EIO;
   5219		}
   5220
   5221		/*
   5222		 * No need to force transaction in WB_SYNC_NONE mode. Also
   5223		 * ext4_sync_fs() will force the commit after everything is
   5224		 * written.
   5225		 */
   5226		if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
   5227			return 0;
   5228
   5229		err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
   5230						EXT4_I(inode)->i_sync_tid);
   5231	} else {
   5232		struct ext4_iloc iloc;
   5233
   5234		err = __ext4_get_inode_loc_noinmem(inode, &iloc);
   5235		if (err)
   5236			return err;
   5237		/*
   5238		 * sync(2) will flush the whole buffer cache. No need to do
   5239		 * it here separately for each inode.
   5240		 */
   5241		if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
   5242			sync_dirty_buffer(iloc.bh);
   5243		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
   5244			ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO,
   5245					       "IO error syncing inode");
   5246			err = -EIO;
   5247		}
   5248		brelse(iloc.bh);
   5249	}
   5250	return err;
   5251}
   5252
   5253/*
   5254 * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate
   5255 * buffers that are attached to a folio straddling i_size and are undergoing
   5256 * commit. In that case we have to wait for commit to finish and try again.
   5257 */
   5258static void ext4_wait_for_tail_page_commit(struct inode *inode)
   5259{
   5260	unsigned offset;
   5261	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
   5262	tid_t commit_tid = 0;
   5263	int ret;
   5264
   5265	offset = inode->i_size & (PAGE_SIZE - 1);
   5266	/*
   5267	 * If the folio is fully truncated, we don't need to wait for any commit
   5268	 * (and we even should not as __ext4_journalled_invalidate_folio() may
   5269	 * strip all buffers from the folio but keep the folio dirty which can then
   5270	 * confuse e.g. concurrent ext4_writepage() seeing dirty folio without
   5271	 * buffers). Also we don't need to wait for any commit if all buffers in
   5272	 * the folio remain valid. This is most beneficial for the common case of
   5273	 * blocksize == PAGESIZE.
   5274	 */
   5275	if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
   5276		return;
   5277	while (1) {
   5278		struct folio *folio = filemap_lock_folio(inode->i_mapping,
   5279				      inode->i_size >> PAGE_SHIFT);
   5280		if (!folio)
   5281			return;
   5282		ret = __ext4_journalled_invalidate_folio(folio, offset,
   5283						folio_size(folio) - offset);
   5284		folio_unlock(folio);
   5285		folio_put(folio);
   5286		if (ret != -EBUSY)
   5287			return;
   5288		commit_tid = 0;
   5289		read_lock(&journal->j_state_lock);
   5290		if (journal->j_committing_transaction)
   5291			commit_tid = journal->j_committing_transaction->t_tid;
   5292		read_unlock(&journal->j_state_lock);
   5293		if (commit_tid)
   5294			jbd2_log_wait_commit(journal, commit_tid);
   5295	}
   5296}
   5297
   5298/*
   5299 * ext4_setattr()
   5300 *
   5301 * Called from notify_change.
   5302 *
   5303 * We want to trap VFS attempts to truncate the file as soon as
   5304 * possible.  In particular, we want to make sure that when the VFS
   5305 * shrinks i_size, we put the inode on the orphan list and modify
   5306 * i_disksize immediately, so that during the subsequent flushing of
   5307 * dirty pages and freeing of disk blocks, we can guarantee that any
   5308 * commit will leave the blocks being flushed in an unused state on
   5309 * disk.  (On recovery, the inode will get truncated and the blocks will
   5310 * be freed, so we have a strong guarantee that no future commit will
   5311 * leave these blocks visible to the user.)
   5312 *
   5313 * Another thing we have to assure is that if we are in ordered mode
   5314 * and inode is still attached to the committing transaction, we must
   5315 * we start writeout of all the dirty pages which are being truncated.
   5316 * This way we are sure that all the data written in the previous
   5317 * transaction are already on disk (truncate waits for pages under
   5318 * writeback).
   5319 *
   5320 * Called with inode->i_rwsem down.
   5321 */
   5322int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
   5323		 struct iattr *attr)
   5324{
   5325	struct inode *inode = d_inode(dentry);
   5326	int error, rc = 0;
   5327	int orphan = 0;
   5328	const unsigned int ia_valid = attr->ia_valid;
   5329
   5330	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
   5331		return -EIO;
   5332
   5333	if (unlikely(IS_IMMUTABLE(inode)))
   5334		return -EPERM;
   5335
   5336	if (unlikely(IS_APPEND(inode) &&
   5337		     (ia_valid & (ATTR_MODE | ATTR_UID |
   5338				  ATTR_GID | ATTR_TIMES_SET))))
   5339		return -EPERM;
   5340
   5341	error = setattr_prepare(mnt_userns, dentry, attr);
   5342	if (error)
   5343		return error;
   5344
   5345	error = fscrypt_prepare_setattr(dentry, attr);
   5346	if (error)
   5347		return error;
   5348
   5349	error = fsverity_prepare_setattr(dentry, attr);
   5350	if (error)
   5351		return error;
   5352
   5353	if (is_quota_modification(inode, attr)) {
   5354		error = dquot_initialize(inode);
   5355		if (error)
   5356			return error;
   5357	}
   5358
   5359	if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
   5360	    (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
   5361		handle_t *handle;
   5362
   5363		/* (user+group)*(old+new) structure, inode write (sb,
   5364		 * inode block, ? - but truncate inode update has it) */
   5365		handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
   5366			(EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
   5367			 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
   5368		if (IS_ERR(handle)) {
   5369			error = PTR_ERR(handle);
   5370			goto err_out;
   5371		}
   5372
   5373		/* dquot_transfer() calls back ext4_get_inode_usage() which
   5374		 * counts xattr inode references.
   5375		 */
   5376		down_read(&EXT4_I(inode)->xattr_sem);
   5377		error = dquot_transfer(inode, attr);
   5378		up_read(&EXT4_I(inode)->xattr_sem);
   5379
   5380		if (error) {
   5381			ext4_journal_stop(handle);
   5382			return error;
   5383		}
   5384		/* Update corresponding info in inode so that everything is in
   5385		 * one transaction */
   5386		if (attr->ia_valid & ATTR_UID)
   5387			inode->i_uid = attr->ia_uid;
   5388		if (attr->ia_valid & ATTR_GID)
   5389			inode->i_gid = attr->ia_gid;
   5390		error = ext4_mark_inode_dirty(handle, inode);
   5391		ext4_journal_stop(handle);
   5392		if (unlikely(error)) {
   5393			return error;
   5394		}
   5395	}
   5396
   5397	if (attr->ia_valid & ATTR_SIZE) {
   5398		handle_t *handle;
   5399		loff_t oldsize = inode->i_size;
   5400		loff_t old_disksize;
   5401		int shrink = (attr->ia_size < inode->i_size);
   5402
   5403		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
   5404			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   5405
   5406			if (attr->ia_size > sbi->s_bitmap_maxbytes) {
   5407				return -EFBIG;
   5408			}
   5409		}
   5410		if (!S_ISREG(inode->i_mode)) {
   5411			return -EINVAL;
   5412		}
   5413
   5414		if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
   5415			inode_inc_iversion(inode);
   5416
   5417		if (shrink) {
   5418			if (ext4_should_order_data(inode)) {
   5419				error = ext4_begin_ordered_truncate(inode,
   5420							    attr->ia_size);
   5421				if (error)
   5422					goto err_out;
   5423			}
   5424			/*
   5425			 * Blocks are going to be removed from the inode. Wait
   5426			 * for dio in flight.
   5427			 */
   5428			inode_dio_wait(inode);
   5429		}
   5430
   5431		filemap_invalidate_lock(inode->i_mapping);
   5432
   5433		rc = ext4_break_layouts(inode);
   5434		if (rc) {
   5435			filemap_invalidate_unlock(inode->i_mapping);
   5436			goto err_out;
   5437		}
   5438
   5439		if (attr->ia_size != inode->i_size) {
   5440			handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
   5441			if (IS_ERR(handle)) {
   5442				error = PTR_ERR(handle);
   5443				goto out_mmap_sem;
   5444			}
   5445			if (ext4_handle_valid(handle) && shrink) {
   5446				error = ext4_orphan_add(handle, inode);
   5447				orphan = 1;
   5448			}
   5449			/*
   5450			 * Update c/mtime on truncate up, ext4_truncate() will
   5451			 * update c/mtime in shrink case below
   5452			 */
   5453			if (!shrink) {
   5454				inode->i_mtime = current_time(inode);
   5455				inode->i_ctime = inode->i_mtime;
   5456			}
   5457
   5458			if (shrink)
   5459				ext4_fc_track_range(handle, inode,
   5460					(attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
   5461					inode->i_sb->s_blocksize_bits,
   5462					EXT_MAX_BLOCKS - 1);
   5463			else
   5464				ext4_fc_track_range(
   5465					handle, inode,
   5466					(oldsize > 0 ? oldsize - 1 : oldsize) >>
   5467					inode->i_sb->s_blocksize_bits,
   5468					(attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
   5469					inode->i_sb->s_blocksize_bits);
   5470
   5471			down_write(&EXT4_I(inode)->i_data_sem);
   5472			old_disksize = EXT4_I(inode)->i_disksize;
   5473			EXT4_I(inode)->i_disksize = attr->ia_size;
   5474			rc = ext4_mark_inode_dirty(handle, inode);
   5475			if (!error)
   5476				error = rc;
   5477			/*
   5478			 * We have to update i_size under i_data_sem together
   5479			 * with i_disksize to avoid races with writeback code
   5480			 * running ext4_wb_update_i_disksize().
   5481			 */
   5482			if (!error)
   5483				i_size_write(inode, attr->ia_size);
   5484			else
   5485				EXT4_I(inode)->i_disksize = old_disksize;
   5486			up_write(&EXT4_I(inode)->i_data_sem);
   5487			ext4_journal_stop(handle);
   5488			if (error)
   5489				goto out_mmap_sem;
   5490			if (!shrink) {
   5491				pagecache_isize_extended(inode, oldsize,
   5492							 inode->i_size);
   5493			} else if (ext4_should_journal_data(inode)) {
   5494				ext4_wait_for_tail_page_commit(inode);
   5495			}
   5496		}
   5497
   5498		/*
   5499		 * Truncate pagecache after we've waited for commit
   5500		 * in data=journal mode to make pages freeable.
   5501		 */
   5502		truncate_pagecache(inode, inode->i_size);
   5503		/*
   5504		 * Call ext4_truncate() even if i_size didn't change to
   5505		 * truncate possible preallocated blocks.
   5506		 */
   5507		if (attr->ia_size <= oldsize) {
   5508			rc = ext4_truncate(inode);
   5509			if (rc)
   5510				error = rc;
   5511		}
   5512out_mmap_sem:
   5513		filemap_invalidate_unlock(inode->i_mapping);
   5514	}
   5515
   5516	if (!error) {
   5517		setattr_copy(mnt_userns, inode, attr);
   5518		mark_inode_dirty(inode);
   5519	}
   5520
   5521	/*
   5522	 * If the call to ext4_truncate failed to get a transaction handle at
   5523	 * all, we need to clean up the in-core orphan list manually.
   5524	 */
   5525	if (orphan && inode->i_nlink)
   5526		ext4_orphan_del(NULL, inode);
   5527
   5528	if (!error && (ia_valid & ATTR_MODE))
   5529		rc = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
   5530
   5531err_out:
   5532	if  (error)
   5533		ext4_std_error(inode->i_sb, error);
   5534	if (!error)
   5535		error = rc;
   5536	return error;
   5537}
   5538
   5539int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
   5540		 struct kstat *stat, u32 request_mask, unsigned int query_flags)
   5541{
   5542	struct inode *inode = d_inode(path->dentry);
   5543	struct ext4_inode *raw_inode;
   5544	struct ext4_inode_info *ei = EXT4_I(inode);
   5545	unsigned int flags;
   5546
   5547	if ((request_mask & STATX_BTIME) &&
   5548	    EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
   5549		stat->result_mask |= STATX_BTIME;
   5550		stat->btime.tv_sec = ei->i_crtime.tv_sec;
   5551		stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
   5552	}
   5553
   5554	flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
   5555	if (flags & EXT4_APPEND_FL)
   5556		stat->attributes |= STATX_ATTR_APPEND;
   5557	if (flags & EXT4_COMPR_FL)
   5558		stat->attributes |= STATX_ATTR_COMPRESSED;
   5559	if (flags & EXT4_ENCRYPT_FL)
   5560		stat->attributes |= STATX_ATTR_ENCRYPTED;
   5561	if (flags & EXT4_IMMUTABLE_FL)
   5562		stat->attributes |= STATX_ATTR_IMMUTABLE;
   5563	if (flags & EXT4_NODUMP_FL)
   5564		stat->attributes |= STATX_ATTR_NODUMP;
   5565	if (flags & EXT4_VERITY_FL)
   5566		stat->attributes |= STATX_ATTR_VERITY;
   5567
   5568	stat->attributes_mask |= (STATX_ATTR_APPEND |
   5569				  STATX_ATTR_COMPRESSED |
   5570				  STATX_ATTR_ENCRYPTED |
   5571				  STATX_ATTR_IMMUTABLE |
   5572				  STATX_ATTR_NODUMP |
   5573				  STATX_ATTR_VERITY);
   5574
   5575	generic_fillattr(mnt_userns, inode, stat);
   5576	return 0;
   5577}
   5578
   5579int ext4_file_getattr(struct user_namespace *mnt_userns,
   5580		      const struct path *path, struct kstat *stat,
   5581		      u32 request_mask, unsigned int query_flags)
   5582{
   5583	struct inode *inode = d_inode(path->dentry);
   5584	u64 delalloc_blocks;
   5585
   5586	ext4_getattr(mnt_userns, path, stat, request_mask, query_flags);
   5587
   5588	/*
   5589	 * If there is inline data in the inode, the inode will normally not
   5590	 * have data blocks allocated (it may have an external xattr block).
   5591	 * Report at least one sector for such files, so tools like tar, rsync,
   5592	 * others don't incorrectly think the file is completely sparse.
   5593	 */
   5594	if (unlikely(ext4_has_inline_data(inode)))
   5595		stat->blocks += (stat->size + 511) >> 9;
   5596
   5597	/*
   5598	 * We can't update i_blocks if the block allocation is delayed
   5599	 * otherwise in the case of system crash before the real block
   5600	 * allocation is done, we will have i_blocks inconsistent with
   5601	 * on-disk file blocks.
   5602	 * We always keep i_blocks updated together with real
   5603	 * allocation. But to not confuse with user, stat
   5604	 * will return the blocks that include the delayed allocation
   5605	 * blocks for this file.
   5606	 */
   5607	delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
   5608				   EXT4_I(inode)->i_reserved_data_blocks);
   5609	stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
   5610	return 0;
   5611}
   5612
   5613static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
   5614				   int pextents)
   5615{
   5616	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
   5617		return ext4_ind_trans_blocks(inode, lblocks);
   5618	return ext4_ext_index_trans_blocks(inode, pextents);
   5619}
   5620
   5621/*
   5622 * Account for index blocks, block groups bitmaps and block group
   5623 * descriptor blocks if modify datablocks and index blocks
   5624 * worse case, the indexs blocks spread over different block groups
   5625 *
   5626 * If datablocks are discontiguous, they are possible to spread over
   5627 * different block groups too. If they are contiguous, with flexbg,
   5628 * they could still across block group boundary.
   5629 *
   5630 * Also account for superblock, inode, quota and xattr blocks
   5631 */
   5632static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
   5633				  int pextents)
   5634{
   5635	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
   5636	int gdpblocks;
   5637	int idxblocks;
   5638	int ret = 0;
   5639
   5640	/*
   5641	 * How many index blocks need to touch to map @lblocks logical blocks
   5642	 * to @pextents physical extents?
   5643	 */
   5644	idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
   5645
   5646	ret = idxblocks;
   5647
   5648	/*
   5649	 * Now let's see how many group bitmaps and group descriptors need
   5650	 * to account
   5651	 */
   5652	groups = idxblocks + pextents;
   5653	gdpblocks = groups;
   5654	if (groups > ngroups)
   5655		groups = ngroups;
   5656	if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
   5657		gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
   5658
   5659	/* bitmaps and block group descriptor blocks */
   5660	ret += groups + gdpblocks;
   5661
   5662	/* Blocks for super block, inode, quota and xattr blocks */
   5663	ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
   5664
   5665	return ret;
   5666}
   5667
   5668/*
   5669 * Calculate the total number of credits to reserve to fit
   5670 * the modification of a single pages into a single transaction,
   5671 * which may include multiple chunks of block allocations.
   5672 *
   5673 * This could be called via ext4_write_begin()
   5674 *
   5675 * We need to consider the worse case, when
   5676 * one new block per extent.
   5677 */
   5678int ext4_writepage_trans_blocks(struct inode *inode)
   5679{
   5680	int bpp = ext4_journal_blocks_per_page(inode);
   5681	int ret;
   5682
   5683	ret = ext4_meta_trans_blocks(inode, bpp, bpp);
   5684
   5685	/* Account for data blocks for journalled mode */
   5686	if (ext4_should_journal_data(inode))
   5687		ret += bpp;
   5688	return ret;
   5689}
   5690
   5691/*
   5692 * Calculate the journal credits for a chunk of data modification.
   5693 *
   5694 * This is called from DIO, fallocate or whoever calling
   5695 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
   5696 *
   5697 * journal buffers for data blocks are not included here, as DIO
   5698 * and fallocate do no need to journal data buffers.
   5699 */
   5700int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
   5701{
   5702	return ext4_meta_trans_blocks(inode, nrblocks, 1);
   5703}
   5704
   5705/*
   5706 * The caller must have previously called ext4_reserve_inode_write().
   5707 * Give this, we know that the caller already has write access to iloc->bh.
   5708 */
   5709int ext4_mark_iloc_dirty(handle_t *handle,
   5710			 struct inode *inode, struct ext4_iloc *iloc)
   5711{
   5712	int err = 0;
   5713
   5714	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
   5715		put_bh(iloc->bh);
   5716		return -EIO;
   5717	}
   5718	ext4_fc_track_inode(handle, inode);
   5719
   5720	if (IS_I_VERSION(inode))
   5721		inode_inc_iversion(inode);
   5722
   5723	/* the do_update_inode consumes one bh->b_count */
   5724	get_bh(iloc->bh);
   5725
   5726	/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
   5727	err = ext4_do_update_inode(handle, inode, iloc);
   5728	put_bh(iloc->bh);
   5729	return err;
   5730}
   5731
   5732/*
   5733 * On success, We end up with an outstanding reference count against
   5734 * iloc->bh.  This _must_ be cleaned up later.
   5735 */
   5736
   5737int
   5738ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
   5739			 struct ext4_iloc *iloc)
   5740{
   5741	int err;
   5742
   5743	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
   5744		return -EIO;
   5745
   5746	err = ext4_get_inode_loc(inode, iloc);
   5747	if (!err) {
   5748		BUFFER_TRACE(iloc->bh, "get_write_access");
   5749		err = ext4_journal_get_write_access(handle, inode->i_sb,
   5750						    iloc->bh, EXT4_JTR_NONE);
   5751		if (err) {
   5752			brelse(iloc->bh);
   5753			iloc->bh = NULL;
   5754		}
   5755	}
   5756	ext4_std_error(inode->i_sb, err);
   5757	return err;
   5758}
   5759
   5760static int __ext4_expand_extra_isize(struct inode *inode,
   5761				     unsigned int new_extra_isize,
   5762				     struct ext4_iloc *iloc,
   5763				     handle_t *handle, int *no_expand)
   5764{
   5765	struct ext4_inode *raw_inode;
   5766	struct ext4_xattr_ibody_header *header;
   5767	unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
   5768	struct ext4_inode_info *ei = EXT4_I(inode);
   5769	int error;
   5770
   5771	/* this was checked at iget time, but double check for good measure */
   5772	if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
   5773	    (ei->i_extra_isize & 3)) {
   5774		EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
   5775				 ei->i_extra_isize,
   5776				 EXT4_INODE_SIZE(inode->i_sb));
   5777		return -EFSCORRUPTED;
   5778	}
   5779	if ((new_extra_isize < ei->i_extra_isize) ||
   5780	    (new_extra_isize < 4) ||
   5781	    (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
   5782		return -EINVAL;	/* Should never happen */
   5783
   5784	raw_inode = ext4_raw_inode(iloc);
   5785
   5786	header = IHDR(inode, raw_inode);
   5787
   5788	/* No extended attributes present */
   5789	if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
   5790	    header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
   5791		memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE +
   5792		       EXT4_I(inode)->i_extra_isize, 0,
   5793		       new_extra_isize - EXT4_I(inode)->i_extra_isize);
   5794		EXT4_I(inode)->i_extra_isize = new_extra_isize;
   5795		return 0;
   5796	}
   5797
   5798	/* try to expand with EAs present */
   5799	error = ext4_expand_extra_isize_ea(inode, new_extra_isize,
   5800					   raw_inode, handle);
   5801	if (error) {
   5802		/*
   5803		 * Inode size expansion failed; don't try again
   5804		 */
   5805		*no_expand = 1;
   5806	}
   5807
   5808	return error;
   5809}
   5810
   5811/*
   5812 * Expand an inode by new_extra_isize bytes.
   5813 * Returns 0 on success or negative error number on failure.
   5814 */
   5815static int ext4_try_to_expand_extra_isize(struct inode *inode,
   5816					  unsigned int new_extra_isize,
   5817					  struct ext4_iloc iloc,
   5818					  handle_t *handle)
   5819{
   5820	int no_expand;
   5821	int error;
   5822
   5823	if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND))
   5824		return -EOVERFLOW;
   5825
   5826	/*
   5827	 * In nojournal mode, we can immediately attempt to expand
   5828	 * the inode.  When journaled, we first need to obtain extra
   5829	 * buffer credits since we may write into the EA block
   5830	 * with this same handle. If journal_extend fails, then it will
   5831	 * only result in a minor loss of functionality for that inode.
   5832	 * If this is felt to be critical, then e2fsck should be run to
   5833	 * force a large enough s_min_extra_isize.
   5834	 */
   5835	if (ext4_journal_extend(handle,
   5836				EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
   5837		return -ENOSPC;
   5838
   5839	if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
   5840		return -EBUSY;
   5841
   5842	error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc,
   5843					  handle, &no_expand);
   5844	ext4_write_unlock_xattr(inode, &no_expand);
   5845
   5846	return error;
   5847}
   5848
   5849int ext4_expand_extra_isize(struct inode *inode,
   5850			    unsigned int new_extra_isize,
   5851			    struct ext4_iloc *iloc)
   5852{
   5853	handle_t *handle;
   5854	int no_expand;
   5855	int error, rc;
   5856
   5857	if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
   5858		brelse(iloc->bh);
   5859		return -EOVERFLOW;
   5860	}
   5861
   5862	handle = ext4_journal_start(inode, EXT4_HT_INODE,
   5863				    EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
   5864	if (IS_ERR(handle)) {
   5865		error = PTR_ERR(handle);
   5866		brelse(iloc->bh);
   5867		return error;
   5868	}
   5869
   5870	ext4_write_lock_xattr(inode, &no_expand);
   5871
   5872	BUFFER_TRACE(iloc->bh, "get_write_access");
   5873	error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh,
   5874					      EXT4_JTR_NONE);
   5875	if (error) {
   5876		brelse(iloc->bh);
   5877		goto out_unlock;
   5878	}
   5879
   5880	error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc,
   5881					  handle, &no_expand);
   5882
   5883	rc = ext4_mark_iloc_dirty(handle, inode, iloc);
   5884	if (!error)
   5885		error = rc;
   5886
   5887out_unlock:
   5888	ext4_write_unlock_xattr(inode, &no_expand);
   5889	ext4_journal_stop(handle);
   5890	return error;
   5891}
   5892
   5893/*
   5894 * What we do here is to mark the in-core inode as clean with respect to inode
   5895 * dirtiness (it may still be data-dirty).
   5896 * This means that the in-core inode may be reaped by prune_icache
   5897 * without having to perform any I/O.  This is a very good thing,
   5898 * because *any* task may call prune_icache - even ones which
   5899 * have a transaction open against a different journal.
   5900 *
   5901 * Is this cheating?  Not really.  Sure, we haven't written the
   5902 * inode out, but prune_icache isn't a user-visible syncing function.
   5903 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
   5904 * we start and wait on commits.
   5905 */
   5906int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
   5907				const char *func, unsigned int line)
   5908{
   5909	struct ext4_iloc iloc;
   5910	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   5911	int err;
   5912
   5913	might_sleep();
   5914	trace_ext4_mark_inode_dirty(inode, _RET_IP_);
   5915	err = ext4_reserve_inode_write(handle, inode, &iloc);
   5916	if (err)
   5917		goto out;
   5918
   5919	if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize)
   5920		ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize,
   5921					       iloc, handle);
   5922
   5923	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
   5924out:
   5925	if (unlikely(err))
   5926		ext4_error_inode_err(inode, func, line, 0, err,
   5927					"mark_inode_dirty error");
   5928	return err;
   5929}
   5930
   5931/*
   5932 * ext4_dirty_inode() is called from __mark_inode_dirty()
   5933 *
   5934 * We're really interested in the case where a file is being extended.
   5935 * i_size has been changed by generic_commit_write() and we thus need
   5936 * to include the updated inode in the current transaction.
   5937 *
   5938 * Also, dquot_alloc_block() will always dirty the inode when blocks
   5939 * are allocated to the file.
   5940 *
   5941 * If the inode is marked synchronous, we don't honour that here - doing
   5942 * so would cause a commit on atime updates, which we don't bother doing.
   5943 * We handle synchronous inodes at the highest possible level.
   5944 */
   5945void ext4_dirty_inode(struct inode *inode, int flags)
   5946{
   5947	handle_t *handle;
   5948
   5949	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
   5950	if (IS_ERR(handle))
   5951		return;
   5952	ext4_mark_inode_dirty(handle, inode);
   5953	ext4_journal_stop(handle);
   5954}
   5955
   5956int ext4_change_inode_journal_flag(struct inode *inode, int val)
   5957{
   5958	journal_t *journal;
   5959	handle_t *handle;
   5960	int err;
   5961	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   5962
   5963	/*
   5964	 * We have to be very careful here: changing a data block's
   5965	 * journaling status dynamically is dangerous.  If we write a
   5966	 * data block to the journal, change the status and then delete
   5967	 * that block, we risk forgetting to revoke the old log record
   5968	 * from the journal and so a subsequent replay can corrupt data.
   5969	 * So, first we make sure that the journal is empty and that
   5970	 * nobody is changing anything.
   5971	 */
   5972
   5973	journal = EXT4_JOURNAL(inode);
   5974	if (!journal)
   5975		return 0;
   5976	if (is_journal_aborted(journal))
   5977		return -EROFS;
   5978
   5979	/* Wait for all existing dio workers */
   5980	inode_dio_wait(inode);
   5981
   5982	/*
   5983	 * Before flushing the journal and switching inode's aops, we have
   5984	 * to flush all dirty data the inode has. There can be outstanding
   5985	 * delayed allocations, there can be unwritten extents created by
   5986	 * fallocate or buffered writes in dioread_nolock mode covered by
   5987	 * dirty data which can be converted only after flushing the dirty
   5988	 * data (and journalled aops don't know how to handle these cases).
   5989	 */
   5990	if (val) {
   5991		filemap_invalidate_lock(inode->i_mapping);
   5992		err = filemap_write_and_wait(inode->i_mapping);
   5993		if (err < 0) {
   5994			filemap_invalidate_unlock(inode->i_mapping);
   5995			return err;
   5996		}
   5997	}
   5998
   5999	percpu_down_write(&sbi->s_writepages_rwsem);
   6000	jbd2_journal_lock_updates(journal);
   6001
   6002	/*
   6003	 * OK, there are no updates running now, and all cached data is
   6004	 * synced to disk.  We are now in a completely consistent state
   6005	 * which doesn't have anything in the journal, and we know that
   6006	 * no filesystem updates are running, so it is safe to modify
   6007	 * the inode's in-core data-journaling state flag now.
   6008	 */
   6009
   6010	if (val)
   6011		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
   6012	else {
   6013		err = jbd2_journal_flush(journal, 0);
   6014		if (err < 0) {
   6015			jbd2_journal_unlock_updates(journal);
   6016			percpu_up_write(&sbi->s_writepages_rwsem);
   6017			return err;
   6018		}
   6019		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
   6020	}
   6021	ext4_set_aops(inode);
   6022
   6023	jbd2_journal_unlock_updates(journal);
   6024	percpu_up_write(&sbi->s_writepages_rwsem);
   6025
   6026	if (val)
   6027		filemap_invalidate_unlock(inode->i_mapping);
   6028
   6029	/* Finally we can mark the inode as dirty. */
   6030
   6031	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
   6032	if (IS_ERR(handle))
   6033		return PTR_ERR(handle);
   6034
   6035	ext4_fc_mark_ineligible(inode->i_sb,
   6036		EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle);
   6037	err = ext4_mark_inode_dirty(handle, inode);
   6038	ext4_handle_sync(handle);
   6039	ext4_journal_stop(handle);
   6040	ext4_std_error(inode->i_sb, err);
   6041
   6042	return err;
   6043}
   6044
   6045static int ext4_bh_unmapped(handle_t *handle, struct inode *inode,
   6046			    struct buffer_head *bh)
   6047{
   6048	return !buffer_mapped(bh);
   6049}
   6050
   6051vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
   6052{
   6053	struct vm_area_struct *vma = vmf->vma;
   6054	struct page *page = vmf->page;
   6055	loff_t size;
   6056	unsigned long len;
   6057	int err;
   6058	vm_fault_t ret;
   6059	struct file *file = vma->vm_file;
   6060	struct inode *inode = file_inode(file);
   6061	struct address_space *mapping = inode->i_mapping;
   6062	handle_t *handle;
   6063	get_block_t *get_block;
   6064	int retries = 0;
   6065
   6066	if (unlikely(IS_IMMUTABLE(inode)))
   6067		return VM_FAULT_SIGBUS;
   6068
   6069	sb_start_pagefault(inode->i_sb);
   6070	file_update_time(vma->vm_file);
   6071
   6072	filemap_invalidate_lock_shared(mapping);
   6073
   6074	err = ext4_convert_inline_data(inode);
   6075	if (err)
   6076		goto out_ret;
   6077
   6078	/*
   6079	 * On data journalling we skip straight to the transaction handle:
   6080	 * there's no delalloc; page truncated will be checked later; the
   6081	 * early return w/ all buffers mapped (calculates size/len) can't
   6082	 * be used; and there's no dioread_nolock, so only ext4_get_block.
   6083	 */
   6084	if (ext4_should_journal_data(inode))
   6085		goto retry_alloc;
   6086
   6087	/* Delalloc case is easy... */
   6088	if (test_opt(inode->i_sb, DELALLOC) &&
   6089	    !ext4_nonda_switch(inode->i_sb)) {
   6090		do {
   6091			err = block_page_mkwrite(vma, vmf,
   6092						   ext4_da_get_block_prep);
   6093		} while (err == -ENOSPC &&
   6094		       ext4_should_retry_alloc(inode->i_sb, &retries));
   6095		goto out_ret;
   6096	}
   6097
   6098	lock_page(page);
   6099	size = i_size_read(inode);
   6100	/* Page got truncated from under us? */
   6101	if (page->mapping != mapping || page_offset(page) > size) {
   6102		unlock_page(page);
   6103		ret = VM_FAULT_NOPAGE;
   6104		goto out;
   6105	}
   6106
   6107	if (page->index == size >> PAGE_SHIFT)
   6108		len = size & ~PAGE_MASK;
   6109	else
   6110		len = PAGE_SIZE;
   6111	/*
   6112	 * Return if we have all the buffers mapped. This avoids the need to do
   6113	 * journal_start/journal_stop which can block and take a long time
   6114	 *
   6115	 * This cannot be done for data journalling, as we have to add the
   6116	 * inode to the transaction's list to writeprotect pages on commit.
   6117	 */
   6118	if (page_has_buffers(page)) {
   6119		if (!ext4_walk_page_buffers(NULL, inode, page_buffers(page),
   6120					    0, len, NULL,
   6121					    ext4_bh_unmapped)) {
   6122			/* Wait so that we don't change page under IO */
   6123			wait_for_stable_page(page);
   6124			ret = VM_FAULT_LOCKED;
   6125			goto out;
   6126		}
   6127	}
   6128	unlock_page(page);
   6129	/* OK, we need to fill the hole... */
   6130	if (ext4_should_dioread_nolock(inode))
   6131		get_block = ext4_get_block_unwritten;
   6132	else
   6133		get_block = ext4_get_block;
   6134retry_alloc:
   6135	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
   6136				    ext4_writepage_trans_blocks(inode));
   6137	if (IS_ERR(handle)) {
   6138		ret = VM_FAULT_SIGBUS;
   6139		goto out;
   6140	}
   6141	/*
   6142	 * Data journalling can't use block_page_mkwrite() because it
   6143	 * will set_buffer_dirty() before do_journal_get_write_access()
   6144	 * thus might hit warning messages for dirty metadata buffers.
   6145	 */
   6146	if (!ext4_should_journal_data(inode)) {
   6147		err = block_page_mkwrite(vma, vmf, get_block);
   6148	} else {
   6149		lock_page(page);
   6150		size = i_size_read(inode);
   6151		/* Page got truncated from under us? */
   6152		if (page->mapping != mapping || page_offset(page) > size) {
   6153			ret = VM_FAULT_NOPAGE;
   6154			goto out_error;
   6155		}
   6156
   6157		if (page->index == size >> PAGE_SHIFT)
   6158			len = size & ~PAGE_MASK;
   6159		else
   6160			len = PAGE_SIZE;
   6161
   6162		err = __block_write_begin(page, 0, len, ext4_get_block);
   6163		if (!err) {
   6164			ret = VM_FAULT_SIGBUS;
   6165			if (ext4_walk_page_buffers(handle, inode,
   6166					page_buffers(page), 0, len, NULL,
   6167					do_journal_get_write_access))
   6168				goto out_error;
   6169			if (ext4_walk_page_buffers(handle, inode,
   6170					page_buffers(page), 0, len, NULL,
   6171					write_end_fn))
   6172				goto out_error;
   6173			if (ext4_jbd2_inode_add_write(handle, inode,
   6174						      page_offset(page), len))
   6175				goto out_error;
   6176			ext4_set_inode_state(inode, EXT4_STATE_JDATA);
   6177		} else {
   6178			unlock_page(page);
   6179		}
   6180	}
   6181	ext4_journal_stop(handle);
   6182	if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
   6183		goto retry_alloc;
   6184out_ret:
   6185	ret = block_page_mkwrite_return(err);
   6186out:
   6187	filemap_invalidate_unlock_shared(mapping);
   6188	sb_end_pagefault(inode->i_sb);
   6189	return ret;
   6190out_error:
   6191	unlock_page(page);
   6192	ext4_journal_stop(handle);
   6193	goto out;
   6194}