cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

ialloc.c (45802B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 *  linux/fs/ext4/ialloc.c
      4 *
      5 * Copyright (C) 1992, 1993, 1994, 1995
      6 * Remy Card (card@masi.ibp.fr)
      7 * Laboratoire MASI - Institut Blaise Pascal
      8 * Universite Pierre et Marie Curie (Paris VI)
      9 *
     10 *  BSD ufs-inspired inode and directory allocation by
     11 *  Stephen Tweedie (sct@redhat.com), 1993
     12 *  Big-endian to little-endian byte-swapping/bitmaps by
     13 *        David S. Miller (davem@caip.rutgers.edu), 1995
     14 */
     15
     16#include <linux/time.h>
     17#include <linux/fs.h>
     18#include <linux/stat.h>
     19#include <linux/string.h>
     20#include <linux/quotaops.h>
     21#include <linux/buffer_head.h>
     22#include <linux/random.h>
     23#include <linux/bitops.h>
     24#include <linux/blkdev.h>
     25#include <linux/cred.h>
     26
     27#include <asm/byteorder.h>
     28
     29#include "ext4.h"
     30#include "ext4_jbd2.h"
     31#include "xattr.h"
     32#include "acl.h"
     33
     34#include <trace/events/ext4.h>
     35
     36/*
     37 * ialloc.c contains the inodes allocation and deallocation routines
     38 */
     39
     40/*
     41 * The free inodes are managed by bitmaps.  A file system contains several
     42 * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
     43 * block for inodes, N blocks for the inode table and data blocks.
     44 *
     45 * The file system contains group descriptors which are located after the
     46 * super block.  Each descriptor contains the number of the bitmap block and
     47 * the free blocks count in the block.
     48 */
     49
     50/*
     51 * To avoid calling the atomic setbit hundreds or thousands of times, we only
     52 * need to use it within a single byte (to ensure we get endianness right).
     53 * We can use memset for the rest of the bitmap as there are no other users.
     54 */
     55void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
     56{
     57	int i;
     58
     59	if (start_bit >= end_bit)
     60		return;
     61
     62	ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
     63	for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
     64		ext4_set_bit(i, bitmap);
     65	if (i < end_bit)
     66		memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
     67}
     68
     69void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
     70{
     71	if (uptodate) {
     72		set_buffer_uptodate(bh);
     73		set_bitmap_uptodate(bh);
     74	}
     75	unlock_buffer(bh);
     76	put_bh(bh);
     77}
     78
     79static int ext4_validate_inode_bitmap(struct super_block *sb,
     80				      struct ext4_group_desc *desc,
     81				      ext4_group_t block_group,
     82				      struct buffer_head *bh)
     83{
     84	ext4_fsblk_t	blk;
     85	struct ext4_group_info *grp;
     86
     87	if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
     88		return 0;
     89
     90	grp = ext4_get_group_info(sb, block_group);
     91
     92	if (buffer_verified(bh))
     93		return 0;
     94	if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
     95		return -EFSCORRUPTED;
     96
     97	ext4_lock_group(sb, block_group);
     98	if (buffer_verified(bh))
     99		goto verified;
    100	blk = ext4_inode_bitmap(sb, desc);
    101	if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
    102					   EXT4_INODES_PER_GROUP(sb) / 8) ||
    103	    ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) {
    104		ext4_unlock_group(sb, block_group);
    105		ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
    106			   "inode_bitmap = %llu", block_group, blk);
    107		ext4_mark_group_bitmap_corrupted(sb, block_group,
    108					EXT4_GROUP_INFO_IBITMAP_CORRUPT);
    109		return -EFSBADCRC;
    110	}
    111	set_buffer_verified(bh);
    112verified:
    113	ext4_unlock_group(sb, block_group);
    114	return 0;
    115}
    116
    117/*
    118 * Read the inode allocation bitmap for a given block_group, reading
    119 * into the specified slot in the superblock's bitmap cache.
    120 *
    121 * Return buffer_head of bitmap on success, or an ERR_PTR on error.
    122 */
    123static struct buffer_head *
    124ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
    125{
    126	struct ext4_group_desc *desc;
    127	struct ext4_sb_info *sbi = EXT4_SB(sb);
    128	struct buffer_head *bh = NULL;
    129	ext4_fsblk_t bitmap_blk;
    130	int err;
    131
    132	desc = ext4_get_group_desc(sb, block_group, NULL);
    133	if (!desc)
    134		return ERR_PTR(-EFSCORRUPTED);
    135
    136	bitmap_blk = ext4_inode_bitmap(sb, desc);
    137	if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
    138	    (bitmap_blk >= ext4_blocks_count(sbi->s_es))) {
    139		ext4_error(sb, "Invalid inode bitmap blk %llu in "
    140			   "block_group %u", bitmap_blk, block_group);
    141		ext4_mark_group_bitmap_corrupted(sb, block_group,
    142					EXT4_GROUP_INFO_IBITMAP_CORRUPT);
    143		return ERR_PTR(-EFSCORRUPTED);
    144	}
    145	bh = sb_getblk(sb, bitmap_blk);
    146	if (unlikely(!bh)) {
    147		ext4_warning(sb, "Cannot read inode bitmap - "
    148			     "block_group = %u, inode_bitmap = %llu",
    149			     block_group, bitmap_blk);
    150		return ERR_PTR(-ENOMEM);
    151	}
    152	if (bitmap_uptodate(bh))
    153		goto verify;
    154
    155	lock_buffer(bh);
    156	if (bitmap_uptodate(bh)) {
    157		unlock_buffer(bh);
    158		goto verify;
    159	}
    160
    161	ext4_lock_group(sb, block_group);
    162	if (ext4_has_group_desc_csum(sb) &&
    163	    (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) {
    164		if (block_group == 0) {
    165			ext4_unlock_group(sb, block_group);
    166			unlock_buffer(bh);
    167			ext4_error(sb, "Inode bitmap for bg 0 marked "
    168				   "uninitialized");
    169			err = -EFSCORRUPTED;
    170			goto out;
    171		}
    172		memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
    173		ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
    174				     sb->s_blocksize * 8, bh->b_data);
    175		set_bitmap_uptodate(bh);
    176		set_buffer_uptodate(bh);
    177		set_buffer_verified(bh);
    178		ext4_unlock_group(sb, block_group);
    179		unlock_buffer(bh);
    180		return bh;
    181	}
    182	ext4_unlock_group(sb, block_group);
    183
    184	if (buffer_uptodate(bh)) {
    185		/*
    186		 * if not uninit if bh is uptodate,
    187		 * bitmap is also uptodate
    188		 */
    189		set_bitmap_uptodate(bh);
    190		unlock_buffer(bh);
    191		goto verify;
    192	}
    193	/*
    194	 * submit the buffer_head for reading
    195	 */
    196	trace_ext4_load_inode_bitmap(sb, block_group);
    197	ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read);
    198	ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO);
    199	if (!buffer_uptodate(bh)) {
    200		put_bh(bh);
    201		ext4_error_err(sb, EIO, "Cannot read inode bitmap - "
    202			       "block_group = %u, inode_bitmap = %llu",
    203			       block_group, bitmap_blk);
    204		ext4_mark_group_bitmap_corrupted(sb, block_group,
    205				EXT4_GROUP_INFO_IBITMAP_CORRUPT);
    206		return ERR_PTR(-EIO);
    207	}
    208
    209verify:
    210	err = ext4_validate_inode_bitmap(sb, desc, block_group, bh);
    211	if (err)
    212		goto out;
    213	return bh;
    214out:
    215	put_bh(bh);
    216	return ERR_PTR(err);
    217}
    218
    219/*
    220 * NOTE! When we get the inode, we're the only people
    221 * that have access to it, and as such there are no
    222 * race conditions we have to worry about. The inode
    223 * is not on the hash-lists, and it cannot be reached
    224 * through the filesystem because the directory entry
    225 * has been deleted earlier.
    226 *
    227 * HOWEVER: we must make sure that we get no aliases,
    228 * which means that we have to call "clear_inode()"
    229 * _before_ we mark the inode not in use in the inode
    230 * bitmaps. Otherwise a newly created file might use
    231 * the same inode number (not actually the same pointer
    232 * though), and then we'd have two inodes sharing the
    233 * same inode number and space on the harddisk.
    234 */
    235void ext4_free_inode(handle_t *handle, struct inode *inode)
    236{
    237	struct super_block *sb = inode->i_sb;
    238	int is_directory;
    239	unsigned long ino;
    240	struct buffer_head *bitmap_bh = NULL;
    241	struct buffer_head *bh2;
    242	ext4_group_t block_group;
    243	unsigned long bit;
    244	struct ext4_group_desc *gdp;
    245	struct ext4_super_block *es;
    246	struct ext4_sb_info *sbi;
    247	int fatal = 0, err, count, cleared;
    248	struct ext4_group_info *grp;
    249
    250	if (!sb) {
    251		printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
    252		       "nonexistent device\n", __func__, __LINE__);
    253		return;
    254	}
    255	if (atomic_read(&inode->i_count) > 1) {
    256		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
    257			 __func__, __LINE__, inode->i_ino,
    258			 atomic_read(&inode->i_count));
    259		return;
    260	}
    261	if (inode->i_nlink) {
    262		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
    263			 __func__, __LINE__, inode->i_ino, inode->i_nlink);
    264		return;
    265	}
    266	sbi = EXT4_SB(sb);
    267
    268	ino = inode->i_ino;
    269	ext4_debug("freeing inode %lu\n", ino);
    270	trace_ext4_free_inode(inode);
    271
    272	dquot_initialize(inode);
    273	dquot_free_inode(inode);
    274
    275	is_directory = S_ISDIR(inode->i_mode);
    276
    277	/* Do this BEFORE marking the inode not in use or returning an error */
    278	ext4_clear_inode(inode);
    279
    280	es = sbi->s_es;
    281	if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
    282		ext4_error(sb, "reserved or nonexistent inode %lu", ino);
    283		goto error_return;
    284	}
    285	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
    286	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
    287	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
    288	/* Don't bother if the inode bitmap is corrupt. */
    289	if (IS_ERR(bitmap_bh)) {
    290		fatal = PTR_ERR(bitmap_bh);
    291		bitmap_bh = NULL;
    292		goto error_return;
    293	}
    294	if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
    295		grp = ext4_get_group_info(sb, block_group);
    296		if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
    297			fatal = -EFSCORRUPTED;
    298			goto error_return;
    299		}
    300	}
    301
    302	BUFFER_TRACE(bitmap_bh, "get_write_access");
    303	fatal = ext4_journal_get_write_access(handle, sb, bitmap_bh,
    304					      EXT4_JTR_NONE);
    305	if (fatal)
    306		goto error_return;
    307
    308	fatal = -ESRCH;
    309	gdp = ext4_get_group_desc(sb, block_group, &bh2);
    310	if (gdp) {
    311		BUFFER_TRACE(bh2, "get_write_access");
    312		fatal = ext4_journal_get_write_access(handle, sb, bh2,
    313						      EXT4_JTR_NONE);
    314	}
    315	ext4_lock_group(sb, block_group);
    316	cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
    317	if (fatal || !cleared) {
    318		ext4_unlock_group(sb, block_group);
    319		goto out;
    320	}
    321
    322	count = ext4_free_inodes_count(sb, gdp) + 1;
    323	ext4_free_inodes_set(sb, gdp, count);
    324	if (is_directory) {
    325		count = ext4_used_dirs_count(sb, gdp) - 1;
    326		ext4_used_dirs_set(sb, gdp, count);
    327		if (percpu_counter_initialized(&sbi->s_dirs_counter))
    328			percpu_counter_dec(&sbi->s_dirs_counter);
    329	}
    330	ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
    331				   EXT4_INODES_PER_GROUP(sb) / 8);
    332	ext4_group_desc_csum_set(sb, block_group, gdp);
    333	ext4_unlock_group(sb, block_group);
    334
    335	if (percpu_counter_initialized(&sbi->s_freeinodes_counter))
    336		percpu_counter_inc(&sbi->s_freeinodes_counter);
    337	if (sbi->s_log_groups_per_flex) {
    338		struct flex_groups *fg;
    339
    340		fg = sbi_array_rcu_deref(sbi, s_flex_groups,
    341					 ext4_flex_group(sbi, block_group));
    342		atomic_inc(&fg->free_inodes);
    343		if (is_directory)
    344			atomic_dec(&fg->used_dirs);
    345	}
    346	BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
    347	fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
    348out:
    349	if (cleared) {
    350		BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
    351		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
    352		if (!fatal)
    353			fatal = err;
    354	} else {
    355		ext4_error(sb, "bit already cleared for inode %lu", ino);
    356		ext4_mark_group_bitmap_corrupted(sb, block_group,
    357					EXT4_GROUP_INFO_IBITMAP_CORRUPT);
    358	}
    359
    360error_return:
    361	brelse(bitmap_bh);
    362	ext4_std_error(sb, fatal);
    363}
    364
    365struct orlov_stats {
    366	__u64 free_clusters;
    367	__u32 free_inodes;
    368	__u32 used_dirs;
    369};
    370
    371/*
    372 * Helper function for Orlov's allocator; returns critical information
    373 * for a particular block group or flex_bg.  If flex_size is 1, then g
    374 * is a block group number; otherwise it is flex_bg number.
    375 */
    376static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
    377			    int flex_size, struct orlov_stats *stats)
    378{
    379	struct ext4_group_desc *desc;
    380
    381	if (flex_size > 1) {
    382		struct flex_groups *fg = sbi_array_rcu_deref(EXT4_SB(sb),
    383							     s_flex_groups, g);
    384		stats->free_inodes = atomic_read(&fg->free_inodes);
    385		stats->free_clusters = atomic64_read(&fg->free_clusters);
    386		stats->used_dirs = atomic_read(&fg->used_dirs);
    387		return;
    388	}
    389
    390	desc = ext4_get_group_desc(sb, g, NULL);
    391	if (desc) {
    392		stats->free_inodes = ext4_free_inodes_count(sb, desc);
    393		stats->free_clusters = ext4_free_group_clusters(sb, desc);
    394		stats->used_dirs = ext4_used_dirs_count(sb, desc);
    395	} else {
    396		stats->free_inodes = 0;
    397		stats->free_clusters = 0;
    398		stats->used_dirs = 0;
    399	}
    400}
    401
    402/*
    403 * Orlov's allocator for directories.
    404 *
    405 * We always try to spread first-level directories.
    406 *
    407 * If there are blockgroups with both free inodes and free clusters counts
    408 * not worse than average we return one with smallest directory count.
    409 * Otherwise we simply return a random group.
    410 *
    411 * For the rest rules look so:
    412 *
    413 * It's OK to put directory into a group unless
    414 * it has too many directories already (max_dirs) or
    415 * it has too few free inodes left (min_inodes) or
    416 * it has too few free clusters left (min_clusters) or
    417 * Parent's group is preferred, if it doesn't satisfy these
    418 * conditions we search cyclically through the rest. If none
    419 * of the groups look good we just look for a group with more
    420 * free inodes than average (starting at parent's group).
    421 */
    422
    423static int find_group_orlov(struct super_block *sb, struct inode *parent,
    424			    ext4_group_t *group, umode_t mode,
    425			    const struct qstr *qstr)
    426{
    427	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
    428	struct ext4_sb_info *sbi = EXT4_SB(sb);
    429	ext4_group_t real_ngroups = ext4_get_groups_count(sb);
    430	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
    431	unsigned int freei, avefreei, grp_free;
    432	ext4_fsblk_t freec, avefreec;
    433	unsigned int ndirs;
    434	int max_dirs, min_inodes;
    435	ext4_grpblk_t min_clusters;
    436	ext4_group_t i, grp, g, ngroups;
    437	struct ext4_group_desc *desc;
    438	struct orlov_stats stats;
    439	int flex_size = ext4_flex_bg_size(sbi);
    440	struct dx_hash_info hinfo;
    441
    442	ngroups = real_ngroups;
    443	if (flex_size > 1) {
    444		ngroups = (real_ngroups + flex_size - 1) >>
    445			sbi->s_log_groups_per_flex;
    446		parent_group >>= sbi->s_log_groups_per_flex;
    447	}
    448
    449	freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
    450	avefreei = freei / ngroups;
    451	freec = percpu_counter_read_positive(&sbi->s_freeclusters_counter);
    452	avefreec = freec;
    453	do_div(avefreec, ngroups);
    454	ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
    455
    456	if (S_ISDIR(mode) &&
    457	    ((parent == d_inode(sb->s_root)) ||
    458	     (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
    459		int best_ndir = inodes_per_group;
    460		int ret = -1;
    461
    462		if (qstr) {
    463			hinfo.hash_version = DX_HASH_HALF_MD4;
    464			hinfo.seed = sbi->s_hash_seed;
    465			ext4fs_dirhash(parent, qstr->name, qstr->len, &hinfo);
    466			grp = hinfo.hash;
    467		} else
    468			grp = prandom_u32();
    469		parent_group = (unsigned)grp % ngroups;
    470		for (i = 0; i < ngroups; i++) {
    471			g = (parent_group + i) % ngroups;
    472			get_orlov_stats(sb, g, flex_size, &stats);
    473			if (!stats.free_inodes)
    474				continue;
    475			if (stats.used_dirs >= best_ndir)
    476				continue;
    477			if (stats.free_inodes < avefreei)
    478				continue;
    479			if (stats.free_clusters < avefreec)
    480				continue;
    481			grp = g;
    482			ret = 0;
    483			best_ndir = stats.used_dirs;
    484		}
    485		if (ret)
    486			goto fallback;
    487	found_flex_bg:
    488		if (flex_size == 1) {
    489			*group = grp;
    490			return 0;
    491		}
    492
    493		/*
    494		 * We pack inodes at the beginning of the flexgroup's
    495		 * inode tables.  Block allocation decisions will do
    496		 * something similar, although regular files will
    497		 * start at 2nd block group of the flexgroup.  See
    498		 * ext4_ext_find_goal() and ext4_find_near().
    499		 */
    500		grp *= flex_size;
    501		for (i = 0; i < flex_size; i++) {
    502			if (grp+i >= real_ngroups)
    503				break;
    504			desc = ext4_get_group_desc(sb, grp+i, NULL);
    505			if (desc && ext4_free_inodes_count(sb, desc)) {
    506				*group = grp+i;
    507				return 0;
    508			}
    509		}
    510		goto fallback;
    511	}
    512
    513	max_dirs = ndirs / ngroups + inodes_per_group / 16;
    514	min_inodes = avefreei - inodes_per_group*flex_size / 4;
    515	if (min_inodes < 1)
    516		min_inodes = 1;
    517	min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
    518
    519	/*
    520	 * Start looking in the flex group where we last allocated an
    521	 * inode for this parent directory
    522	 */
    523	if (EXT4_I(parent)->i_last_alloc_group != ~0) {
    524		parent_group = EXT4_I(parent)->i_last_alloc_group;
    525		if (flex_size > 1)
    526			parent_group >>= sbi->s_log_groups_per_flex;
    527	}
    528
    529	for (i = 0; i < ngroups; i++) {
    530		grp = (parent_group + i) % ngroups;
    531		get_orlov_stats(sb, grp, flex_size, &stats);
    532		if (stats.used_dirs >= max_dirs)
    533			continue;
    534		if (stats.free_inodes < min_inodes)
    535			continue;
    536		if (stats.free_clusters < min_clusters)
    537			continue;
    538		goto found_flex_bg;
    539	}
    540
    541fallback:
    542	ngroups = real_ngroups;
    543	avefreei = freei / ngroups;
    544fallback_retry:
    545	parent_group = EXT4_I(parent)->i_block_group;
    546	for (i = 0; i < ngroups; i++) {
    547		grp = (parent_group + i) % ngroups;
    548		desc = ext4_get_group_desc(sb, grp, NULL);
    549		if (desc) {
    550			grp_free = ext4_free_inodes_count(sb, desc);
    551			if (grp_free && grp_free >= avefreei) {
    552				*group = grp;
    553				return 0;
    554			}
    555		}
    556	}
    557
    558	if (avefreei) {
    559		/*
    560		 * The free-inodes counter is approximate, and for really small
    561		 * filesystems the above test can fail to find any blockgroups
    562		 */
    563		avefreei = 0;
    564		goto fallback_retry;
    565	}
    566
    567	return -1;
    568}
    569
    570static int find_group_other(struct super_block *sb, struct inode *parent,
    571			    ext4_group_t *group, umode_t mode)
    572{
    573	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
    574	ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
    575	struct ext4_group_desc *desc;
    576	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
    577
    578	/*
    579	 * Try to place the inode is the same flex group as its
    580	 * parent.  If we can't find space, use the Orlov algorithm to
    581	 * find another flex group, and store that information in the
    582	 * parent directory's inode information so that use that flex
    583	 * group for future allocations.
    584	 */
    585	if (flex_size > 1) {
    586		int retry = 0;
    587
    588	try_again:
    589		parent_group &= ~(flex_size-1);
    590		last = parent_group + flex_size;
    591		if (last > ngroups)
    592			last = ngroups;
    593		for  (i = parent_group; i < last; i++) {
    594			desc = ext4_get_group_desc(sb, i, NULL);
    595			if (desc && ext4_free_inodes_count(sb, desc)) {
    596				*group = i;
    597				return 0;
    598			}
    599		}
    600		if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
    601			retry = 1;
    602			parent_group = EXT4_I(parent)->i_last_alloc_group;
    603			goto try_again;
    604		}
    605		/*
    606		 * If this didn't work, use the Orlov search algorithm
    607		 * to find a new flex group; we pass in the mode to
    608		 * avoid the topdir algorithms.
    609		 */
    610		*group = parent_group + flex_size;
    611		if (*group > ngroups)
    612			*group = 0;
    613		return find_group_orlov(sb, parent, group, mode, NULL);
    614	}
    615
    616	/*
    617	 * Try to place the inode in its parent directory
    618	 */
    619	*group = parent_group;
    620	desc = ext4_get_group_desc(sb, *group, NULL);
    621	if (desc && ext4_free_inodes_count(sb, desc) &&
    622	    ext4_free_group_clusters(sb, desc))
    623		return 0;
    624
    625	/*
    626	 * We're going to place this inode in a different blockgroup from its
    627	 * parent.  We want to cause files in a common directory to all land in
    628	 * the same blockgroup.  But we want files which are in a different
    629	 * directory which shares a blockgroup with our parent to land in a
    630	 * different blockgroup.
    631	 *
    632	 * So add our directory's i_ino into the starting point for the hash.
    633	 */
    634	*group = (*group + parent->i_ino) % ngroups;
    635
    636	/*
    637	 * Use a quadratic hash to find a group with a free inode and some free
    638	 * blocks.
    639	 */
    640	for (i = 1; i < ngroups; i <<= 1) {
    641		*group += i;
    642		if (*group >= ngroups)
    643			*group -= ngroups;
    644		desc = ext4_get_group_desc(sb, *group, NULL);
    645		if (desc && ext4_free_inodes_count(sb, desc) &&
    646		    ext4_free_group_clusters(sb, desc))
    647			return 0;
    648	}
    649
    650	/*
    651	 * That failed: try linear search for a free inode, even if that group
    652	 * has no free blocks.
    653	 */
    654	*group = parent_group;
    655	for (i = 0; i < ngroups; i++) {
    656		if (++*group >= ngroups)
    657			*group = 0;
    658		desc = ext4_get_group_desc(sb, *group, NULL);
    659		if (desc && ext4_free_inodes_count(sb, desc))
    660			return 0;
    661	}
    662
    663	return -1;
    664}
    665
    666/*
    667 * In no journal mode, if an inode has recently been deleted, we want
    668 * to avoid reusing it until we're reasonably sure the inode table
    669 * block has been written back to disk.  (Yes, these values are
    670 * somewhat arbitrary...)
    671 */
    672#define RECENTCY_MIN	60
    673#define RECENTCY_DIRTY	300
    674
    675static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
    676{
    677	struct ext4_group_desc	*gdp;
    678	struct ext4_inode	*raw_inode;
    679	struct buffer_head	*bh;
    680	int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
    681	int offset, ret = 0;
    682	int recentcy = RECENTCY_MIN;
    683	u32 dtime, now;
    684
    685	gdp = ext4_get_group_desc(sb, group, NULL);
    686	if (unlikely(!gdp))
    687		return 0;
    688
    689	bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) +
    690		       (ino / inodes_per_block));
    691	if (!bh || !buffer_uptodate(bh))
    692		/*
    693		 * If the block is not in the buffer cache, then it
    694		 * must have been written out.
    695		 */
    696		goto out;
    697
    698	offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb);
    699	raw_inode = (struct ext4_inode *) (bh->b_data + offset);
    700
    701	/* i_dtime is only 32 bits on disk, but we only care about relative
    702	 * times in the range of a few minutes (i.e. long enough to sync a
    703	 * recently-deleted inode to disk), so using the low 32 bits of the
    704	 * clock (a 68 year range) is enough, see time_before32() */
    705	dtime = le32_to_cpu(raw_inode->i_dtime);
    706	now = ktime_get_real_seconds();
    707	if (buffer_dirty(bh))
    708		recentcy += RECENTCY_DIRTY;
    709
    710	if (dtime && time_before32(dtime, now) &&
    711	    time_before32(now, dtime + recentcy))
    712		ret = 1;
    713out:
    714	brelse(bh);
    715	return ret;
    716}
    717
    718static int find_inode_bit(struct super_block *sb, ext4_group_t group,
    719			  struct buffer_head *bitmap, unsigned long *ino)
    720{
    721	bool check_recently_deleted = EXT4_SB(sb)->s_journal == NULL;
    722	unsigned long recently_deleted_ino = EXT4_INODES_PER_GROUP(sb);
    723
    724next:
    725	*ino = ext4_find_next_zero_bit((unsigned long *)
    726				       bitmap->b_data,
    727				       EXT4_INODES_PER_GROUP(sb), *ino);
    728	if (*ino >= EXT4_INODES_PER_GROUP(sb))
    729		goto not_found;
    730
    731	if (check_recently_deleted && recently_deleted(sb, group, *ino)) {
    732		recently_deleted_ino = *ino;
    733		*ino = *ino + 1;
    734		if (*ino < EXT4_INODES_PER_GROUP(sb))
    735			goto next;
    736		goto not_found;
    737	}
    738	return 1;
    739not_found:
    740	if (recently_deleted_ino >= EXT4_INODES_PER_GROUP(sb))
    741		return 0;
    742	/*
    743	 * Not reusing recently deleted inodes is mostly a preference. We don't
    744	 * want to report ENOSPC or skew allocation patterns because of that.
    745	 * So return even recently deleted inode if we could find better in the
    746	 * given range.
    747	 */
    748	*ino = recently_deleted_ino;
    749	return 1;
    750}
    751
    752int ext4_mark_inode_used(struct super_block *sb, int ino)
    753{
    754	unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
    755	struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL;
    756	struct ext4_group_desc *gdp;
    757	ext4_group_t group;
    758	int bit;
    759	int err = -EFSCORRUPTED;
    760
    761	if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
    762		goto out;
    763
    764	group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
    765	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
    766	inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
    767	if (IS_ERR(inode_bitmap_bh))
    768		return PTR_ERR(inode_bitmap_bh);
    769
    770	if (ext4_test_bit(bit, inode_bitmap_bh->b_data)) {
    771		err = 0;
    772		goto out;
    773	}
    774
    775	gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
    776	if (!gdp || !group_desc_bh) {
    777		err = -EINVAL;
    778		goto out;
    779	}
    780
    781	ext4_set_bit(bit, inode_bitmap_bh->b_data);
    782
    783	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
    784	err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh);
    785	if (err) {
    786		ext4_std_error(sb, err);
    787		goto out;
    788	}
    789	err = sync_dirty_buffer(inode_bitmap_bh);
    790	if (err) {
    791		ext4_std_error(sb, err);
    792		goto out;
    793	}
    794
    795	/* We may have to initialize the block bitmap if it isn't already */
    796	if (ext4_has_group_desc_csum(sb) &&
    797	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
    798		struct buffer_head *block_bitmap_bh;
    799
    800		block_bitmap_bh = ext4_read_block_bitmap(sb, group);
    801		if (IS_ERR(block_bitmap_bh)) {
    802			err = PTR_ERR(block_bitmap_bh);
    803			goto out;
    804		}
    805
    806		BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
    807		err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh);
    808		sync_dirty_buffer(block_bitmap_bh);
    809
    810		/* recheck and clear flag under lock if we still need to */
    811		ext4_lock_group(sb, group);
    812		if (ext4_has_group_desc_csum(sb) &&
    813		    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
    814			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
    815			ext4_free_group_clusters_set(sb, gdp,
    816				ext4_free_clusters_after_init(sb, group, gdp));
    817			ext4_block_bitmap_csum_set(sb, group, gdp,
    818						   block_bitmap_bh);
    819			ext4_group_desc_csum_set(sb, group, gdp);
    820		}
    821		ext4_unlock_group(sb, group);
    822		brelse(block_bitmap_bh);
    823
    824		if (err) {
    825			ext4_std_error(sb, err);
    826			goto out;
    827		}
    828	}
    829
    830	/* Update the relevant bg descriptor fields */
    831	if (ext4_has_group_desc_csum(sb)) {
    832		int free;
    833
    834		ext4_lock_group(sb, group); /* while we modify the bg desc */
    835		free = EXT4_INODES_PER_GROUP(sb) -
    836			ext4_itable_unused_count(sb, gdp);
    837		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
    838			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
    839			free = 0;
    840		}
    841
    842		/*
    843		 * Check the relative inode number against the last used
    844		 * relative inode number in this group. if it is greater
    845		 * we need to update the bg_itable_unused count
    846		 */
    847		if (bit >= free)
    848			ext4_itable_unused_set(sb, gdp,
    849					(EXT4_INODES_PER_GROUP(sb) - bit - 1));
    850	} else {
    851		ext4_lock_group(sb, group);
    852	}
    853
    854	ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
    855	if (ext4_has_group_desc_csum(sb)) {
    856		ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
    857					   EXT4_INODES_PER_GROUP(sb) / 8);
    858		ext4_group_desc_csum_set(sb, group, gdp);
    859	}
    860
    861	ext4_unlock_group(sb, group);
    862	err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh);
    863	sync_dirty_buffer(group_desc_bh);
    864out:
    865	return err;
    866}
    867
    868static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode,
    869					    bool encrypt)
    870{
    871	struct super_block *sb = dir->i_sb;
    872	int nblocks = 0;
    873#ifdef CONFIG_EXT4_FS_POSIX_ACL
    874	struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
    875
    876	if (IS_ERR(p))
    877		return PTR_ERR(p);
    878	if (p) {
    879		int acl_size = p->a_count * sizeof(ext4_acl_entry);
    880
    881		nblocks += (S_ISDIR(mode) ? 2 : 1) *
    882			__ext4_xattr_set_credits(sb, NULL /* inode */,
    883						 NULL /* block_bh */, acl_size,
    884						 true /* is_create */);
    885		posix_acl_release(p);
    886	}
    887#endif
    888
    889#ifdef CONFIG_SECURITY
    890	{
    891		int num_security_xattrs = 1;
    892
    893#ifdef CONFIG_INTEGRITY
    894		num_security_xattrs++;
    895#endif
    896		/*
    897		 * We assume that security xattrs are never more than 1k.
    898		 * In practice they are under 128 bytes.
    899		 */
    900		nblocks += num_security_xattrs *
    901			__ext4_xattr_set_credits(sb, NULL /* inode */,
    902						 NULL /* block_bh */, 1024,
    903						 true /* is_create */);
    904	}
    905#endif
    906	if (encrypt)
    907		nblocks += __ext4_xattr_set_credits(sb,
    908						    NULL /* inode */,
    909						    NULL /* block_bh */,
    910						    FSCRYPT_SET_CONTEXT_MAX_SIZE,
    911						    true /* is_create */);
    912	return nblocks;
    913}
    914
    915/*
    916 * There are two policies for allocating an inode.  If the new inode is
    917 * a directory, then a forward search is made for a block group with both
    918 * free space and a low directory-to-inode ratio; if that fails, then of
    919 * the groups with above-average free space, that group with the fewest
    920 * directories already is chosen.
    921 *
    922 * For other inodes, search forward from the parent directory's block
    923 * group to find a free inode.
    924 */
    925struct inode *__ext4_new_inode(struct user_namespace *mnt_userns,
    926			       handle_t *handle, struct inode *dir,
    927			       umode_t mode, const struct qstr *qstr,
    928			       __u32 goal, uid_t *owner, __u32 i_flags,
    929			       int handle_type, unsigned int line_no,
    930			       int nblocks)
    931{
    932	struct super_block *sb;
    933	struct buffer_head *inode_bitmap_bh = NULL;
    934	struct buffer_head *group_desc_bh;
    935	ext4_group_t ngroups, group = 0;
    936	unsigned long ino = 0;
    937	struct inode *inode;
    938	struct ext4_group_desc *gdp = NULL;
    939	struct ext4_inode_info *ei;
    940	struct ext4_sb_info *sbi;
    941	int ret2, err;
    942	struct inode *ret;
    943	ext4_group_t i;
    944	ext4_group_t flex_group;
    945	struct ext4_group_info *grp = NULL;
    946	bool encrypt = false;
    947
    948	/* Cannot create files in a deleted directory */
    949	if (!dir || !dir->i_nlink)
    950		return ERR_PTR(-EPERM);
    951
    952	sb = dir->i_sb;
    953	sbi = EXT4_SB(sb);
    954
    955	if (unlikely(ext4_forced_shutdown(sbi)))
    956		return ERR_PTR(-EIO);
    957
    958	ngroups = ext4_get_groups_count(sb);
    959	trace_ext4_request_inode(dir, mode);
    960	inode = new_inode(sb);
    961	if (!inode)
    962		return ERR_PTR(-ENOMEM);
    963	ei = EXT4_I(inode);
    964
    965	/*
    966	 * Initialize owners and quota early so that we don't have to account
    967	 * for quota initialization worst case in standard inode creating
    968	 * transaction
    969	 */
    970	if (owner) {
    971		inode->i_mode = mode;
    972		i_uid_write(inode, owner[0]);
    973		i_gid_write(inode, owner[1]);
    974	} else if (test_opt(sb, GRPID)) {
    975		inode->i_mode = mode;
    976		inode_fsuid_set(inode, mnt_userns);
    977		inode->i_gid = dir->i_gid;
    978	} else
    979		inode_init_owner(mnt_userns, inode, dir, mode);
    980
    981	if (ext4_has_feature_project(sb) &&
    982	    ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
    983		ei->i_projid = EXT4_I(dir)->i_projid;
    984	else
    985		ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);
    986
    987	if (!(i_flags & EXT4_EA_INODE_FL)) {
    988		err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
    989		if (err)
    990			goto out;
    991	}
    992
    993	err = dquot_initialize(inode);
    994	if (err)
    995		goto out;
    996
    997	if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
    998		ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt);
    999		if (ret2 < 0) {
   1000			err = ret2;
   1001			goto out;
   1002		}
   1003		nblocks += ret2;
   1004	}
   1005
   1006	if (!goal)
   1007		goal = sbi->s_inode_goal;
   1008
   1009	if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
   1010		group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
   1011		ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
   1012		ret2 = 0;
   1013		goto got_group;
   1014	}
   1015
   1016	if (S_ISDIR(mode))
   1017		ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
   1018	else
   1019		ret2 = find_group_other(sb, dir, &group, mode);
   1020
   1021got_group:
   1022	EXT4_I(dir)->i_last_alloc_group = group;
   1023	err = -ENOSPC;
   1024	if (ret2 == -1)
   1025		goto out;
   1026
   1027	/*
   1028	 * Normally we will only go through one pass of this loop,
   1029	 * unless we get unlucky and it turns out the group we selected
   1030	 * had its last inode grabbed by someone else.
   1031	 */
   1032	for (i = 0; i < ngroups; i++, ino = 0) {
   1033		err = -EIO;
   1034
   1035		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
   1036		if (!gdp)
   1037			goto out;
   1038
   1039		/*
   1040		 * Check free inodes count before loading bitmap.
   1041		 */
   1042		if (ext4_free_inodes_count(sb, gdp) == 0)
   1043			goto next_group;
   1044
   1045		if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
   1046			grp = ext4_get_group_info(sb, group);
   1047			/*
   1048			 * Skip groups with already-known suspicious inode
   1049			 * tables
   1050			 */
   1051			if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
   1052				goto next_group;
   1053		}
   1054
   1055		brelse(inode_bitmap_bh);
   1056		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
   1057		/* Skip groups with suspicious inode tables */
   1058		if (((!(sbi->s_mount_state & EXT4_FC_REPLAY))
   1059		     && EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) ||
   1060		    IS_ERR(inode_bitmap_bh)) {
   1061			inode_bitmap_bh = NULL;
   1062			goto next_group;
   1063		}
   1064
   1065repeat_in_this_group:
   1066		ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
   1067		if (!ret2)
   1068			goto next_group;
   1069
   1070		if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) {
   1071			ext4_error(sb, "reserved inode found cleared - "
   1072				   "inode=%lu", ino + 1);
   1073			ext4_mark_group_bitmap_corrupted(sb, group,
   1074					EXT4_GROUP_INFO_IBITMAP_CORRUPT);
   1075			goto next_group;
   1076		}
   1077
   1078		if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) {
   1079			BUG_ON(nblocks <= 0);
   1080			handle = __ext4_journal_start_sb(dir->i_sb, line_no,
   1081				 handle_type, nblocks, 0,
   1082				 ext4_trans_default_revoke_credits(sb));
   1083			if (IS_ERR(handle)) {
   1084				err = PTR_ERR(handle);
   1085				ext4_std_error(sb, err);
   1086				goto out;
   1087			}
   1088		}
   1089		BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
   1090		err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh,
   1091						    EXT4_JTR_NONE);
   1092		if (err) {
   1093			ext4_std_error(sb, err);
   1094			goto out;
   1095		}
   1096		ext4_lock_group(sb, group);
   1097		ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
   1098		if (ret2) {
   1099			/* Someone already took the bit. Repeat the search
   1100			 * with lock held.
   1101			 */
   1102			ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
   1103			if (ret2) {
   1104				ext4_set_bit(ino, inode_bitmap_bh->b_data);
   1105				ret2 = 0;
   1106			} else {
   1107				ret2 = 1; /* we didn't grab the inode */
   1108			}
   1109		}
   1110		ext4_unlock_group(sb, group);
   1111		ino++;		/* the inode bitmap is zero-based */
   1112		if (!ret2)
   1113			goto got; /* we grabbed the inode! */
   1114
   1115		if (ino < EXT4_INODES_PER_GROUP(sb))
   1116			goto repeat_in_this_group;
   1117next_group:
   1118		if (++group == ngroups)
   1119			group = 0;
   1120	}
   1121	err = -ENOSPC;
   1122	goto out;
   1123
   1124got:
   1125	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
   1126	err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
   1127	if (err) {
   1128		ext4_std_error(sb, err);
   1129		goto out;
   1130	}
   1131
   1132	BUFFER_TRACE(group_desc_bh, "get_write_access");
   1133	err = ext4_journal_get_write_access(handle, sb, group_desc_bh,
   1134					    EXT4_JTR_NONE);
   1135	if (err) {
   1136		ext4_std_error(sb, err);
   1137		goto out;
   1138	}
   1139
   1140	/* We may have to initialize the block bitmap if it isn't already */
   1141	if (ext4_has_group_desc_csum(sb) &&
   1142	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
   1143		struct buffer_head *block_bitmap_bh;
   1144
   1145		block_bitmap_bh = ext4_read_block_bitmap(sb, group);
   1146		if (IS_ERR(block_bitmap_bh)) {
   1147			err = PTR_ERR(block_bitmap_bh);
   1148			goto out;
   1149		}
   1150		BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
   1151		err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh,
   1152						    EXT4_JTR_NONE);
   1153		if (err) {
   1154			brelse(block_bitmap_bh);
   1155			ext4_std_error(sb, err);
   1156			goto out;
   1157		}
   1158
   1159		BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
   1160		err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
   1161
   1162		/* recheck and clear flag under lock if we still need to */
   1163		ext4_lock_group(sb, group);
   1164		if (ext4_has_group_desc_csum(sb) &&
   1165		    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
   1166			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
   1167			ext4_free_group_clusters_set(sb, gdp,
   1168				ext4_free_clusters_after_init(sb, group, gdp));
   1169			ext4_block_bitmap_csum_set(sb, group, gdp,
   1170						   block_bitmap_bh);
   1171			ext4_group_desc_csum_set(sb, group, gdp);
   1172		}
   1173		ext4_unlock_group(sb, group);
   1174		brelse(block_bitmap_bh);
   1175
   1176		if (err) {
   1177			ext4_std_error(sb, err);
   1178			goto out;
   1179		}
   1180	}
   1181
   1182	/* Update the relevant bg descriptor fields */
   1183	if (ext4_has_group_desc_csum(sb)) {
   1184		int free;
   1185		struct ext4_group_info *grp = NULL;
   1186
   1187		if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
   1188			grp = ext4_get_group_info(sb, group);
   1189			down_read(&grp->alloc_sem); /*
   1190						     * protect vs itable
   1191						     * lazyinit
   1192						     */
   1193		}
   1194		ext4_lock_group(sb, group); /* while we modify the bg desc */
   1195		free = EXT4_INODES_PER_GROUP(sb) -
   1196			ext4_itable_unused_count(sb, gdp);
   1197		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
   1198			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
   1199			free = 0;
   1200		}
   1201		/*
   1202		 * Check the relative inode number against the last used
   1203		 * relative inode number in this group. if it is greater
   1204		 * we need to update the bg_itable_unused count
   1205		 */
   1206		if (ino > free)
   1207			ext4_itable_unused_set(sb, gdp,
   1208					(EXT4_INODES_PER_GROUP(sb) - ino));
   1209		if (!(sbi->s_mount_state & EXT4_FC_REPLAY))
   1210			up_read(&grp->alloc_sem);
   1211	} else {
   1212		ext4_lock_group(sb, group);
   1213	}
   1214
   1215	ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
   1216	if (S_ISDIR(mode)) {
   1217		ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
   1218		if (sbi->s_log_groups_per_flex) {
   1219			ext4_group_t f = ext4_flex_group(sbi, group);
   1220
   1221			atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
   1222							f)->used_dirs);
   1223		}
   1224	}
   1225	if (ext4_has_group_desc_csum(sb)) {
   1226		ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
   1227					   EXT4_INODES_PER_GROUP(sb) / 8);
   1228		ext4_group_desc_csum_set(sb, group, gdp);
   1229	}
   1230	ext4_unlock_group(sb, group);
   1231
   1232	BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
   1233	err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
   1234	if (err) {
   1235		ext4_std_error(sb, err);
   1236		goto out;
   1237	}
   1238
   1239	percpu_counter_dec(&sbi->s_freeinodes_counter);
   1240	if (S_ISDIR(mode))
   1241		percpu_counter_inc(&sbi->s_dirs_counter);
   1242
   1243	if (sbi->s_log_groups_per_flex) {
   1244		flex_group = ext4_flex_group(sbi, group);
   1245		atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups,
   1246						flex_group)->free_inodes);
   1247	}
   1248
   1249	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
   1250	/* This is the optimal IO size (for stat), not the fs block size */
   1251	inode->i_blocks = 0;
   1252	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
   1253	ei->i_crtime = inode->i_mtime;
   1254
   1255	memset(ei->i_data, 0, sizeof(ei->i_data));
   1256	ei->i_dir_start_lookup = 0;
   1257	ei->i_disksize = 0;
   1258
   1259	/* Don't inherit extent flag from directory, amongst others. */
   1260	ei->i_flags =
   1261		ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
   1262	ei->i_flags |= i_flags;
   1263	ei->i_file_acl = 0;
   1264	ei->i_dtime = 0;
   1265	ei->i_block_group = group;
   1266	ei->i_last_alloc_group = ~0;
   1267
   1268	ext4_set_inode_flags(inode, true);
   1269	if (IS_DIRSYNC(inode))
   1270		ext4_handle_sync(handle);
   1271	if (insert_inode_locked(inode) < 0) {
   1272		/*
   1273		 * Likely a bitmap corruption causing inode to be allocated
   1274		 * twice.
   1275		 */
   1276		err = -EIO;
   1277		ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
   1278			   inode->i_ino);
   1279		ext4_mark_group_bitmap_corrupted(sb, group,
   1280					EXT4_GROUP_INFO_IBITMAP_CORRUPT);
   1281		goto out;
   1282	}
   1283	inode->i_generation = prandom_u32();
   1284
   1285	/* Precompute checksum seed for inode metadata */
   1286	if (ext4_has_metadata_csum(sb)) {
   1287		__u32 csum;
   1288		__le32 inum = cpu_to_le32(inode->i_ino);
   1289		__le32 gen = cpu_to_le32(inode->i_generation);
   1290		csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
   1291				   sizeof(inum));
   1292		ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
   1293					      sizeof(gen));
   1294	}
   1295
   1296	ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
   1297	ext4_set_inode_state(inode, EXT4_STATE_NEW);
   1298
   1299	ei->i_extra_isize = sbi->s_want_extra_isize;
   1300	ei->i_inline_off = 0;
   1301	if (ext4_has_feature_inline_data(sb) &&
   1302	    (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode)))
   1303		ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
   1304	ret = inode;
   1305	err = dquot_alloc_inode(inode);
   1306	if (err)
   1307		goto fail_drop;
   1308
   1309	/*
   1310	 * Since the encryption xattr will always be unique, create it first so
   1311	 * that it's less likely to end up in an external xattr block and
   1312	 * prevent its deduplication.
   1313	 */
   1314	if (encrypt) {
   1315		err = fscrypt_set_context(inode, handle);
   1316		if (err)
   1317			goto fail_free_drop;
   1318	}
   1319
   1320	if (!(ei->i_flags & EXT4_EA_INODE_FL)) {
   1321		err = ext4_init_acl(handle, inode, dir);
   1322		if (err)
   1323			goto fail_free_drop;
   1324
   1325		err = ext4_init_security(handle, inode, dir, qstr);
   1326		if (err)
   1327			goto fail_free_drop;
   1328	}
   1329
   1330	if (ext4_has_feature_extents(sb)) {
   1331		/* set extent flag only for directory, file and normal symlink*/
   1332		if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
   1333			ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
   1334			ext4_ext_tree_init(handle, inode);
   1335		}
   1336	}
   1337
   1338	if (ext4_handle_valid(handle)) {
   1339		ei->i_sync_tid = handle->h_transaction->t_tid;
   1340		ei->i_datasync_tid = handle->h_transaction->t_tid;
   1341	}
   1342
   1343	err = ext4_mark_inode_dirty(handle, inode);
   1344	if (err) {
   1345		ext4_std_error(sb, err);
   1346		goto fail_free_drop;
   1347	}
   1348
   1349	ext4_debug("allocating inode %lu\n", inode->i_ino);
   1350	trace_ext4_allocate_inode(inode, dir, mode);
   1351	brelse(inode_bitmap_bh);
   1352	return ret;
   1353
   1354fail_free_drop:
   1355	dquot_free_inode(inode);
   1356fail_drop:
   1357	clear_nlink(inode);
   1358	unlock_new_inode(inode);
   1359out:
   1360	dquot_drop(inode);
   1361	inode->i_flags |= S_NOQUOTA;
   1362	iput(inode);
   1363	brelse(inode_bitmap_bh);
   1364	return ERR_PTR(err);
   1365}
   1366
   1367/* Verify that we are loading a valid orphan from disk */
   1368struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
   1369{
   1370	unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
   1371	ext4_group_t block_group;
   1372	int bit;
   1373	struct buffer_head *bitmap_bh = NULL;
   1374	struct inode *inode = NULL;
   1375	int err = -EFSCORRUPTED;
   1376
   1377	if (ino < EXT4_FIRST_INO(sb) || ino > max_ino)
   1378		goto bad_orphan;
   1379
   1380	block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
   1381	bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
   1382	bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
   1383	if (IS_ERR(bitmap_bh))
   1384		return ERR_CAST(bitmap_bh);
   1385
   1386	/* Having the inode bit set should be a 100% indicator that this
   1387	 * is a valid orphan (no e2fsck run on fs).  Orphans also include
   1388	 * inodes that were being truncated, so we can't check i_nlink==0.
   1389	 */
   1390	if (!ext4_test_bit(bit, bitmap_bh->b_data))
   1391		goto bad_orphan;
   1392
   1393	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
   1394	if (IS_ERR(inode)) {
   1395		err = PTR_ERR(inode);
   1396		ext4_error_err(sb, -err,
   1397			       "couldn't read orphan inode %lu (err %d)",
   1398			       ino, err);
   1399		brelse(bitmap_bh);
   1400		return inode;
   1401	}
   1402
   1403	/*
   1404	 * If the orphans has i_nlinks > 0 then it should be able to
   1405	 * be truncated, otherwise it won't be removed from the orphan
   1406	 * list during processing and an infinite loop will result.
   1407	 * Similarly, it must not be a bad inode.
   1408	 */
   1409	if ((inode->i_nlink && !ext4_can_truncate(inode)) ||
   1410	    is_bad_inode(inode))
   1411		goto bad_orphan;
   1412
   1413	if (NEXT_ORPHAN(inode) > max_ino)
   1414		goto bad_orphan;
   1415	brelse(bitmap_bh);
   1416	return inode;
   1417
   1418bad_orphan:
   1419	ext4_error(sb, "bad orphan inode %lu", ino);
   1420	if (bitmap_bh)
   1421		printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n",
   1422		       bit, (unsigned long long)bitmap_bh->b_blocknr,
   1423		       ext4_test_bit(bit, bitmap_bh->b_data));
   1424	if (inode) {
   1425		printk(KERN_ERR "is_bad_inode(inode)=%d\n",
   1426		       is_bad_inode(inode));
   1427		printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n",
   1428		       NEXT_ORPHAN(inode));
   1429		printk(KERN_ERR "max_ino=%lu\n", max_ino);
   1430		printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink);
   1431		/* Avoid freeing blocks if we got a bad deleted inode */
   1432		if (inode->i_nlink == 0)
   1433			inode->i_blocks = 0;
   1434		iput(inode);
   1435	}
   1436	brelse(bitmap_bh);
   1437	return ERR_PTR(err);
   1438}
   1439
   1440unsigned long ext4_count_free_inodes(struct super_block *sb)
   1441{
   1442	unsigned long desc_count;
   1443	struct ext4_group_desc *gdp;
   1444	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
   1445#ifdef EXT4FS_DEBUG
   1446	struct ext4_super_block *es;
   1447	unsigned long bitmap_count, x;
   1448	struct buffer_head *bitmap_bh = NULL;
   1449
   1450	es = EXT4_SB(sb)->s_es;
   1451	desc_count = 0;
   1452	bitmap_count = 0;
   1453	gdp = NULL;
   1454	for (i = 0; i < ngroups; i++) {
   1455		gdp = ext4_get_group_desc(sb, i, NULL);
   1456		if (!gdp)
   1457			continue;
   1458		desc_count += ext4_free_inodes_count(sb, gdp);
   1459		brelse(bitmap_bh);
   1460		bitmap_bh = ext4_read_inode_bitmap(sb, i);
   1461		if (IS_ERR(bitmap_bh)) {
   1462			bitmap_bh = NULL;
   1463			continue;
   1464		}
   1465
   1466		x = ext4_count_free(bitmap_bh->b_data,
   1467				    EXT4_INODES_PER_GROUP(sb) / 8);
   1468		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
   1469			(unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
   1470		bitmap_count += x;
   1471	}
   1472	brelse(bitmap_bh);
   1473	printk(KERN_DEBUG "ext4_count_free_inodes: "
   1474	       "stored = %u, computed = %lu, %lu\n",
   1475	       le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
   1476	return desc_count;
   1477#else
   1478	desc_count = 0;
   1479	for (i = 0; i < ngroups; i++) {
   1480		gdp = ext4_get_group_desc(sb, i, NULL);
   1481		if (!gdp)
   1482			continue;
   1483		desc_count += ext4_free_inodes_count(sb, gdp);
   1484		cond_resched();
   1485	}
   1486	return desc_count;
   1487#endif
   1488}
   1489
   1490/* Called at mount-time, super-block is locked */
   1491unsigned long ext4_count_dirs(struct super_block * sb)
   1492{
   1493	unsigned long count = 0;
   1494	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
   1495
   1496	for (i = 0; i < ngroups; i++) {
   1497		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
   1498		if (!gdp)
   1499			continue;
   1500		count += ext4_used_dirs_count(sb, gdp);
   1501	}
   1502	return count;
   1503}
   1504
   1505/*
   1506 * Zeroes not yet zeroed inode table - just write zeroes through the whole
   1507 * inode table. Must be called without any spinlock held. The only place
   1508 * where it is called from on active part of filesystem is ext4lazyinit
   1509 * thread, so we do not need any special locks, however we have to prevent
   1510 * inode allocation from the current group, so we take alloc_sem lock, to
   1511 * block ext4_new_inode() until we are finished.
   1512 */
   1513int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
   1514				 int barrier)
   1515{
   1516	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
   1517	struct ext4_sb_info *sbi = EXT4_SB(sb);
   1518	struct ext4_group_desc *gdp = NULL;
   1519	struct buffer_head *group_desc_bh;
   1520	handle_t *handle;
   1521	ext4_fsblk_t blk;
   1522	int num, ret = 0, used_blks = 0;
   1523	unsigned long used_inos = 0;
   1524
   1525	/* This should not happen, but just to be sure check this */
   1526	if (sb_rdonly(sb)) {
   1527		ret = 1;
   1528		goto out;
   1529	}
   1530
   1531	gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
   1532	if (!gdp)
   1533		goto out;
   1534
   1535	/*
   1536	 * We do not need to lock this, because we are the only one
   1537	 * handling this flag.
   1538	 */
   1539	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))
   1540		goto out;
   1541
   1542	handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
   1543	if (IS_ERR(handle)) {
   1544		ret = PTR_ERR(handle);
   1545		goto out;
   1546	}
   1547
   1548	down_write(&grp->alloc_sem);
   1549	/*
   1550	 * If inode bitmap was already initialized there may be some
   1551	 * used inodes so we need to skip blocks with used inodes in
   1552	 * inode table.
   1553	 */
   1554	if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) {
   1555		used_inos = EXT4_INODES_PER_GROUP(sb) -
   1556			    ext4_itable_unused_count(sb, gdp);
   1557		used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block);
   1558
   1559		/* Bogus inode unused count? */
   1560		if (used_blks < 0 || used_blks > sbi->s_itb_per_group) {
   1561			ext4_error(sb, "Something is wrong with group %u: "
   1562				   "used itable blocks: %d; "
   1563				   "itable unused count: %u",
   1564				   group, used_blks,
   1565				   ext4_itable_unused_count(sb, gdp));
   1566			ret = 1;
   1567			goto err_out;
   1568		}
   1569
   1570		used_inos += group * EXT4_INODES_PER_GROUP(sb);
   1571		/*
   1572		 * Are there some uninitialized inodes in the inode table
   1573		 * before the first normal inode?
   1574		 */
   1575		if ((used_blks != sbi->s_itb_per_group) &&
   1576		     (used_inos < EXT4_FIRST_INO(sb))) {
   1577			ext4_error(sb, "Something is wrong with group %u: "
   1578				   "itable unused count: %u; "
   1579				   "itables initialized count: %ld",
   1580				   group, ext4_itable_unused_count(sb, gdp),
   1581				   used_inos);
   1582			ret = 1;
   1583			goto err_out;
   1584		}
   1585	}
   1586
   1587	blk = ext4_inode_table(sb, gdp) + used_blks;
   1588	num = sbi->s_itb_per_group - used_blks;
   1589
   1590	BUFFER_TRACE(group_desc_bh, "get_write_access");
   1591	ret = ext4_journal_get_write_access(handle, sb, group_desc_bh,
   1592					    EXT4_JTR_NONE);
   1593	if (ret)
   1594		goto err_out;
   1595
   1596	/*
   1597	 * Skip zeroout if the inode table is full. But we set the ZEROED
   1598	 * flag anyway, because obviously, when it is full it does not need
   1599	 * further zeroing.
   1600	 */
   1601	if (unlikely(num == 0))
   1602		goto skip_zeroout;
   1603
   1604	ext4_debug("going to zero out inode table in group %d\n",
   1605		   group);
   1606	ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
   1607	if (ret < 0)
   1608		goto err_out;
   1609	if (barrier)
   1610		blkdev_issue_flush(sb->s_bdev);
   1611
   1612skip_zeroout:
   1613	ext4_lock_group(sb, group);
   1614	gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
   1615	ext4_group_desc_csum_set(sb, group, gdp);
   1616	ext4_unlock_group(sb, group);
   1617
   1618	BUFFER_TRACE(group_desc_bh,
   1619		     "call ext4_handle_dirty_metadata");
   1620	ret = ext4_handle_dirty_metadata(handle, NULL,
   1621					 group_desc_bh);
   1622
   1623err_out:
   1624	up_write(&grp->alloc_sem);
   1625	ext4_journal_stop(handle);
   1626out:
   1627	return ret;
   1628}