cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

mballoc.c (189638B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
      4 * Written by Alex Tomas <alex@clusterfs.com>
      5 */
      6
      7
      8/*
      9 * mballoc.c contains the multiblocks allocation routines
     10 */
     11
     12#include "ext4_jbd2.h"
     13#include "mballoc.h"
     14#include <linux/log2.h>
     15#include <linux/module.h>
     16#include <linux/slab.h>
     17#include <linux/nospec.h>
     18#include <linux/backing-dev.h>
     19#include <trace/events/ext4.h>
     20
     21/*
     22 * MUSTDO:
     23 *   - test ext4_ext_search_left() and ext4_ext_search_right()
     24 *   - search for metadata in few groups
     25 *
     26 * TODO v4:
     27 *   - normalization should take into account whether file is still open
     28 *   - discard preallocations if no free space left (policy?)
     29 *   - don't normalize tails
     30 *   - quota
     31 *   - reservation for superuser
     32 *
     33 * TODO v3:
     34 *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
     35 *   - track min/max extents in each group for better group selection
     36 *   - mb_mark_used() may allocate chunk right after splitting buddy
     37 *   - tree of groups sorted by number of free blocks
     38 *   - error handling
     39 */
     40
     41/*
     42 * The allocation request involve request for multiple number of blocks
     43 * near to the goal(block) value specified.
     44 *
     45 * During initialization phase of the allocator we decide to use the
     46 * group preallocation or inode preallocation depending on the size of
     47 * the file. The size of the file could be the resulting file size we
     48 * would have after allocation, or the current file size, which ever
     49 * is larger. If the size is less than sbi->s_mb_stream_request we
     50 * select to use the group preallocation. The default value of
     51 * s_mb_stream_request is 16 blocks. This can also be tuned via
     52 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
     53 * terms of number of blocks.
     54 *
     55 * The main motivation for having small file use group preallocation is to
     56 * ensure that we have small files closer together on the disk.
     57 *
     58 * First stage the allocator looks at the inode prealloc list,
     59 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
     60 * spaces for this particular inode. The inode prealloc space is
     61 * represented as:
     62 *
     63 * pa_lstart -> the logical start block for this prealloc space
     64 * pa_pstart -> the physical start block for this prealloc space
     65 * pa_len    -> length for this prealloc space (in clusters)
     66 * pa_free   ->  free space available in this prealloc space (in clusters)
     67 *
     68 * The inode preallocation space is used looking at the _logical_ start
     69 * block. If only the logical file block falls within the range of prealloc
     70 * space we will consume the particular prealloc space. This makes sure that
     71 * we have contiguous physical blocks representing the file blocks
     72 *
     73 * The important thing to be noted in case of inode prealloc space is that
     74 * we don't modify the values associated to inode prealloc space except
     75 * pa_free.
     76 *
     77 * If we are not able to find blocks in the inode prealloc space and if we
     78 * have the group allocation flag set then we look at the locality group
     79 * prealloc space. These are per CPU prealloc list represented as
     80 *
     81 * ext4_sb_info.s_locality_groups[smp_processor_id()]
     82 *
     83 * The reason for having a per cpu locality group is to reduce the contention
     84 * between CPUs. It is possible to get scheduled at this point.
     85 *
     86 * The locality group prealloc space is used looking at whether we have
     87 * enough free space (pa_free) within the prealloc space.
     88 *
     89 * If we can't allocate blocks via inode prealloc or/and locality group
     90 * prealloc then we look at the buddy cache. The buddy cache is represented
     91 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
     92 * mapped to the buddy and bitmap information regarding different
     93 * groups. The buddy information is attached to buddy cache inode so that
     94 * we can access them through the page cache. The information regarding
     95 * each group is loaded via ext4_mb_load_buddy.  The information involve
     96 * block bitmap and buddy information. The information are stored in the
     97 * inode as:
     98 *
     99 *  {                        page                        }
    100 *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
    101 *
    102 *
    103 * one block each for bitmap and buddy information.  So for each group we
    104 * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE /
    105 * blocksize) blocks.  So it can have information regarding groups_per_page
    106 * which is blocks_per_page/2
    107 *
    108 * The buddy cache inode is not stored on disk. The inode is thrown
    109 * away when the filesystem is unmounted.
    110 *
    111 * We look for count number of blocks in the buddy cache. If we were able
    112 * to locate that many free blocks we return with additional information
    113 * regarding rest of the contiguous physical block available
    114 *
    115 * Before allocating blocks via buddy cache we normalize the request
    116 * blocks. This ensure we ask for more blocks that we needed. The extra
    117 * blocks that we get after allocation is added to the respective prealloc
    118 * list. In case of inode preallocation we follow a list of heuristics
    119 * based on file size. This can be found in ext4_mb_normalize_request. If
    120 * we are doing a group prealloc we try to normalize the request to
    121 * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
    122 * dependent on the cluster size; for non-bigalloc file systems, it is
    123 * 512 blocks. This can be tuned via
    124 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
    125 * terms of number of blocks. If we have mounted the file system with -O
    126 * stripe=<value> option the group prealloc request is normalized to the
    127 * smallest multiple of the stripe value (sbi->s_stripe) which is
    128 * greater than the default mb_group_prealloc.
    129 *
    130 * If "mb_optimize_scan" mount option is set, we maintain in memory group info
    131 * structures in two data structures:
    132 *
    133 * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders)
    134 *
    135 *    Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks)
    136 *
    137 *    This is an array of lists where the index in the array represents the
    138 *    largest free order in the buddy bitmap of the participating group infos of
    139 *    that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total
    140 *    number of buddy bitmap orders possible) number of lists. Group-infos are
    141 *    placed in appropriate lists.
    142 *
    143 * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root)
    144 *
    145 *    Locking: sbi->s_mb_rb_lock (rwlock)
    146 *
    147 *    This is a red black tree consisting of group infos and the tree is sorted
    148 *    by average fragment sizes (which is calculated as ext4_group_info->bb_free
    149 *    / ext4_group_info->bb_fragments).
    150 *
    151 * When "mb_optimize_scan" mount option is set, mballoc consults the above data
    152 * structures to decide the order in which groups are to be traversed for
    153 * fulfilling an allocation request.
    154 *
    155 * At CR = 0, we look for groups which have the largest_free_order >= the order
    156 * of the request. We directly look at the largest free order list in the data
    157 * structure (1) above where largest_free_order = order of the request. If that
    158 * list is empty, we look at remaining list in the increasing order of
    159 * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time.
    160 *
    161 * At CR = 1, we only consider groups where average fragment size > request
    162 * size. So, we lookup a group which has average fragment size just above or
    163 * equal to request size using our rb tree (data structure 2) in O(log N) time.
    164 *
    165 * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
    166 * linear order which requires O(N) search time for each CR 0 and CR 1 phase.
    167 *
    168 * The regular allocator (using the buddy cache) supports a few tunables.
    169 *
    170 * /sys/fs/ext4/<partition>/mb_min_to_scan
    171 * /sys/fs/ext4/<partition>/mb_max_to_scan
    172 * /sys/fs/ext4/<partition>/mb_order2_req
    173 * /sys/fs/ext4/<partition>/mb_linear_limit
    174 *
    175 * The regular allocator uses buddy scan only if the request len is power of
    176 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
    177 * value of s_mb_order2_reqs can be tuned via
    178 * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
    179 * stripe size (sbi->s_stripe), we try to search for contiguous block in
    180 * stripe size. This should result in better allocation on RAID setups. If
    181 * not, we search in the specific group using bitmap for best extents. The
    182 * tunable min_to_scan and max_to_scan control the behaviour here.
    183 * min_to_scan indicate how long the mballoc __must__ look for a best
    184 * extent and max_to_scan indicates how long the mballoc __can__ look for a
    185 * best extent in the found extents. Searching for the blocks starts with
    186 * the group specified as the goal value in allocation context via
    187 * ac_g_ex. Each group is first checked based on the criteria whether it
    188 * can be used for allocation. ext4_mb_good_group explains how the groups are
    189 * checked.
    190 *
    191 * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not
    192 * get traversed linearly. That may result in subsequent allocations being not
    193 * close to each other. And so, the underlying device may get filled up in a
    194 * non-linear fashion. While that may not matter on non-rotational devices, for
    195 * rotational devices that may result in higher seek times. "mb_linear_limit"
    196 * tells mballoc how many groups mballoc should search linearly before
    197 * performing consulting above data structures for more efficient lookups. For
    198 * non rotational devices, this value defaults to 0 and for rotational devices
    199 * this is set to MB_DEFAULT_LINEAR_LIMIT.
    200 *
    201 * Both the prealloc space are getting populated as above. So for the first
    202 * request we will hit the buddy cache which will result in this prealloc
    203 * space getting filled. The prealloc space is then later used for the
    204 * subsequent request.
    205 */
    206
    207/*
    208 * mballoc operates on the following data:
    209 *  - on-disk bitmap
    210 *  - in-core buddy (actually includes buddy and bitmap)
    211 *  - preallocation descriptors (PAs)
    212 *
    213 * there are two types of preallocations:
    214 *  - inode
    215 *    assiged to specific inode and can be used for this inode only.
    216 *    it describes part of inode's space preallocated to specific
    217 *    physical blocks. any block from that preallocated can be used
    218 *    independent. the descriptor just tracks number of blocks left
    219 *    unused. so, before taking some block from descriptor, one must
    220 *    make sure corresponded logical block isn't allocated yet. this
    221 *    also means that freeing any block within descriptor's range
    222 *    must discard all preallocated blocks.
    223 *  - locality group
    224 *    assigned to specific locality group which does not translate to
    225 *    permanent set of inodes: inode can join and leave group. space
    226 *    from this type of preallocation can be used for any inode. thus
    227 *    it's consumed from the beginning to the end.
    228 *
    229 * relation between them can be expressed as:
    230 *    in-core buddy = on-disk bitmap + preallocation descriptors
    231 *
    232 * this mean blocks mballoc considers used are:
    233 *  - allocated blocks (persistent)
    234 *  - preallocated blocks (non-persistent)
    235 *
    236 * consistency in mballoc world means that at any time a block is either
    237 * free or used in ALL structures. notice: "any time" should not be read
    238 * literally -- time is discrete and delimited by locks.
    239 *
    240 *  to keep it simple, we don't use block numbers, instead we count number of
    241 *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
    242 *
    243 * all operations can be expressed as:
    244 *  - init buddy:			buddy = on-disk + PAs
    245 *  - new PA:				buddy += N; PA = N
    246 *  - use inode PA:			on-disk += N; PA -= N
    247 *  - discard inode PA			buddy -= on-disk - PA; PA = 0
    248 *  - use locality group PA		on-disk += N; PA -= N
    249 *  - discard locality group PA		buddy -= PA; PA = 0
    250 *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
    251 *        is used in real operation because we can't know actual used
    252 *        bits from PA, only from on-disk bitmap
    253 *
    254 * if we follow this strict logic, then all operations above should be atomic.
    255 * given some of them can block, we'd have to use something like semaphores
    256 * killing performance on high-end SMP hardware. let's try to relax it using
    257 * the following knowledge:
    258 *  1) if buddy is referenced, it's already initialized
    259 *  2) while block is used in buddy and the buddy is referenced,
    260 *     nobody can re-allocate that block
    261 *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
    262 *     bit set and PA claims same block, it's OK. IOW, one can set bit in
    263 *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
    264 *     block
    265 *
    266 * so, now we're building a concurrency table:
    267 *  - init buddy vs.
    268 *    - new PA
    269 *      blocks for PA are allocated in the buddy, buddy must be referenced
    270 *      until PA is linked to allocation group to avoid concurrent buddy init
    271 *    - use inode PA
    272 *      we need to make sure that either on-disk bitmap or PA has uptodate data
    273 *      given (3) we care that PA-=N operation doesn't interfere with init
    274 *    - discard inode PA
    275 *      the simplest way would be to have buddy initialized by the discard
    276 *    - use locality group PA
    277 *      again PA-=N must be serialized with init
    278 *    - discard locality group PA
    279 *      the simplest way would be to have buddy initialized by the discard
    280 *  - new PA vs.
    281 *    - use inode PA
    282 *      i_data_sem serializes them
    283 *    - discard inode PA
    284 *      discard process must wait until PA isn't used by another process
    285 *    - use locality group PA
    286 *      some mutex should serialize them
    287 *    - discard locality group PA
    288 *      discard process must wait until PA isn't used by another process
    289 *  - use inode PA
    290 *    - use inode PA
    291 *      i_data_sem or another mutex should serializes them
    292 *    - discard inode PA
    293 *      discard process must wait until PA isn't used by another process
    294 *    - use locality group PA
    295 *      nothing wrong here -- they're different PAs covering different blocks
    296 *    - discard locality group PA
    297 *      discard process must wait until PA isn't used by another process
    298 *
    299 * now we're ready to make few consequences:
    300 *  - PA is referenced and while it is no discard is possible
    301 *  - PA is referenced until block isn't marked in on-disk bitmap
    302 *  - PA changes only after on-disk bitmap
    303 *  - discard must not compete with init. either init is done before
    304 *    any discard or they're serialized somehow
    305 *  - buddy init as sum of on-disk bitmap and PAs is done atomically
    306 *
    307 * a special case when we've used PA to emptiness. no need to modify buddy
    308 * in this case, but we should care about concurrent init
    309 *
    310 */
    311
    312 /*
    313 * Logic in few words:
    314 *
    315 *  - allocation:
    316 *    load group
    317 *    find blocks
    318 *    mark bits in on-disk bitmap
    319 *    release group
    320 *
    321 *  - use preallocation:
    322 *    find proper PA (per-inode or group)
    323 *    load group
    324 *    mark bits in on-disk bitmap
    325 *    release group
    326 *    release PA
    327 *
    328 *  - free:
    329 *    load group
    330 *    mark bits in on-disk bitmap
    331 *    release group
    332 *
    333 *  - discard preallocations in group:
    334 *    mark PAs deleted
    335 *    move them onto local list
    336 *    load on-disk bitmap
    337 *    load group
    338 *    remove PA from object (inode or locality group)
    339 *    mark free blocks in-core
    340 *
    341 *  - discard inode's preallocations:
    342 */
    343
    344/*
    345 * Locking rules
    346 *
    347 * Locks:
    348 *  - bitlock on a group	(group)
    349 *  - object (inode/locality)	(object)
    350 *  - per-pa lock		(pa)
    351 *  - cr0 lists lock		(cr0)
    352 *  - cr1 tree lock		(cr1)
    353 *
    354 * Paths:
    355 *  - new pa
    356 *    object
    357 *    group
    358 *
    359 *  - find and use pa:
    360 *    pa
    361 *
    362 *  - release consumed pa:
    363 *    pa
    364 *    group
    365 *    object
    366 *
    367 *  - generate in-core bitmap:
    368 *    group
    369 *        pa
    370 *
    371 *  - discard all for given object (inode, locality group):
    372 *    object
    373 *        pa
    374 *    group
    375 *
    376 *  - discard all for given group:
    377 *    group
    378 *        pa
    379 *    group
    380 *        object
    381 *
    382 *  - allocation path (ext4_mb_regular_allocator)
    383 *    group
    384 *    cr0/cr1
    385 */
    386static struct kmem_cache *ext4_pspace_cachep;
    387static struct kmem_cache *ext4_ac_cachep;
    388static struct kmem_cache *ext4_free_data_cachep;
    389
    390/* We create slab caches for groupinfo data structures based on the
    391 * superblock block size.  There will be one per mounted filesystem for
    392 * each unique s_blocksize_bits */
    393#define NR_GRPINFO_CACHES 8
    394static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
    395
    396static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
    397	"ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
    398	"ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
    399	"ext4_groupinfo_64k", "ext4_groupinfo_128k"
    400};
    401
    402static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
    403					ext4_group_t group);
    404static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
    405						ext4_group_t group);
    406static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
    407
    408static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
    409			       ext4_group_t group, int cr);
    410
    411static int ext4_try_to_trim_range(struct super_block *sb,
    412		struct ext4_buddy *e4b, ext4_grpblk_t start,
    413		ext4_grpblk_t max, ext4_grpblk_t minblocks);
    414
    415/*
    416 * The algorithm using this percpu seq counter goes below:
    417 * 1. We sample the percpu discard_pa_seq counter before trying for block
    418 *    allocation in ext4_mb_new_blocks().
    419 * 2. We increment this percpu discard_pa_seq counter when we either allocate
    420 *    or free these blocks i.e. while marking those blocks as used/free in
    421 *    mb_mark_used()/mb_free_blocks().
    422 * 3. We also increment this percpu seq counter when we successfully identify
    423 *    that the bb_prealloc_list is not empty and hence proceed for discarding
    424 *    of those PAs inside ext4_mb_discard_group_preallocations().
    425 *
    426 * Now to make sure that the regular fast path of block allocation is not
    427 * affected, as a small optimization we only sample the percpu seq counter
    428 * on that cpu. Only when the block allocation fails and when freed blocks
    429 * found were 0, that is when we sample percpu seq counter for all cpus using
    430 * below function ext4_get_discard_pa_seq_sum(). This happens after making
    431 * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty.
    432 */
    433static DEFINE_PER_CPU(u64, discard_pa_seq);
    434static inline u64 ext4_get_discard_pa_seq_sum(void)
    435{
    436	int __cpu;
    437	u64 __seq = 0;
    438
    439	for_each_possible_cpu(__cpu)
    440		__seq += per_cpu(discard_pa_seq, __cpu);
    441	return __seq;
    442}
    443
    444static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
    445{
    446#if BITS_PER_LONG == 64
    447	*bit += ((unsigned long) addr & 7UL) << 3;
    448	addr = (void *) ((unsigned long) addr & ~7UL);
    449#elif BITS_PER_LONG == 32
    450	*bit += ((unsigned long) addr & 3UL) << 3;
    451	addr = (void *) ((unsigned long) addr & ~3UL);
    452#else
    453#error "how many bits you are?!"
    454#endif
    455	return addr;
    456}
    457
    458static inline int mb_test_bit(int bit, void *addr)
    459{
    460	/*
    461	 * ext4_test_bit on architecture like powerpc
    462	 * needs unsigned long aligned address
    463	 */
    464	addr = mb_correct_addr_and_bit(&bit, addr);
    465	return ext4_test_bit(bit, addr);
    466}
    467
    468static inline void mb_set_bit(int bit, void *addr)
    469{
    470	addr = mb_correct_addr_and_bit(&bit, addr);
    471	ext4_set_bit(bit, addr);
    472}
    473
    474static inline void mb_clear_bit(int bit, void *addr)
    475{
    476	addr = mb_correct_addr_and_bit(&bit, addr);
    477	ext4_clear_bit(bit, addr);
    478}
    479
    480static inline int mb_test_and_clear_bit(int bit, void *addr)
    481{
    482	addr = mb_correct_addr_and_bit(&bit, addr);
    483	return ext4_test_and_clear_bit(bit, addr);
    484}
    485
    486static inline int mb_find_next_zero_bit(void *addr, int max, int start)
    487{
    488	int fix = 0, ret, tmpmax;
    489	addr = mb_correct_addr_and_bit(&fix, addr);
    490	tmpmax = max + fix;
    491	start += fix;
    492
    493	ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
    494	if (ret > max)
    495		return max;
    496	return ret;
    497}
    498
    499static inline int mb_find_next_bit(void *addr, int max, int start)
    500{
    501	int fix = 0, ret, tmpmax;
    502	addr = mb_correct_addr_and_bit(&fix, addr);
    503	tmpmax = max + fix;
    504	start += fix;
    505
    506	ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
    507	if (ret > max)
    508		return max;
    509	return ret;
    510}
    511
    512static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
    513{
    514	char *bb;
    515
    516	BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
    517	BUG_ON(max == NULL);
    518
    519	if (order > e4b->bd_blkbits + 1) {
    520		*max = 0;
    521		return NULL;
    522	}
    523
    524	/* at order 0 we see each particular block */
    525	if (order == 0) {
    526		*max = 1 << (e4b->bd_blkbits + 3);
    527		return e4b->bd_bitmap;
    528	}
    529
    530	bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
    531	*max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
    532
    533	return bb;
    534}
    535
    536#ifdef DOUBLE_CHECK
    537static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
    538			   int first, int count)
    539{
    540	int i;
    541	struct super_block *sb = e4b->bd_sb;
    542
    543	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
    544		return;
    545	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
    546	for (i = 0; i < count; i++) {
    547		if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
    548			ext4_fsblk_t blocknr;
    549
    550			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
    551			blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
    552			ext4_grp_locked_error(sb, e4b->bd_group,
    553					      inode ? inode->i_ino : 0,
    554					      blocknr,
    555					      "freeing block already freed "
    556					      "(bit %u)",
    557					      first + i);
    558			ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
    559					EXT4_GROUP_INFO_BBITMAP_CORRUPT);
    560		}
    561		mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
    562	}
    563}
    564
    565static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
    566{
    567	int i;
    568
    569	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
    570		return;
    571	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
    572	for (i = 0; i < count; i++) {
    573		BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
    574		mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
    575	}
    576}
    577
    578static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
    579{
    580	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
    581		return;
    582	if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
    583		unsigned char *b1, *b2;
    584		int i;
    585		b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
    586		b2 = (unsigned char *) bitmap;
    587		for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
    588			if (b1[i] != b2[i]) {
    589				ext4_msg(e4b->bd_sb, KERN_ERR,
    590					 "corruption in group %u "
    591					 "at byte %u(%u): %x in copy != %x "
    592					 "on disk/prealloc",
    593					 e4b->bd_group, i, i * 8, b1[i], b2[i]);
    594				BUG();
    595			}
    596		}
    597	}
    598}
    599
    600static void mb_group_bb_bitmap_alloc(struct super_block *sb,
    601			struct ext4_group_info *grp, ext4_group_t group)
    602{
    603	struct buffer_head *bh;
    604
    605	grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
    606	if (!grp->bb_bitmap)
    607		return;
    608
    609	bh = ext4_read_block_bitmap(sb, group);
    610	if (IS_ERR_OR_NULL(bh)) {
    611		kfree(grp->bb_bitmap);
    612		grp->bb_bitmap = NULL;
    613		return;
    614	}
    615
    616	memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize);
    617	put_bh(bh);
    618}
    619
    620static void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
    621{
    622	kfree(grp->bb_bitmap);
    623}
    624
    625#else
    626static inline void mb_free_blocks_double(struct inode *inode,
    627				struct ext4_buddy *e4b, int first, int count)
    628{
    629	return;
    630}
    631static inline void mb_mark_used_double(struct ext4_buddy *e4b,
    632						int first, int count)
    633{
    634	return;
    635}
    636static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
    637{
    638	return;
    639}
    640
    641static inline void mb_group_bb_bitmap_alloc(struct super_block *sb,
    642			struct ext4_group_info *grp, ext4_group_t group)
    643{
    644	return;
    645}
    646
    647static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp)
    648{
    649	return;
    650}
    651#endif
    652
    653#ifdef AGGRESSIVE_CHECK
    654
    655#define MB_CHECK_ASSERT(assert)						\
    656do {									\
    657	if (!(assert)) {						\
    658		printk(KERN_EMERG					\
    659			"Assertion failure in %s() at %s:%d: \"%s\"\n",	\
    660			function, file, line, # assert);		\
    661		BUG();							\
    662	}								\
    663} while (0)
    664
    665static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
    666				const char *function, int line)
    667{
    668	struct super_block *sb = e4b->bd_sb;
    669	int order = e4b->bd_blkbits + 1;
    670	int max;
    671	int max2;
    672	int i;
    673	int j;
    674	int k;
    675	int count;
    676	struct ext4_group_info *grp;
    677	int fragments = 0;
    678	int fstart;
    679	struct list_head *cur;
    680	void *buddy;
    681	void *buddy2;
    682
    683	if (e4b->bd_info->bb_check_counter++ % 10)
    684		return 0;
    685
    686	while (order > 1) {
    687		buddy = mb_find_buddy(e4b, order, &max);
    688		MB_CHECK_ASSERT(buddy);
    689		buddy2 = mb_find_buddy(e4b, order - 1, &max2);
    690		MB_CHECK_ASSERT(buddy2);
    691		MB_CHECK_ASSERT(buddy != buddy2);
    692		MB_CHECK_ASSERT(max * 2 == max2);
    693
    694		count = 0;
    695		for (i = 0; i < max; i++) {
    696
    697			if (mb_test_bit(i, buddy)) {
    698				/* only single bit in buddy2 may be 0 */
    699				if (!mb_test_bit(i << 1, buddy2)) {
    700					MB_CHECK_ASSERT(
    701						mb_test_bit((i<<1)+1, buddy2));
    702				}
    703				continue;
    704			}
    705
    706			/* both bits in buddy2 must be 1 */
    707			MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
    708			MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
    709
    710			for (j = 0; j < (1 << order); j++) {
    711				k = (i * (1 << order)) + j;
    712				MB_CHECK_ASSERT(
    713					!mb_test_bit(k, e4b->bd_bitmap));
    714			}
    715			count++;
    716		}
    717		MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
    718		order--;
    719	}
    720
    721	fstart = -1;
    722	buddy = mb_find_buddy(e4b, 0, &max);
    723	for (i = 0; i < max; i++) {
    724		if (!mb_test_bit(i, buddy)) {
    725			MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
    726			if (fstart == -1) {
    727				fragments++;
    728				fstart = i;
    729			}
    730			continue;
    731		}
    732		fstart = -1;
    733		/* check used bits only */
    734		for (j = 0; j < e4b->bd_blkbits + 1; j++) {
    735			buddy2 = mb_find_buddy(e4b, j, &max2);
    736			k = i >> j;
    737			MB_CHECK_ASSERT(k < max2);
    738			MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
    739		}
    740	}
    741	MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
    742	MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
    743
    744	grp = ext4_get_group_info(sb, e4b->bd_group);
    745	list_for_each(cur, &grp->bb_prealloc_list) {
    746		ext4_group_t groupnr;
    747		struct ext4_prealloc_space *pa;
    748		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
    749		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
    750		MB_CHECK_ASSERT(groupnr == e4b->bd_group);
    751		for (i = 0; i < pa->pa_len; i++)
    752			MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
    753	}
    754	return 0;
    755}
    756#undef MB_CHECK_ASSERT
    757#define mb_check_buddy(e4b) __mb_check_buddy(e4b,	\
    758					__FILE__, __func__, __LINE__)
    759#else
    760#define mb_check_buddy(e4b)
    761#endif
    762
    763/*
    764 * Divide blocks started from @first with length @len into
    765 * smaller chunks with power of 2 blocks.
    766 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
    767 * then increase bb_counters[] for corresponded chunk size.
    768 */
    769static void ext4_mb_mark_free_simple(struct super_block *sb,
    770				void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
    771					struct ext4_group_info *grp)
    772{
    773	struct ext4_sb_info *sbi = EXT4_SB(sb);
    774	ext4_grpblk_t min;
    775	ext4_grpblk_t max;
    776	ext4_grpblk_t chunk;
    777	unsigned int border;
    778
    779	BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
    780
    781	border = 2 << sb->s_blocksize_bits;
    782
    783	while (len > 0) {
    784		/* find how many blocks can be covered since this position */
    785		max = ffs(first | border) - 1;
    786
    787		/* find how many blocks of power 2 we need to mark */
    788		min = fls(len) - 1;
    789
    790		if (max < min)
    791			min = max;
    792		chunk = 1 << min;
    793
    794		/* mark multiblock chunks only */
    795		grp->bb_counters[min]++;
    796		if (min > 0)
    797			mb_clear_bit(first >> min,
    798				     buddy + sbi->s_mb_offsets[min]);
    799
    800		len -= chunk;
    801		first += chunk;
    802	}
    803}
    804
    805static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new,
    806			int (*cmp)(struct rb_node *, struct rb_node *))
    807{
    808	struct rb_node **iter = &root->rb_node, *parent = NULL;
    809
    810	while (*iter) {
    811		parent = *iter;
    812		if (cmp(new, *iter) > 0)
    813			iter = &((*iter)->rb_left);
    814		else
    815			iter = &((*iter)->rb_right);
    816	}
    817
    818	rb_link_node(new, parent, iter);
    819	rb_insert_color(new, root);
    820}
    821
    822static int
    823ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2)
    824{
    825	struct ext4_group_info *grp1 = rb_entry(rb1,
    826						struct ext4_group_info,
    827						bb_avg_fragment_size_rb);
    828	struct ext4_group_info *grp2 = rb_entry(rb2,
    829						struct ext4_group_info,
    830						bb_avg_fragment_size_rb);
    831	int num_frags_1, num_frags_2;
    832
    833	num_frags_1 = grp1->bb_fragments ?
    834		grp1->bb_free / grp1->bb_fragments : 0;
    835	num_frags_2 = grp2->bb_fragments ?
    836		grp2->bb_free / grp2->bb_fragments : 0;
    837
    838	return (num_frags_2 - num_frags_1);
    839}
    840
    841/*
    842 * Reinsert grpinfo into the avg_fragment_size tree with new average
    843 * fragment size.
    844 */
    845static void
    846mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
    847{
    848	struct ext4_sb_info *sbi = EXT4_SB(sb);
    849
    850	if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
    851		return;
    852
    853	write_lock(&sbi->s_mb_rb_lock);
    854	if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) {
    855		rb_erase(&grp->bb_avg_fragment_size_rb,
    856				&sbi->s_mb_avg_fragment_size_root);
    857		RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb);
    858	}
    859
    860	ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root,
    861		&grp->bb_avg_fragment_size_rb,
    862		ext4_mb_avg_fragment_size_cmp);
    863	write_unlock(&sbi->s_mb_rb_lock);
    864}
    865
    866/*
    867 * Choose next group by traversing largest_free_order lists. Updates *new_cr if
    868 * cr level needs an update.
    869 */
    870static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
    871			int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
    872{
    873	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    874	struct ext4_group_info *iter, *grp;
    875	int i;
    876
    877	if (ac->ac_status == AC_STATUS_FOUND)
    878		return;
    879
    880	if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED))
    881		atomic_inc(&sbi->s_bal_cr0_bad_suggestions);
    882
    883	grp = NULL;
    884	for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
    885		if (list_empty(&sbi->s_mb_largest_free_orders[i]))
    886			continue;
    887		read_lock(&sbi->s_mb_largest_free_orders_locks[i]);
    888		if (list_empty(&sbi->s_mb_largest_free_orders[i])) {
    889			read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
    890			continue;
    891		}
    892		grp = NULL;
    893		list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
    894				    bb_largest_free_order_node) {
    895			if (sbi->s_mb_stats)
    896				atomic64_inc(&sbi->s_bal_cX_groups_considered[0]);
    897			if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) {
    898				grp = iter;
    899				break;
    900			}
    901		}
    902		read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
    903		if (grp)
    904			break;
    905	}
    906
    907	if (!grp) {
    908		/* Increment cr and search again */
    909		*new_cr = 1;
    910	} else {
    911		*group = grp->bb_group;
    912		ac->ac_last_optimal_group = *group;
    913		ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED;
    914	}
    915}
    916
    917/*
    918 * Choose next group by traversing average fragment size tree. Updates *new_cr
    919 * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that
    920 * the linear search should continue for one iteration since there's lock
    921 * contention on the rb tree lock.
    922 */
    923static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
    924		int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
    925{
    926	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
    927	int avg_fragment_size, best_so_far;
    928	struct rb_node *node, *found;
    929	struct ext4_group_info *grp;
    930
    931	/*
    932	 * If there is contention on the lock, instead of waiting for the lock
    933	 * to become available, just continue searching lineraly. We'll resume
    934	 * our rb tree search later starting at ac->ac_last_optimal_group.
    935	 */
    936	if (!read_trylock(&sbi->s_mb_rb_lock)) {
    937		ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR;
    938		return;
    939	}
    940
    941	if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
    942		if (sbi->s_mb_stats)
    943			atomic_inc(&sbi->s_bal_cr1_bad_suggestions);
    944		/* We have found something at CR 1 in the past */
    945		grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group);
    946		for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL;
    947		     found = rb_next(found)) {
    948			grp = rb_entry(found, struct ext4_group_info,
    949				       bb_avg_fragment_size_rb);
    950			if (sbi->s_mb_stats)
    951				atomic64_inc(&sbi->s_bal_cX_groups_considered[1]);
    952			if (likely(ext4_mb_good_group(ac, grp->bb_group, 1)))
    953				break;
    954		}
    955		goto done;
    956	}
    957
    958	node = sbi->s_mb_avg_fragment_size_root.rb_node;
    959	best_so_far = 0;
    960	found = NULL;
    961
    962	while (node) {
    963		grp = rb_entry(node, struct ext4_group_info,
    964			       bb_avg_fragment_size_rb);
    965		avg_fragment_size = 0;
    966		if (ext4_mb_good_group(ac, grp->bb_group, 1)) {
    967			avg_fragment_size = grp->bb_fragments ?
    968				grp->bb_free / grp->bb_fragments : 0;
    969			if (!best_so_far || avg_fragment_size < best_so_far) {
    970				best_so_far = avg_fragment_size;
    971				found = node;
    972			}
    973		}
    974		if (avg_fragment_size > ac->ac_g_ex.fe_len)
    975			node = node->rb_right;
    976		else
    977			node = node->rb_left;
    978	}
    979
    980done:
    981	if (found) {
    982		grp = rb_entry(found, struct ext4_group_info,
    983			       bb_avg_fragment_size_rb);
    984		*group = grp->bb_group;
    985		ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED;
    986	} else {
    987		*new_cr = 2;
    988	}
    989
    990	read_unlock(&sbi->s_mb_rb_lock);
    991	ac->ac_last_optimal_group = *group;
    992}
    993
    994static inline int should_optimize_scan(struct ext4_allocation_context *ac)
    995{
    996	if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN)))
    997		return 0;
    998	if (ac->ac_criteria >= 2)
    999		return 0;
   1000	if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
   1001		return 0;
   1002	return 1;
   1003}
   1004
   1005/*
   1006 * Return next linear group for allocation. If linear traversal should not be
   1007 * performed, this function just returns the same group
   1008 */
   1009static int
   1010next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups)
   1011{
   1012	if (!should_optimize_scan(ac))
   1013		goto inc_and_return;
   1014
   1015	if (ac->ac_groups_linear_remaining) {
   1016		ac->ac_groups_linear_remaining--;
   1017		goto inc_and_return;
   1018	}
   1019
   1020	if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) {
   1021		ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR;
   1022		goto inc_and_return;
   1023	}
   1024
   1025	return group;
   1026inc_and_return:
   1027	/*
   1028	 * Artificially restricted ngroups for non-extent
   1029	 * files makes group > ngroups possible on first loop.
   1030	 */
   1031	return group + 1 >= ngroups ? 0 : group + 1;
   1032}
   1033
   1034/*
   1035 * ext4_mb_choose_next_group: choose next group for allocation.
   1036 *
   1037 * @ac        Allocation Context
   1038 * @new_cr    This is an output parameter. If the there is no good group
   1039 *            available at current CR level, this field is updated to indicate
   1040 *            the new cr level that should be used.
   1041 * @group     This is an input / output parameter. As an input it indicates the
   1042 *            next group that the allocator intends to use for allocation. As
   1043 *            output, this field indicates the next group that should be used as
   1044 *            determined by the optimization functions.
   1045 * @ngroups   Total number of groups
   1046 */
   1047static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
   1048		int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
   1049{
   1050	*new_cr = ac->ac_criteria;
   1051
   1052	if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining)
   1053		return;
   1054
   1055	if (*new_cr == 0) {
   1056		ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
   1057	} else if (*new_cr == 1) {
   1058		ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups);
   1059	} else {
   1060		/*
   1061		 * TODO: For CR=2, we can arrange groups in an rb tree sorted by
   1062		 * bb_free. But until that happens, we should never come here.
   1063		 */
   1064		WARN_ON(1);
   1065	}
   1066}
   1067
   1068/*
   1069 * Cache the order of the largest free extent we have available in this block
   1070 * group.
   1071 */
   1072static void
   1073mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
   1074{
   1075	struct ext4_sb_info *sbi = EXT4_SB(sb);
   1076	int i;
   1077
   1078	if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) {
   1079		write_lock(&sbi->s_mb_largest_free_orders_locks[
   1080					      grp->bb_largest_free_order]);
   1081		list_del_init(&grp->bb_largest_free_order_node);
   1082		write_unlock(&sbi->s_mb_largest_free_orders_locks[
   1083					      grp->bb_largest_free_order]);
   1084	}
   1085	grp->bb_largest_free_order = -1; /* uninit */
   1086
   1087	for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) {
   1088		if (grp->bb_counters[i] > 0) {
   1089			grp->bb_largest_free_order = i;
   1090			break;
   1091		}
   1092	}
   1093	if (test_opt2(sb, MB_OPTIMIZE_SCAN) &&
   1094	    grp->bb_largest_free_order >= 0 && grp->bb_free) {
   1095		write_lock(&sbi->s_mb_largest_free_orders_locks[
   1096					      grp->bb_largest_free_order]);
   1097		list_add_tail(&grp->bb_largest_free_order_node,
   1098		      &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]);
   1099		write_unlock(&sbi->s_mb_largest_free_orders_locks[
   1100					      grp->bb_largest_free_order]);
   1101	}
   1102}
   1103
   1104static noinline_for_stack
   1105void ext4_mb_generate_buddy(struct super_block *sb,
   1106				void *buddy, void *bitmap, ext4_group_t group)
   1107{
   1108	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
   1109	struct ext4_sb_info *sbi = EXT4_SB(sb);
   1110	ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
   1111	ext4_grpblk_t i = 0;
   1112	ext4_grpblk_t first;
   1113	ext4_grpblk_t len;
   1114	unsigned free = 0;
   1115	unsigned fragments = 0;
   1116	unsigned long long period = get_cycles();
   1117
   1118	/* initialize buddy from bitmap which is aggregation
   1119	 * of on-disk bitmap and preallocations */
   1120	i = mb_find_next_zero_bit(bitmap, max, 0);
   1121	grp->bb_first_free = i;
   1122	while (i < max) {
   1123		fragments++;
   1124		first = i;
   1125		i = mb_find_next_bit(bitmap, max, i);
   1126		len = i - first;
   1127		free += len;
   1128		if (len > 1)
   1129			ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
   1130		else
   1131			grp->bb_counters[0]++;
   1132		if (i < max)
   1133			i = mb_find_next_zero_bit(bitmap, max, i);
   1134	}
   1135	grp->bb_fragments = fragments;
   1136
   1137	if (free != grp->bb_free) {
   1138		ext4_grp_locked_error(sb, group, 0, 0,
   1139				      "block bitmap and bg descriptor "
   1140				      "inconsistent: %u vs %u free clusters",
   1141				      free, grp->bb_free);
   1142		/*
   1143		 * If we intend to continue, we consider group descriptor
   1144		 * corrupt and update bb_free using bitmap value
   1145		 */
   1146		grp->bb_free = free;
   1147		ext4_mark_group_bitmap_corrupted(sb, group,
   1148					EXT4_GROUP_INFO_BBITMAP_CORRUPT);
   1149	}
   1150	mb_set_largest_free_order(sb, grp);
   1151
   1152	clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
   1153
   1154	period = get_cycles() - period;
   1155	atomic_inc(&sbi->s_mb_buddies_generated);
   1156	atomic64_add(period, &sbi->s_mb_generation_time);
   1157	mb_update_avg_fragment_size(sb, grp);
   1158}
   1159
   1160/* The buddy information is attached the buddy cache inode
   1161 * for convenience. The information regarding each group
   1162 * is loaded via ext4_mb_load_buddy. The information involve
   1163 * block bitmap and buddy information. The information are
   1164 * stored in the inode as
   1165 *
   1166 * {                        page                        }
   1167 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
   1168 *
   1169 *
   1170 * one block each for bitmap and buddy information.
   1171 * So for each group we take up 2 blocks. A page can
   1172 * contain blocks_per_page (PAGE_SIZE / blocksize)  blocks.
   1173 * So it can have information regarding groups_per_page which
   1174 * is blocks_per_page/2
   1175 *
   1176 * Locking note:  This routine takes the block group lock of all groups
   1177 * for this page; do not hold this lock when calling this routine!
   1178 */
   1179
   1180static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
   1181{
   1182	ext4_group_t ngroups;
   1183	int blocksize;
   1184	int blocks_per_page;
   1185	int groups_per_page;
   1186	int err = 0;
   1187	int i;
   1188	ext4_group_t first_group, group;
   1189	int first_block;
   1190	struct super_block *sb;
   1191	struct buffer_head *bhs;
   1192	struct buffer_head **bh = NULL;
   1193	struct inode *inode;
   1194	char *data;
   1195	char *bitmap;
   1196	struct ext4_group_info *grinfo;
   1197
   1198	inode = page->mapping->host;
   1199	sb = inode->i_sb;
   1200	ngroups = ext4_get_groups_count(sb);
   1201	blocksize = i_blocksize(inode);
   1202	blocks_per_page = PAGE_SIZE / blocksize;
   1203
   1204	mb_debug(sb, "init page %lu\n", page->index);
   1205
   1206	groups_per_page = blocks_per_page >> 1;
   1207	if (groups_per_page == 0)
   1208		groups_per_page = 1;
   1209
   1210	/* allocate buffer_heads to read bitmaps */
   1211	if (groups_per_page > 1) {
   1212		i = sizeof(struct buffer_head *) * groups_per_page;
   1213		bh = kzalloc(i, gfp);
   1214		if (bh == NULL) {
   1215			err = -ENOMEM;
   1216			goto out;
   1217		}
   1218	} else
   1219		bh = &bhs;
   1220
   1221	first_group = page->index * blocks_per_page / 2;
   1222
   1223	/* read all groups the page covers into the cache */
   1224	for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
   1225		if (group >= ngroups)
   1226			break;
   1227
   1228		grinfo = ext4_get_group_info(sb, group);
   1229		/*
   1230		 * If page is uptodate then we came here after online resize
   1231		 * which added some new uninitialized group info structs, so
   1232		 * we must skip all initialized uptodate buddies on the page,
   1233		 * which may be currently in use by an allocating task.
   1234		 */
   1235		if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
   1236			bh[i] = NULL;
   1237			continue;
   1238		}
   1239		bh[i] = ext4_read_block_bitmap_nowait(sb, group, false);
   1240		if (IS_ERR(bh[i])) {
   1241			err = PTR_ERR(bh[i]);
   1242			bh[i] = NULL;
   1243			goto out;
   1244		}
   1245		mb_debug(sb, "read bitmap for group %u\n", group);
   1246	}
   1247
   1248	/* wait for I/O completion */
   1249	for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
   1250		int err2;
   1251
   1252		if (!bh[i])
   1253			continue;
   1254		err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
   1255		if (!err)
   1256			err = err2;
   1257	}
   1258
   1259	first_block = page->index * blocks_per_page;
   1260	for (i = 0; i < blocks_per_page; i++) {
   1261		group = (first_block + i) >> 1;
   1262		if (group >= ngroups)
   1263			break;
   1264
   1265		if (!bh[group - first_group])
   1266			/* skip initialized uptodate buddy */
   1267			continue;
   1268
   1269		if (!buffer_verified(bh[group - first_group]))
   1270			/* Skip faulty bitmaps */
   1271			continue;
   1272		err = 0;
   1273
   1274		/*
   1275		 * data carry information regarding this
   1276		 * particular group in the format specified
   1277		 * above
   1278		 *
   1279		 */
   1280		data = page_address(page) + (i * blocksize);
   1281		bitmap = bh[group - first_group]->b_data;
   1282
   1283		/*
   1284		 * We place the buddy block and bitmap block
   1285		 * close together
   1286		 */
   1287		if ((first_block + i) & 1) {
   1288			/* this is block of buddy */
   1289			BUG_ON(incore == NULL);
   1290			mb_debug(sb, "put buddy for group %u in page %lu/%x\n",
   1291				group, page->index, i * blocksize);
   1292			trace_ext4_mb_buddy_bitmap_load(sb, group);
   1293			grinfo = ext4_get_group_info(sb, group);
   1294			grinfo->bb_fragments = 0;
   1295			memset(grinfo->bb_counters, 0,
   1296			       sizeof(*grinfo->bb_counters) *
   1297			       (MB_NUM_ORDERS(sb)));
   1298			/*
   1299			 * incore got set to the group block bitmap below
   1300			 */
   1301			ext4_lock_group(sb, group);
   1302			/* init the buddy */
   1303			memset(data, 0xff, blocksize);
   1304			ext4_mb_generate_buddy(sb, data, incore, group);
   1305			ext4_unlock_group(sb, group);
   1306			incore = NULL;
   1307		} else {
   1308			/* this is block of bitmap */
   1309			BUG_ON(incore != NULL);
   1310			mb_debug(sb, "put bitmap for group %u in page %lu/%x\n",
   1311				group, page->index, i * blocksize);
   1312			trace_ext4_mb_bitmap_load(sb, group);
   1313
   1314			/* see comments in ext4_mb_put_pa() */
   1315			ext4_lock_group(sb, group);
   1316			memcpy(data, bitmap, blocksize);
   1317
   1318			/* mark all preallocated blks used in in-core bitmap */
   1319			ext4_mb_generate_from_pa(sb, data, group);
   1320			ext4_mb_generate_from_freelist(sb, data, group);
   1321			ext4_unlock_group(sb, group);
   1322
   1323			/* set incore so that the buddy information can be
   1324			 * generated using this
   1325			 */
   1326			incore = data;
   1327		}
   1328	}
   1329	SetPageUptodate(page);
   1330
   1331out:
   1332	if (bh) {
   1333		for (i = 0; i < groups_per_page; i++)
   1334			brelse(bh[i]);
   1335		if (bh != &bhs)
   1336			kfree(bh);
   1337	}
   1338	return err;
   1339}
   1340
   1341/*
   1342 * Lock the buddy and bitmap pages. This make sure other parallel init_group
   1343 * on the same buddy page doesn't happen whild holding the buddy page lock.
   1344 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
   1345 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
   1346 */
   1347static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
   1348		ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
   1349{
   1350	struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
   1351	int block, pnum, poff;
   1352	int blocks_per_page;
   1353	struct page *page;
   1354
   1355	e4b->bd_buddy_page = NULL;
   1356	e4b->bd_bitmap_page = NULL;
   1357
   1358	blocks_per_page = PAGE_SIZE / sb->s_blocksize;
   1359	/*
   1360	 * the buddy cache inode stores the block bitmap
   1361	 * and buddy information in consecutive blocks.
   1362	 * So for each group we need two blocks.
   1363	 */
   1364	block = group * 2;
   1365	pnum = block / blocks_per_page;
   1366	poff = block % blocks_per_page;
   1367	page = find_or_create_page(inode->i_mapping, pnum, gfp);
   1368	if (!page)
   1369		return -ENOMEM;
   1370	BUG_ON(page->mapping != inode->i_mapping);
   1371	e4b->bd_bitmap_page = page;
   1372	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
   1373
   1374	if (blocks_per_page >= 2) {
   1375		/* buddy and bitmap are on the same page */
   1376		return 0;
   1377	}
   1378
   1379	block++;
   1380	pnum = block / blocks_per_page;
   1381	page = find_or_create_page(inode->i_mapping, pnum, gfp);
   1382	if (!page)
   1383		return -ENOMEM;
   1384	BUG_ON(page->mapping != inode->i_mapping);
   1385	e4b->bd_buddy_page = page;
   1386	return 0;
   1387}
   1388
   1389static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
   1390{
   1391	if (e4b->bd_bitmap_page) {
   1392		unlock_page(e4b->bd_bitmap_page);
   1393		put_page(e4b->bd_bitmap_page);
   1394	}
   1395	if (e4b->bd_buddy_page) {
   1396		unlock_page(e4b->bd_buddy_page);
   1397		put_page(e4b->bd_buddy_page);
   1398	}
   1399}
   1400
   1401/*
   1402 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
   1403 * block group lock of all groups for this page; do not hold the BG lock when
   1404 * calling this routine!
   1405 */
   1406static noinline_for_stack
   1407int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
   1408{
   1409
   1410	struct ext4_group_info *this_grp;
   1411	struct ext4_buddy e4b;
   1412	struct page *page;
   1413	int ret = 0;
   1414
   1415	might_sleep();
   1416	mb_debug(sb, "init group %u\n", group);
   1417	this_grp = ext4_get_group_info(sb, group);
   1418	/*
   1419	 * This ensures that we don't reinit the buddy cache
   1420	 * page which map to the group from which we are already
   1421	 * allocating. If we are looking at the buddy cache we would
   1422	 * have taken a reference using ext4_mb_load_buddy and that
   1423	 * would have pinned buddy page to page cache.
   1424	 * The call to ext4_mb_get_buddy_page_lock will mark the
   1425	 * page accessed.
   1426	 */
   1427	ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
   1428	if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
   1429		/*
   1430		 * somebody initialized the group
   1431		 * return without doing anything
   1432		 */
   1433		goto err;
   1434	}
   1435
   1436	page = e4b.bd_bitmap_page;
   1437	ret = ext4_mb_init_cache(page, NULL, gfp);
   1438	if (ret)
   1439		goto err;
   1440	if (!PageUptodate(page)) {
   1441		ret = -EIO;
   1442		goto err;
   1443	}
   1444
   1445	if (e4b.bd_buddy_page == NULL) {
   1446		/*
   1447		 * If both the bitmap and buddy are in
   1448		 * the same page we don't need to force
   1449		 * init the buddy
   1450		 */
   1451		ret = 0;
   1452		goto err;
   1453	}
   1454	/* init buddy cache */
   1455	page = e4b.bd_buddy_page;
   1456	ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
   1457	if (ret)
   1458		goto err;
   1459	if (!PageUptodate(page)) {
   1460		ret = -EIO;
   1461		goto err;
   1462	}
   1463err:
   1464	ext4_mb_put_buddy_page_lock(&e4b);
   1465	return ret;
   1466}
   1467
   1468/*
   1469 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
   1470 * block group lock of all groups for this page; do not hold the BG lock when
   1471 * calling this routine!
   1472 */
   1473static noinline_for_stack int
   1474ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
   1475		       struct ext4_buddy *e4b, gfp_t gfp)
   1476{
   1477	int blocks_per_page;
   1478	int block;
   1479	int pnum;
   1480	int poff;
   1481	struct page *page;
   1482	int ret;
   1483	struct ext4_group_info *grp;
   1484	struct ext4_sb_info *sbi = EXT4_SB(sb);
   1485	struct inode *inode = sbi->s_buddy_cache;
   1486
   1487	might_sleep();
   1488	mb_debug(sb, "load group %u\n", group);
   1489
   1490	blocks_per_page = PAGE_SIZE / sb->s_blocksize;
   1491	grp = ext4_get_group_info(sb, group);
   1492
   1493	e4b->bd_blkbits = sb->s_blocksize_bits;
   1494	e4b->bd_info = grp;
   1495	e4b->bd_sb = sb;
   1496	e4b->bd_group = group;
   1497	e4b->bd_buddy_page = NULL;
   1498	e4b->bd_bitmap_page = NULL;
   1499
   1500	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
   1501		/*
   1502		 * we need full data about the group
   1503		 * to make a good selection
   1504		 */
   1505		ret = ext4_mb_init_group(sb, group, gfp);
   1506		if (ret)
   1507			return ret;
   1508	}
   1509
   1510	/*
   1511	 * the buddy cache inode stores the block bitmap
   1512	 * and buddy information in consecutive blocks.
   1513	 * So for each group we need two blocks.
   1514	 */
   1515	block = group * 2;
   1516	pnum = block / blocks_per_page;
   1517	poff = block % blocks_per_page;
   1518
   1519	/* we could use find_or_create_page(), but it locks page
   1520	 * what we'd like to avoid in fast path ... */
   1521	page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
   1522	if (page == NULL || !PageUptodate(page)) {
   1523		if (page)
   1524			/*
   1525			 * drop the page reference and try
   1526			 * to get the page with lock. If we
   1527			 * are not uptodate that implies
   1528			 * somebody just created the page but
   1529			 * is yet to initialize the same. So
   1530			 * wait for it to initialize.
   1531			 */
   1532			put_page(page);
   1533		page = find_or_create_page(inode->i_mapping, pnum, gfp);
   1534		if (page) {
   1535			BUG_ON(page->mapping != inode->i_mapping);
   1536			if (!PageUptodate(page)) {
   1537				ret = ext4_mb_init_cache(page, NULL, gfp);
   1538				if (ret) {
   1539					unlock_page(page);
   1540					goto err;
   1541				}
   1542				mb_cmp_bitmaps(e4b, page_address(page) +
   1543					       (poff * sb->s_blocksize));
   1544			}
   1545			unlock_page(page);
   1546		}
   1547	}
   1548	if (page == NULL) {
   1549		ret = -ENOMEM;
   1550		goto err;
   1551	}
   1552	if (!PageUptodate(page)) {
   1553		ret = -EIO;
   1554		goto err;
   1555	}
   1556
   1557	/* Pages marked accessed already */
   1558	e4b->bd_bitmap_page = page;
   1559	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
   1560
   1561	block++;
   1562	pnum = block / blocks_per_page;
   1563	poff = block % blocks_per_page;
   1564
   1565	page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
   1566	if (page == NULL || !PageUptodate(page)) {
   1567		if (page)
   1568			put_page(page);
   1569		page = find_or_create_page(inode->i_mapping, pnum, gfp);
   1570		if (page) {
   1571			BUG_ON(page->mapping != inode->i_mapping);
   1572			if (!PageUptodate(page)) {
   1573				ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
   1574							 gfp);
   1575				if (ret) {
   1576					unlock_page(page);
   1577					goto err;
   1578				}
   1579			}
   1580			unlock_page(page);
   1581		}
   1582	}
   1583	if (page == NULL) {
   1584		ret = -ENOMEM;
   1585		goto err;
   1586	}
   1587	if (!PageUptodate(page)) {
   1588		ret = -EIO;
   1589		goto err;
   1590	}
   1591
   1592	/* Pages marked accessed already */
   1593	e4b->bd_buddy_page = page;
   1594	e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
   1595
   1596	return 0;
   1597
   1598err:
   1599	if (page)
   1600		put_page(page);
   1601	if (e4b->bd_bitmap_page)
   1602		put_page(e4b->bd_bitmap_page);
   1603	if (e4b->bd_buddy_page)
   1604		put_page(e4b->bd_buddy_page);
   1605	e4b->bd_buddy = NULL;
   1606	e4b->bd_bitmap = NULL;
   1607	return ret;
   1608}
   1609
   1610static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
   1611			      struct ext4_buddy *e4b)
   1612{
   1613	return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
   1614}
   1615
   1616static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
   1617{
   1618	if (e4b->bd_bitmap_page)
   1619		put_page(e4b->bd_bitmap_page);
   1620	if (e4b->bd_buddy_page)
   1621		put_page(e4b->bd_buddy_page);
   1622}
   1623
   1624
   1625static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
   1626{
   1627	int order = 1, max;
   1628	void *bb;
   1629
   1630	BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
   1631	BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
   1632
   1633	while (order <= e4b->bd_blkbits + 1) {
   1634		bb = mb_find_buddy(e4b, order, &max);
   1635		if (!mb_test_bit(block >> order, bb)) {
   1636			/* this block is part of buddy of order 'order' */
   1637			return order;
   1638		}
   1639		order++;
   1640	}
   1641	return 0;
   1642}
   1643
   1644static void mb_clear_bits(void *bm, int cur, int len)
   1645{
   1646	__u32 *addr;
   1647
   1648	len = cur + len;
   1649	while (cur < len) {
   1650		if ((cur & 31) == 0 && (len - cur) >= 32) {
   1651			/* fast path: clear whole word at once */
   1652			addr = bm + (cur >> 3);
   1653			*addr = 0;
   1654			cur += 32;
   1655			continue;
   1656		}
   1657		mb_clear_bit(cur, bm);
   1658		cur++;
   1659	}
   1660}
   1661
   1662/* clear bits in given range
   1663 * will return first found zero bit if any, -1 otherwise
   1664 */
   1665static int mb_test_and_clear_bits(void *bm, int cur, int len)
   1666{
   1667	__u32 *addr;
   1668	int zero_bit = -1;
   1669
   1670	len = cur + len;
   1671	while (cur < len) {
   1672		if ((cur & 31) == 0 && (len - cur) >= 32) {
   1673			/* fast path: clear whole word at once */
   1674			addr = bm + (cur >> 3);
   1675			if (*addr != (__u32)(-1) && zero_bit == -1)
   1676				zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
   1677			*addr = 0;
   1678			cur += 32;
   1679			continue;
   1680		}
   1681		if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
   1682			zero_bit = cur;
   1683		cur++;
   1684	}
   1685
   1686	return zero_bit;
   1687}
   1688
   1689void mb_set_bits(void *bm, int cur, int len)
   1690{
   1691	__u32 *addr;
   1692
   1693	len = cur + len;
   1694	while (cur < len) {
   1695		if ((cur & 31) == 0 && (len - cur) >= 32) {
   1696			/* fast path: set whole word at once */
   1697			addr = bm + (cur >> 3);
   1698			*addr = 0xffffffff;
   1699			cur += 32;
   1700			continue;
   1701		}
   1702		mb_set_bit(cur, bm);
   1703		cur++;
   1704	}
   1705}
   1706
   1707static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
   1708{
   1709	if (mb_test_bit(*bit + side, bitmap)) {
   1710		mb_clear_bit(*bit, bitmap);
   1711		(*bit) -= side;
   1712		return 1;
   1713	}
   1714	else {
   1715		(*bit) += side;
   1716		mb_set_bit(*bit, bitmap);
   1717		return -1;
   1718	}
   1719}
   1720
   1721static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
   1722{
   1723	int max;
   1724	int order = 1;
   1725	void *buddy = mb_find_buddy(e4b, order, &max);
   1726
   1727	while (buddy) {
   1728		void *buddy2;
   1729
   1730		/* Bits in range [first; last] are known to be set since
   1731		 * corresponding blocks were allocated. Bits in range
   1732		 * (first; last) will stay set because they form buddies on
   1733		 * upper layer. We just deal with borders if they don't
   1734		 * align with upper layer and then go up.
   1735		 * Releasing entire group is all about clearing
   1736		 * single bit of highest order buddy.
   1737		 */
   1738
   1739		/* Example:
   1740		 * ---------------------------------
   1741		 * |   1   |   1   |   1   |   1   |
   1742		 * ---------------------------------
   1743		 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
   1744		 * ---------------------------------
   1745		 *   0   1   2   3   4   5   6   7
   1746		 *      \_____________________/
   1747		 *
   1748		 * Neither [1] nor [6] is aligned to above layer.
   1749		 * Left neighbour [0] is free, so mark it busy,
   1750		 * decrease bb_counters and extend range to
   1751		 * [0; 6]
   1752		 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
   1753		 * mark [6] free, increase bb_counters and shrink range to
   1754		 * [0; 5].
   1755		 * Then shift range to [0; 2], go up and do the same.
   1756		 */
   1757
   1758
   1759		if (first & 1)
   1760			e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
   1761		if (!(last & 1))
   1762			e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
   1763		if (first > last)
   1764			break;
   1765		order++;
   1766
   1767		if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
   1768			mb_clear_bits(buddy, first, last - first + 1);
   1769			e4b->bd_info->bb_counters[order - 1] += last - first + 1;
   1770			break;
   1771		}
   1772		first >>= 1;
   1773		last >>= 1;
   1774		buddy = buddy2;
   1775	}
   1776}
   1777
   1778static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
   1779			   int first, int count)
   1780{
   1781	int left_is_free = 0;
   1782	int right_is_free = 0;
   1783	int block;
   1784	int last = first + count - 1;
   1785	struct super_block *sb = e4b->bd_sb;
   1786
   1787	if (WARN_ON(count == 0))
   1788		return;
   1789	BUG_ON(last >= (sb->s_blocksize << 3));
   1790	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
   1791	/* Don't bother if the block group is corrupt. */
   1792	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
   1793		return;
   1794
   1795	mb_check_buddy(e4b);
   1796	mb_free_blocks_double(inode, e4b, first, count);
   1797
   1798	this_cpu_inc(discard_pa_seq);
   1799	e4b->bd_info->bb_free += count;
   1800	if (first < e4b->bd_info->bb_first_free)
   1801		e4b->bd_info->bb_first_free = first;
   1802
   1803	/* access memory sequentially: check left neighbour,
   1804	 * clear range and then check right neighbour
   1805	 */
   1806	if (first != 0)
   1807		left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
   1808	block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
   1809	if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
   1810		right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
   1811
   1812	if (unlikely(block != -1)) {
   1813		struct ext4_sb_info *sbi = EXT4_SB(sb);
   1814		ext4_fsblk_t blocknr;
   1815
   1816		blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
   1817		blocknr += EXT4_C2B(sbi, block);
   1818		if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
   1819			ext4_grp_locked_error(sb, e4b->bd_group,
   1820					      inode ? inode->i_ino : 0,
   1821					      blocknr,
   1822					      "freeing already freed block (bit %u); block bitmap corrupt.",
   1823					      block);
   1824			ext4_mark_group_bitmap_corrupted(
   1825				sb, e4b->bd_group,
   1826				EXT4_GROUP_INFO_BBITMAP_CORRUPT);
   1827		}
   1828		goto done;
   1829	}
   1830
   1831	/* let's maintain fragments counter */
   1832	if (left_is_free && right_is_free)
   1833		e4b->bd_info->bb_fragments--;
   1834	else if (!left_is_free && !right_is_free)
   1835		e4b->bd_info->bb_fragments++;
   1836
   1837	/* buddy[0] == bd_bitmap is a special case, so handle
   1838	 * it right away and let mb_buddy_mark_free stay free of
   1839	 * zero order checks.
   1840	 * Check if neighbours are to be coaleasced,
   1841	 * adjust bitmap bb_counters and borders appropriately.
   1842	 */
   1843	if (first & 1) {
   1844		first += !left_is_free;
   1845		e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
   1846	}
   1847	if (!(last & 1)) {
   1848		last -= !right_is_free;
   1849		e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
   1850	}
   1851
   1852	if (first <= last)
   1853		mb_buddy_mark_free(e4b, first >> 1, last >> 1);
   1854
   1855done:
   1856	mb_set_largest_free_order(sb, e4b->bd_info);
   1857	mb_update_avg_fragment_size(sb, e4b->bd_info);
   1858	mb_check_buddy(e4b);
   1859}
   1860
   1861static int mb_find_extent(struct ext4_buddy *e4b, int block,
   1862				int needed, struct ext4_free_extent *ex)
   1863{
   1864	int next = block;
   1865	int max, order;
   1866	void *buddy;
   1867
   1868	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
   1869	BUG_ON(ex == NULL);
   1870
   1871	buddy = mb_find_buddy(e4b, 0, &max);
   1872	BUG_ON(buddy == NULL);
   1873	BUG_ON(block >= max);
   1874	if (mb_test_bit(block, buddy)) {
   1875		ex->fe_len = 0;
   1876		ex->fe_start = 0;
   1877		ex->fe_group = 0;
   1878		return 0;
   1879	}
   1880
   1881	/* find actual order */
   1882	order = mb_find_order_for_block(e4b, block);
   1883	block = block >> order;
   1884
   1885	ex->fe_len = 1 << order;
   1886	ex->fe_start = block << order;
   1887	ex->fe_group = e4b->bd_group;
   1888
   1889	/* calc difference from given start */
   1890	next = next - ex->fe_start;
   1891	ex->fe_len -= next;
   1892	ex->fe_start += next;
   1893
   1894	while (needed > ex->fe_len &&
   1895	       mb_find_buddy(e4b, order, &max)) {
   1896
   1897		if (block + 1 >= max)
   1898			break;
   1899
   1900		next = (block + 1) * (1 << order);
   1901		if (mb_test_bit(next, e4b->bd_bitmap))
   1902			break;
   1903
   1904		order = mb_find_order_for_block(e4b, next);
   1905
   1906		block = next >> order;
   1907		ex->fe_len += 1 << order;
   1908	}
   1909
   1910	if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) {
   1911		/* Should never happen! (but apparently sometimes does?!?) */
   1912		WARN_ON(1);
   1913		ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0,
   1914			"corruption or bug in mb_find_extent "
   1915			"block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
   1916			block, order, needed, ex->fe_group, ex->fe_start,
   1917			ex->fe_len, ex->fe_logical);
   1918		ex->fe_len = 0;
   1919		ex->fe_start = 0;
   1920		ex->fe_group = 0;
   1921	}
   1922	return ex->fe_len;
   1923}
   1924
   1925static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
   1926{
   1927	int ord;
   1928	int mlen = 0;
   1929	int max = 0;
   1930	int cur;
   1931	int start = ex->fe_start;
   1932	int len = ex->fe_len;
   1933	unsigned ret = 0;
   1934	int len0 = len;
   1935	void *buddy;
   1936
   1937	BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
   1938	BUG_ON(e4b->bd_group != ex->fe_group);
   1939	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
   1940	mb_check_buddy(e4b);
   1941	mb_mark_used_double(e4b, start, len);
   1942
   1943	this_cpu_inc(discard_pa_seq);
   1944	e4b->bd_info->bb_free -= len;
   1945	if (e4b->bd_info->bb_first_free == start)
   1946		e4b->bd_info->bb_first_free += len;
   1947
   1948	/* let's maintain fragments counter */
   1949	if (start != 0)
   1950		mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
   1951	if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
   1952		max = !mb_test_bit(start + len, e4b->bd_bitmap);
   1953	if (mlen && max)
   1954		e4b->bd_info->bb_fragments++;
   1955	else if (!mlen && !max)
   1956		e4b->bd_info->bb_fragments--;
   1957
   1958	/* let's maintain buddy itself */
   1959	while (len) {
   1960		ord = mb_find_order_for_block(e4b, start);
   1961
   1962		if (((start >> ord) << ord) == start && len >= (1 << ord)) {
   1963			/* the whole chunk may be allocated at once! */
   1964			mlen = 1 << ord;
   1965			buddy = mb_find_buddy(e4b, ord, &max);
   1966			BUG_ON((start >> ord) >= max);
   1967			mb_set_bit(start >> ord, buddy);
   1968			e4b->bd_info->bb_counters[ord]--;
   1969			start += mlen;
   1970			len -= mlen;
   1971			BUG_ON(len < 0);
   1972			continue;
   1973		}
   1974
   1975		/* store for history */
   1976		if (ret == 0)
   1977			ret = len | (ord << 16);
   1978
   1979		/* we have to split large buddy */
   1980		BUG_ON(ord <= 0);
   1981		buddy = mb_find_buddy(e4b, ord, &max);
   1982		mb_set_bit(start >> ord, buddy);
   1983		e4b->bd_info->bb_counters[ord]--;
   1984
   1985		ord--;
   1986		cur = (start >> ord) & ~1U;
   1987		buddy = mb_find_buddy(e4b, ord, &max);
   1988		mb_clear_bit(cur, buddy);
   1989		mb_clear_bit(cur + 1, buddy);
   1990		e4b->bd_info->bb_counters[ord]++;
   1991		e4b->bd_info->bb_counters[ord]++;
   1992	}
   1993	mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
   1994
   1995	mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info);
   1996	mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
   1997	mb_check_buddy(e4b);
   1998
   1999	return ret;
   2000}
   2001
   2002/*
   2003 * Must be called under group lock!
   2004 */
   2005static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
   2006					struct ext4_buddy *e4b)
   2007{
   2008	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
   2009	int ret;
   2010
   2011	BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
   2012	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
   2013
   2014	ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
   2015	ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
   2016	ret = mb_mark_used(e4b, &ac->ac_b_ex);
   2017
   2018	/* preallocation can change ac_b_ex, thus we store actually
   2019	 * allocated blocks for history */
   2020	ac->ac_f_ex = ac->ac_b_ex;
   2021
   2022	ac->ac_status = AC_STATUS_FOUND;
   2023	ac->ac_tail = ret & 0xffff;
   2024	ac->ac_buddy = ret >> 16;
   2025
   2026	/*
   2027	 * take the page reference. We want the page to be pinned
   2028	 * so that we don't get a ext4_mb_init_cache_call for this
   2029	 * group until we update the bitmap. That would mean we
   2030	 * double allocate blocks. The reference is dropped
   2031	 * in ext4_mb_release_context
   2032	 */
   2033	ac->ac_bitmap_page = e4b->bd_bitmap_page;
   2034	get_page(ac->ac_bitmap_page);
   2035	ac->ac_buddy_page = e4b->bd_buddy_page;
   2036	get_page(ac->ac_buddy_page);
   2037	/* store last allocated for subsequent stream allocation */
   2038	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
   2039		spin_lock(&sbi->s_md_lock);
   2040		sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
   2041		sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
   2042		spin_unlock(&sbi->s_md_lock);
   2043	}
   2044	/*
   2045	 * As we've just preallocated more space than
   2046	 * user requested originally, we store allocated
   2047	 * space in a special descriptor.
   2048	 */
   2049	if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
   2050		ext4_mb_new_preallocation(ac);
   2051
   2052}
   2053
   2054static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
   2055					struct ext4_buddy *e4b,
   2056					int finish_group)
   2057{
   2058	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
   2059	struct ext4_free_extent *bex = &ac->ac_b_ex;
   2060	struct ext4_free_extent *gex = &ac->ac_g_ex;
   2061	struct ext4_free_extent ex;
   2062	int max;
   2063
   2064	if (ac->ac_status == AC_STATUS_FOUND)
   2065		return;
   2066	/*
   2067	 * We don't want to scan for a whole year
   2068	 */
   2069	if (ac->ac_found > sbi->s_mb_max_to_scan &&
   2070			!(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
   2071		ac->ac_status = AC_STATUS_BREAK;
   2072		return;
   2073	}
   2074
   2075	/*
   2076	 * Haven't found good chunk so far, let's continue
   2077	 */
   2078	if (bex->fe_len < gex->fe_len)
   2079		return;
   2080
   2081	if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
   2082			&& bex->fe_group == e4b->bd_group) {
   2083		/* recheck chunk's availability - we don't know
   2084		 * when it was found (within this lock-unlock
   2085		 * period or not) */
   2086		max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
   2087		if (max >= gex->fe_len) {
   2088			ext4_mb_use_best_found(ac, e4b);
   2089			return;
   2090		}
   2091	}
   2092}
   2093
   2094/*
   2095 * The routine checks whether found extent is good enough. If it is,
   2096 * then the extent gets marked used and flag is set to the context
   2097 * to stop scanning. Otherwise, the extent is compared with the
   2098 * previous found extent and if new one is better, then it's stored
   2099 * in the context. Later, the best found extent will be used, if
   2100 * mballoc can't find good enough extent.
   2101 *
   2102 * FIXME: real allocation policy is to be designed yet!
   2103 */
   2104static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
   2105					struct ext4_free_extent *ex,
   2106					struct ext4_buddy *e4b)
   2107{
   2108	struct ext4_free_extent *bex = &ac->ac_b_ex;
   2109	struct ext4_free_extent *gex = &ac->ac_g_ex;
   2110
   2111	BUG_ON(ex->fe_len <= 0);
   2112	BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
   2113	BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
   2114	BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
   2115
   2116	ac->ac_found++;
   2117
   2118	/*
   2119	 * The special case - take what you catch first
   2120	 */
   2121	if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
   2122		*bex = *ex;
   2123		ext4_mb_use_best_found(ac, e4b);
   2124		return;
   2125	}
   2126
   2127	/*
   2128	 * Let's check whether the chuck is good enough
   2129	 */
   2130	if (ex->fe_len == gex->fe_len) {
   2131		*bex = *ex;
   2132		ext4_mb_use_best_found(ac, e4b);
   2133		return;
   2134	}
   2135
   2136	/*
   2137	 * If this is first found extent, just store it in the context
   2138	 */
   2139	if (bex->fe_len == 0) {
   2140		*bex = *ex;
   2141		return;
   2142	}
   2143
   2144	/*
   2145	 * If new found extent is better, store it in the context
   2146	 */
   2147	if (bex->fe_len < gex->fe_len) {
   2148		/* if the request isn't satisfied, any found extent
   2149		 * larger than previous best one is better */
   2150		if (ex->fe_len > bex->fe_len)
   2151			*bex = *ex;
   2152	} else if (ex->fe_len > gex->fe_len) {
   2153		/* if the request is satisfied, then we try to find
   2154		 * an extent that still satisfy the request, but is
   2155		 * smaller than previous one */
   2156		if (ex->fe_len < bex->fe_len)
   2157			*bex = *ex;
   2158	}
   2159
   2160	ext4_mb_check_limits(ac, e4b, 0);
   2161}
   2162
   2163static noinline_for_stack
   2164int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
   2165					struct ext4_buddy *e4b)
   2166{
   2167	struct ext4_free_extent ex = ac->ac_b_ex;
   2168	ext4_group_t group = ex.fe_group;
   2169	int max;
   2170	int err;
   2171
   2172	BUG_ON(ex.fe_len <= 0);
   2173	err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
   2174	if (err)
   2175		return err;
   2176
   2177	ext4_lock_group(ac->ac_sb, group);
   2178	max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
   2179
   2180	if (max > 0) {
   2181		ac->ac_b_ex = ex;
   2182		ext4_mb_use_best_found(ac, e4b);
   2183	}
   2184
   2185	ext4_unlock_group(ac->ac_sb, group);
   2186	ext4_mb_unload_buddy(e4b);
   2187
   2188	return 0;
   2189}
   2190
   2191static noinline_for_stack
   2192int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
   2193				struct ext4_buddy *e4b)
   2194{
   2195	ext4_group_t group = ac->ac_g_ex.fe_group;
   2196	int max;
   2197	int err;
   2198	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
   2199	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
   2200	struct ext4_free_extent ex;
   2201
   2202	if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
   2203		return 0;
   2204	if (grp->bb_free == 0)
   2205		return 0;
   2206
   2207	err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
   2208	if (err)
   2209		return err;
   2210
   2211	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
   2212		ext4_mb_unload_buddy(e4b);
   2213		return 0;
   2214	}
   2215
   2216	ext4_lock_group(ac->ac_sb, group);
   2217	max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
   2218			     ac->ac_g_ex.fe_len, &ex);
   2219	ex.fe_logical = 0xDEADFA11; /* debug value */
   2220
   2221	if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
   2222		ext4_fsblk_t start;
   2223
   2224		start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
   2225			ex.fe_start;
   2226		/* use do_div to get remainder (would be 64-bit modulo) */
   2227		if (do_div(start, sbi->s_stripe) == 0) {
   2228			ac->ac_found++;
   2229			ac->ac_b_ex = ex;
   2230			ext4_mb_use_best_found(ac, e4b);
   2231		}
   2232	} else if (max >= ac->ac_g_ex.fe_len) {
   2233		BUG_ON(ex.fe_len <= 0);
   2234		BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
   2235		BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
   2236		ac->ac_found++;
   2237		ac->ac_b_ex = ex;
   2238		ext4_mb_use_best_found(ac, e4b);
   2239	} else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
   2240		/* Sometimes, caller may want to merge even small
   2241		 * number of blocks to an existing extent */
   2242		BUG_ON(ex.fe_len <= 0);
   2243		BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
   2244		BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
   2245		ac->ac_found++;
   2246		ac->ac_b_ex = ex;
   2247		ext4_mb_use_best_found(ac, e4b);
   2248	}
   2249	ext4_unlock_group(ac->ac_sb, group);
   2250	ext4_mb_unload_buddy(e4b);
   2251
   2252	return 0;
   2253}
   2254
   2255/*
   2256 * The routine scans buddy structures (not bitmap!) from given order
   2257 * to max order and tries to find big enough chunk to satisfy the req
   2258 */
   2259static noinline_for_stack
   2260void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
   2261					struct ext4_buddy *e4b)
   2262{
   2263	struct super_block *sb = ac->ac_sb;
   2264	struct ext4_group_info *grp = e4b->bd_info;
   2265	void *buddy;
   2266	int i;
   2267	int k;
   2268	int max;
   2269
   2270	BUG_ON(ac->ac_2order <= 0);
   2271	for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) {
   2272		if (grp->bb_counters[i] == 0)
   2273			continue;
   2274
   2275		buddy = mb_find_buddy(e4b, i, &max);
   2276		BUG_ON(buddy == NULL);
   2277
   2278		k = mb_find_next_zero_bit(buddy, max, 0);
   2279		if (k >= max) {
   2280			ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
   2281				"%d free clusters of order %d. But found 0",
   2282				grp->bb_counters[i], i);
   2283			ext4_mark_group_bitmap_corrupted(ac->ac_sb,
   2284					 e4b->bd_group,
   2285					EXT4_GROUP_INFO_BBITMAP_CORRUPT);
   2286			break;
   2287		}
   2288		ac->ac_found++;
   2289
   2290		ac->ac_b_ex.fe_len = 1 << i;
   2291		ac->ac_b_ex.fe_start = k << i;
   2292		ac->ac_b_ex.fe_group = e4b->bd_group;
   2293
   2294		ext4_mb_use_best_found(ac, e4b);
   2295
   2296		BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
   2297
   2298		if (EXT4_SB(sb)->s_mb_stats)
   2299			atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
   2300
   2301		break;
   2302	}
   2303}
   2304
   2305/*
   2306 * The routine scans the group and measures all found extents.
   2307 * In order to optimize scanning, caller must pass number of
   2308 * free blocks in the group, so the routine can know upper limit.
   2309 */
   2310static noinline_for_stack
   2311void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
   2312					struct ext4_buddy *e4b)
   2313{
   2314	struct super_block *sb = ac->ac_sb;
   2315	void *bitmap = e4b->bd_bitmap;
   2316	struct ext4_free_extent ex;
   2317	int i;
   2318	int free;
   2319
   2320	free = e4b->bd_info->bb_free;
   2321	if (WARN_ON(free <= 0))
   2322		return;
   2323
   2324	i = e4b->bd_info->bb_first_free;
   2325
   2326	while (free && ac->ac_status == AC_STATUS_CONTINUE) {
   2327		i = mb_find_next_zero_bit(bitmap,
   2328						EXT4_CLUSTERS_PER_GROUP(sb), i);
   2329		if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
   2330			/*
   2331			 * IF we have corrupt bitmap, we won't find any
   2332			 * free blocks even though group info says we
   2333			 * have free blocks
   2334			 */
   2335			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
   2336					"%d free clusters as per "
   2337					"group info. But bitmap says 0",
   2338					free);
   2339			ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
   2340					EXT4_GROUP_INFO_BBITMAP_CORRUPT);
   2341			break;
   2342		}
   2343
   2344		mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
   2345		if (WARN_ON(ex.fe_len <= 0))
   2346			break;
   2347		if (free < ex.fe_len) {
   2348			ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
   2349					"%d free clusters as per "
   2350					"group info. But got %d blocks",
   2351					free, ex.fe_len);
   2352			ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
   2353					EXT4_GROUP_INFO_BBITMAP_CORRUPT);
   2354			/*
   2355			 * The number of free blocks differs. This mostly
   2356			 * indicate that the bitmap is corrupt. So exit
   2357			 * without claiming the space.
   2358			 */
   2359			break;
   2360		}
   2361		ex.fe_logical = 0xDEADC0DE; /* debug value */
   2362		ext4_mb_measure_extent(ac, &ex, e4b);
   2363
   2364		i += ex.fe_len;
   2365		free -= ex.fe_len;
   2366	}
   2367
   2368	ext4_mb_check_limits(ac, e4b, 1);
   2369}
   2370
   2371/*
   2372 * This is a special case for storages like raid5
   2373 * we try to find stripe-aligned chunks for stripe-size-multiple requests
   2374 */
   2375static noinline_for_stack
   2376void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
   2377				 struct ext4_buddy *e4b)
   2378{
   2379	struct super_block *sb = ac->ac_sb;
   2380	struct ext4_sb_info *sbi = EXT4_SB(sb);
   2381	void *bitmap = e4b->bd_bitmap;
   2382	struct ext4_free_extent ex;
   2383	ext4_fsblk_t first_group_block;
   2384	ext4_fsblk_t a;
   2385	ext4_grpblk_t i;
   2386	int max;
   2387
   2388	BUG_ON(sbi->s_stripe == 0);
   2389
   2390	/* find first stripe-aligned block in group */
   2391	first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
   2392
   2393	a = first_group_block + sbi->s_stripe - 1;
   2394	do_div(a, sbi->s_stripe);
   2395	i = (a * sbi->s_stripe) - first_group_block;
   2396
   2397	while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
   2398		if (!mb_test_bit(i, bitmap)) {
   2399			max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
   2400			if (max >= sbi->s_stripe) {
   2401				ac->ac_found++;
   2402				ex.fe_logical = 0xDEADF00D; /* debug value */
   2403				ac->ac_b_ex = ex;
   2404				ext4_mb_use_best_found(ac, e4b);
   2405				break;
   2406			}
   2407		}
   2408		i += sbi->s_stripe;
   2409	}
   2410}
   2411
   2412/*
   2413 * This is also called BEFORE we load the buddy bitmap.
   2414 * Returns either 1 or 0 indicating that the group is either suitable
   2415 * for the allocation or not.
   2416 */
   2417static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
   2418				ext4_group_t group, int cr)
   2419{
   2420	ext4_grpblk_t free, fragments;
   2421	int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
   2422	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
   2423
   2424	BUG_ON(cr < 0 || cr >= 4);
   2425
   2426	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
   2427		return false;
   2428
   2429	free = grp->bb_free;
   2430	if (free == 0)
   2431		return false;
   2432
   2433	fragments = grp->bb_fragments;
   2434	if (fragments == 0)
   2435		return false;
   2436
   2437	switch (cr) {
   2438	case 0:
   2439		BUG_ON(ac->ac_2order == 0);
   2440
   2441		/* Avoid using the first bg of a flexgroup for data files */
   2442		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
   2443		    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
   2444		    ((group % flex_size) == 0))
   2445			return false;
   2446
   2447		if (free < ac->ac_g_ex.fe_len)
   2448			return false;
   2449
   2450		if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb))
   2451			return true;
   2452
   2453		if (grp->bb_largest_free_order < ac->ac_2order)
   2454			return false;
   2455
   2456		return true;
   2457	case 1:
   2458		if ((free / fragments) >= ac->ac_g_ex.fe_len)
   2459			return true;
   2460		break;
   2461	case 2:
   2462		if (free >= ac->ac_g_ex.fe_len)
   2463			return true;
   2464		break;
   2465	case 3:
   2466		return true;
   2467	default:
   2468		BUG();
   2469	}
   2470
   2471	return false;
   2472}
   2473
   2474/*
   2475 * This could return negative error code if something goes wrong
   2476 * during ext4_mb_init_group(). This should not be called with
   2477 * ext4_lock_group() held.
   2478 *
   2479 * Note: because we are conditionally operating with the group lock in
   2480 * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this
   2481 * function using __acquire and __release.  This means we need to be
   2482 * super careful before messing with the error path handling via "goto
   2483 * out"!
   2484 */
   2485static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
   2486				     ext4_group_t group, int cr)
   2487{
   2488	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
   2489	struct super_block *sb = ac->ac_sb;
   2490	struct ext4_sb_info *sbi = EXT4_SB(sb);
   2491	bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK;
   2492	ext4_grpblk_t free;
   2493	int ret = 0;
   2494
   2495	if (sbi->s_mb_stats)
   2496		atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
   2497	if (should_lock) {
   2498		ext4_lock_group(sb, group);
   2499		__release(ext4_group_lock_ptr(sb, group));
   2500	}
   2501	free = grp->bb_free;
   2502	if (free == 0)
   2503		goto out;
   2504	if (cr <= 2 && free < ac->ac_g_ex.fe_len)
   2505		goto out;
   2506	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
   2507		goto out;
   2508	if (should_lock) {
   2509		__acquire(ext4_group_lock_ptr(sb, group));
   2510		ext4_unlock_group(sb, group);
   2511	}
   2512
   2513	/* We only do this if the grp has never been initialized */
   2514	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
   2515		struct ext4_group_desc *gdp =
   2516			ext4_get_group_desc(sb, group, NULL);
   2517		int ret;
   2518
   2519		/* cr=0/1 is a very optimistic search to find large
   2520		 * good chunks almost for free.  If buddy data is not
   2521		 * ready, then this optimization makes no sense.  But
   2522		 * we never skip the first block group in a flex_bg,
   2523		 * since this gets used for metadata block allocation,
   2524		 * and we want to make sure we locate metadata blocks
   2525		 * in the first block group in the flex_bg if possible.
   2526		 */
   2527		if (cr < 2 &&
   2528		    (!sbi->s_log_groups_per_flex ||
   2529		     ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
   2530		    !(ext4_has_group_desc_csum(sb) &&
   2531		      (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))))
   2532			return 0;
   2533		ret = ext4_mb_init_group(sb, group, GFP_NOFS);
   2534		if (ret)
   2535			return ret;
   2536	}
   2537
   2538	if (should_lock) {
   2539		ext4_lock_group(sb, group);
   2540		__release(ext4_group_lock_ptr(sb, group));
   2541	}
   2542	ret = ext4_mb_good_group(ac, group, cr);
   2543out:
   2544	if (should_lock) {
   2545		__acquire(ext4_group_lock_ptr(sb, group));
   2546		ext4_unlock_group(sb, group);
   2547	}
   2548	return ret;
   2549}
   2550
   2551/*
   2552 * Start prefetching @nr block bitmaps starting at @group.
   2553 * Return the next group which needs to be prefetched.
   2554 */
   2555ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
   2556			      unsigned int nr, int *cnt)
   2557{
   2558	ext4_group_t ngroups = ext4_get_groups_count(sb);
   2559	struct buffer_head *bh;
   2560	struct blk_plug plug;
   2561
   2562	blk_start_plug(&plug);
   2563	while (nr-- > 0) {
   2564		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
   2565								  NULL);
   2566		struct ext4_group_info *grp = ext4_get_group_info(sb, group);
   2567
   2568		/*
   2569		 * Prefetch block groups with free blocks; but don't
   2570		 * bother if it is marked uninitialized on disk, since
   2571		 * it won't require I/O to read.  Also only try to
   2572		 * prefetch once, so we avoid getblk() call, which can
   2573		 * be expensive.
   2574		 */
   2575		if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
   2576		    EXT4_MB_GRP_NEED_INIT(grp) &&
   2577		    ext4_free_group_clusters(sb, gdp) > 0 &&
   2578		    !(ext4_has_group_desc_csum(sb) &&
   2579		      (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
   2580			bh = ext4_read_block_bitmap_nowait(sb, group, true);
   2581			if (bh && !IS_ERR(bh)) {
   2582				if (!buffer_uptodate(bh) && cnt)
   2583					(*cnt)++;
   2584				brelse(bh);
   2585			}
   2586		}
   2587		if (++group >= ngroups)
   2588			group = 0;
   2589	}
   2590	blk_finish_plug(&plug);
   2591	return group;
   2592}
   2593
   2594/*
   2595 * Prefetching reads the block bitmap into the buffer cache; but we
   2596 * need to make sure that the buddy bitmap in the page cache has been
   2597 * initialized.  Note that ext4_mb_init_group() will block if the I/O
   2598 * is not yet completed, or indeed if it was not initiated by
   2599 * ext4_mb_prefetch did not start the I/O.
   2600 *
   2601 * TODO: We should actually kick off the buddy bitmap setup in a work
   2602 * queue when the buffer I/O is completed, so that we don't block
   2603 * waiting for the block allocation bitmap read to finish when
   2604 * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator().
   2605 */
   2606void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
   2607			   unsigned int nr)
   2608{
   2609	while (nr-- > 0) {
   2610		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
   2611								  NULL);
   2612		struct ext4_group_info *grp = ext4_get_group_info(sb, group);
   2613
   2614		if (!group)
   2615			group = ext4_get_groups_count(sb);
   2616		group--;
   2617		grp = ext4_get_group_info(sb, group);
   2618
   2619		if (EXT4_MB_GRP_NEED_INIT(grp) &&
   2620		    ext4_free_group_clusters(sb, gdp) > 0 &&
   2621		    !(ext4_has_group_desc_csum(sb) &&
   2622		      (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
   2623			if (ext4_mb_init_group(sb, group, GFP_NOFS))
   2624				break;
   2625		}
   2626	}
   2627}
   2628
   2629static noinline_for_stack int
   2630ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
   2631{
   2632	ext4_group_t prefetch_grp = 0, ngroups, group, i;
   2633	int cr = -1;
   2634	int err = 0, first_err = 0;
   2635	unsigned int nr = 0, prefetch_ios = 0;
   2636	struct ext4_sb_info *sbi;
   2637	struct super_block *sb;
   2638	struct ext4_buddy e4b;
   2639	int lost;
   2640
   2641	sb = ac->ac_sb;
   2642	sbi = EXT4_SB(sb);
   2643	ngroups = ext4_get_groups_count(sb);
   2644	/* non-extent files are limited to low blocks/groups */
   2645	if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
   2646		ngroups = sbi->s_blockfile_groups;
   2647
   2648	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
   2649
   2650	/* first, try the goal */
   2651	err = ext4_mb_find_by_goal(ac, &e4b);
   2652	if (err || ac->ac_status == AC_STATUS_FOUND)
   2653		goto out;
   2654
   2655	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
   2656		goto out;
   2657
   2658	/*
   2659	 * ac->ac_2order is set only if the fe_len is a power of 2
   2660	 * if ac->ac_2order is set we also set criteria to 0 so that we
   2661	 * try exact allocation using buddy.
   2662	 */
   2663	i = fls(ac->ac_g_ex.fe_len);
   2664	ac->ac_2order = 0;
   2665	/*
   2666	 * We search using buddy data only if the order of the request
   2667	 * is greater than equal to the sbi_s_mb_order2_reqs
   2668	 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
   2669	 * We also support searching for power-of-two requests only for
   2670	 * requests upto maximum buddy size we have constructed.
   2671	 */
   2672	if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
   2673		/*
   2674		 * This should tell if fe_len is exactly power of 2
   2675		 */
   2676		if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
   2677			ac->ac_2order = array_index_nospec(i - 1,
   2678							   MB_NUM_ORDERS(sb));
   2679	}
   2680
   2681	/* if stream allocation is enabled, use global goal */
   2682	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
   2683		/* TBD: may be hot point */
   2684		spin_lock(&sbi->s_md_lock);
   2685		ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
   2686		ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
   2687		spin_unlock(&sbi->s_md_lock);
   2688	}
   2689
   2690	/* Let's just scan groups to find more-less suitable blocks */
   2691	cr = ac->ac_2order ? 0 : 1;
   2692	/*
   2693	 * cr == 0 try to get exact allocation,
   2694	 * cr == 3  try to get anything
   2695	 */
   2696repeat:
   2697	for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
   2698		ac->ac_criteria = cr;
   2699		/*
   2700		 * searching for the right group start
   2701		 * from the goal value specified
   2702		 */
   2703		group = ac->ac_g_ex.fe_group;
   2704		ac->ac_last_optimal_group = group;
   2705		ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
   2706		prefetch_grp = group;
   2707
   2708		for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups),
   2709			     i++) {
   2710			int ret = 0, new_cr;
   2711
   2712			cond_resched();
   2713
   2714			ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups);
   2715			if (new_cr != cr) {
   2716				cr = new_cr;
   2717				goto repeat;
   2718			}
   2719
   2720			/*
   2721			 * Batch reads of the block allocation bitmaps
   2722			 * to get multiple READs in flight; limit
   2723			 * prefetching at cr=0/1, otherwise mballoc can
   2724			 * spend a lot of time loading imperfect groups
   2725			 */
   2726			if ((prefetch_grp == group) &&
   2727			    (cr > 1 ||
   2728			     prefetch_ios < sbi->s_mb_prefetch_limit)) {
   2729				unsigned int curr_ios = prefetch_ios;
   2730
   2731				nr = sbi->s_mb_prefetch;
   2732				if (ext4_has_feature_flex_bg(sb)) {
   2733					nr = 1 << sbi->s_log_groups_per_flex;
   2734					nr -= group & (nr - 1);
   2735					nr = min(nr, sbi->s_mb_prefetch);
   2736				}
   2737				prefetch_grp = ext4_mb_prefetch(sb, group,
   2738							nr, &prefetch_ios);
   2739				if (prefetch_ios == curr_ios)
   2740					nr = 0;
   2741			}
   2742
   2743			/* This now checks without needing the buddy page */
   2744			ret = ext4_mb_good_group_nolock(ac, group, cr);
   2745			if (ret <= 0) {
   2746				if (!first_err)
   2747					first_err = ret;
   2748				continue;
   2749			}
   2750
   2751			err = ext4_mb_load_buddy(sb, group, &e4b);
   2752			if (err)
   2753				goto out;
   2754
   2755			ext4_lock_group(sb, group);
   2756
   2757			/*
   2758			 * We need to check again after locking the
   2759			 * block group
   2760			 */
   2761			ret = ext4_mb_good_group(ac, group, cr);
   2762			if (ret == 0) {
   2763				ext4_unlock_group(sb, group);
   2764				ext4_mb_unload_buddy(&e4b);
   2765				continue;
   2766			}
   2767
   2768			ac->ac_groups_scanned++;
   2769			if (cr == 0)
   2770				ext4_mb_simple_scan_group(ac, &e4b);
   2771			else if (cr == 1 && sbi->s_stripe &&
   2772					!(ac->ac_g_ex.fe_len % sbi->s_stripe))
   2773				ext4_mb_scan_aligned(ac, &e4b);
   2774			else
   2775				ext4_mb_complex_scan_group(ac, &e4b);
   2776
   2777			ext4_unlock_group(sb, group);
   2778			ext4_mb_unload_buddy(&e4b);
   2779
   2780			if (ac->ac_status != AC_STATUS_CONTINUE)
   2781				break;
   2782		}
   2783		/* Processed all groups and haven't found blocks */
   2784		if (sbi->s_mb_stats && i == ngroups)
   2785			atomic64_inc(&sbi->s_bal_cX_failed[cr]);
   2786	}
   2787
   2788	if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
   2789	    !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
   2790		/*
   2791		 * We've been searching too long. Let's try to allocate
   2792		 * the best chunk we've found so far
   2793		 */
   2794		ext4_mb_try_best_found(ac, &e4b);
   2795		if (ac->ac_status != AC_STATUS_FOUND) {
   2796			/*
   2797			 * Someone more lucky has already allocated it.
   2798			 * The only thing we can do is just take first
   2799			 * found block(s)
   2800			 */
   2801			lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
   2802			mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
   2803				 ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
   2804				 ac->ac_b_ex.fe_len, lost);
   2805
   2806			ac->ac_b_ex.fe_group = 0;
   2807			ac->ac_b_ex.fe_start = 0;
   2808			ac->ac_b_ex.fe_len = 0;
   2809			ac->ac_status = AC_STATUS_CONTINUE;
   2810			ac->ac_flags |= EXT4_MB_HINT_FIRST;
   2811			cr = 3;
   2812			goto repeat;
   2813		}
   2814	}
   2815
   2816	if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
   2817		atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
   2818out:
   2819	if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
   2820		err = first_err;
   2821
   2822	mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
   2823		 ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
   2824		 ac->ac_flags, cr, err);
   2825
   2826	if (nr)
   2827		ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
   2828
   2829	return err;
   2830}
   2831
   2832static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
   2833{
   2834	struct super_block *sb = pde_data(file_inode(seq->file));
   2835	ext4_group_t group;
   2836
   2837	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
   2838		return NULL;
   2839	group = *pos + 1;
   2840	return (void *) ((unsigned long) group);
   2841}
   2842
   2843static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
   2844{
   2845	struct super_block *sb = pde_data(file_inode(seq->file));
   2846	ext4_group_t group;
   2847
   2848	++*pos;
   2849	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
   2850		return NULL;
   2851	group = *pos + 1;
   2852	return (void *) ((unsigned long) group);
   2853}
   2854
   2855static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
   2856{
   2857	struct super_block *sb = pde_data(file_inode(seq->file));
   2858	ext4_group_t group = (ext4_group_t) ((unsigned long) v);
   2859	int i;
   2860	int err, buddy_loaded = 0;
   2861	struct ext4_buddy e4b;
   2862	struct ext4_group_info *grinfo;
   2863	unsigned char blocksize_bits = min_t(unsigned char,
   2864					     sb->s_blocksize_bits,
   2865					     EXT4_MAX_BLOCK_LOG_SIZE);
   2866	struct sg {
   2867		struct ext4_group_info info;
   2868		ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
   2869	} sg;
   2870
   2871	group--;
   2872	if (group == 0)
   2873		seq_puts(seq, "#group: free  frags first ["
   2874			      " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
   2875			      " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]\n");
   2876
   2877	i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
   2878		sizeof(struct ext4_group_info);
   2879
   2880	grinfo = ext4_get_group_info(sb, group);
   2881	/* Load the group info in memory only if not already loaded. */
   2882	if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
   2883		err = ext4_mb_load_buddy(sb, group, &e4b);
   2884		if (err) {
   2885			seq_printf(seq, "#%-5u: I/O error\n", group);
   2886			return 0;
   2887		}
   2888		buddy_loaded = 1;
   2889	}
   2890
   2891	memcpy(&sg, ext4_get_group_info(sb, group), i);
   2892
   2893	if (buddy_loaded)
   2894		ext4_mb_unload_buddy(&e4b);
   2895
   2896	seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
   2897			sg.info.bb_fragments, sg.info.bb_first_free);
   2898	for (i = 0; i <= 13; i++)
   2899		seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
   2900				sg.info.bb_counters[i] : 0);
   2901	seq_puts(seq, " ]\n");
   2902
   2903	return 0;
   2904}
   2905
   2906static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
   2907{
   2908}
   2909
   2910const struct seq_operations ext4_mb_seq_groups_ops = {
   2911	.start  = ext4_mb_seq_groups_start,
   2912	.next   = ext4_mb_seq_groups_next,
   2913	.stop   = ext4_mb_seq_groups_stop,
   2914	.show   = ext4_mb_seq_groups_show,
   2915};
   2916
   2917int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
   2918{
   2919	struct super_block *sb = seq->private;
   2920	struct ext4_sb_info *sbi = EXT4_SB(sb);
   2921
   2922	seq_puts(seq, "mballoc:\n");
   2923	if (!sbi->s_mb_stats) {
   2924		seq_puts(seq, "\tmb stats collection turned off.\n");
   2925		seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
   2926		return 0;
   2927	}
   2928	seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
   2929	seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
   2930
   2931	seq_printf(seq, "\tgroups_scanned: %u\n",  atomic_read(&sbi->s_bal_groups_scanned));
   2932
   2933	seq_puts(seq, "\tcr0_stats:\n");
   2934	seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0]));
   2935	seq_printf(seq, "\t\tgroups_considered: %llu\n",
   2936		   atomic64_read(&sbi->s_bal_cX_groups_considered[0]));
   2937	seq_printf(seq, "\t\tuseless_loops: %llu\n",
   2938		   atomic64_read(&sbi->s_bal_cX_failed[0]));
   2939	seq_printf(seq, "\t\tbad_suggestions: %u\n",
   2940		   atomic_read(&sbi->s_bal_cr0_bad_suggestions));
   2941
   2942	seq_puts(seq, "\tcr1_stats:\n");
   2943	seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1]));
   2944	seq_printf(seq, "\t\tgroups_considered: %llu\n",
   2945		   atomic64_read(&sbi->s_bal_cX_groups_considered[1]));
   2946	seq_printf(seq, "\t\tuseless_loops: %llu\n",
   2947		   atomic64_read(&sbi->s_bal_cX_failed[1]));
   2948	seq_printf(seq, "\t\tbad_suggestions: %u\n",
   2949		   atomic_read(&sbi->s_bal_cr1_bad_suggestions));
   2950
   2951	seq_puts(seq, "\tcr2_stats:\n");
   2952	seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2]));
   2953	seq_printf(seq, "\t\tgroups_considered: %llu\n",
   2954		   atomic64_read(&sbi->s_bal_cX_groups_considered[2]));
   2955	seq_printf(seq, "\t\tuseless_loops: %llu\n",
   2956		   atomic64_read(&sbi->s_bal_cX_failed[2]));
   2957
   2958	seq_puts(seq, "\tcr3_stats:\n");
   2959	seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3]));
   2960	seq_printf(seq, "\t\tgroups_considered: %llu\n",
   2961		   atomic64_read(&sbi->s_bal_cX_groups_considered[3]));
   2962	seq_printf(seq, "\t\tuseless_loops: %llu\n",
   2963		   atomic64_read(&sbi->s_bal_cX_failed[3]));
   2964	seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned));
   2965	seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
   2966	seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
   2967	seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
   2968	seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
   2969
   2970	seq_printf(seq, "\tbuddies_generated: %u/%u\n",
   2971		   atomic_read(&sbi->s_mb_buddies_generated),
   2972		   ext4_get_groups_count(sb));
   2973	seq_printf(seq, "\tbuddies_time_used: %llu\n",
   2974		   atomic64_read(&sbi->s_mb_generation_time));
   2975	seq_printf(seq, "\tpreallocated: %u\n",
   2976		   atomic_read(&sbi->s_mb_preallocated));
   2977	seq_printf(seq, "\tdiscarded: %u\n",
   2978		   atomic_read(&sbi->s_mb_discarded));
   2979	return 0;
   2980}
   2981
   2982static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos)
   2983__acquires(&EXT4_SB(sb)->s_mb_rb_lock)
   2984{
   2985	struct super_block *sb = pde_data(file_inode(seq->file));
   2986	unsigned long position;
   2987
   2988	read_lock(&EXT4_SB(sb)->s_mb_rb_lock);
   2989
   2990	if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
   2991		return NULL;
   2992	position = *pos + 1;
   2993	return (void *) ((unsigned long) position);
   2994}
   2995
   2996static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos)
   2997{
   2998	struct super_block *sb = pde_data(file_inode(seq->file));
   2999	unsigned long position;
   3000
   3001	++*pos;
   3002	if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
   3003		return NULL;
   3004	position = *pos + 1;
   3005	return (void *) ((unsigned long) position);
   3006}
   3007
   3008static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
   3009{
   3010	struct super_block *sb = pde_data(file_inode(seq->file));
   3011	struct ext4_sb_info *sbi = EXT4_SB(sb);
   3012	unsigned long position = ((unsigned long) v);
   3013	struct ext4_group_info *grp;
   3014	struct rb_node *n;
   3015	unsigned int count, min, max;
   3016
   3017	position--;
   3018	if (position >= MB_NUM_ORDERS(sb)) {
   3019		seq_puts(seq, "fragment_size_tree:\n");
   3020		n = rb_first(&sbi->s_mb_avg_fragment_size_root);
   3021		if (!n) {
   3022			seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n");
   3023			return 0;
   3024		}
   3025		grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
   3026		min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
   3027		count = 1;
   3028		while (rb_next(n)) {
   3029			count++;
   3030			n = rb_next(n);
   3031		}
   3032		grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
   3033		max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
   3034
   3035		seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n",
   3036			   min, max, count);
   3037		return 0;
   3038	}
   3039
   3040	if (position == 0) {
   3041		seq_printf(seq, "optimize_scan: %d\n",
   3042			   test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0);
   3043		seq_puts(seq, "max_free_order_lists:\n");
   3044	}
   3045	count = 0;
   3046	list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position],
   3047			    bb_largest_free_order_node)
   3048		count++;
   3049	seq_printf(seq, "\tlist_order_%u_groups: %u\n",
   3050		   (unsigned int)position, count);
   3051
   3052	return 0;
   3053}
   3054
   3055static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
   3056__releases(&EXT4_SB(sb)->s_mb_rb_lock)
   3057{
   3058	struct super_block *sb = pde_data(file_inode(seq->file));
   3059
   3060	read_unlock(&EXT4_SB(sb)->s_mb_rb_lock);
   3061}
   3062
   3063const struct seq_operations ext4_mb_seq_structs_summary_ops = {
   3064	.start  = ext4_mb_seq_structs_summary_start,
   3065	.next   = ext4_mb_seq_structs_summary_next,
   3066	.stop   = ext4_mb_seq_structs_summary_stop,
   3067	.show   = ext4_mb_seq_structs_summary_show,
   3068};
   3069
   3070static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
   3071{
   3072	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
   3073	struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
   3074
   3075	BUG_ON(!cachep);
   3076	return cachep;
   3077}
   3078
   3079/*
   3080 * Allocate the top-level s_group_info array for the specified number
   3081 * of groups
   3082 */
   3083int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
   3084{
   3085	struct ext4_sb_info *sbi = EXT4_SB(sb);
   3086	unsigned size;
   3087	struct ext4_group_info ***old_groupinfo, ***new_groupinfo;
   3088
   3089	size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
   3090		EXT4_DESC_PER_BLOCK_BITS(sb);
   3091	if (size <= sbi->s_group_info_size)
   3092		return 0;
   3093
   3094	size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
   3095	new_groupinfo = kvzalloc(size, GFP_KERNEL);
   3096	if (!new_groupinfo) {
   3097		ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
   3098		return -ENOMEM;
   3099	}
   3100	rcu_read_lock();
   3101	old_groupinfo = rcu_dereference(sbi->s_group_info);
   3102	if (old_groupinfo)
   3103		memcpy(new_groupinfo, old_groupinfo,
   3104		       sbi->s_group_info_size * sizeof(*sbi->s_group_info));
   3105	rcu_read_unlock();
   3106	rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
   3107	sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
   3108	if (old_groupinfo)
   3109		ext4_kvfree_array_rcu(old_groupinfo);
   3110	ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
   3111		   sbi->s_group_info_size);
   3112	return 0;
   3113}
   3114
   3115/* Create and initialize ext4_group_info data for the given group. */
   3116int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
   3117			  struct ext4_group_desc *desc)
   3118{
   3119	int i;
   3120	int metalen = 0;
   3121	int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
   3122	struct ext4_sb_info *sbi = EXT4_SB(sb);
   3123	struct ext4_group_info **meta_group_info;
   3124	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
   3125
   3126	/*
   3127	 * First check if this group is the first of a reserved block.
   3128	 * If it's true, we have to allocate a new table of pointers
   3129	 * to ext4_group_info structures
   3130	 */
   3131	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
   3132		metalen = sizeof(*meta_group_info) <<
   3133			EXT4_DESC_PER_BLOCK_BITS(sb);
   3134		meta_group_info = kmalloc(metalen, GFP_NOFS);
   3135		if (meta_group_info == NULL) {
   3136			ext4_msg(sb, KERN_ERR, "can't allocate mem "
   3137				 "for a buddy group");
   3138			goto exit_meta_group_info;
   3139		}
   3140		rcu_read_lock();
   3141		rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
   3142		rcu_read_unlock();
   3143	}
   3144
   3145	meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
   3146	i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
   3147
   3148	meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
   3149	if (meta_group_info[i] == NULL) {
   3150		ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
   3151		goto exit_group_info;
   3152	}
   3153	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
   3154		&(meta_group_info[i]->bb_state));
   3155
   3156	/*
   3157	 * initialize bb_free to be able to skip
   3158	 * empty groups without initialization
   3159	 */
   3160	if (ext4_has_group_desc_csum(sb) &&
   3161	    (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
   3162		meta_group_info[i]->bb_free =
   3163			ext4_free_clusters_after_init(sb, group, desc);
   3164	} else {
   3165		meta_group_info[i]->bb_free =
   3166			ext4_free_group_clusters(sb, desc);
   3167	}
   3168
   3169	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
   3170	init_rwsem(&meta_group_info[i]->alloc_sem);
   3171	meta_group_info[i]->bb_free_root = RB_ROOT;
   3172	INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
   3173	RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb);
   3174	meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
   3175	meta_group_info[i]->bb_group = group;
   3176
   3177	mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
   3178	return 0;
   3179
   3180exit_group_info:
   3181	/* If a meta_group_info table has been allocated, release it now */
   3182	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
   3183		struct ext4_group_info ***group_info;
   3184
   3185		rcu_read_lock();
   3186		group_info = rcu_dereference(sbi->s_group_info);
   3187		kfree(group_info[idx]);
   3188		group_info[idx] = NULL;
   3189		rcu_read_unlock();
   3190	}
   3191exit_meta_group_info:
   3192	return -ENOMEM;
   3193} /* ext4_mb_add_groupinfo */
   3194
   3195static int ext4_mb_init_backend(struct super_block *sb)
   3196{
   3197	ext4_group_t ngroups = ext4_get_groups_count(sb);
   3198	ext4_group_t i;
   3199	struct ext4_sb_info *sbi = EXT4_SB(sb);
   3200	int err;
   3201	struct ext4_group_desc *desc;
   3202	struct ext4_group_info ***group_info;
   3203	struct kmem_cache *cachep;
   3204
   3205	err = ext4_mb_alloc_groupinfo(sb, ngroups);
   3206	if (err)
   3207		return err;
   3208
   3209	sbi->s_buddy_cache = new_inode(sb);
   3210	if (sbi->s_buddy_cache == NULL) {
   3211		ext4_msg(sb, KERN_ERR, "can't get new inode");
   3212		goto err_freesgi;
   3213	}
   3214	/* To avoid potentially colliding with an valid on-disk inode number,
   3215	 * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
   3216	 * not in the inode hash, so it should never be found by iget(), but
   3217	 * this will avoid confusion if it ever shows up during debugging. */
   3218	sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
   3219	EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
   3220	for (i = 0; i < ngroups; i++) {
   3221		cond_resched();
   3222		desc = ext4_get_group_desc(sb, i, NULL);
   3223		if (desc == NULL) {
   3224			ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
   3225			goto err_freebuddy;
   3226		}
   3227		if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
   3228			goto err_freebuddy;
   3229	}
   3230
   3231	if (ext4_has_feature_flex_bg(sb)) {
   3232		/* a single flex group is supposed to be read by a single IO.
   3233		 * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
   3234		 * unsigned integer, so the maximum shift is 32.
   3235		 */
   3236		if (sbi->s_es->s_log_groups_per_flex >= 32) {
   3237			ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
   3238			goto err_freebuddy;
   3239		}
   3240		sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
   3241			BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
   3242		sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
   3243	} else {
   3244		sbi->s_mb_prefetch = 32;
   3245	}
   3246	if (sbi->s_mb_prefetch > ext4_get_groups_count(sb))
   3247		sbi->s_mb_prefetch = ext4_get_groups_count(sb);
   3248	/* now many real IOs to prefetch within a single allocation at cr=0
   3249	 * given cr=0 is an CPU-related optimization we shouldn't try to
   3250	 * load too many groups, at some point we should start to use what
   3251	 * we've got in memory.
   3252	 * with an average random access time 5ms, it'd take a second to get
   3253	 * 200 groups (* N with flex_bg), so let's make this limit 4
   3254	 */
   3255	sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4;
   3256	if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb))
   3257		sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb);
   3258
   3259	return 0;
   3260
   3261err_freebuddy:
   3262	cachep = get_groupinfo_cache(sb->s_blocksize_bits);
   3263	while (i-- > 0)
   3264		kmem_cache_free(cachep, ext4_get_group_info(sb, i));
   3265	i = sbi->s_group_info_size;
   3266	rcu_read_lock();
   3267	group_info = rcu_dereference(sbi->s_group_info);
   3268	while (i-- > 0)
   3269		kfree(group_info[i]);
   3270	rcu_read_unlock();
   3271	iput(sbi->s_buddy_cache);
   3272err_freesgi:
   3273	rcu_read_lock();
   3274	kvfree(rcu_dereference(sbi->s_group_info));
   3275	rcu_read_unlock();
   3276	return -ENOMEM;
   3277}
   3278
   3279static void ext4_groupinfo_destroy_slabs(void)
   3280{
   3281	int i;
   3282
   3283	for (i = 0; i < NR_GRPINFO_CACHES; i++) {
   3284		kmem_cache_destroy(ext4_groupinfo_caches[i]);
   3285		ext4_groupinfo_caches[i] = NULL;
   3286	}
   3287}
   3288
   3289static int ext4_groupinfo_create_slab(size_t size)
   3290{
   3291	static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
   3292	int slab_size;
   3293	int blocksize_bits = order_base_2(size);
   3294	int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
   3295	struct kmem_cache *cachep;
   3296
   3297	if (cache_index >= NR_GRPINFO_CACHES)
   3298		return -EINVAL;
   3299
   3300	if (unlikely(cache_index < 0))
   3301		cache_index = 0;
   3302
   3303	mutex_lock(&ext4_grpinfo_slab_create_mutex);
   3304	if (ext4_groupinfo_caches[cache_index]) {
   3305		mutex_unlock(&ext4_grpinfo_slab_create_mutex);
   3306		return 0;	/* Already created */
   3307	}
   3308
   3309	slab_size = offsetof(struct ext4_group_info,
   3310				bb_counters[blocksize_bits + 2]);
   3311
   3312	cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
   3313					slab_size, 0, SLAB_RECLAIM_ACCOUNT,
   3314					NULL);
   3315
   3316	ext4_groupinfo_caches[cache_index] = cachep;
   3317
   3318	mutex_unlock(&ext4_grpinfo_slab_create_mutex);
   3319	if (!cachep) {
   3320		printk(KERN_EMERG
   3321		       "EXT4-fs: no memory for groupinfo slab cache\n");
   3322		return -ENOMEM;
   3323	}
   3324
   3325	return 0;
   3326}
   3327
   3328static void ext4_discard_work(struct work_struct *work)
   3329{
   3330	struct ext4_sb_info *sbi = container_of(work,
   3331			struct ext4_sb_info, s_discard_work);
   3332	struct super_block *sb = sbi->s_sb;
   3333	struct ext4_free_data *fd, *nfd;
   3334	struct ext4_buddy e4b;
   3335	struct list_head discard_list;
   3336	ext4_group_t grp, load_grp;
   3337	int err = 0;
   3338
   3339	INIT_LIST_HEAD(&discard_list);
   3340	spin_lock(&sbi->s_md_lock);
   3341	list_splice_init(&sbi->s_discard_list, &discard_list);
   3342	spin_unlock(&sbi->s_md_lock);
   3343
   3344	load_grp = UINT_MAX;
   3345	list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) {
   3346		/*
   3347		 * If filesystem is umounting or no memory or suffering
   3348		 * from no space, give up the discard
   3349		 */
   3350		if ((sb->s_flags & SB_ACTIVE) && !err &&
   3351		    !atomic_read(&sbi->s_retry_alloc_pending)) {
   3352			grp = fd->efd_group;
   3353			if (grp != load_grp) {
   3354				if (load_grp != UINT_MAX)
   3355					ext4_mb_unload_buddy(&e4b);
   3356
   3357				err = ext4_mb_load_buddy(sb, grp, &e4b);
   3358				if (err) {
   3359					kmem_cache_free(ext4_free_data_cachep, fd);
   3360					load_grp = UINT_MAX;
   3361					continue;
   3362				} else {
   3363					load_grp = grp;
   3364				}
   3365			}
   3366
   3367			ext4_lock_group(sb, grp);
   3368			ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster,
   3369						fd->efd_start_cluster + fd->efd_count - 1, 1);
   3370			ext4_unlock_group(sb, grp);
   3371		}
   3372		kmem_cache_free(ext4_free_data_cachep, fd);
   3373	}
   3374
   3375	if (load_grp != UINT_MAX)
   3376		ext4_mb_unload_buddy(&e4b);
   3377}
   3378
   3379int ext4_mb_init(struct super_block *sb)
   3380{
   3381	struct ext4_sb_info *sbi = EXT4_SB(sb);
   3382	unsigned i, j;
   3383	unsigned offset, offset_incr;
   3384	unsigned max;
   3385	int ret;
   3386
   3387	i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets);
   3388
   3389	sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
   3390	if (sbi->s_mb_offsets == NULL) {
   3391		ret = -ENOMEM;
   3392		goto out;
   3393	}
   3394
   3395	i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs);
   3396	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
   3397	if (sbi->s_mb_maxs == NULL) {
   3398		ret = -ENOMEM;
   3399		goto out;
   3400	}
   3401
   3402	ret = ext4_groupinfo_create_slab(sb->s_blocksize);
   3403	if (ret < 0)
   3404		goto out;
   3405
   3406	/* order 0 is regular bitmap */
   3407	sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
   3408	sbi->s_mb_offsets[0] = 0;
   3409
   3410	i = 1;
   3411	offset = 0;
   3412	offset_incr = 1 << (sb->s_blocksize_bits - 1);
   3413	max = sb->s_blocksize << 2;
   3414	do {
   3415		sbi->s_mb_offsets[i] = offset;
   3416		sbi->s_mb_maxs[i] = max;
   3417		offset += offset_incr;
   3418		offset_incr = offset_incr >> 1;
   3419		max = max >> 1;
   3420		i++;
   3421	} while (i < MB_NUM_ORDERS(sb));
   3422
   3423	sbi->s_mb_avg_fragment_size_root = RB_ROOT;
   3424	sbi->s_mb_largest_free_orders =
   3425		kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
   3426			GFP_KERNEL);
   3427	if (!sbi->s_mb_largest_free_orders) {
   3428		ret = -ENOMEM;
   3429		goto out;
   3430	}
   3431	sbi->s_mb_largest_free_orders_locks =
   3432		kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
   3433			GFP_KERNEL);
   3434	if (!sbi->s_mb_largest_free_orders_locks) {
   3435		ret = -ENOMEM;
   3436		goto out;
   3437	}
   3438	for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
   3439		INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
   3440		rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
   3441	}
   3442	rwlock_init(&sbi->s_mb_rb_lock);
   3443
   3444	spin_lock_init(&sbi->s_md_lock);
   3445	sbi->s_mb_free_pending = 0;
   3446	INIT_LIST_HEAD(&sbi->s_freed_data_list);
   3447	INIT_LIST_HEAD(&sbi->s_discard_list);
   3448	INIT_WORK(&sbi->s_discard_work, ext4_discard_work);
   3449	atomic_set(&sbi->s_retry_alloc_pending, 0);
   3450
   3451	sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
   3452	sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
   3453	sbi->s_mb_stats = MB_DEFAULT_STATS;
   3454	sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
   3455	sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
   3456	sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
   3457	/*
   3458	 * The default group preallocation is 512, which for 4k block
   3459	 * sizes translates to 2 megabytes.  However for bigalloc file
   3460	 * systems, this is probably too big (i.e, if the cluster size
   3461	 * is 1 megabyte, then group preallocation size becomes half a
   3462	 * gigabyte!).  As a default, we will keep a two megabyte
   3463	 * group pralloc size for cluster sizes up to 64k, and after
   3464	 * that, we will force a minimum group preallocation size of
   3465	 * 32 clusters.  This translates to 8 megs when the cluster
   3466	 * size is 256k, and 32 megs when the cluster size is 1 meg,
   3467	 * which seems reasonable as a default.
   3468	 */
   3469	sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
   3470				       sbi->s_cluster_bits, 32);
   3471	/*
   3472	 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
   3473	 * to the lowest multiple of s_stripe which is bigger than
   3474	 * the s_mb_group_prealloc as determined above. We want
   3475	 * the preallocation size to be an exact multiple of the
   3476	 * RAID stripe size so that preallocations don't fragment
   3477	 * the stripes.
   3478	 */
   3479	if (sbi->s_stripe > 1) {
   3480		sbi->s_mb_group_prealloc = roundup(
   3481			sbi->s_mb_group_prealloc, sbi->s_stripe);
   3482	}
   3483
   3484	sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
   3485	if (sbi->s_locality_groups == NULL) {
   3486		ret = -ENOMEM;
   3487		goto out;
   3488	}
   3489	for_each_possible_cpu(i) {
   3490		struct ext4_locality_group *lg;
   3491		lg = per_cpu_ptr(sbi->s_locality_groups, i);
   3492		mutex_init(&lg->lg_mutex);
   3493		for (j = 0; j < PREALLOC_TB_SIZE; j++)
   3494			INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
   3495		spin_lock_init(&lg->lg_prealloc_lock);
   3496	}
   3497
   3498	if (bdev_nonrot(sb->s_bdev))
   3499		sbi->s_mb_max_linear_groups = 0;
   3500	else
   3501		sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
   3502	/* init file for buddy data */
   3503	ret = ext4_mb_init_backend(sb);
   3504	if (ret != 0)
   3505		goto out_free_locality_groups;
   3506
   3507	return 0;
   3508
   3509out_free_locality_groups:
   3510	free_percpu(sbi->s_locality_groups);
   3511	sbi->s_locality_groups = NULL;
   3512out:
   3513	kfree(sbi->s_mb_largest_free_orders);
   3514	kfree(sbi->s_mb_largest_free_orders_locks);
   3515	kfree(sbi->s_mb_offsets);
   3516	sbi->s_mb_offsets = NULL;
   3517	kfree(sbi->s_mb_maxs);
   3518	sbi->s_mb_maxs = NULL;
   3519	return ret;
   3520}
   3521
   3522/* need to called with the ext4 group lock held */
   3523static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
   3524{
   3525	struct ext4_prealloc_space *pa;
   3526	struct list_head *cur, *tmp;
   3527	int count = 0;
   3528
   3529	list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
   3530		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
   3531		list_del(&pa->pa_group_list);
   3532		count++;
   3533		kmem_cache_free(ext4_pspace_cachep, pa);
   3534	}
   3535	return count;
   3536}
   3537
   3538int ext4_mb_release(struct super_block *sb)
   3539{
   3540	ext4_group_t ngroups = ext4_get_groups_count(sb);
   3541	ext4_group_t i;
   3542	int num_meta_group_infos;
   3543	struct ext4_group_info *grinfo, ***group_info;
   3544	struct ext4_sb_info *sbi = EXT4_SB(sb);
   3545	struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
   3546	int count;
   3547
   3548	if (test_opt(sb, DISCARD)) {
   3549		/*
   3550		 * wait the discard work to drain all of ext4_free_data
   3551		 */
   3552		flush_work(&sbi->s_discard_work);
   3553		WARN_ON_ONCE(!list_empty(&sbi->s_discard_list));
   3554	}
   3555
   3556	if (sbi->s_group_info) {
   3557		for (i = 0; i < ngroups; i++) {
   3558			cond_resched();
   3559			grinfo = ext4_get_group_info(sb, i);
   3560			mb_group_bb_bitmap_free(grinfo);
   3561			ext4_lock_group(sb, i);
   3562			count = ext4_mb_cleanup_pa(grinfo);
   3563			if (count)
   3564				mb_debug(sb, "mballoc: %d PAs left\n",
   3565					 count);
   3566			ext4_unlock_group(sb, i);
   3567			kmem_cache_free(cachep, grinfo);
   3568		}
   3569		num_meta_group_infos = (ngroups +
   3570				EXT4_DESC_PER_BLOCK(sb) - 1) >>
   3571			EXT4_DESC_PER_BLOCK_BITS(sb);
   3572		rcu_read_lock();
   3573		group_info = rcu_dereference(sbi->s_group_info);
   3574		for (i = 0; i < num_meta_group_infos; i++)
   3575			kfree(group_info[i]);
   3576		kvfree(group_info);
   3577		rcu_read_unlock();
   3578	}
   3579	kfree(sbi->s_mb_largest_free_orders);
   3580	kfree(sbi->s_mb_largest_free_orders_locks);
   3581	kfree(sbi->s_mb_offsets);
   3582	kfree(sbi->s_mb_maxs);
   3583	iput(sbi->s_buddy_cache);
   3584	if (sbi->s_mb_stats) {
   3585		ext4_msg(sb, KERN_INFO,
   3586		       "mballoc: %u blocks %u reqs (%u success)",
   3587				atomic_read(&sbi->s_bal_allocated),
   3588				atomic_read(&sbi->s_bal_reqs),
   3589				atomic_read(&sbi->s_bal_success));
   3590		ext4_msg(sb, KERN_INFO,
   3591		      "mballoc: %u extents scanned, %u groups scanned, %u goal hits, "
   3592				"%u 2^N hits, %u breaks, %u lost",
   3593				atomic_read(&sbi->s_bal_ex_scanned),
   3594				atomic_read(&sbi->s_bal_groups_scanned),
   3595				atomic_read(&sbi->s_bal_goals),
   3596				atomic_read(&sbi->s_bal_2orders),
   3597				atomic_read(&sbi->s_bal_breaks),
   3598				atomic_read(&sbi->s_mb_lost_chunks));
   3599		ext4_msg(sb, KERN_INFO,
   3600		       "mballoc: %u generated and it took %llu",
   3601				atomic_read(&sbi->s_mb_buddies_generated),
   3602				atomic64_read(&sbi->s_mb_generation_time));
   3603		ext4_msg(sb, KERN_INFO,
   3604		       "mballoc: %u preallocated, %u discarded",
   3605				atomic_read(&sbi->s_mb_preallocated),
   3606				atomic_read(&sbi->s_mb_discarded));
   3607	}
   3608
   3609	free_percpu(sbi->s_locality_groups);
   3610
   3611	return 0;
   3612}
   3613
   3614static inline int ext4_issue_discard(struct super_block *sb,
   3615		ext4_group_t block_group, ext4_grpblk_t cluster, int count,
   3616		struct bio **biop)
   3617{
   3618	ext4_fsblk_t discard_block;
   3619
   3620	discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
   3621			 ext4_group_first_block_no(sb, block_group));
   3622	count = EXT4_C2B(EXT4_SB(sb), count);
   3623	trace_ext4_discard_blocks(sb,
   3624			(unsigned long long) discard_block, count);
   3625	if (biop) {
   3626		return __blkdev_issue_discard(sb->s_bdev,
   3627			(sector_t)discard_block << (sb->s_blocksize_bits - 9),
   3628			(sector_t)count << (sb->s_blocksize_bits - 9),
   3629			GFP_NOFS, biop);
   3630	} else
   3631		return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
   3632}
   3633
   3634static void ext4_free_data_in_buddy(struct super_block *sb,
   3635				    struct ext4_free_data *entry)
   3636{
   3637	struct ext4_buddy e4b;
   3638	struct ext4_group_info *db;
   3639	int err, count = 0, count2 = 0;
   3640
   3641	mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
   3642		 entry->efd_count, entry->efd_group, entry);
   3643
   3644	err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
   3645	/* we expect to find existing buddy because it's pinned */
   3646	BUG_ON(err != 0);
   3647
   3648	spin_lock(&EXT4_SB(sb)->s_md_lock);
   3649	EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
   3650	spin_unlock(&EXT4_SB(sb)->s_md_lock);
   3651
   3652	db = e4b.bd_info;
   3653	/* there are blocks to put in buddy to make them really free */
   3654	count += entry->efd_count;
   3655	count2++;
   3656	ext4_lock_group(sb, entry->efd_group);
   3657	/* Take it out of per group rb tree */
   3658	rb_erase(&entry->efd_node, &(db->bb_free_root));
   3659	mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
   3660
   3661	/*
   3662	 * Clear the trimmed flag for the group so that the next
   3663	 * ext4_trim_fs can trim it.
   3664	 * If the volume is mounted with -o discard, online discard
   3665	 * is supported and the free blocks will be trimmed online.
   3666	 */
   3667	if (!test_opt(sb, DISCARD))
   3668		EXT4_MB_GRP_CLEAR_TRIMMED(db);
   3669
   3670	if (!db->bb_free_root.rb_node) {
   3671		/* No more items in the per group rb tree
   3672		 * balance refcounts from ext4_mb_free_metadata()
   3673		 */
   3674		put_page(e4b.bd_buddy_page);
   3675		put_page(e4b.bd_bitmap_page);
   3676	}
   3677	ext4_unlock_group(sb, entry->efd_group);
   3678	ext4_mb_unload_buddy(&e4b);
   3679
   3680	mb_debug(sb, "freed %d blocks in %d structures\n", count,
   3681		 count2);
   3682}
   3683
   3684/*
   3685 * This function is called by the jbd2 layer once the commit has finished,
   3686 * so we know we can free the blocks that were released with that commit.
   3687 */
   3688void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
   3689{
   3690	struct ext4_sb_info *sbi = EXT4_SB(sb);
   3691	struct ext4_free_data *entry, *tmp;
   3692	struct list_head freed_data_list;
   3693	struct list_head *cut_pos = NULL;
   3694	bool wake;
   3695
   3696	INIT_LIST_HEAD(&freed_data_list);
   3697
   3698	spin_lock(&sbi->s_md_lock);
   3699	list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
   3700		if (entry->efd_tid != commit_tid)
   3701			break;
   3702		cut_pos = &entry->efd_list;
   3703	}
   3704	if (cut_pos)
   3705		list_cut_position(&freed_data_list, &sbi->s_freed_data_list,
   3706				  cut_pos);
   3707	spin_unlock(&sbi->s_md_lock);
   3708
   3709	list_for_each_entry(entry, &freed_data_list, efd_list)
   3710		ext4_free_data_in_buddy(sb, entry);
   3711
   3712	if (test_opt(sb, DISCARD)) {
   3713		spin_lock(&sbi->s_md_lock);
   3714		wake = list_empty(&sbi->s_discard_list);
   3715		list_splice_tail(&freed_data_list, &sbi->s_discard_list);
   3716		spin_unlock(&sbi->s_md_lock);
   3717		if (wake)
   3718			queue_work(system_unbound_wq, &sbi->s_discard_work);
   3719	} else {
   3720		list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
   3721			kmem_cache_free(ext4_free_data_cachep, entry);
   3722	}
   3723}
   3724
   3725int __init ext4_init_mballoc(void)
   3726{
   3727	ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
   3728					SLAB_RECLAIM_ACCOUNT);
   3729	if (ext4_pspace_cachep == NULL)
   3730		goto out;
   3731
   3732	ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
   3733				    SLAB_RECLAIM_ACCOUNT);
   3734	if (ext4_ac_cachep == NULL)
   3735		goto out_pa_free;
   3736
   3737	ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
   3738					   SLAB_RECLAIM_ACCOUNT);
   3739	if (ext4_free_data_cachep == NULL)
   3740		goto out_ac_free;
   3741
   3742	return 0;
   3743
   3744out_ac_free:
   3745	kmem_cache_destroy(ext4_ac_cachep);
   3746out_pa_free:
   3747	kmem_cache_destroy(ext4_pspace_cachep);
   3748out:
   3749	return -ENOMEM;
   3750}
   3751
   3752void ext4_exit_mballoc(void)
   3753{
   3754	/*
   3755	 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
   3756	 * before destroying the slab cache.
   3757	 */
   3758	rcu_barrier();
   3759	kmem_cache_destroy(ext4_pspace_cachep);
   3760	kmem_cache_destroy(ext4_ac_cachep);
   3761	kmem_cache_destroy(ext4_free_data_cachep);
   3762	ext4_groupinfo_destroy_slabs();
   3763}
   3764
   3765
   3766/*
   3767 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
   3768 * Returns 0 if success or error code
   3769 */
   3770static noinline_for_stack int
   3771ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
   3772				handle_t *handle, unsigned int reserv_clstrs)
   3773{
   3774	struct buffer_head *bitmap_bh = NULL;
   3775	struct ext4_group_desc *gdp;
   3776	struct buffer_head *gdp_bh;
   3777	struct ext4_sb_info *sbi;
   3778	struct super_block *sb;
   3779	ext4_fsblk_t block;
   3780	int err, len;
   3781
   3782	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
   3783	BUG_ON(ac->ac_b_ex.fe_len <= 0);
   3784
   3785	sb = ac->ac_sb;
   3786	sbi = EXT4_SB(sb);
   3787
   3788	bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
   3789	if (IS_ERR(bitmap_bh)) {
   3790		err = PTR_ERR(bitmap_bh);
   3791		bitmap_bh = NULL;
   3792		goto out_err;
   3793	}
   3794
   3795	BUFFER_TRACE(bitmap_bh, "getting write access");
   3796	err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
   3797					    EXT4_JTR_NONE);
   3798	if (err)
   3799		goto out_err;
   3800
   3801	err = -EIO;
   3802	gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
   3803	if (!gdp)
   3804		goto out_err;
   3805
   3806	ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
   3807			ext4_free_group_clusters(sb, gdp));
   3808
   3809	BUFFER_TRACE(gdp_bh, "get_write_access");
   3810	err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE);
   3811	if (err)
   3812		goto out_err;
   3813
   3814	block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
   3815
   3816	len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
   3817	if (!ext4_inode_block_valid(ac->ac_inode, block, len)) {
   3818		ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
   3819			   "fs metadata", block, block+len);
   3820		/* File system mounted not to panic on error
   3821		 * Fix the bitmap and return EFSCORRUPTED
   3822		 * We leak some of the blocks here.
   3823		 */
   3824		ext4_lock_group(sb, ac->ac_b_ex.fe_group);
   3825		mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
   3826			      ac->ac_b_ex.fe_len);
   3827		ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
   3828		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
   3829		if (!err)
   3830			err = -EFSCORRUPTED;
   3831		goto out_err;
   3832	}
   3833
   3834	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
   3835#ifdef AGGRESSIVE_CHECK
   3836	{
   3837		int i;
   3838		for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
   3839			BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
   3840						bitmap_bh->b_data));
   3841		}
   3842	}
   3843#endif
   3844	mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
   3845		      ac->ac_b_ex.fe_len);
   3846	if (ext4_has_group_desc_csum(sb) &&
   3847	    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
   3848		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
   3849		ext4_free_group_clusters_set(sb, gdp,
   3850					     ext4_free_clusters_after_init(sb,
   3851						ac->ac_b_ex.fe_group, gdp));
   3852	}
   3853	len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
   3854	ext4_free_group_clusters_set(sb, gdp, len);
   3855	ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
   3856	ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
   3857
   3858	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
   3859	percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
   3860	/*
   3861	 * Now reduce the dirty block count also. Should not go negative
   3862	 */
   3863	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
   3864		/* release all the reserved blocks if non delalloc */
   3865		percpu_counter_sub(&sbi->s_dirtyclusters_counter,
   3866				   reserv_clstrs);
   3867
   3868	if (sbi->s_log_groups_per_flex) {
   3869		ext4_group_t flex_group = ext4_flex_group(sbi,
   3870							  ac->ac_b_ex.fe_group);
   3871		atomic64_sub(ac->ac_b_ex.fe_len,
   3872			     &sbi_array_rcu_deref(sbi, s_flex_groups,
   3873						  flex_group)->free_clusters);
   3874	}
   3875
   3876	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
   3877	if (err)
   3878		goto out_err;
   3879	err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
   3880
   3881out_err:
   3882	brelse(bitmap_bh);
   3883	return err;
   3884}
   3885
   3886/*
   3887 * Idempotent helper for Ext4 fast commit replay path to set the state of
   3888 * blocks in bitmaps and update counters.
   3889 */
   3890void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
   3891			int len, int state)
   3892{
   3893	struct buffer_head *bitmap_bh = NULL;
   3894	struct ext4_group_desc *gdp;
   3895	struct buffer_head *gdp_bh;
   3896	struct ext4_sb_info *sbi = EXT4_SB(sb);
   3897	ext4_group_t group;
   3898	ext4_grpblk_t blkoff;
   3899	int i, err;
   3900	int already;
   3901	unsigned int clen, clen_changed, thisgrp_len;
   3902
   3903	while (len > 0) {
   3904		ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
   3905
   3906		/*
   3907		 * Check to see if we are freeing blocks across a group
   3908		 * boundary.
   3909		 * In case of flex_bg, this can happen that (block, len) may
   3910		 * span across more than one group. In that case we need to
   3911		 * get the corresponding group metadata to work with.
   3912		 * For this we have goto again loop.
   3913		 */
   3914		thisgrp_len = min_t(unsigned int, (unsigned int)len,
   3915			EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
   3916		clen = EXT4_NUM_B2C(sbi, thisgrp_len);
   3917
   3918		if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) {
   3919			ext4_error(sb, "Marking blocks in system zone - "
   3920				   "Block = %llu, len = %u",
   3921				   block, thisgrp_len);
   3922			bitmap_bh = NULL;
   3923			break;
   3924		}
   3925
   3926		bitmap_bh = ext4_read_block_bitmap(sb, group);
   3927		if (IS_ERR(bitmap_bh)) {
   3928			err = PTR_ERR(bitmap_bh);
   3929			bitmap_bh = NULL;
   3930			break;
   3931		}
   3932
   3933		err = -EIO;
   3934		gdp = ext4_get_group_desc(sb, group, &gdp_bh);
   3935		if (!gdp)
   3936			break;
   3937
   3938		ext4_lock_group(sb, group);
   3939		already = 0;
   3940		for (i = 0; i < clen; i++)
   3941			if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
   3942					 !state)
   3943				already++;
   3944
   3945		clen_changed = clen - already;
   3946		if (state)
   3947			mb_set_bits(bitmap_bh->b_data, blkoff, clen);
   3948		else
   3949			mb_clear_bits(bitmap_bh->b_data, blkoff, clen);
   3950		if (ext4_has_group_desc_csum(sb) &&
   3951		    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
   3952			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
   3953			ext4_free_group_clusters_set(sb, gdp,
   3954			     ext4_free_clusters_after_init(sb, group, gdp));
   3955		}
   3956		if (state)
   3957			clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
   3958		else
   3959			clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
   3960
   3961		ext4_free_group_clusters_set(sb, gdp, clen);
   3962		ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
   3963		ext4_group_desc_csum_set(sb, group, gdp);
   3964
   3965		ext4_unlock_group(sb, group);
   3966
   3967		if (sbi->s_log_groups_per_flex) {
   3968			ext4_group_t flex_group = ext4_flex_group(sbi, group);
   3969			struct flex_groups *fg = sbi_array_rcu_deref(sbi,
   3970						   s_flex_groups, flex_group);
   3971
   3972			if (state)
   3973				atomic64_sub(clen_changed, &fg->free_clusters);
   3974			else
   3975				atomic64_add(clen_changed, &fg->free_clusters);
   3976
   3977		}
   3978
   3979		err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
   3980		if (err)
   3981			break;
   3982		sync_dirty_buffer(bitmap_bh);
   3983		err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
   3984		sync_dirty_buffer(gdp_bh);
   3985		if (err)
   3986			break;
   3987
   3988		block += thisgrp_len;
   3989		len -= thisgrp_len;
   3990		brelse(bitmap_bh);
   3991		BUG_ON(len < 0);
   3992	}
   3993
   3994	if (err)
   3995		brelse(bitmap_bh);
   3996}
   3997
   3998/*
   3999 * here we normalize request for locality group
   4000 * Group request are normalized to s_mb_group_prealloc, which goes to
   4001 * s_strip if we set the same via mount option.
   4002 * s_mb_group_prealloc can be configured via
   4003 * /sys/fs/ext4/<partition>/mb_group_prealloc
   4004 *
   4005 * XXX: should we try to preallocate more than the group has now?
   4006 */
   4007static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
   4008{
   4009	struct super_block *sb = ac->ac_sb;
   4010	struct ext4_locality_group *lg = ac->ac_lg;
   4011
   4012	BUG_ON(lg == NULL);
   4013	ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
   4014	mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len);
   4015}
   4016
   4017/*
   4018 * Normalization means making request better in terms of
   4019 * size and alignment
   4020 */
   4021static noinline_for_stack void
   4022ext4_mb_normalize_request(struct ext4_allocation_context *ac,
   4023				struct ext4_allocation_request *ar)
   4024{
   4025	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
   4026	int bsbits, max;
   4027	ext4_lblk_t end;
   4028	loff_t size, start_off;
   4029	loff_t orig_size __maybe_unused;
   4030	ext4_lblk_t start;
   4031	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
   4032	struct ext4_prealloc_space *pa;
   4033
   4034	/* do normalize only data requests, metadata requests
   4035	   do not need preallocation */
   4036	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
   4037		return;
   4038
   4039	/* sometime caller may want exact blocks */
   4040	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
   4041		return;
   4042
   4043	/* caller may indicate that preallocation isn't
   4044	 * required (it's a tail, for example) */
   4045	if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
   4046		return;
   4047
   4048	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
   4049		ext4_mb_normalize_group_request(ac);
   4050		return ;
   4051	}
   4052
   4053	bsbits = ac->ac_sb->s_blocksize_bits;
   4054
   4055	/* first, let's learn actual file size
   4056	 * given current request is allocated */
   4057	size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
   4058	size = size << bsbits;
   4059	if (size < i_size_read(ac->ac_inode))
   4060		size = i_size_read(ac->ac_inode);
   4061	orig_size = size;
   4062
   4063	/* max size of free chunks */
   4064	max = 2 << bsbits;
   4065
   4066#define NRL_CHECK_SIZE(req, size, max, chunk_size)	\
   4067		(req <= (size) || max <= (chunk_size))
   4068
   4069	/* first, try to predict filesize */
   4070	/* XXX: should this table be tunable? */
   4071	start_off = 0;
   4072	if (size <= 16 * 1024) {
   4073		size = 16 * 1024;
   4074	} else if (size <= 32 * 1024) {
   4075		size = 32 * 1024;
   4076	} else if (size <= 64 * 1024) {
   4077		size = 64 * 1024;
   4078	} else if (size <= 128 * 1024) {
   4079		size = 128 * 1024;
   4080	} else if (size <= 256 * 1024) {
   4081		size = 256 * 1024;
   4082	} else if (size <= 512 * 1024) {
   4083		size = 512 * 1024;
   4084	} else if (size <= 1024 * 1024) {
   4085		size = 1024 * 1024;
   4086	} else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
   4087		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
   4088						(21 - bsbits)) << 21;
   4089		size = 2 * 1024 * 1024;
   4090	} else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
   4091		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
   4092							(22 - bsbits)) << 22;
   4093		size = 4 * 1024 * 1024;
   4094	} else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
   4095					(8<<20)>>bsbits, max, 8 * 1024)) {
   4096		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
   4097							(23 - bsbits)) << 23;
   4098		size = 8 * 1024 * 1024;
   4099	} else {
   4100		start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
   4101		size	  = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
   4102					      ac->ac_o_ex.fe_len) << bsbits;
   4103	}
   4104	size = size >> bsbits;
   4105	start = start_off >> bsbits;
   4106
   4107	/*
   4108	 * For tiny groups (smaller than 8MB) the chosen allocation
   4109	 * alignment may be larger than group size. Make sure the
   4110	 * alignment does not move allocation to a different group which
   4111	 * makes mballoc fail assertions later.
   4112	 */
   4113	start = max(start, rounddown(ac->ac_o_ex.fe_logical,
   4114			(ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
   4115
   4116	/* don't cover already allocated blocks in selected range */
   4117	if (ar->pleft && start <= ar->lleft) {
   4118		size -= ar->lleft + 1 - start;
   4119		start = ar->lleft + 1;
   4120	}
   4121	if (ar->pright && start + size - 1 >= ar->lright)
   4122		size -= start + size - ar->lright;
   4123
   4124	/*
   4125	 * Trim allocation request for filesystems with artificially small
   4126	 * groups.
   4127	 */
   4128	if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
   4129		size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);
   4130
   4131	end = start + size;
   4132
   4133	/* check we don't cross already preallocated blocks */
   4134	rcu_read_lock();
   4135	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
   4136		ext4_lblk_t pa_end;
   4137
   4138		if (pa->pa_deleted)
   4139			continue;
   4140		spin_lock(&pa->pa_lock);
   4141		if (pa->pa_deleted) {
   4142			spin_unlock(&pa->pa_lock);
   4143			continue;
   4144		}
   4145
   4146		pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
   4147						  pa->pa_len);
   4148
   4149		/* PA must not overlap original request */
   4150		BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
   4151			ac->ac_o_ex.fe_logical < pa->pa_lstart));
   4152
   4153		/* skip PAs this normalized request doesn't overlap with */
   4154		if (pa->pa_lstart >= end || pa_end <= start) {
   4155			spin_unlock(&pa->pa_lock);
   4156			continue;
   4157		}
   4158		BUG_ON(pa->pa_lstart <= start && pa_end >= end);
   4159
   4160		/* adjust start or end to be adjacent to this pa */
   4161		if (pa_end <= ac->ac_o_ex.fe_logical) {
   4162			BUG_ON(pa_end < start);
   4163			start = pa_end;
   4164		} else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
   4165			BUG_ON(pa->pa_lstart > end);
   4166			end = pa->pa_lstart;
   4167		}
   4168		spin_unlock(&pa->pa_lock);
   4169	}
   4170	rcu_read_unlock();
   4171	size = end - start;
   4172
   4173	/* XXX: extra loop to check we really don't overlap preallocations */
   4174	rcu_read_lock();
   4175	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
   4176		ext4_lblk_t pa_end;
   4177
   4178		spin_lock(&pa->pa_lock);
   4179		if (pa->pa_deleted == 0) {
   4180			pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
   4181							  pa->pa_len);
   4182			BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
   4183		}
   4184		spin_unlock(&pa->pa_lock);
   4185	}
   4186	rcu_read_unlock();
   4187
   4188	/*
   4189	 * In this function "start" and "size" are normalized for better
   4190	 * alignment and length such that we could preallocate more blocks.
   4191	 * This normalization is done such that original request of
   4192	 * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and
   4193	 * "size" boundaries.
   4194	 * (Note fe_len can be relaxed since FS block allocation API does not
   4195	 * provide gurantee on number of contiguous blocks allocation since that
   4196	 * depends upon free space left, etc).
   4197	 * In case of inode pa, later we use the allocated blocks
   4198	 * [pa_start + fe_logical - pa_lstart, fe_len/size] from the preallocated
   4199	 * range of goal/best blocks [start, size] to put it at the
   4200	 * ac_o_ex.fe_logical extent of this inode.
   4201	 * (See ext4_mb_use_inode_pa() for more details)
   4202	 */
   4203	if (start + size <= ac->ac_o_ex.fe_logical ||
   4204			start > ac->ac_o_ex.fe_logical) {
   4205		ext4_msg(ac->ac_sb, KERN_ERR,
   4206			 "start %lu, size %lu, fe_logical %lu",
   4207			 (unsigned long) start, (unsigned long) size,
   4208			 (unsigned long) ac->ac_o_ex.fe_logical);
   4209		BUG();
   4210	}
   4211	BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
   4212
   4213	/* now prepare goal request */
   4214
   4215	/* XXX: is it better to align blocks WRT to logical
   4216	 * placement or satisfy big request as is */
   4217	ac->ac_g_ex.fe_logical = start;
   4218	ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
   4219
   4220	/* define goal start in order to merge */
   4221	if (ar->pright && (ar->lright == (start + size))) {
   4222		/* merge to the right */
   4223		ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
   4224						&ac->ac_f_ex.fe_group,
   4225						&ac->ac_f_ex.fe_start);
   4226		ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
   4227	}
   4228	if (ar->pleft && (ar->lleft + 1 == start)) {
   4229		/* merge to the left */
   4230		ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
   4231						&ac->ac_f_ex.fe_group,
   4232						&ac->ac_f_ex.fe_start);
   4233		ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
   4234	}
   4235
   4236	mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size,
   4237		 orig_size, start);
   4238}
   4239
   4240static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
   4241{
   4242	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
   4243
   4244	if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
   4245		atomic_inc(&sbi->s_bal_reqs);
   4246		atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
   4247		if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
   4248			atomic_inc(&sbi->s_bal_success);
   4249		atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
   4250		atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
   4251		if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
   4252				ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
   4253			atomic_inc(&sbi->s_bal_goals);
   4254		if (ac->ac_found > sbi->s_mb_max_to_scan)
   4255			atomic_inc(&sbi->s_bal_breaks);
   4256	}
   4257
   4258	if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
   4259		trace_ext4_mballoc_alloc(ac);
   4260	else
   4261		trace_ext4_mballoc_prealloc(ac);
   4262}
   4263
   4264/*
   4265 * Called on failure; free up any blocks from the inode PA for this
   4266 * context.  We don't need this for MB_GROUP_PA because we only change
   4267 * pa_free in ext4_mb_release_context(), but on failure, we've already
   4268 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
   4269 */
   4270static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
   4271{
   4272	struct ext4_prealloc_space *pa = ac->ac_pa;
   4273	struct ext4_buddy e4b;
   4274	int err;
   4275
   4276	if (pa == NULL) {
   4277		if (ac->ac_f_ex.fe_len == 0)
   4278			return;
   4279		err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
   4280		if (err) {
   4281			/*
   4282			 * This should never happen since we pin the
   4283			 * pages in the ext4_allocation_context so
   4284			 * ext4_mb_load_buddy() should never fail.
   4285			 */
   4286			WARN(1, "mb_load_buddy failed (%d)", err);
   4287			return;
   4288		}
   4289		ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
   4290		mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
   4291			       ac->ac_f_ex.fe_len);
   4292		ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
   4293		ext4_mb_unload_buddy(&e4b);
   4294		return;
   4295	}
   4296	if (pa->pa_type == MB_INODE_PA)
   4297		pa->pa_free += ac->ac_b_ex.fe_len;
   4298}
   4299
   4300/*
   4301 * use blocks preallocated to inode
   4302 */
   4303static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
   4304				struct ext4_prealloc_space *pa)
   4305{
   4306	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
   4307	ext4_fsblk_t start;
   4308	ext4_fsblk_t end;
   4309	int len;
   4310
   4311	/* found preallocated blocks, use them */
   4312	start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
   4313	end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
   4314		  start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
   4315	len = EXT4_NUM_B2C(sbi, end - start);
   4316	ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
   4317					&ac->ac_b_ex.fe_start);
   4318	ac->ac_b_ex.fe_len = len;
   4319	ac->ac_status = AC_STATUS_FOUND;
   4320	ac->ac_pa = pa;
   4321
   4322	BUG_ON(start < pa->pa_pstart);
   4323	BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
   4324	BUG_ON(pa->pa_free < len);
   4325	pa->pa_free -= len;
   4326
   4327	mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
   4328}
   4329
   4330/*
   4331 * use blocks preallocated to locality group
   4332 */
   4333static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
   4334				struct ext4_prealloc_space *pa)
   4335{
   4336	unsigned int len = ac->ac_o_ex.fe_len;
   4337
   4338	ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
   4339					&ac->ac_b_ex.fe_group,
   4340					&ac->ac_b_ex.fe_start);
   4341	ac->ac_b_ex.fe_len = len;
   4342	ac->ac_status = AC_STATUS_FOUND;
   4343	ac->ac_pa = pa;
   4344
   4345	/* we don't correct pa_pstart or pa_plen here to avoid
   4346	 * possible race when the group is being loaded concurrently
   4347	 * instead we correct pa later, after blocks are marked
   4348	 * in on-disk bitmap -- see ext4_mb_release_context()
   4349	 * Other CPUs are prevented from allocating from this pa by lg_mutex
   4350	 */
   4351	mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
   4352		 pa->pa_lstart-len, len, pa);
   4353}
   4354
   4355/*
   4356 * Return the prealloc space that have minimal distance
   4357 * from the goal block. @cpa is the prealloc
   4358 * space that is having currently known minimal distance
   4359 * from the goal block.
   4360 */
   4361static struct ext4_prealloc_space *
   4362ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
   4363			struct ext4_prealloc_space *pa,
   4364			struct ext4_prealloc_space *cpa)
   4365{
   4366	ext4_fsblk_t cur_distance, new_distance;
   4367
   4368	if (cpa == NULL) {
   4369		atomic_inc(&pa->pa_count);
   4370		return pa;
   4371	}
   4372	cur_distance = abs(goal_block - cpa->pa_pstart);
   4373	new_distance = abs(goal_block - pa->pa_pstart);
   4374
   4375	if (cur_distance <= new_distance)
   4376		return cpa;
   4377
   4378	/* drop the previous reference */
   4379	atomic_dec(&cpa->pa_count);
   4380	atomic_inc(&pa->pa_count);
   4381	return pa;
   4382}
   4383
   4384/*
   4385 * search goal blocks in preallocated space
   4386 */
   4387static noinline_for_stack bool
   4388ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
   4389{
   4390	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
   4391	int order, i;
   4392	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
   4393	struct ext4_locality_group *lg;
   4394	struct ext4_prealloc_space *pa, *cpa = NULL;
   4395	ext4_fsblk_t goal_block;
   4396
   4397	/* only data can be preallocated */
   4398	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
   4399		return false;
   4400
   4401	/* first, try per-file preallocation */
   4402	rcu_read_lock();
   4403	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
   4404
   4405		/* all fields in this condition don't change,
   4406		 * so we can skip locking for them */
   4407		if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
   4408		    ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
   4409					       EXT4_C2B(sbi, pa->pa_len)))
   4410			continue;
   4411
   4412		/* non-extent files can't have physical blocks past 2^32 */
   4413		if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
   4414		    (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
   4415		     EXT4_MAX_BLOCK_FILE_PHYS))
   4416			continue;
   4417
   4418		/* found preallocated blocks, use them */
   4419		spin_lock(&pa->pa_lock);
   4420		if (pa->pa_deleted == 0 && pa->pa_free) {
   4421			atomic_inc(&pa->pa_count);
   4422			ext4_mb_use_inode_pa(ac, pa);
   4423			spin_unlock(&pa->pa_lock);
   4424			ac->ac_criteria = 10;
   4425			rcu_read_unlock();
   4426			return true;
   4427		}
   4428		spin_unlock(&pa->pa_lock);
   4429	}
   4430	rcu_read_unlock();
   4431
   4432	/* can we use group allocation? */
   4433	if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
   4434		return false;
   4435
   4436	/* inode may have no locality group for some reason */
   4437	lg = ac->ac_lg;
   4438	if (lg == NULL)
   4439		return false;
   4440	order  = fls(ac->ac_o_ex.fe_len) - 1;
   4441	if (order > PREALLOC_TB_SIZE - 1)
   4442		/* The max size of hash table is PREALLOC_TB_SIZE */
   4443		order = PREALLOC_TB_SIZE - 1;
   4444
   4445	goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
   4446	/*
   4447	 * search for the prealloc space that is having
   4448	 * minimal distance from the goal block.
   4449	 */
   4450	for (i = order; i < PREALLOC_TB_SIZE; i++) {
   4451		rcu_read_lock();
   4452		list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
   4453					pa_inode_list) {
   4454			spin_lock(&pa->pa_lock);
   4455			if (pa->pa_deleted == 0 &&
   4456					pa->pa_free >= ac->ac_o_ex.fe_len) {
   4457
   4458				cpa = ext4_mb_check_group_pa(goal_block,
   4459								pa, cpa);
   4460			}
   4461			spin_unlock(&pa->pa_lock);
   4462		}
   4463		rcu_read_unlock();
   4464	}
   4465	if (cpa) {
   4466		ext4_mb_use_group_pa(ac, cpa);
   4467		ac->ac_criteria = 20;
   4468		return true;
   4469	}
   4470	return false;
   4471}
   4472
   4473/*
   4474 * the function goes through all block freed in the group
   4475 * but not yet committed and marks them used in in-core bitmap.
   4476 * buddy must be generated from this bitmap
   4477 * Need to be called with the ext4 group lock held
   4478 */
   4479static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
   4480						ext4_group_t group)
   4481{
   4482	struct rb_node *n;
   4483	struct ext4_group_info *grp;
   4484	struct ext4_free_data *entry;
   4485
   4486	grp = ext4_get_group_info(sb, group);
   4487	n = rb_first(&(grp->bb_free_root));
   4488
   4489	while (n) {
   4490		entry = rb_entry(n, struct ext4_free_data, efd_node);
   4491		mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
   4492		n = rb_next(n);
   4493	}
   4494	return;
   4495}
   4496
   4497/*
   4498 * the function goes through all preallocation in this group and marks them
   4499 * used in in-core bitmap. buddy must be generated from this bitmap
   4500 * Need to be called with ext4 group lock held
   4501 */
   4502static noinline_for_stack
   4503void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
   4504					ext4_group_t group)
   4505{
   4506	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
   4507	struct ext4_prealloc_space *pa;
   4508	struct list_head *cur;
   4509	ext4_group_t groupnr;
   4510	ext4_grpblk_t start;
   4511	int preallocated = 0;
   4512	int len;
   4513
   4514	/* all form of preallocation discards first load group,
   4515	 * so the only competing code is preallocation use.
   4516	 * we don't need any locking here
   4517	 * notice we do NOT ignore preallocations with pa_deleted
   4518	 * otherwise we could leave used blocks available for
   4519	 * allocation in buddy when concurrent ext4_mb_put_pa()
   4520	 * is dropping preallocation
   4521	 */
   4522	list_for_each(cur, &grp->bb_prealloc_list) {
   4523		pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
   4524		spin_lock(&pa->pa_lock);
   4525		ext4_get_group_no_and_offset(sb, pa->pa_pstart,
   4526					     &groupnr, &start);
   4527		len = pa->pa_len;
   4528		spin_unlock(&pa->pa_lock);
   4529		if (unlikely(len == 0))
   4530			continue;
   4531		BUG_ON(groupnr != group);
   4532		mb_set_bits(bitmap, start, len);
   4533		preallocated += len;
   4534	}
   4535	mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
   4536}
   4537
   4538static void ext4_mb_mark_pa_deleted(struct super_block *sb,
   4539				    struct ext4_prealloc_space *pa)
   4540{
   4541	struct ext4_inode_info *ei;
   4542
   4543	if (pa->pa_deleted) {
   4544		ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n",
   4545			     pa->pa_type, pa->pa_pstart, pa->pa_lstart,
   4546			     pa->pa_len);
   4547		return;
   4548	}
   4549
   4550	pa->pa_deleted = 1;
   4551
   4552	if (pa->pa_type == MB_INODE_PA) {
   4553		ei = EXT4_I(pa->pa_inode);
   4554		atomic_dec(&ei->i_prealloc_active);
   4555	}
   4556}
   4557
   4558static void ext4_mb_pa_callback(struct rcu_head *head)
   4559{
   4560	struct ext4_prealloc_space *pa;
   4561	pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
   4562
   4563	BUG_ON(atomic_read(&pa->pa_count));
   4564	BUG_ON(pa->pa_deleted == 0);
   4565	kmem_cache_free(ext4_pspace_cachep, pa);
   4566}
   4567
   4568/*
   4569 * drops a reference to preallocated space descriptor
   4570 * if this was the last reference and the space is consumed
   4571 */
   4572static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
   4573			struct super_block *sb, struct ext4_prealloc_space *pa)
   4574{
   4575	ext4_group_t grp;
   4576	ext4_fsblk_t grp_blk;
   4577
   4578	/* in this short window concurrent discard can set pa_deleted */
   4579	spin_lock(&pa->pa_lock);
   4580	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
   4581		spin_unlock(&pa->pa_lock);
   4582		return;
   4583	}
   4584
   4585	if (pa->pa_deleted == 1) {
   4586		spin_unlock(&pa->pa_lock);
   4587		return;
   4588	}
   4589
   4590	ext4_mb_mark_pa_deleted(sb, pa);
   4591	spin_unlock(&pa->pa_lock);
   4592
   4593	grp_blk = pa->pa_pstart;
   4594	/*
   4595	 * If doing group-based preallocation, pa_pstart may be in the
   4596	 * next group when pa is used up
   4597	 */
   4598	if (pa->pa_type == MB_GROUP_PA)
   4599		grp_blk--;
   4600
   4601	grp = ext4_get_group_number(sb, grp_blk);
   4602
   4603	/*
   4604	 * possible race:
   4605	 *
   4606	 *  P1 (buddy init)			P2 (regular allocation)
   4607	 *					find block B in PA
   4608	 *  copy on-disk bitmap to buddy
   4609	 *  					mark B in on-disk bitmap
   4610	 *					drop PA from group
   4611	 *  mark all PAs in buddy
   4612	 *
   4613	 * thus, P1 initializes buddy with B available. to prevent this
   4614	 * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
   4615	 * against that pair
   4616	 */
   4617	ext4_lock_group(sb, grp);
   4618	list_del(&pa->pa_group_list);
   4619	ext4_unlock_group(sb, grp);
   4620
   4621	spin_lock(pa->pa_obj_lock);
   4622	list_del_rcu(&pa->pa_inode_list);
   4623	spin_unlock(pa->pa_obj_lock);
   4624
   4625	call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
   4626}
   4627
   4628/*
   4629 * creates new preallocated space for given inode
   4630 */
   4631static noinline_for_stack void
   4632ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
   4633{
   4634	struct super_block *sb = ac->ac_sb;
   4635	struct ext4_sb_info *sbi = EXT4_SB(sb);
   4636	struct ext4_prealloc_space *pa;
   4637	struct ext4_group_info *grp;
   4638	struct ext4_inode_info *ei;
   4639
   4640	/* preallocate only when found space is larger then requested */
   4641	BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
   4642	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
   4643	BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
   4644	BUG_ON(ac->ac_pa == NULL);
   4645
   4646	pa = ac->ac_pa;
   4647
   4648	if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
   4649		int winl;
   4650		int wins;
   4651		int win;
   4652		int offs;
   4653
   4654		/* we can't allocate as much as normalizer wants.
   4655		 * so, found space must get proper lstart
   4656		 * to cover original request */
   4657		BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
   4658		BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
   4659
   4660		/* we're limited by original request in that
   4661		 * logical block must be covered any way
   4662		 * winl is window we can move our chunk within */
   4663		winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
   4664
   4665		/* also, we should cover whole original request */
   4666		wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
   4667
   4668		/* the smallest one defines real window */
   4669		win = min(winl, wins);
   4670
   4671		offs = ac->ac_o_ex.fe_logical %
   4672			EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
   4673		if (offs && offs < win)
   4674			win = offs;
   4675
   4676		ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
   4677			EXT4_NUM_B2C(sbi, win);
   4678		BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
   4679		BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
   4680	}
   4681
   4682	/* preallocation can change ac_b_ex, thus we store actually
   4683	 * allocated blocks for history */
   4684	ac->ac_f_ex = ac->ac_b_ex;
   4685
   4686	pa->pa_lstart = ac->ac_b_ex.fe_logical;
   4687	pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
   4688	pa->pa_len = ac->ac_b_ex.fe_len;
   4689	pa->pa_free = pa->pa_len;
   4690	spin_lock_init(&pa->pa_lock);
   4691	INIT_LIST_HEAD(&pa->pa_inode_list);
   4692	INIT_LIST_HEAD(&pa->pa_group_list);
   4693	pa->pa_deleted = 0;
   4694	pa->pa_type = MB_INODE_PA;
   4695
   4696	mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
   4697		 pa->pa_len, pa->pa_lstart);
   4698	trace_ext4_mb_new_inode_pa(ac, pa);
   4699
   4700	ext4_mb_use_inode_pa(ac, pa);
   4701	atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
   4702
   4703	ei = EXT4_I(ac->ac_inode);
   4704	grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
   4705
   4706	pa->pa_obj_lock = &ei->i_prealloc_lock;
   4707	pa->pa_inode = ac->ac_inode;
   4708
   4709	list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
   4710
   4711	spin_lock(pa->pa_obj_lock);
   4712	list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
   4713	spin_unlock(pa->pa_obj_lock);
   4714	atomic_inc(&ei->i_prealloc_active);
   4715}
   4716
   4717/*
   4718 * creates new preallocated space for locality group inodes belongs to
   4719 */
   4720static noinline_for_stack void
   4721ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
   4722{
   4723	struct super_block *sb = ac->ac_sb;
   4724	struct ext4_locality_group *lg;
   4725	struct ext4_prealloc_space *pa;
   4726	struct ext4_group_info *grp;
   4727
   4728	/* preallocate only when found space is larger then requested */
   4729	BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
   4730	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
   4731	BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
   4732	BUG_ON(ac->ac_pa == NULL);
   4733
   4734	pa = ac->ac_pa;
   4735
   4736	/* preallocation can change ac_b_ex, thus we store actually
   4737	 * allocated blocks for history */
   4738	ac->ac_f_ex = ac->ac_b_ex;
   4739
   4740	pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
   4741	pa->pa_lstart = pa->pa_pstart;
   4742	pa->pa_len = ac->ac_b_ex.fe_len;
   4743	pa->pa_free = pa->pa_len;
   4744	spin_lock_init(&pa->pa_lock);
   4745	INIT_LIST_HEAD(&pa->pa_inode_list);
   4746	INIT_LIST_HEAD(&pa->pa_group_list);
   4747	pa->pa_deleted = 0;
   4748	pa->pa_type = MB_GROUP_PA;
   4749
   4750	mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart,
   4751		 pa->pa_len, pa->pa_lstart);
   4752	trace_ext4_mb_new_group_pa(ac, pa);
   4753
   4754	ext4_mb_use_group_pa(ac, pa);
   4755	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
   4756
   4757	grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
   4758	lg = ac->ac_lg;
   4759	BUG_ON(lg == NULL);
   4760
   4761	pa->pa_obj_lock = &lg->lg_prealloc_lock;
   4762	pa->pa_inode = NULL;
   4763
   4764	list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
   4765
   4766	/*
   4767	 * We will later add the new pa to the right bucket
   4768	 * after updating the pa_free in ext4_mb_release_context
   4769	 */
   4770}
   4771
   4772static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
   4773{
   4774	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
   4775		ext4_mb_new_group_pa(ac);
   4776	else
   4777		ext4_mb_new_inode_pa(ac);
   4778}
   4779
   4780/*
   4781 * finds all unused blocks in on-disk bitmap, frees them in
   4782 * in-core bitmap and buddy.
   4783 * @pa must be unlinked from inode and group lists, so that
   4784 * nobody else can find/use it.
   4785 * the caller MUST hold group/inode locks.
   4786 * TODO: optimize the case when there are no in-core structures yet
   4787 */
   4788static noinline_for_stack int
   4789ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
   4790			struct ext4_prealloc_space *pa)
   4791{
   4792	struct super_block *sb = e4b->bd_sb;
   4793	struct ext4_sb_info *sbi = EXT4_SB(sb);
   4794	unsigned int end;
   4795	unsigned int next;
   4796	ext4_group_t group;
   4797	ext4_grpblk_t bit;
   4798	unsigned long long grp_blk_start;
   4799	int free = 0;
   4800
   4801	BUG_ON(pa->pa_deleted == 0);
   4802	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
   4803	grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
   4804	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
   4805	end = bit + pa->pa_len;
   4806
   4807	while (bit < end) {
   4808		bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
   4809		if (bit >= end)
   4810			break;
   4811		next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
   4812		mb_debug(sb, "free preallocated %u/%u in group %u\n",
   4813			 (unsigned) ext4_group_first_block_no(sb, group) + bit,
   4814			 (unsigned) next - bit, (unsigned) group);
   4815		free += next - bit;
   4816
   4817		trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
   4818		trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
   4819						    EXT4_C2B(sbi, bit)),
   4820					       next - bit);
   4821		mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
   4822		bit = next + 1;
   4823	}
   4824	if (free != pa->pa_free) {
   4825		ext4_msg(e4b->bd_sb, KERN_CRIT,
   4826			 "pa %p: logic %lu, phys. %lu, len %d",
   4827			 pa, (unsigned long) pa->pa_lstart,
   4828			 (unsigned long) pa->pa_pstart,
   4829			 pa->pa_len);
   4830		ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
   4831					free, pa->pa_free);
   4832		/*
   4833		 * pa is already deleted so we use the value obtained
   4834		 * from the bitmap and continue.
   4835		 */
   4836	}
   4837	atomic_add(free, &sbi->s_mb_discarded);
   4838
   4839	return 0;
   4840}
   4841
   4842static noinline_for_stack int
   4843ext4_mb_release_group_pa(struct ext4_buddy *e4b,
   4844				struct ext4_prealloc_space *pa)
   4845{
   4846	struct super_block *sb = e4b->bd_sb;
   4847	ext4_group_t group;
   4848	ext4_grpblk_t bit;
   4849
   4850	trace_ext4_mb_release_group_pa(sb, pa);
   4851	BUG_ON(pa->pa_deleted == 0);
   4852	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
   4853	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
   4854	mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
   4855	atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
   4856	trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
   4857
   4858	return 0;
   4859}
   4860
   4861/*
   4862 * releases all preallocations in given group
   4863 *
   4864 * first, we need to decide discard policy:
   4865 * - when do we discard
   4866 *   1) ENOSPC
   4867 * - how many do we discard
   4868 *   1) how many requested
   4869 */
   4870static noinline_for_stack int
   4871ext4_mb_discard_group_preallocations(struct super_block *sb,
   4872				     ext4_group_t group, int *busy)
   4873{
   4874	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
   4875	struct buffer_head *bitmap_bh = NULL;
   4876	struct ext4_prealloc_space *pa, *tmp;
   4877	struct list_head list;
   4878	struct ext4_buddy e4b;
   4879	int err;
   4880	int free = 0;
   4881
   4882	mb_debug(sb, "discard preallocation for group %u\n", group);
   4883	if (list_empty(&grp->bb_prealloc_list))
   4884		goto out_dbg;
   4885
   4886	bitmap_bh = ext4_read_block_bitmap(sb, group);
   4887	if (IS_ERR(bitmap_bh)) {
   4888		err = PTR_ERR(bitmap_bh);
   4889		ext4_error_err(sb, -err,
   4890			       "Error %d reading block bitmap for %u",
   4891			       err, group);
   4892		goto out_dbg;
   4893	}
   4894
   4895	err = ext4_mb_load_buddy(sb, group, &e4b);
   4896	if (err) {
   4897		ext4_warning(sb, "Error %d loading buddy information for %u",
   4898			     err, group);
   4899		put_bh(bitmap_bh);
   4900		goto out_dbg;
   4901	}
   4902
   4903	INIT_LIST_HEAD(&list);
   4904	ext4_lock_group(sb, group);
   4905	list_for_each_entry_safe(pa, tmp,
   4906				&grp->bb_prealloc_list, pa_group_list) {
   4907		spin_lock(&pa->pa_lock);
   4908		if (atomic_read(&pa->pa_count)) {
   4909			spin_unlock(&pa->pa_lock);
   4910			*busy = 1;
   4911			continue;
   4912		}
   4913		if (pa->pa_deleted) {
   4914			spin_unlock(&pa->pa_lock);
   4915			continue;
   4916		}
   4917
   4918		/* seems this one can be freed ... */
   4919		ext4_mb_mark_pa_deleted(sb, pa);
   4920
   4921		if (!free)
   4922			this_cpu_inc(discard_pa_seq);
   4923
   4924		/* we can trust pa_free ... */
   4925		free += pa->pa_free;
   4926
   4927		spin_unlock(&pa->pa_lock);
   4928
   4929		list_del(&pa->pa_group_list);
   4930		list_add(&pa->u.pa_tmp_list, &list);
   4931	}
   4932
   4933	/* now free all selected PAs */
   4934	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
   4935
   4936		/* remove from object (inode or locality group) */
   4937		spin_lock(pa->pa_obj_lock);
   4938		list_del_rcu(&pa->pa_inode_list);
   4939		spin_unlock(pa->pa_obj_lock);
   4940
   4941		if (pa->pa_type == MB_GROUP_PA)
   4942			ext4_mb_release_group_pa(&e4b, pa);
   4943		else
   4944			ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
   4945
   4946		list_del(&pa->u.pa_tmp_list);
   4947		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
   4948	}
   4949
   4950	ext4_unlock_group(sb, group);
   4951	ext4_mb_unload_buddy(&e4b);
   4952	put_bh(bitmap_bh);
   4953out_dbg:
   4954	mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n",
   4955		 free, group, grp->bb_free);
   4956	return free;
   4957}
   4958
   4959/*
   4960 * releases all non-used preallocated blocks for given inode
   4961 *
   4962 * It's important to discard preallocations under i_data_sem
   4963 * We don't want another block to be served from the prealloc
   4964 * space when we are discarding the inode prealloc space.
   4965 *
   4966 * FIXME!! Make sure it is valid at all the call sites
   4967 */
   4968void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
   4969{
   4970	struct ext4_inode_info *ei = EXT4_I(inode);
   4971	struct super_block *sb = inode->i_sb;
   4972	struct buffer_head *bitmap_bh = NULL;
   4973	struct ext4_prealloc_space *pa, *tmp;
   4974	ext4_group_t group = 0;
   4975	struct list_head list;
   4976	struct ext4_buddy e4b;
   4977	int err;
   4978
   4979	if (!S_ISREG(inode->i_mode)) {
   4980		/*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
   4981		return;
   4982	}
   4983
   4984	if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
   4985		return;
   4986
   4987	mb_debug(sb, "discard preallocation for inode %lu\n",
   4988		 inode->i_ino);
   4989	trace_ext4_discard_preallocations(inode,
   4990			atomic_read(&ei->i_prealloc_active), needed);
   4991
   4992	INIT_LIST_HEAD(&list);
   4993
   4994	if (needed == 0)
   4995		needed = UINT_MAX;
   4996
   4997repeat:
   4998	/* first, collect all pa's in the inode */
   4999	spin_lock(&ei->i_prealloc_lock);
   5000	while (!list_empty(&ei->i_prealloc_list) && needed) {
   5001		pa = list_entry(ei->i_prealloc_list.prev,
   5002				struct ext4_prealloc_space, pa_inode_list);
   5003		BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
   5004		spin_lock(&pa->pa_lock);
   5005		if (atomic_read(&pa->pa_count)) {
   5006			/* this shouldn't happen often - nobody should
   5007			 * use preallocation while we're discarding it */
   5008			spin_unlock(&pa->pa_lock);
   5009			spin_unlock(&ei->i_prealloc_lock);
   5010			ext4_msg(sb, KERN_ERR,
   5011				 "uh-oh! used pa while discarding");
   5012			WARN_ON(1);
   5013			schedule_timeout_uninterruptible(HZ);
   5014			goto repeat;
   5015
   5016		}
   5017		if (pa->pa_deleted == 0) {
   5018			ext4_mb_mark_pa_deleted(sb, pa);
   5019			spin_unlock(&pa->pa_lock);
   5020			list_del_rcu(&pa->pa_inode_list);
   5021			list_add(&pa->u.pa_tmp_list, &list);
   5022			needed--;
   5023			continue;
   5024		}
   5025
   5026		/* someone is deleting pa right now */
   5027		spin_unlock(&pa->pa_lock);
   5028		spin_unlock(&ei->i_prealloc_lock);
   5029
   5030		/* we have to wait here because pa_deleted
   5031		 * doesn't mean pa is already unlinked from
   5032		 * the list. as we might be called from
   5033		 * ->clear_inode() the inode will get freed
   5034		 * and concurrent thread which is unlinking
   5035		 * pa from inode's list may access already
   5036		 * freed memory, bad-bad-bad */
   5037
   5038		/* XXX: if this happens too often, we can
   5039		 * add a flag to force wait only in case
   5040		 * of ->clear_inode(), but not in case of
   5041		 * regular truncate */
   5042		schedule_timeout_uninterruptible(HZ);
   5043		goto repeat;
   5044	}
   5045	spin_unlock(&ei->i_prealloc_lock);
   5046
   5047	list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
   5048		BUG_ON(pa->pa_type != MB_INODE_PA);
   5049		group = ext4_get_group_number(sb, pa->pa_pstart);
   5050
   5051		err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
   5052					     GFP_NOFS|__GFP_NOFAIL);
   5053		if (err) {
   5054			ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
   5055				       err, group);
   5056			continue;
   5057		}
   5058
   5059		bitmap_bh = ext4_read_block_bitmap(sb, group);
   5060		if (IS_ERR(bitmap_bh)) {
   5061			err = PTR_ERR(bitmap_bh);
   5062			ext4_error_err(sb, -err, "Error %d reading block bitmap for %u",
   5063				       err, group);
   5064			ext4_mb_unload_buddy(&e4b);
   5065			continue;
   5066		}
   5067
   5068		ext4_lock_group(sb, group);
   5069		list_del(&pa->pa_group_list);
   5070		ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
   5071		ext4_unlock_group(sb, group);
   5072
   5073		ext4_mb_unload_buddy(&e4b);
   5074		put_bh(bitmap_bh);
   5075
   5076		list_del(&pa->u.pa_tmp_list);
   5077		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
   5078	}
   5079}
   5080
   5081static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
   5082{
   5083	struct ext4_prealloc_space *pa;
   5084
   5085	BUG_ON(ext4_pspace_cachep == NULL);
   5086	pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
   5087	if (!pa)
   5088		return -ENOMEM;
   5089	atomic_set(&pa->pa_count, 1);
   5090	ac->ac_pa = pa;
   5091	return 0;
   5092}
   5093
   5094static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
   5095{
   5096	struct ext4_prealloc_space *pa = ac->ac_pa;
   5097
   5098	BUG_ON(!pa);
   5099	ac->ac_pa = NULL;
   5100	WARN_ON(!atomic_dec_and_test(&pa->pa_count));
   5101	kmem_cache_free(ext4_pspace_cachep, pa);
   5102}
   5103
   5104#ifdef CONFIG_EXT4_DEBUG
   5105static inline void ext4_mb_show_pa(struct super_block *sb)
   5106{
   5107	ext4_group_t i, ngroups;
   5108
   5109	if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
   5110		return;
   5111
   5112	ngroups = ext4_get_groups_count(sb);
   5113	mb_debug(sb, "groups: ");
   5114	for (i = 0; i < ngroups; i++) {
   5115		struct ext4_group_info *grp = ext4_get_group_info(sb, i);
   5116		struct ext4_prealloc_space *pa;
   5117		ext4_grpblk_t start;
   5118		struct list_head *cur;
   5119		ext4_lock_group(sb, i);
   5120		list_for_each(cur, &grp->bb_prealloc_list) {
   5121			pa = list_entry(cur, struct ext4_prealloc_space,
   5122					pa_group_list);
   5123			spin_lock(&pa->pa_lock);
   5124			ext4_get_group_no_and_offset(sb, pa->pa_pstart,
   5125						     NULL, &start);
   5126			spin_unlock(&pa->pa_lock);
   5127			mb_debug(sb, "PA:%u:%d:%d\n", i, start,
   5128				 pa->pa_len);
   5129		}
   5130		ext4_unlock_group(sb, i);
   5131		mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free,
   5132			 grp->bb_fragments);
   5133	}
   5134}
   5135
   5136static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
   5137{
   5138	struct super_block *sb = ac->ac_sb;
   5139
   5140	if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
   5141		return;
   5142
   5143	mb_debug(sb, "Can't allocate:"
   5144			" Allocation context details:");
   5145	mb_debug(sb, "status %u flags 0x%x",
   5146			ac->ac_status, ac->ac_flags);
   5147	mb_debug(sb, "orig %lu/%lu/%lu@%lu, "
   5148			"goal %lu/%lu/%lu@%lu, "
   5149			"best %lu/%lu/%lu@%lu cr %d",
   5150			(unsigned long)ac->ac_o_ex.fe_group,
   5151			(unsigned long)ac->ac_o_ex.fe_start,
   5152			(unsigned long)ac->ac_o_ex.fe_len,
   5153			(unsigned long)ac->ac_o_ex.fe_logical,
   5154			(unsigned long)ac->ac_g_ex.fe_group,
   5155			(unsigned long)ac->ac_g_ex.fe_start,
   5156			(unsigned long)ac->ac_g_ex.fe_len,
   5157			(unsigned long)ac->ac_g_ex.fe_logical,
   5158			(unsigned long)ac->ac_b_ex.fe_group,
   5159			(unsigned long)ac->ac_b_ex.fe_start,
   5160			(unsigned long)ac->ac_b_ex.fe_len,
   5161			(unsigned long)ac->ac_b_ex.fe_logical,
   5162			(int)ac->ac_criteria);
   5163	mb_debug(sb, "%u found", ac->ac_found);
   5164	ext4_mb_show_pa(sb);
   5165}
   5166#else
   5167static inline void ext4_mb_show_pa(struct super_block *sb)
   5168{
   5169	return;
   5170}
   5171static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
   5172{
   5173	ext4_mb_show_pa(ac->ac_sb);
   5174	return;
   5175}
   5176#endif
   5177
   5178/*
   5179 * We use locality group preallocation for small size file. The size of the
   5180 * file is determined by the current size or the resulting size after
   5181 * allocation which ever is larger
   5182 *
   5183 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
   5184 */
   5185static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
   5186{
   5187	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
   5188	int bsbits = ac->ac_sb->s_blocksize_bits;
   5189	loff_t size, isize;
   5190
   5191	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
   5192		return;
   5193
   5194	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
   5195		return;
   5196
   5197	size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
   5198	isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
   5199		>> bsbits;
   5200
   5201	if ((size == isize) && !ext4_fs_is_busy(sbi) &&
   5202	    !inode_is_open_for_write(ac->ac_inode)) {
   5203		ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
   5204		return;
   5205	}
   5206
   5207	if (sbi->s_mb_group_prealloc <= 0) {
   5208		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
   5209		return;
   5210	}
   5211
   5212	/* don't use group allocation for large files */
   5213	size = max(size, isize);
   5214	if (size > sbi->s_mb_stream_request) {
   5215		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
   5216		return;
   5217	}
   5218
   5219	BUG_ON(ac->ac_lg != NULL);
   5220	/*
   5221	 * locality group prealloc space are per cpu. The reason for having
   5222	 * per cpu locality group is to reduce the contention between block
   5223	 * request from multiple CPUs.
   5224	 */
   5225	ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
   5226
   5227	/* we're going to use group allocation */
   5228	ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
   5229
   5230	/* serialize all allocations in the group */
   5231	mutex_lock(&ac->ac_lg->lg_mutex);
   5232}
   5233
   5234static noinline_for_stack int
   5235ext4_mb_initialize_context(struct ext4_allocation_context *ac,
   5236				struct ext4_allocation_request *ar)
   5237{
   5238	struct super_block *sb = ar->inode->i_sb;
   5239	struct ext4_sb_info *sbi = EXT4_SB(sb);
   5240	struct ext4_super_block *es = sbi->s_es;
   5241	ext4_group_t group;
   5242	unsigned int len;
   5243	ext4_fsblk_t goal;
   5244	ext4_grpblk_t block;
   5245
   5246	/* we can't allocate > group size */
   5247	len = ar->len;
   5248
   5249	/* just a dirty hack to filter too big requests  */
   5250	if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
   5251		len = EXT4_CLUSTERS_PER_GROUP(sb);
   5252
   5253	/* start searching from the goal */
   5254	goal = ar->goal;
   5255	if (goal < le32_to_cpu(es->s_first_data_block) ||
   5256			goal >= ext4_blocks_count(es))
   5257		goal = le32_to_cpu(es->s_first_data_block);
   5258	ext4_get_group_no_and_offset(sb, goal, &group, &block);
   5259
   5260	/* set up allocation goals */
   5261	ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
   5262	ac->ac_status = AC_STATUS_CONTINUE;
   5263	ac->ac_sb = sb;
   5264	ac->ac_inode = ar->inode;
   5265	ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
   5266	ac->ac_o_ex.fe_group = group;
   5267	ac->ac_o_ex.fe_start = block;
   5268	ac->ac_o_ex.fe_len = len;
   5269	ac->ac_g_ex = ac->ac_o_ex;
   5270	ac->ac_flags = ar->flags;
   5271
   5272	/* we have to define context: we'll work with a file or
   5273	 * locality group. this is a policy, actually */
   5274	ext4_mb_group_or_file(ac);
   5275
   5276	mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, "
   5277			"left: %u/%u, right %u/%u to %swritable\n",
   5278			(unsigned) ar->len, (unsigned) ar->logical,
   5279			(unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
   5280			(unsigned) ar->lleft, (unsigned) ar->pleft,
   5281			(unsigned) ar->lright, (unsigned) ar->pright,
   5282			inode_is_open_for_write(ar->inode) ? "" : "non-");
   5283	return 0;
   5284
   5285}
   5286
   5287static noinline_for_stack void
   5288ext4_mb_discard_lg_preallocations(struct super_block *sb,
   5289					struct ext4_locality_group *lg,
   5290					int order, int total_entries)
   5291{
   5292	ext4_group_t group = 0;
   5293	struct ext4_buddy e4b;
   5294	struct list_head discard_list;
   5295	struct ext4_prealloc_space *pa, *tmp;
   5296
   5297	mb_debug(sb, "discard locality group preallocation\n");
   5298
   5299	INIT_LIST_HEAD(&discard_list);
   5300
   5301	spin_lock(&lg->lg_prealloc_lock);
   5302	list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
   5303				pa_inode_list,
   5304				lockdep_is_held(&lg->lg_prealloc_lock)) {
   5305		spin_lock(&pa->pa_lock);
   5306		if (atomic_read(&pa->pa_count)) {
   5307			/*
   5308			 * This is the pa that we just used
   5309			 * for block allocation. So don't
   5310			 * free that
   5311			 */
   5312			spin_unlock(&pa->pa_lock);
   5313			continue;
   5314		}
   5315		if (pa->pa_deleted) {
   5316			spin_unlock(&pa->pa_lock);
   5317			continue;
   5318		}
   5319		/* only lg prealloc space */
   5320		BUG_ON(pa->pa_type != MB_GROUP_PA);
   5321
   5322		/* seems this one can be freed ... */
   5323		ext4_mb_mark_pa_deleted(sb, pa);
   5324		spin_unlock(&pa->pa_lock);
   5325
   5326		list_del_rcu(&pa->pa_inode_list);
   5327		list_add(&pa->u.pa_tmp_list, &discard_list);
   5328
   5329		total_entries--;
   5330		if (total_entries <= 5) {
   5331			/*
   5332			 * we want to keep only 5 entries
   5333			 * allowing it to grow to 8. This
   5334			 * mak sure we don't call discard
   5335			 * soon for this list.
   5336			 */
   5337			break;
   5338		}
   5339	}
   5340	spin_unlock(&lg->lg_prealloc_lock);
   5341
   5342	list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
   5343		int err;
   5344
   5345		group = ext4_get_group_number(sb, pa->pa_pstart);
   5346		err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
   5347					     GFP_NOFS|__GFP_NOFAIL);
   5348		if (err) {
   5349			ext4_error_err(sb, -err, "Error %d loading buddy information for %u",
   5350				       err, group);
   5351			continue;
   5352		}
   5353		ext4_lock_group(sb, group);
   5354		list_del(&pa->pa_group_list);
   5355		ext4_mb_release_group_pa(&e4b, pa);
   5356		ext4_unlock_group(sb, group);
   5357
   5358		ext4_mb_unload_buddy(&e4b);
   5359		list_del(&pa->u.pa_tmp_list);
   5360		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
   5361	}
   5362}
   5363
   5364/*
   5365 * We have incremented pa_count. So it cannot be freed at this
   5366 * point. Also we hold lg_mutex. So no parallel allocation is
   5367 * possible from this lg. That means pa_free cannot be updated.
   5368 *
   5369 * A parallel ext4_mb_discard_group_preallocations is possible.
   5370 * which can cause the lg_prealloc_list to be updated.
   5371 */
   5372
   5373static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
   5374{
   5375	int order, added = 0, lg_prealloc_count = 1;
   5376	struct super_block *sb = ac->ac_sb;
   5377	struct ext4_locality_group *lg = ac->ac_lg;
   5378	struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
   5379
   5380	order = fls(pa->pa_free) - 1;
   5381	if (order > PREALLOC_TB_SIZE - 1)
   5382		/* The max size of hash table is PREALLOC_TB_SIZE */
   5383		order = PREALLOC_TB_SIZE - 1;
   5384	/* Add the prealloc space to lg */
   5385	spin_lock(&lg->lg_prealloc_lock);
   5386	list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
   5387				pa_inode_list,
   5388				lockdep_is_held(&lg->lg_prealloc_lock)) {
   5389		spin_lock(&tmp_pa->pa_lock);
   5390		if (tmp_pa->pa_deleted) {
   5391			spin_unlock(&tmp_pa->pa_lock);
   5392			continue;
   5393		}
   5394		if (!added && pa->pa_free < tmp_pa->pa_free) {
   5395			/* Add to the tail of the previous entry */
   5396			list_add_tail_rcu(&pa->pa_inode_list,
   5397						&tmp_pa->pa_inode_list);
   5398			added = 1;
   5399			/*
   5400			 * we want to count the total
   5401			 * number of entries in the list
   5402			 */
   5403		}
   5404		spin_unlock(&tmp_pa->pa_lock);
   5405		lg_prealloc_count++;
   5406	}
   5407	if (!added)
   5408		list_add_tail_rcu(&pa->pa_inode_list,
   5409					&lg->lg_prealloc_list[order]);
   5410	spin_unlock(&lg->lg_prealloc_lock);
   5411
   5412	/* Now trim the list to be not more than 8 elements */
   5413	if (lg_prealloc_count > 8) {
   5414		ext4_mb_discard_lg_preallocations(sb, lg,
   5415						  order, lg_prealloc_count);
   5416		return;
   5417	}
   5418	return ;
   5419}
   5420
   5421/*
   5422 * if per-inode prealloc list is too long, trim some PA
   5423 */
   5424static void ext4_mb_trim_inode_pa(struct inode *inode)
   5425{
   5426	struct ext4_inode_info *ei = EXT4_I(inode);
   5427	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
   5428	int count, delta;
   5429
   5430	count = atomic_read(&ei->i_prealloc_active);
   5431	delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
   5432	if (count > sbi->s_mb_max_inode_prealloc + delta) {
   5433		count -= sbi->s_mb_max_inode_prealloc;
   5434		ext4_discard_preallocations(inode, count);
   5435	}
   5436}
   5437
   5438/*
   5439 * release all resource we used in allocation
   5440 */
   5441static int ext4_mb_release_context(struct ext4_allocation_context *ac)
   5442{
   5443	struct inode *inode = ac->ac_inode;
   5444	struct ext4_inode_info *ei = EXT4_I(inode);
   5445	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
   5446	struct ext4_prealloc_space *pa = ac->ac_pa;
   5447	if (pa) {
   5448		if (pa->pa_type == MB_GROUP_PA) {
   5449			/* see comment in ext4_mb_use_group_pa() */
   5450			spin_lock(&pa->pa_lock);
   5451			pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
   5452			pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
   5453			pa->pa_free -= ac->ac_b_ex.fe_len;
   5454			pa->pa_len -= ac->ac_b_ex.fe_len;
   5455			spin_unlock(&pa->pa_lock);
   5456
   5457			/*
   5458			 * We want to add the pa to the right bucket.
   5459			 * Remove it from the list and while adding
   5460			 * make sure the list to which we are adding
   5461			 * doesn't grow big.
   5462			 */
   5463			if (likely(pa->pa_free)) {
   5464				spin_lock(pa->pa_obj_lock);
   5465				list_del_rcu(&pa->pa_inode_list);
   5466				spin_unlock(pa->pa_obj_lock);
   5467				ext4_mb_add_n_trim(ac);
   5468			}
   5469		}
   5470
   5471		if (pa->pa_type == MB_INODE_PA) {
   5472			/*
   5473			 * treat per-inode prealloc list as a lru list, then try
   5474			 * to trim the least recently used PA.
   5475			 */
   5476			spin_lock(pa->pa_obj_lock);
   5477			list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
   5478			spin_unlock(pa->pa_obj_lock);
   5479		}
   5480
   5481		ext4_mb_put_pa(ac, ac->ac_sb, pa);
   5482	}
   5483	if (ac->ac_bitmap_page)
   5484		put_page(ac->ac_bitmap_page);
   5485	if (ac->ac_buddy_page)
   5486		put_page(ac->ac_buddy_page);
   5487	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
   5488		mutex_unlock(&ac->ac_lg->lg_mutex);
   5489	ext4_mb_collect_stats(ac);
   5490	ext4_mb_trim_inode_pa(inode);
   5491	return 0;
   5492}
   5493
   5494static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
   5495{
   5496	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
   5497	int ret;
   5498	int freed = 0, busy = 0;
   5499	int retry = 0;
   5500
   5501	trace_ext4_mb_discard_preallocations(sb, needed);
   5502
   5503	if (needed == 0)
   5504		needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
   5505 repeat:
   5506	for (i = 0; i < ngroups && needed > 0; i++) {
   5507		ret = ext4_mb_discard_group_preallocations(sb, i, &busy);
   5508		freed += ret;
   5509		needed -= ret;
   5510		cond_resched();
   5511	}
   5512
   5513	if (needed > 0 && busy && ++retry < 3) {
   5514		busy = 0;
   5515		goto repeat;
   5516	}
   5517
   5518	return freed;
   5519}
   5520
   5521static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
   5522			struct ext4_allocation_context *ac, u64 *seq)
   5523{
   5524	int freed;
   5525	u64 seq_retry = 0;
   5526	bool ret = false;
   5527
   5528	freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
   5529	if (freed) {
   5530		ret = true;
   5531		goto out_dbg;
   5532	}
   5533	seq_retry = ext4_get_discard_pa_seq_sum();
   5534	if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) {
   5535		ac->ac_flags |= EXT4_MB_STRICT_CHECK;
   5536		*seq = seq_retry;
   5537		ret = true;
   5538	}
   5539
   5540out_dbg:
   5541	mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
   5542	return ret;
   5543}
   5544
   5545static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
   5546				struct ext4_allocation_request *ar, int *errp);
   5547
   5548/*
   5549 * Main entry point into mballoc to allocate blocks
   5550 * it tries to use preallocation first, then falls back
   5551 * to usual allocation
   5552 */
   5553ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
   5554				struct ext4_allocation_request *ar, int *errp)
   5555{
   5556	struct ext4_allocation_context *ac = NULL;
   5557	struct ext4_sb_info *sbi;
   5558	struct super_block *sb;
   5559	ext4_fsblk_t block = 0;
   5560	unsigned int inquota = 0;
   5561	unsigned int reserv_clstrs = 0;
   5562	u64 seq;
   5563
   5564	might_sleep();
   5565	sb = ar->inode->i_sb;
   5566	sbi = EXT4_SB(sb);
   5567
   5568	trace_ext4_request_blocks(ar);
   5569	if (sbi->s_mount_state & EXT4_FC_REPLAY)
   5570		return ext4_mb_new_blocks_simple(handle, ar, errp);
   5571
   5572	/* Allow to use superuser reservation for quota file */
   5573	if (ext4_is_quota_file(ar->inode))
   5574		ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
   5575
   5576	if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
   5577		/* Without delayed allocation we need to verify
   5578		 * there is enough free blocks to do block allocation
   5579		 * and verify allocation doesn't exceed the quota limits.
   5580		 */
   5581		while (ar->len &&
   5582			ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
   5583
   5584			/* let others to free the space */
   5585			cond_resched();
   5586			ar->len = ar->len >> 1;
   5587		}
   5588		if (!ar->len) {
   5589			ext4_mb_show_pa(sb);
   5590			*errp = -ENOSPC;
   5591			return 0;
   5592		}
   5593		reserv_clstrs = ar->len;
   5594		if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
   5595			dquot_alloc_block_nofail(ar->inode,
   5596						 EXT4_C2B(sbi, ar->len));
   5597		} else {
   5598			while (ar->len &&
   5599				dquot_alloc_block(ar->inode,
   5600						  EXT4_C2B(sbi, ar->len))) {
   5601
   5602				ar->flags |= EXT4_MB_HINT_NOPREALLOC;
   5603				ar->len--;
   5604			}
   5605		}
   5606		inquota = ar->len;
   5607		if (ar->len == 0) {
   5608			*errp = -EDQUOT;
   5609			goto out;
   5610		}
   5611	}
   5612
   5613	ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
   5614	if (!ac) {
   5615		ar->len = 0;
   5616		*errp = -ENOMEM;
   5617		goto out;
   5618	}
   5619
   5620	*errp = ext4_mb_initialize_context(ac, ar);
   5621	if (*errp) {
   5622		ar->len = 0;
   5623		goto out;
   5624	}
   5625
   5626	ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
   5627	seq = this_cpu_read(discard_pa_seq);
   5628	if (!ext4_mb_use_preallocated(ac)) {
   5629		ac->ac_op = EXT4_MB_HISTORY_ALLOC;
   5630		ext4_mb_normalize_request(ac, ar);
   5631
   5632		*errp = ext4_mb_pa_alloc(ac);
   5633		if (*errp)
   5634			goto errout;
   5635repeat:
   5636		/* allocate space in core */
   5637		*errp = ext4_mb_regular_allocator(ac);
   5638		/*
   5639		 * pa allocated above is added to grp->bb_prealloc_list only
   5640		 * when we were able to allocate some block i.e. when
   5641		 * ac->ac_status == AC_STATUS_FOUND.
   5642		 * And error from above mean ac->ac_status != AC_STATUS_FOUND
   5643		 * So we have to free this pa here itself.
   5644		 */
   5645		if (*errp) {
   5646			ext4_mb_pa_free(ac);
   5647			ext4_discard_allocated_blocks(ac);
   5648			goto errout;
   5649		}
   5650		if (ac->ac_status == AC_STATUS_FOUND &&
   5651			ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
   5652			ext4_mb_pa_free(ac);
   5653	}
   5654	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
   5655		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
   5656		if (*errp) {
   5657			ext4_discard_allocated_blocks(ac);
   5658			goto errout;
   5659		} else {
   5660			block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
   5661			ar->len = ac->ac_b_ex.fe_len;
   5662		}
   5663	} else {
   5664		if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
   5665			goto repeat;
   5666		/*
   5667		 * If block allocation fails then the pa allocated above
   5668		 * needs to be freed here itself.
   5669		 */
   5670		ext4_mb_pa_free(ac);
   5671		*errp = -ENOSPC;
   5672	}
   5673
   5674errout:
   5675	if (*errp) {
   5676		ac->ac_b_ex.fe_len = 0;
   5677		ar->len = 0;
   5678		ext4_mb_show_ac(ac);
   5679	}
   5680	ext4_mb_release_context(ac);
   5681out:
   5682	if (ac)
   5683		kmem_cache_free(ext4_ac_cachep, ac);
   5684	if (inquota && ar->len < inquota)
   5685		dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
   5686	if (!ar->len) {
   5687		if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
   5688			/* release all the reserved blocks if non delalloc */
   5689			percpu_counter_sub(&sbi->s_dirtyclusters_counter,
   5690						reserv_clstrs);
   5691	}
   5692
   5693	trace_ext4_allocate_blocks(ar, (unsigned long long)block);
   5694
   5695	return block;
   5696}
   5697
   5698/*
   5699 * We can merge two free data extents only if the physical blocks
   5700 * are contiguous, AND the extents were freed by the same transaction,
   5701 * AND the blocks are associated with the same group.
   5702 */
   5703static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
   5704					struct ext4_free_data *entry,
   5705					struct ext4_free_data *new_entry,
   5706					struct rb_root *entry_rb_root)
   5707{
   5708	if ((entry->efd_tid != new_entry->efd_tid) ||
   5709	    (entry->efd_group != new_entry->efd_group))
   5710		return;
   5711	if (entry->efd_start_cluster + entry->efd_count ==
   5712	    new_entry->efd_start_cluster) {
   5713		new_entry->efd_start_cluster = entry->efd_start_cluster;
   5714		new_entry->efd_count += entry->efd_count;
   5715	} else if (new_entry->efd_start_cluster + new_entry->efd_count ==
   5716		   entry->efd_start_cluster) {
   5717		new_entry->efd_count += entry->efd_count;
   5718	} else
   5719		return;
   5720	spin_lock(&sbi->s_md_lock);
   5721	list_del(&entry->efd_list);
   5722	spin_unlock(&sbi->s_md_lock);
   5723	rb_erase(&entry->efd_node, entry_rb_root);
   5724	kmem_cache_free(ext4_free_data_cachep, entry);
   5725}
   5726
   5727static noinline_for_stack int
   5728ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
   5729		      struct ext4_free_data *new_entry)
   5730{
   5731	ext4_group_t group = e4b->bd_group;
   5732	ext4_grpblk_t cluster;
   5733	ext4_grpblk_t clusters = new_entry->efd_count;
   5734	struct ext4_free_data *entry;
   5735	struct ext4_group_info *db = e4b->bd_info;
   5736	struct super_block *sb = e4b->bd_sb;
   5737	struct ext4_sb_info *sbi = EXT4_SB(sb);
   5738	struct rb_node **n = &db->bb_free_root.rb_node, *node;
   5739	struct rb_node *parent = NULL, *new_node;
   5740
   5741	BUG_ON(!ext4_handle_valid(handle));
   5742	BUG_ON(e4b->bd_bitmap_page == NULL);
   5743	BUG_ON(e4b->bd_buddy_page == NULL);
   5744
   5745	new_node = &new_entry->efd_node;
   5746	cluster = new_entry->efd_start_cluster;
   5747
   5748	if (!*n) {
   5749		/* first free block exent. We need to
   5750		   protect buddy cache from being freed,
   5751		 * otherwise we'll refresh it from
   5752		 * on-disk bitmap and lose not-yet-available
   5753		 * blocks */
   5754		get_page(e4b->bd_buddy_page);
   5755		get_page(e4b->bd_bitmap_page);
   5756	}
   5757	while (*n) {
   5758		parent = *n;
   5759		entry = rb_entry(parent, struct ext4_free_data, efd_node);
   5760		if (cluster < entry->efd_start_cluster)
   5761			n = &(*n)->rb_left;
   5762		else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
   5763			n = &(*n)->rb_right;
   5764		else {
   5765			ext4_grp_locked_error(sb, group, 0,
   5766				ext4_group_first_block_no(sb, group) +
   5767				EXT4_C2B(sbi, cluster),
   5768				"Block already on to-be-freed list");
   5769			kmem_cache_free(ext4_free_data_cachep, new_entry);
   5770			return 0;
   5771		}
   5772	}
   5773
   5774	rb_link_node(new_node, parent, n);
   5775	rb_insert_color(new_node, &db->bb_free_root);
   5776
   5777	/* Now try to see the extent can be merged to left and right */
   5778	node = rb_prev(new_node);
   5779	if (node) {
   5780		entry = rb_entry(node, struct ext4_free_data, efd_node);
   5781		ext4_try_merge_freed_extent(sbi, entry, new_entry,
   5782					    &(db->bb_free_root));
   5783	}
   5784
   5785	node = rb_next(new_node);
   5786	if (node) {
   5787		entry = rb_entry(node, struct ext4_free_data, efd_node);
   5788		ext4_try_merge_freed_extent(sbi, entry, new_entry,
   5789					    &(db->bb_free_root));
   5790	}
   5791
   5792	spin_lock(&sbi->s_md_lock);
   5793	list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
   5794	sbi->s_mb_free_pending += clusters;
   5795	spin_unlock(&sbi->s_md_lock);
   5796	return 0;
   5797}
   5798
   5799/*
   5800 * Simple allocator for Ext4 fast commit replay path. It searches for blocks
   5801 * linearly starting at the goal block and also excludes the blocks which
   5802 * are going to be in use after fast commit replay.
   5803 */
   5804static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
   5805				struct ext4_allocation_request *ar, int *errp)
   5806{
   5807	struct buffer_head *bitmap_bh;
   5808	struct super_block *sb = ar->inode->i_sb;
   5809	ext4_group_t group;
   5810	ext4_grpblk_t blkoff;
   5811	ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
   5812	ext4_grpblk_t i = 0;
   5813	ext4_fsblk_t goal, block;
   5814	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
   5815
   5816	goal = ar->goal;
   5817	if (goal < le32_to_cpu(es->s_first_data_block) ||
   5818			goal >= ext4_blocks_count(es))
   5819		goal = le32_to_cpu(es->s_first_data_block);
   5820
   5821	ar->len = 0;
   5822	ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
   5823	for (; group < ext4_get_groups_count(sb); group++) {
   5824		bitmap_bh = ext4_read_block_bitmap(sb, group);
   5825		if (IS_ERR(bitmap_bh)) {
   5826			*errp = PTR_ERR(bitmap_bh);
   5827			pr_warn("Failed to read block bitmap\n");
   5828			return 0;
   5829		}
   5830
   5831		ext4_get_group_no_and_offset(sb,
   5832			max(ext4_group_first_block_no(sb, group), goal),
   5833			NULL, &blkoff);
   5834		while (1) {
   5835			i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
   5836						blkoff);
   5837			if (i >= max)
   5838				break;
   5839			if (ext4_fc_replay_check_excluded(sb,
   5840				ext4_group_first_block_no(sb, group) + i)) {
   5841				blkoff = i + 1;
   5842			} else
   5843				break;
   5844		}
   5845		brelse(bitmap_bh);
   5846		if (i < max)
   5847			break;
   5848	}
   5849
   5850	if (group >= ext4_get_groups_count(sb) || i >= max) {
   5851		*errp = -ENOSPC;
   5852		return 0;
   5853	}
   5854
   5855	block = ext4_group_first_block_no(sb, group) + i;
   5856	ext4_mb_mark_bb(sb, block, 1, 1);
   5857	ar->len = 1;
   5858
   5859	return block;
   5860}
   5861
   5862static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
   5863					unsigned long count)
   5864{
   5865	struct buffer_head *bitmap_bh;
   5866	struct super_block *sb = inode->i_sb;
   5867	struct ext4_group_desc *gdp;
   5868	struct buffer_head *gdp_bh;
   5869	ext4_group_t group;
   5870	ext4_grpblk_t blkoff;
   5871	int already_freed = 0, err, i;
   5872
   5873	ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
   5874	bitmap_bh = ext4_read_block_bitmap(sb, group);
   5875	if (IS_ERR(bitmap_bh)) {
   5876		err = PTR_ERR(bitmap_bh);
   5877		pr_warn("Failed to read block bitmap\n");
   5878		return;
   5879	}
   5880	gdp = ext4_get_group_desc(sb, group, &gdp_bh);
   5881	if (!gdp)
   5882		return;
   5883
   5884	for (i = 0; i < count; i++) {
   5885		if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
   5886			already_freed++;
   5887	}
   5888	mb_clear_bits(bitmap_bh->b_data, blkoff, count);
   5889	err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
   5890	if (err)
   5891		return;
   5892	ext4_free_group_clusters_set(
   5893		sb, gdp, ext4_free_group_clusters(sb, gdp) +
   5894		count - already_freed);
   5895	ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
   5896	ext4_group_desc_csum_set(sb, group, gdp);
   5897	ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
   5898	sync_dirty_buffer(bitmap_bh);
   5899	sync_dirty_buffer(gdp_bh);
   5900	brelse(bitmap_bh);
   5901}
   5902
   5903/**
   5904 * ext4_mb_clear_bb() -- helper function for freeing blocks.
   5905 *			Used by ext4_free_blocks()
   5906 * @handle:		handle for this transaction
   5907 * @inode:		inode
   5908 * @block:		starting physical block to be freed
   5909 * @count:		number of blocks to be freed
   5910 * @flags:		flags used by ext4_free_blocks
   5911 */
   5912static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
   5913			       ext4_fsblk_t block, unsigned long count,
   5914			       int flags)
   5915{
   5916	struct buffer_head *bitmap_bh = NULL;
   5917	struct super_block *sb = inode->i_sb;
   5918	struct ext4_group_desc *gdp;
   5919	unsigned int overflow;
   5920	ext4_grpblk_t bit;
   5921	struct buffer_head *gd_bh;
   5922	ext4_group_t block_group;
   5923	struct ext4_sb_info *sbi;
   5924	struct ext4_buddy e4b;
   5925	unsigned int count_clusters;
   5926	int err = 0;
   5927	int ret;
   5928
   5929	sbi = EXT4_SB(sb);
   5930
   5931do_more:
   5932	overflow = 0;
   5933	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
   5934
   5935	if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
   5936			ext4_get_group_info(sb, block_group))))
   5937		return;
   5938
   5939	/*
   5940	 * Check to see if we are freeing blocks across a group
   5941	 * boundary.
   5942	 */
   5943	if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
   5944		overflow = EXT4_C2B(sbi, bit) + count -
   5945			EXT4_BLOCKS_PER_GROUP(sb);
   5946		count -= overflow;
   5947	}
   5948	count_clusters = EXT4_NUM_B2C(sbi, count);
   5949	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
   5950	if (IS_ERR(bitmap_bh)) {
   5951		err = PTR_ERR(bitmap_bh);
   5952		bitmap_bh = NULL;
   5953		goto error_return;
   5954	}
   5955	gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
   5956	if (!gdp) {
   5957		err = -EIO;
   5958		goto error_return;
   5959	}
   5960
   5961	if (!ext4_inode_block_valid(inode, block, count)) {
   5962		ext4_error(sb, "Freeing blocks in system zone - "
   5963			   "Block = %llu, count = %lu", block, count);
   5964		/* err = 0. ext4_std_error should be a no op */
   5965		goto error_return;
   5966	}
   5967
   5968	BUFFER_TRACE(bitmap_bh, "getting write access");
   5969	err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
   5970					    EXT4_JTR_NONE);
   5971	if (err)
   5972		goto error_return;
   5973
   5974	/*
   5975	 * We are about to modify some metadata.  Call the journal APIs
   5976	 * to unshare ->b_data if a currently-committing transaction is
   5977	 * using it
   5978	 */
   5979	BUFFER_TRACE(gd_bh, "get_write_access");
   5980	err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
   5981	if (err)
   5982		goto error_return;
   5983#ifdef AGGRESSIVE_CHECK
   5984	{
   5985		int i;
   5986		for (i = 0; i < count_clusters; i++)
   5987			BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
   5988	}
   5989#endif
   5990	trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
   5991
   5992	/* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
   5993	err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
   5994				     GFP_NOFS|__GFP_NOFAIL);
   5995	if (err)
   5996		goto error_return;
   5997
   5998	/*
   5999	 * We need to make sure we don't reuse the freed block until after the
   6000	 * transaction is committed. We make an exception if the inode is to be
   6001	 * written in writeback mode since writeback mode has weak data
   6002	 * consistency guarantees.
   6003	 */
   6004	if (ext4_handle_valid(handle) &&
   6005	    ((flags & EXT4_FREE_BLOCKS_METADATA) ||
   6006	     !ext4_should_writeback_data(inode))) {
   6007		struct ext4_free_data *new_entry;
   6008		/*
   6009		 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
   6010		 * to fail.
   6011		 */
   6012		new_entry = kmem_cache_alloc(ext4_free_data_cachep,
   6013				GFP_NOFS|__GFP_NOFAIL);
   6014		new_entry->efd_start_cluster = bit;
   6015		new_entry->efd_group = block_group;
   6016		new_entry->efd_count = count_clusters;
   6017		new_entry->efd_tid = handle->h_transaction->t_tid;
   6018
   6019		ext4_lock_group(sb, block_group);
   6020		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
   6021		ext4_mb_free_metadata(handle, &e4b, new_entry);
   6022	} else {
   6023		/* need to update group_info->bb_free and bitmap
   6024		 * with group lock held. generate_buddy look at
   6025		 * them with group lock_held
   6026		 */
   6027		if (test_opt(sb, DISCARD)) {
   6028			err = ext4_issue_discard(sb, block_group, bit, count,
   6029						 NULL);
   6030			if (err && err != -EOPNOTSUPP)
   6031				ext4_msg(sb, KERN_WARNING, "discard request in"
   6032					 " group:%u block:%d count:%lu failed"
   6033					 " with %d", block_group, bit, count,
   6034					 err);
   6035		} else
   6036			EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
   6037
   6038		ext4_lock_group(sb, block_group);
   6039		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
   6040		mb_free_blocks(inode, &e4b, bit, count_clusters);
   6041	}
   6042
   6043	ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
   6044	ext4_free_group_clusters_set(sb, gdp, ret);
   6045	ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
   6046	ext4_group_desc_csum_set(sb, block_group, gdp);
   6047	ext4_unlock_group(sb, block_group);
   6048
   6049	if (sbi->s_log_groups_per_flex) {
   6050		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
   6051		atomic64_add(count_clusters,
   6052			     &sbi_array_rcu_deref(sbi, s_flex_groups,
   6053						  flex_group)->free_clusters);
   6054	}
   6055
   6056	/*
   6057	 * on a bigalloc file system, defer the s_freeclusters_counter
   6058	 * update to the caller (ext4_remove_space and friends) so they
   6059	 * can determine if a cluster freed here should be rereserved
   6060	 */
   6061	if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) {
   6062		if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
   6063			dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
   6064		percpu_counter_add(&sbi->s_freeclusters_counter,
   6065				   count_clusters);
   6066	}
   6067
   6068	ext4_mb_unload_buddy(&e4b);
   6069
   6070	/* We dirtied the bitmap block */
   6071	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
   6072	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
   6073
   6074	/* And the group descriptor block */
   6075	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
   6076	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
   6077	if (!err)
   6078		err = ret;
   6079
   6080	if (overflow && !err) {
   6081		block += count;
   6082		count = overflow;
   6083		put_bh(bitmap_bh);
   6084		goto do_more;
   6085	}
   6086error_return:
   6087	brelse(bitmap_bh);
   6088	ext4_std_error(sb, err);
   6089	return;
   6090}
   6091
   6092/**
   6093 * ext4_free_blocks() -- Free given blocks and update quota
   6094 * @handle:		handle for this transaction
   6095 * @inode:		inode
   6096 * @bh:			optional buffer of the block to be freed
   6097 * @block:		starting physical block to be freed
   6098 * @count:		number of blocks to be freed
   6099 * @flags:		flags used by ext4_free_blocks
   6100 */
   6101void ext4_free_blocks(handle_t *handle, struct inode *inode,
   6102		      struct buffer_head *bh, ext4_fsblk_t block,
   6103		      unsigned long count, int flags)
   6104{
   6105	struct super_block *sb = inode->i_sb;
   6106	unsigned int overflow;
   6107	struct ext4_sb_info *sbi;
   6108
   6109	sbi = EXT4_SB(sb);
   6110
   6111	if (sbi->s_mount_state & EXT4_FC_REPLAY) {
   6112		ext4_free_blocks_simple(inode, block, count);
   6113		return;
   6114	}
   6115
   6116	might_sleep();
   6117	if (bh) {
   6118		if (block)
   6119			BUG_ON(block != bh->b_blocknr);
   6120		else
   6121			block = bh->b_blocknr;
   6122	}
   6123
   6124	if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
   6125	    !ext4_inode_block_valid(inode, block, count)) {
   6126		ext4_error(sb, "Freeing blocks not in datazone - "
   6127			   "block = %llu, count = %lu", block, count);
   6128		return;
   6129	}
   6130
   6131	ext4_debug("freeing block %llu\n", block);
   6132	trace_ext4_free_blocks(inode, block, count, flags);
   6133
   6134	if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
   6135		BUG_ON(count > 1);
   6136
   6137		ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
   6138			    inode, bh, block);
   6139	}
   6140
   6141	/*
   6142	 * If the extent to be freed does not begin on a cluster
   6143	 * boundary, we need to deal with partial clusters at the
   6144	 * beginning and end of the extent.  Normally we will free
   6145	 * blocks at the beginning or the end unless we are explicitly
   6146	 * requested to avoid doing so.
   6147	 */
   6148	overflow = EXT4_PBLK_COFF(sbi, block);
   6149	if (overflow) {
   6150		if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
   6151			overflow = sbi->s_cluster_ratio - overflow;
   6152			block += overflow;
   6153			if (count > overflow)
   6154				count -= overflow;
   6155			else
   6156				return;
   6157		} else {
   6158			block -= overflow;
   6159			count += overflow;
   6160		}
   6161	}
   6162	overflow = EXT4_LBLK_COFF(sbi, count);
   6163	if (overflow) {
   6164		if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
   6165			if (count > overflow)
   6166				count -= overflow;
   6167			else
   6168				return;
   6169		} else
   6170			count += sbi->s_cluster_ratio - overflow;
   6171	}
   6172
   6173	if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
   6174		int i;
   6175		int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
   6176
   6177		for (i = 0; i < count; i++) {
   6178			cond_resched();
   6179			if (is_metadata)
   6180				bh = sb_find_get_block(inode->i_sb, block + i);
   6181			ext4_forget(handle, is_metadata, inode, bh, block + i);
   6182		}
   6183	}
   6184
   6185	ext4_mb_clear_bb(handle, inode, block, count, flags);
   6186	return;
   6187}
   6188
   6189/**
   6190 * ext4_group_add_blocks() -- Add given blocks to an existing group
   6191 * @handle:			handle to this transaction
   6192 * @sb:				super block
   6193 * @block:			start physical block to add to the block group
   6194 * @count:			number of blocks to free
   6195 *
   6196 * This marks the blocks as free in the bitmap and buddy.
   6197 */
   6198int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
   6199			 ext4_fsblk_t block, unsigned long count)
   6200{
   6201	struct buffer_head *bitmap_bh = NULL;
   6202	struct buffer_head *gd_bh;
   6203	ext4_group_t block_group;
   6204	ext4_grpblk_t bit;
   6205	unsigned int i;
   6206	struct ext4_group_desc *desc;
   6207	struct ext4_sb_info *sbi = EXT4_SB(sb);
   6208	struct ext4_buddy e4b;
   6209	int err = 0, ret, free_clusters_count;
   6210	ext4_grpblk_t clusters_freed;
   6211	ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block);
   6212	ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1);
   6213	unsigned long cluster_count = last_cluster - first_cluster + 1;
   6214
   6215	ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
   6216
   6217	if (count == 0)
   6218		return 0;
   6219
   6220	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
   6221	/*
   6222	 * Check to see if we are freeing blocks across a group
   6223	 * boundary.
   6224	 */
   6225	if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) {
   6226		ext4_warning(sb, "too many blocks added to group %u",
   6227			     block_group);
   6228		err = -EINVAL;
   6229		goto error_return;
   6230	}
   6231
   6232	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
   6233	if (IS_ERR(bitmap_bh)) {
   6234		err = PTR_ERR(bitmap_bh);
   6235		bitmap_bh = NULL;
   6236		goto error_return;
   6237	}
   6238
   6239	desc = ext4_get_group_desc(sb, block_group, &gd_bh);
   6240	if (!desc) {
   6241		err = -EIO;
   6242		goto error_return;
   6243	}
   6244
   6245	if (!ext4_sb_block_valid(sb, NULL, block, count)) {
   6246		ext4_error(sb, "Adding blocks in system zones - "
   6247			   "Block = %llu, count = %lu",
   6248			   block, count);
   6249		err = -EINVAL;
   6250		goto error_return;
   6251	}
   6252
   6253	BUFFER_TRACE(bitmap_bh, "getting write access");
   6254	err = ext4_journal_get_write_access(handle, sb, bitmap_bh,
   6255					    EXT4_JTR_NONE);
   6256	if (err)
   6257		goto error_return;
   6258
   6259	/*
   6260	 * We are about to modify some metadata.  Call the journal APIs
   6261	 * to unshare ->b_data if a currently-committing transaction is
   6262	 * using it
   6263	 */
   6264	BUFFER_TRACE(gd_bh, "get_write_access");
   6265	err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE);
   6266	if (err)
   6267		goto error_return;
   6268
   6269	for (i = 0, clusters_freed = 0; i < cluster_count; i++) {
   6270		BUFFER_TRACE(bitmap_bh, "clear bit");
   6271		if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
   6272			ext4_error(sb, "bit already cleared for block %llu",
   6273				   (ext4_fsblk_t)(block + i));
   6274			BUFFER_TRACE(bitmap_bh, "bit already cleared");
   6275		} else {
   6276			clusters_freed++;
   6277		}
   6278	}
   6279
   6280	err = ext4_mb_load_buddy(sb, block_group, &e4b);
   6281	if (err)
   6282		goto error_return;
   6283
   6284	/*
   6285	 * need to update group_info->bb_free and bitmap
   6286	 * with group lock held. generate_buddy look at
   6287	 * them with group lock_held
   6288	 */
   6289	ext4_lock_group(sb, block_group);
   6290	mb_clear_bits(bitmap_bh->b_data, bit, cluster_count);
   6291	mb_free_blocks(NULL, &e4b, bit, cluster_count);
   6292	free_clusters_count = clusters_freed +
   6293		ext4_free_group_clusters(sb, desc);
   6294	ext4_free_group_clusters_set(sb, desc, free_clusters_count);
   6295	ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
   6296	ext4_group_desc_csum_set(sb, block_group, desc);
   6297	ext4_unlock_group(sb, block_group);
   6298	percpu_counter_add(&sbi->s_freeclusters_counter,
   6299			   clusters_freed);
   6300
   6301	if (sbi->s_log_groups_per_flex) {
   6302		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
   6303		atomic64_add(clusters_freed,
   6304			     &sbi_array_rcu_deref(sbi, s_flex_groups,
   6305						  flex_group)->free_clusters);
   6306	}
   6307
   6308	ext4_mb_unload_buddy(&e4b);
   6309
   6310	/* We dirtied the bitmap block */
   6311	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
   6312	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
   6313
   6314	/* And the group descriptor block */
   6315	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
   6316	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
   6317	if (!err)
   6318		err = ret;
   6319
   6320error_return:
   6321	brelse(bitmap_bh);
   6322	ext4_std_error(sb, err);
   6323	return err;
   6324}
   6325
   6326/**
   6327 * ext4_trim_extent -- function to TRIM one single free extent in the group
   6328 * @sb:		super block for the file system
   6329 * @start:	starting block of the free extent in the alloc. group
   6330 * @count:	number of blocks to TRIM
   6331 * @e4b:	ext4 buddy for the group
   6332 *
   6333 * Trim "count" blocks starting at "start" in the "group". To assure that no
   6334 * one will allocate those blocks, mark it as used in buddy bitmap. This must
   6335 * be called with under the group lock.
   6336 */
   6337static int ext4_trim_extent(struct super_block *sb,
   6338		int start, int count, struct ext4_buddy *e4b)
   6339__releases(bitlock)
   6340__acquires(bitlock)
   6341{
   6342	struct ext4_free_extent ex;
   6343	ext4_group_t group = e4b->bd_group;
   6344	int ret = 0;
   6345
   6346	trace_ext4_trim_extent(sb, group, start, count);
   6347
   6348	assert_spin_locked(ext4_group_lock_ptr(sb, group));
   6349
   6350	ex.fe_start = start;
   6351	ex.fe_group = group;
   6352	ex.fe_len = count;
   6353
   6354	/*
   6355	 * Mark blocks used, so no one can reuse them while
   6356	 * being trimmed.
   6357	 */
   6358	mb_mark_used(e4b, &ex);
   6359	ext4_unlock_group(sb, group);
   6360	ret = ext4_issue_discard(sb, group, start, count, NULL);
   6361	ext4_lock_group(sb, group);
   6362	mb_free_blocks(NULL, e4b, start, ex.fe_len);
   6363	return ret;
   6364}
   6365
   6366static int ext4_try_to_trim_range(struct super_block *sb,
   6367		struct ext4_buddy *e4b, ext4_grpblk_t start,
   6368		ext4_grpblk_t max, ext4_grpblk_t minblocks)
   6369__acquires(ext4_group_lock_ptr(sb, e4b->bd_group))
   6370__releases(ext4_group_lock_ptr(sb, e4b->bd_group))
   6371{
   6372	ext4_grpblk_t next, count, free_count;
   6373	void *bitmap;
   6374
   6375	bitmap = e4b->bd_bitmap;
   6376	start = (e4b->bd_info->bb_first_free > start) ?
   6377		e4b->bd_info->bb_first_free : start;
   6378	count = 0;
   6379	free_count = 0;
   6380
   6381	while (start <= max) {
   6382		start = mb_find_next_zero_bit(bitmap, max + 1, start);
   6383		if (start > max)
   6384			break;
   6385		next = mb_find_next_bit(bitmap, max + 1, start);
   6386
   6387		if ((next - start) >= minblocks) {
   6388			int ret = ext4_trim_extent(sb, start, next - start, e4b);
   6389
   6390			if (ret && ret != -EOPNOTSUPP)
   6391				break;
   6392			count += next - start;
   6393		}
   6394		free_count += next - start;
   6395		start = next + 1;
   6396
   6397		if (fatal_signal_pending(current)) {
   6398			count = -ERESTARTSYS;
   6399			break;
   6400		}
   6401
   6402		if (need_resched()) {
   6403			ext4_unlock_group(sb, e4b->bd_group);
   6404			cond_resched();
   6405			ext4_lock_group(sb, e4b->bd_group);
   6406		}
   6407
   6408		if ((e4b->bd_info->bb_free - free_count) < minblocks)
   6409			break;
   6410	}
   6411
   6412	return count;
   6413}
   6414
   6415/**
   6416 * ext4_trim_all_free -- function to trim all free space in alloc. group
   6417 * @sb:			super block for file system
   6418 * @group:		group to be trimmed
   6419 * @start:		first group block to examine
   6420 * @max:		last group block to examine
   6421 * @minblocks:		minimum extent block count
   6422 * @set_trimmed:	set the trimmed flag if at least one block is trimmed
   6423 *
   6424 * ext4_trim_all_free walks through group's block bitmap searching for free
   6425 * extents. When the free extent is found, mark it as used in group buddy
   6426 * bitmap. Then issue a TRIM command on this extent and free the extent in
   6427 * the group buddy bitmap.
   6428 */
   6429static ext4_grpblk_t
   6430ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
   6431		   ext4_grpblk_t start, ext4_grpblk_t max,
   6432		   ext4_grpblk_t minblocks, bool set_trimmed)
   6433{
   6434	struct ext4_buddy e4b;
   6435	int ret;
   6436
   6437	trace_ext4_trim_all_free(sb, group, start, max);
   6438
   6439	ret = ext4_mb_load_buddy(sb, group, &e4b);
   6440	if (ret) {
   6441		ext4_warning(sb, "Error %d loading buddy information for %u",
   6442			     ret, group);
   6443		return ret;
   6444	}
   6445
   6446	ext4_lock_group(sb, group);
   6447
   6448	if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
   6449	    minblocks < EXT4_SB(sb)->s_last_trim_minblks) {
   6450		ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
   6451		if (ret >= 0 && set_trimmed)
   6452			EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
   6453	} else {
   6454		ret = 0;
   6455	}
   6456
   6457	ext4_unlock_group(sb, group);
   6458	ext4_mb_unload_buddy(&e4b);
   6459
   6460	ext4_debug("trimmed %d blocks in the group %d\n",
   6461		ret, group);
   6462
   6463	return ret;
   6464}
   6465
   6466/**
   6467 * ext4_trim_fs() -- trim ioctl handle function
   6468 * @sb:			superblock for filesystem
   6469 * @range:		fstrim_range structure
   6470 *
   6471 * start:	First Byte to trim
   6472 * len:		number of Bytes to trim from start
   6473 * minlen:	minimum extent length in Bytes
   6474 * ext4_trim_fs goes through all allocation groups containing Bytes from
   6475 * start to start+len. For each such a group ext4_trim_all_free function
   6476 * is invoked to trim all free space.
   6477 */
   6478int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
   6479{
   6480	unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev);
   6481	struct ext4_group_info *grp;
   6482	ext4_group_t group, first_group, last_group;
   6483	ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
   6484	uint64_t start, end, minlen, trimmed = 0;
   6485	ext4_fsblk_t first_data_blk =
   6486			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
   6487	ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
   6488	bool whole_group, eof = false;
   6489	int ret = 0;
   6490
   6491	start = range->start >> sb->s_blocksize_bits;
   6492	end = start + (range->len >> sb->s_blocksize_bits) - 1;
   6493	minlen = EXT4_NUM_B2C(EXT4_SB(sb),
   6494			      range->minlen >> sb->s_blocksize_bits);
   6495
   6496	if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
   6497	    start >= max_blks ||
   6498	    range->len < sb->s_blocksize)
   6499		return -EINVAL;
   6500	/* No point to try to trim less than discard granularity */
   6501	if (range->minlen < discard_granularity) {
   6502		minlen = EXT4_NUM_B2C(EXT4_SB(sb),
   6503				discard_granularity >> sb->s_blocksize_bits);
   6504		if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
   6505			goto out;
   6506	}
   6507	if (end >= max_blks - 1) {
   6508		end = max_blks - 1;
   6509		eof = true;
   6510	}
   6511	if (end <= first_data_blk)
   6512		goto out;
   6513	if (start < first_data_blk)
   6514		start = first_data_blk;
   6515
   6516	/* Determine first and last group to examine based on start and end */
   6517	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
   6518				     &first_group, &first_cluster);
   6519	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
   6520				     &last_group, &last_cluster);
   6521
   6522	/* end now represents the last cluster to discard in this group */
   6523	end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
   6524	whole_group = true;
   6525
   6526	for (group = first_group; group <= last_group; group++) {
   6527		grp = ext4_get_group_info(sb, group);
   6528		/* We only do this if the grp has never been initialized */
   6529		if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
   6530			ret = ext4_mb_init_group(sb, group, GFP_NOFS);
   6531			if (ret)
   6532				break;
   6533		}
   6534
   6535		/*
   6536		 * For all the groups except the last one, last cluster will
   6537		 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
   6538		 * change it for the last group, note that last_cluster is
   6539		 * already computed earlier by ext4_get_group_no_and_offset()
   6540		 */
   6541		if (group == last_group) {
   6542			end = last_cluster;
   6543			whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1;
   6544		}
   6545		if (grp->bb_free >= minlen) {
   6546			cnt = ext4_trim_all_free(sb, group, first_cluster,
   6547						 end, minlen, whole_group);
   6548			if (cnt < 0) {
   6549				ret = cnt;
   6550				break;
   6551			}
   6552			trimmed += cnt;
   6553		}
   6554
   6555		/*
   6556		 * For every group except the first one, we are sure
   6557		 * that the first cluster to discard will be cluster #0.
   6558		 */
   6559		first_cluster = 0;
   6560	}
   6561
   6562	if (!ret)
   6563		EXT4_SB(sb)->s_last_trim_minblks = minlen;
   6564
   6565out:
   6566	range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
   6567	return ret;
   6568}
   6569
   6570/* Iterate all the free extents in the group. */
   6571int
   6572ext4_mballoc_query_range(
   6573	struct super_block		*sb,
   6574	ext4_group_t			group,
   6575	ext4_grpblk_t			start,
   6576	ext4_grpblk_t			end,
   6577	ext4_mballoc_query_range_fn	formatter,
   6578	void				*priv)
   6579{
   6580	void				*bitmap;
   6581	ext4_grpblk_t			next;
   6582	struct ext4_buddy		e4b;
   6583	int				error;
   6584
   6585	error = ext4_mb_load_buddy(sb, group, &e4b);
   6586	if (error)
   6587		return error;
   6588	bitmap = e4b.bd_bitmap;
   6589
   6590	ext4_lock_group(sb, group);
   6591
   6592	start = (e4b.bd_info->bb_first_free > start) ?
   6593		e4b.bd_info->bb_first_free : start;
   6594	if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
   6595		end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
   6596
   6597	while (start <= end) {
   6598		start = mb_find_next_zero_bit(bitmap, end + 1, start);
   6599		if (start > end)
   6600			break;
   6601		next = mb_find_next_bit(bitmap, end + 1, start);
   6602
   6603		ext4_unlock_group(sb, group);
   6604		error = formatter(sb, group, start, next - start, priv);
   6605		if (error)
   6606			goto out_unload;
   6607		ext4_lock_group(sb, group);
   6608
   6609		start = next + 1;
   6610	}
   6611
   6612	ext4_unlock_group(sb, group);
   6613out_unload:
   6614	ext4_mb_unload_buddy(&e4b);
   6615
   6616	return error;
   6617}