cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

rgrp.c (74926B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
      4 * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
      5 */
      6
      7#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
      8
      9#include <linux/slab.h>
     10#include <linux/spinlock.h>
     11#include <linux/completion.h>
     12#include <linux/buffer_head.h>
     13#include <linux/fs.h>
     14#include <linux/gfs2_ondisk.h>
     15#include <linux/prefetch.h>
     16#include <linux/blkdev.h>
     17#include <linux/rbtree.h>
     18#include <linux/random.h>
     19
     20#include "gfs2.h"
     21#include "incore.h"
     22#include "glock.h"
     23#include "glops.h"
     24#include "lops.h"
     25#include "meta_io.h"
     26#include "quota.h"
     27#include "rgrp.h"
     28#include "super.h"
     29#include "trans.h"
     30#include "util.h"
     31#include "log.h"
     32#include "inode.h"
     33#include "trace_gfs2.h"
     34#include "dir.h"
     35
     36#define BFITNOENT ((u32)~0)
     37#define NO_BLOCK ((u64)~0)
     38
     39struct gfs2_rbm {
     40	struct gfs2_rgrpd *rgd;
     41	u32 offset;		/* The offset is bitmap relative */
     42	int bii;		/* Bitmap index */
     43};
     44
     45static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm)
     46{
     47	return rbm->rgd->rd_bits + rbm->bii;
     48}
     49
     50static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm)
     51{
     52	BUG_ON(rbm->offset >= rbm->rgd->rd_data);
     53	return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) +
     54		rbm->offset;
     55}
     56
     57/*
     58 * These routines are used by the resource group routines (rgrp.c)
     59 * to keep track of block allocation.  Each block is represented by two
     60 * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
     61 *
     62 * 0 = Free
     63 * 1 = Used (not metadata)
     64 * 2 = Unlinked (still in use) inode
     65 * 3 = Used (metadata)
     66 */
     67
     68struct gfs2_extent {
     69	struct gfs2_rbm rbm;
     70	u32 len;
     71};
     72
     73static const char valid_change[16] = {
     74	        /* current */
     75	/* n */ 0, 1, 1, 1,
     76	/* e */ 1, 0, 0, 0,
     77	/* w */ 0, 0, 0, 1,
     78	        1, 0, 0, 0
     79};
     80
     81static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
     82			 struct gfs2_blkreserv *rs, bool nowrap);
     83
     84
     85/**
     86 * gfs2_setbit - Set a bit in the bitmaps
     87 * @rbm: The position of the bit to set
     88 * @do_clone: Also set the clone bitmap, if it exists
     89 * @new_state: the new state of the block
     90 *
     91 */
     92
     93static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
     94			       unsigned char new_state)
     95{
     96	unsigned char *byte1, *byte2, *end, cur_state;
     97	struct gfs2_bitmap *bi = rbm_bi(rbm);
     98	unsigned int buflen = bi->bi_bytes;
     99	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
    100
    101	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
    102	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
    103
    104	BUG_ON(byte1 >= end);
    105
    106	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
    107
    108	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
    109		struct gfs2_sbd *sdp = rbm->rgd->rd_sbd;
    110
    111		fs_warn(sdp, "buf_blk = 0x%x old_state=%d, new_state=%d\n",
    112			rbm->offset, cur_state, new_state);
    113		fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n",
    114			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start,
    115			(unsigned long long)bi->bi_bh->b_blocknr);
    116		fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n",
    117			bi->bi_offset, bi->bi_bytes,
    118			(unsigned long long)gfs2_rbm_to_block(rbm));
    119		dump_stack();
    120		gfs2_consist_rgrpd(rbm->rgd);
    121		return;
    122	}
    123	*byte1 ^= (cur_state ^ new_state) << bit;
    124
    125	if (do_clone && bi->bi_clone) {
    126		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
    127		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
    128		*byte2 ^= (cur_state ^ new_state) << bit;
    129	}
    130}
    131
    132/**
    133 * gfs2_testbit - test a bit in the bitmaps
    134 * @rbm: The bit to test
    135 * @use_clone: If true, test the clone bitmap, not the official bitmap.
    136 *
    137 * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps,
    138 * not the "real" bitmaps, to avoid allocating recently freed blocks.
    139 *
    140 * Returns: The two bit block state of the requested bit
    141 */
    142
    143static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
    144{
    145	struct gfs2_bitmap *bi = rbm_bi(rbm);
    146	const u8 *buffer;
    147	const u8 *byte;
    148	unsigned int bit;
    149
    150	if (use_clone && bi->bi_clone)
    151		buffer = bi->bi_clone;
    152	else
    153		buffer = bi->bi_bh->b_data;
    154	buffer += bi->bi_offset;
    155	byte = buffer + (rbm->offset / GFS2_NBBY);
    156	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
    157
    158	return (*byte >> bit) & GFS2_BIT_MASK;
    159}
    160
    161/**
    162 * gfs2_bit_search
    163 * @ptr: Pointer to bitmap data
    164 * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
    165 * @state: The state we are searching for
    166 *
    167 * We xor the bitmap data with a patter which is the bitwise opposite
    168 * of what we are looking for, this gives rise to a pattern of ones
    169 * wherever there is a match. Since we have two bits per entry, we
    170 * take this pattern, shift it down by one place and then and it with
    171 * the original. All the even bit positions (0,2,4, etc) then represent
    172 * successful matches, so we mask with 0x55555..... to remove the unwanted
    173 * odd bit positions.
    174 *
    175 * This allows searching of a whole u64 at once (32 blocks) with a
    176 * single test (on 64 bit arches).
    177 */
    178
    179static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
    180{
    181	u64 tmp;
    182	static const u64 search[] = {
    183		[0] = 0xffffffffffffffffULL,
    184		[1] = 0xaaaaaaaaaaaaaaaaULL,
    185		[2] = 0x5555555555555555ULL,
    186		[3] = 0x0000000000000000ULL,
    187	};
    188	tmp = le64_to_cpu(*ptr) ^ search[state];
    189	tmp &= (tmp >> 1);
    190	tmp &= mask;
    191	return tmp;
    192}
    193
    194/**
    195 * rs_cmp - multi-block reservation range compare
    196 * @start: start of the new reservation
    197 * @len: number of blocks in the new reservation
    198 * @rs: existing reservation to compare against
    199 *
    200 * returns: 1 if the block range is beyond the reach of the reservation
    201 *         -1 if the block range is before the start of the reservation
    202 *          0 if the block range overlaps with the reservation
    203 */
    204static inline int rs_cmp(u64 start, u32 len, struct gfs2_blkreserv *rs)
    205{
    206	if (start >= rs->rs_start + rs->rs_requested)
    207		return 1;
    208	if (rs->rs_start >= start + len)
    209		return -1;
    210	return 0;
    211}
    212
    213/**
    214 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
    215 *       a block in a given allocation state.
    216 * @buf: the buffer that holds the bitmaps
    217 * @len: the length (in bytes) of the buffer
    218 * @goal: start search at this block's bit-pair (within @buffer)
    219 * @state: GFS2_BLKST_XXX the state of the block we're looking for.
    220 *
    221 * Scope of @goal and returned block number is only within this bitmap buffer,
    222 * not entire rgrp or filesystem.  @buffer will be offset from the actual
    223 * beginning of a bitmap block buffer, skipping any header structures, but
    224 * headers are always a multiple of 64 bits long so that the buffer is
    225 * always aligned to a 64 bit boundary.
    226 *
    227 * The size of the buffer is in bytes, but is it assumed that it is
    228 * always ok to read a complete multiple of 64 bits at the end
    229 * of the block in case the end is no aligned to a natural boundary.
    230 *
    231 * Return: the block number (bitmap buffer scope) that was found
    232 */
    233
    234static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
    235		       u32 goal, u8 state)
    236{
    237	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
    238	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
    239	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
    240	u64 tmp;
    241	u64 mask = 0x5555555555555555ULL;
    242	u32 bit;
    243
    244	/* Mask off bits we don't care about at the start of the search */
    245	mask <<= spoint;
    246	tmp = gfs2_bit_search(ptr, mask, state);
    247	ptr++;
    248	while(tmp == 0 && ptr < end) {
    249		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
    250		ptr++;
    251	}
    252	/* Mask off any bits which are more than len bytes from the start */
    253	if (ptr == end && (len & (sizeof(u64) - 1)))
    254		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
    255	/* Didn't find anything, so return */
    256	if (tmp == 0)
    257		return BFITNOENT;
    258	ptr--;
    259	bit = __ffs64(tmp);
    260	bit /= 2;	/* two bits per entry in the bitmap */
    261	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
    262}
    263
    264/**
    265 * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
    266 * @rbm: The rbm with rgd already set correctly
    267 * @block: The block number (filesystem relative)
    268 *
    269 * This sets the bi and offset members of an rbm based on a
    270 * resource group and a filesystem relative block number. The
    271 * resource group must be set in the rbm on entry, the bi and
    272 * offset members will be set by this function.
    273 *
    274 * Returns: 0 on success, or an error code
    275 */
    276
    277static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
    278{
    279	if (!rgrp_contains_block(rbm->rgd, block))
    280		return -E2BIG;
    281	rbm->bii = 0;
    282	rbm->offset = block - rbm->rgd->rd_data0;
    283	/* Check if the block is within the first block */
    284	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
    285		return 0;
    286
    287	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
    288	rbm->offset += (sizeof(struct gfs2_rgrp) -
    289			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
    290	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
    291	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
    292	return 0;
    293}
    294
    295/**
    296 * gfs2_rbm_add - add a number of blocks to an rbm
    297 * @rbm: The rbm with rgd already set correctly
    298 * @blocks: The number of blocks to add to rpm
    299 *
    300 * This function takes an existing rbm structure and adds a number of blocks to
    301 * it.
    302 *
    303 * Returns: True if the new rbm would point past the end of the rgrp.
    304 */
    305
    306static bool gfs2_rbm_add(struct gfs2_rbm *rbm, u32 blocks)
    307{
    308	struct gfs2_rgrpd *rgd = rbm->rgd;
    309	struct gfs2_bitmap *bi = rgd->rd_bits + rbm->bii;
    310
    311	if (rbm->offset + blocks < bi->bi_blocks) {
    312		rbm->offset += blocks;
    313		return false;
    314	}
    315	blocks -= bi->bi_blocks - rbm->offset;
    316
    317	for(;;) {
    318		bi++;
    319		if (bi == rgd->rd_bits + rgd->rd_length)
    320			return true;
    321		if (blocks < bi->bi_blocks) {
    322			rbm->offset = blocks;
    323			rbm->bii = bi - rgd->rd_bits;
    324			return false;
    325		}
    326		blocks -= bi->bi_blocks;
    327	}
    328}
    329
    330/**
    331 * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
    332 * @rbm: Position to search (value/result)
    333 * @n_unaligned: Number of unaligned blocks to check
    334 * @len: Decremented for each block found (terminate on zero)
    335 *
    336 * Returns: true if a non-free block is encountered or the end of the resource
    337 *	    group is reached.
    338 */
    339
    340static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
    341{
    342	u32 n;
    343	u8 res;
    344
    345	for (n = 0; n < n_unaligned; n++) {
    346		res = gfs2_testbit(rbm, true);
    347		if (res != GFS2_BLKST_FREE)
    348			return true;
    349		(*len)--;
    350		if (*len == 0)
    351			return true;
    352		if (gfs2_rbm_add(rbm, 1))
    353			return true;
    354	}
    355
    356	return false;
    357}
    358
    359/**
    360 * gfs2_free_extlen - Return extent length of free blocks
    361 * @rrbm: Starting position
    362 * @len: Max length to check
    363 *
    364 * Starting at the block specified by the rbm, see how many free blocks
    365 * there are, not reading more than len blocks ahead. This can be done
    366 * using memchr_inv when the blocks are byte aligned, but has to be done
    367 * on a block by block basis in case of unaligned blocks. Also this
    368 * function can cope with bitmap boundaries (although it must stop on
    369 * a resource group boundary)
    370 *
    371 * Returns: Number of free blocks in the extent
    372 */
    373
    374static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
    375{
    376	struct gfs2_rbm rbm = *rrbm;
    377	u32 n_unaligned = rbm.offset & 3;
    378	u32 size = len;
    379	u32 bytes;
    380	u32 chunk_size;
    381	u8 *ptr, *start, *end;
    382	u64 block;
    383	struct gfs2_bitmap *bi;
    384
    385	if (n_unaligned &&
    386	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
    387		goto out;
    388
    389	n_unaligned = len & 3;
    390	/* Start is now byte aligned */
    391	while (len > 3) {
    392		bi = rbm_bi(&rbm);
    393		start = bi->bi_bh->b_data;
    394		if (bi->bi_clone)
    395			start = bi->bi_clone;
    396		start += bi->bi_offset;
    397		end = start + bi->bi_bytes;
    398		BUG_ON(rbm.offset & 3);
    399		start += (rbm.offset / GFS2_NBBY);
    400		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
    401		ptr = memchr_inv(start, 0, bytes);
    402		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
    403		chunk_size *= GFS2_NBBY;
    404		BUG_ON(len < chunk_size);
    405		len -= chunk_size;
    406		block = gfs2_rbm_to_block(&rbm);
    407		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
    408			n_unaligned = 0;
    409			break;
    410		}
    411		if (ptr) {
    412			n_unaligned = 3;
    413			break;
    414		}
    415		n_unaligned = len & 3;
    416	}
    417
    418	/* Deal with any bits left over at the end */
    419	if (n_unaligned)
    420		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
    421out:
    422	return size - len;
    423}
    424
    425/**
    426 * gfs2_bitcount - count the number of bits in a certain state
    427 * @rgd: the resource group descriptor
    428 * @buffer: the buffer that holds the bitmaps
    429 * @buflen: the length (in bytes) of the buffer
    430 * @state: the state of the block we're looking for
    431 *
    432 * Returns: The number of bits
    433 */
    434
    435static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
    436			 unsigned int buflen, u8 state)
    437{
    438	const u8 *byte = buffer;
    439	const u8 *end = buffer + buflen;
    440	const u8 state1 = state << 2;
    441	const u8 state2 = state << 4;
    442	const u8 state3 = state << 6;
    443	u32 count = 0;
    444
    445	for (; byte < end; byte++) {
    446		if (((*byte) & 0x03) == state)
    447			count++;
    448		if (((*byte) & 0x0C) == state1)
    449			count++;
    450		if (((*byte) & 0x30) == state2)
    451			count++;
    452		if (((*byte) & 0xC0) == state3)
    453			count++;
    454	}
    455
    456	return count;
    457}
    458
    459/**
    460 * gfs2_rgrp_verify - Verify that a resource group is consistent
    461 * @rgd: the rgrp
    462 *
    463 */
    464
    465void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
    466{
    467	struct gfs2_sbd *sdp = rgd->rd_sbd;
    468	struct gfs2_bitmap *bi = NULL;
    469	u32 length = rgd->rd_length;
    470	u32 count[4], tmp;
    471	int buf, x;
    472
    473	memset(count, 0, 4 * sizeof(u32));
    474
    475	/* Count # blocks in each of 4 possible allocation states */
    476	for (buf = 0; buf < length; buf++) {
    477		bi = rgd->rd_bits + buf;
    478		for (x = 0; x < 4; x++)
    479			count[x] += gfs2_bitcount(rgd,
    480						  bi->bi_bh->b_data +
    481						  bi->bi_offset,
    482						  bi->bi_bytes, x);
    483	}
    484
    485	if (count[0] != rgd->rd_free) {
    486		gfs2_lm(sdp, "free data mismatch:  %u != %u\n",
    487			count[0], rgd->rd_free);
    488		gfs2_consist_rgrpd(rgd);
    489		return;
    490	}
    491
    492	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
    493	if (count[1] != tmp) {
    494		gfs2_lm(sdp, "used data mismatch:  %u != %u\n",
    495			count[1], tmp);
    496		gfs2_consist_rgrpd(rgd);
    497		return;
    498	}
    499
    500	if (count[2] + count[3] != rgd->rd_dinodes) {
    501		gfs2_lm(sdp, "used metadata mismatch:  %u != %u\n",
    502			count[2] + count[3], rgd->rd_dinodes);
    503		gfs2_consist_rgrpd(rgd);
    504		return;
    505	}
    506}
    507
    508/**
    509 * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
    510 * @sdp: The GFS2 superblock
    511 * @blk: The data block number
    512 * @exact: True if this needs to be an exact match
    513 *
    514 * The @exact argument should be set to true by most callers. The exception
    515 * is when we need to match blocks which are not represented by the rgrp
    516 * bitmap, but which are part of the rgrp (i.e. padding blocks) which are
    517 * there for alignment purposes. Another way of looking at it is that @exact
    518 * matches only valid data/metadata blocks, but with @exact false, it will
    519 * match any block within the extent of the rgrp.
    520 *
    521 * Returns: The resource group, or NULL if not found
    522 */
    523
    524struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
    525{
    526	struct rb_node *n, *next;
    527	struct gfs2_rgrpd *cur;
    528
    529	spin_lock(&sdp->sd_rindex_spin);
    530	n = sdp->sd_rindex_tree.rb_node;
    531	while (n) {
    532		cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
    533		next = NULL;
    534		if (blk < cur->rd_addr)
    535			next = n->rb_left;
    536		else if (blk >= cur->rd_data0 + cur->rd_data)
    537			next = n->rb_right;
    538		if (next == NULL) {
    539			spin_unlock(&sdp->sd_rindex_spin);
    540			if (exact) {
    541				if (blk < cur->rd_addr)
    542					return NULL;
    543				if (blk >= cur->rd_data0 + cur->rd_data)
    544					return NULL;
    545			}
    546			return cur;
    547		}
    548		n = next;
    549	}
    550	spin_unlock(&sdp->sd_rindex_spin);
    551
    552	return NULL;
    553}
    554
    555/**
    556 * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
    557 * @sdp: The GFS2 superblock
    558 *
    559 * Returns: The first rgrp in the filesystem
    560 */
    561
    562struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
    563{
    564	const struct rb_node *n;
    565	struct gfs2_rgrpd *rgd;
    566
    567	spin_lock(&sdp->sd_rindex_spin);
    568	n = rb_first(&sdp->sd_rindex_tree);
    569	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
    570	spin_unlock(&sdp->sd_rindex_spin);
    571
    572	return rgd;
    573}
    574
    575/**
    576 * gfs2_rgrpd_get_next - get the next RG
    577 * @rgd: the resource group descriptor
    578 *
    579 * Returns: The next rgrp
    580 */
    581
    582struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
    583{
    584	struct gfs2_sbd *sdp = rgd->rd_sbd;
    585	const struct rb_node *n;
    586
    587	spin_lock(&sdp->sd_rindex_spin);
    588	n = rb_next(&rgd->rd_node);
    589	if (n == NULL)
    590		n = rb_first(&sdp->sd_rindex_tree);
    591
    592	if (unlikely(&rgd->rd_node == n)) {
    593		spin_unlock(&sdp->sd_rindex_spin);
    594		return NULL;
    595	}
    596	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
    597	spin_unlock(&sdp->sd_rindex_spin);
    598	return rgd;
    599}
    600
    601void check_and_update_goal(struct gfs2_inode *ip)
    602{
    603	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
    604	if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
    605		ip->i_goal = ip->i_no_addr;
    606}
    607
    608void gfs2_free_clones(struct gfs2_rgrpd *rgd)
    609{
    610	int x;
    611
    612	for (x = 0; x < rgd->rd_length; x++) {
    613		struct gfs2_bitmap *bi = rgd->rd_bits + x;
    614		kfree(bi->bi_clone);
    615		bi->bi_clone = NULL;
    616	}
    617}
    618
    619static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs,
    620		    const char *fs_id_buf)
    621{
    622	struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
    623
    624	gfs2_print_dbg(seq, "%s  B: n:%llu s:%llu f:%u\n",
    625		       fs_id_buf,
    626		       (unsigned long long)ip->i_no_addr,
    627		       (unsigned long long)rs->rs_start,
    628		       rs->rs_requested);
    629}
    630
    631/**
    632 * __rs_deltree - remove a multi-block reservation from the rgd tree
    633 * @rs: The reservation to remove
    634 *
    635 */
    636static void __rs_deltree(struct gfs2_blkreserv *rs)
    637{
    638	struct gfs2_rgrpd *rgd;
    639
    640	if (!gfs2_rs_active(rs))
    641		return;
    642
    643	rgd = rs->rs_rgd;
    644	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
    645	rb_erase(&rs->rs_node, &rgd->rd_rstree);
    646	RB_CLEAR_NODE(&rs->rs_node);
    647
    648	if (rs->rs_requested) {
    649		/* return requested blocks to the rgrp */
    650		BUG_ON(rs->rs_rgd->rd_requested < rs->rs_requested);
    651		rs->rs_rgd->rd_requested -= rs->rs_requested;
    652
    653		/* The rgrp extent failure point is likely not to increase;
    654		   it will only do so if the freed blocks are somehow
    655		   contiguous with a span of free blocks that follows. Still,
    656		   it will force the number to be recalculated later. */
    657		rgd->rd_extfail_pt += rs->rs_requested;
    658		rs->rs_requested = 0;
    659	}
    660}
    661
    662/**
    663 * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
    664 * @rs: The reservation to remove
    665 *
    666 */
    667void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
    668{
    669	struct gfs2_rgrpd *rgd;
    670
    671	rgd = rs->rs_rgd;
    672	if (rgd) {
    673		spin_lock(&rgd->rd_rsspin);
    674		__rs_deltree(rs);
    675		BUG_ON(rs->rs_requested);
    676		spin_unlock(&rgd->rd_rsspin);
    677	}
    678}
    679
    680/**
    681 * gfs2_rs_delete - delete a multi-block reservation
    682 * @ip: The inode for this reservation
    683 *
    684 */
    685void gfs2_rs_delete(struct gfs2_inode *ip)
    686{
    687	struct inode *inode = &ip->i_inode;
    688
    689	down_write(&ip->i_rw_mutex);
    690	if (atomic_read(&inode->i_writecount) <= 1)
    691		gfs2_rs_deltree(&ip->i_res);
    692	up_write(&ip->i_rw_mutex);
    693}
    694
    695/**
    696 * return_all_reservations - return all reserved blocks back to the rgrp.
    697 * @rgd: the rgrp that needs its space back
    698 *
    699 * We previously reserved a bunch of blocks for allocation. Now we need to
    700 * give them back. This leave the reservation structures in tact, but removes
    701 * all of their corresponding "no-fly zones".
    702 */
    703static void return_all_reservations(struct gfs2_rgrpd *rgd)
    704{
    705	struct rb_node *n;
    706	struct gfs2_blkreserv *rs;
    707
    708	spin_lock(&rgd->rd_rsspin);
    709	while ((n = rb_first(&rgd->rd_rstree))) {
    710		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
    711		__rs_deltree(rs);
    712	}
    713	spin_unlock(&rgd->rd_rsspin);
    714}
    715
    716void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
    717{
    718	struct rb_node *n;
    719	struct gfs2_rgrpd *rgd;
    720	struct gfs2_glock *gl;
    721
    722	while ((n = rb_first(&sdp->sd_rindex_tree))) {
    723		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
    724		gl = rgd->rd_gl;
    725
    726		rb_erase(n, &sdp->sd_rindex_tree);
    727
    728		if (gl) {
    729			if (gl->gl_state != LM_ST_UNLOCKED) {
    730				gfs2_glock_cb(gl, LM_ST_UNLOCKED);
    731				flush_delayed_work(&gl->gl_work);
    732			}
    733			gfs2_rgrp_brelse(rgd);
    734			glock_clear_object(gl, rgd);
    735			gfs2_glock_put(gl);
    736		}
    737
    738		gfs2_free_clones(rgd);
    739		return_all_reservations(rgd);
    740		kfree(rgd->rd_bits);
    741		rgd->rd_bits = NULL;
    742		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
    743	}
    744}
    745
    746/**
    747 * compute_bitstructs - Compute the bitmap sizes
    748 * @rgd: The resource group descriptor
    749 *
    750 * Calculates bitmap descriptors, one for each block that contains bitmap data
    751 *
    752 * Returns: errno
    753 */
    754
    755static int compute_bitstructs(struct gfs2_rgrpd *rgd)
    756{
    757	struct gfs2_sbd *sdp = rgd->rd_sbd;
    758	struct gfs2_bitmap *bi;
    759	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
    760	u32 bytes_left, bytes;
    761	int x;
    762
    763	if (!length)
    764		return -EINVAL;
    765
    766	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
    767	if (!rgd->rd_bits)
    768		return -ENOMEM;
    769
    770	bytes_left = rgd->rd_bitbytes;
    771
    772	for (x = 0; x < length; x++) {
    773		bi = rgd->rd_bits + x;
    774
    775		bi->bi_flags = 0;
    776		/* small rgrp; bitmap stored completely in header block */
    777		if (length == 1) {
    778			bytes = bytes_left;
    779			bi->bi_offset = sizeof(struct gfs2_rgrp);
    780			bi->bi_start = 0;
    781			bi->bi_bytes = bytes;
    782			bi->bi_blocks = bytes * GFS2_NBBY;
    783		/* header block */
    784		} else if (x == 0) {
    785			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
    786			bi->bi_offset = sizeof(struct gfs2_rgrp);
    787			bi->bi_start = 0;
    788			bi->bi_bytes = bytes;
    789			bi->bi_blocks = bytes * GFS2_NBBY;
    790		/* last block */
    791		} else if (x + 1 == length) {
    792			bytes = bytes_left;
    793			bi->bi_offset = sizeof(struct gfs2_meta_header);
    794			bi->bi_start = rgd->rd_bitbytes - bytes_left;
    795			bi->bi_bytes = bytes;
    796			bi->bi_blocks = bytes * GFS2_NBBY;
    797		/* other blocks */
    798		} else {
    799			bytes = sdp->sd_sb.sb_bsize -
    800				sizeof(struct gfs2_meta_header);
    801			bi->bi_offset = sizeof(struct gfs2_meta_header);
    802			bi->bi_start = rgd->rd_bitbytes - bytes_left;
    803			bi->bi_bytes = bytes;
    804			bi->bi_blocks = bytes * GFS2_NBBY;
    805		}
    806
    807		bytes_left -= bytes;
    808	}
    809
    810	if (bytes_left) {
    811		gfs2_consist_rgrpd(rgd);
    812		return -EIO;
    813	}
    814	bi = rgd->rd_bits + (length - 1);
    815	if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) {
    816		gfs2_lm(sdp,
    817			"ri_addr = %llu\n"
    818			"ri_length = %u\n"
    819			"ri_data0 = %llu\n"
    820			"ri_data = %u\n"
    821			"ri_bitbytes = %u\n"
    822			"start=%u len=%u offset=%u\n",
    823			(unsigned long long)rgd->rd_addr,
    824			rgd->rd_length,
    825			(unsigned long long)rgd->rd_data0,
    826			rgd->rd_data,
    827			rgd->rd_bitbytes,
    828			bi->bi_start, bi->bi_bytes, bi->bi_offset);
    829		gfs2_consist_rgrpd(rgd);
    830		return -EIO;
    831	}
    832
    833	return 0;
    834}
    835
    836/**
    837 * gfs2_ri_total - Total up the file system space, according to the rindex.
    838 * @sdp: the filesystem
    839 *
    840 */
    841u64 gfs2_ri_total(struct gfs2_sbd *sdp)
    842{
    843	u64 total_data = 0;	
    844	struct inode *inode = sdp->sd_rindex;
    845	struct gfs2_inode *ip = GFS2_I(inode);
    846	char buf[sizeof(struct gfs2_rindex)];
    847	int error, rgrps;
    848
    849	for (rgrps = 0;; rgrps++) {
    850		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
    851
    852		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
    853			break;
    854		error = gfs2_internal_read(ip, buf, &pos,
    855					   sizeof(struct gfs2_rindex));
    856		if (error != sizeof(struct gfs2_rindex))
    857			break;
    858		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
    859	}
    860	return total_data;
    861}
    862
    863static int rgd_insert(struct gfs2_rgrpd *rgd)
    864{
    865	struct gfs2_sbd *sdp = rgd->rd_sbd;
    866	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
    867
    868	/* Figure out where to put new node */
    869	while (*newn) {
    870		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
    871						  rd_node);
    872
    873		parent = *newn;
    874		if (rgd->rd_addr < cur->rd_addr)
    875			newn = &((*newn)->rb_left);
    876		else if (rgd->rd_addr > cur->rd_addr)
    877			newn = &((*newn)->rb_right);
    878		else
    879			return -EEXIST;
    880	}
    881
    882	rb_link_node(&rgd->rd_node, parent, newn);
    883	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
    884	sdp->sd_rgrps++;
    885	return 0;
    886}
    887
    888/**
    889 * read_rindex_entry - Pull in a new resource index entry from the disk
    890 * @ip: Pointer to the rindex inode
    891 *
    892 * Returns: 0 on success, > 0 on EOF, error code otherwise
    893 */
    894
    895static int read_rindex_entry(struct gfs2_inode *ip)
    896{
    897	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
    898	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
    899	struct gfs2_rindex buf;
    900	int error;
    901	struct gfs2_rgrpd *rgd;
    902
    903	if (pos >= i_size_read(&ip->i_inode))
    904		return 1;
    905
    906	error = gfs2_internal_read(ip, (char *)&buf, &pos,
    907				   sizeof(struct gfs2_rindex));
    908
    909	if (error != sizeof(struct gfs2_rindex))
    910		return (error == 0) ? 1 : error;
    911
    912	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
    913	error = -ENOMEM;
    914	if (!rgd)
    915		return error;
    916
    917	rgd->rd_sbd = sdp;
    918	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
    919	rgd->rd_length = be32_to_cpu(buf.ri_length);
    920	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
    921	rgd->rd_data = be32_to_cpu(buf.ri_data);
    922	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
    923	spin_lock_init(&rgd->rd_rsspin);
    924	mutex_init(&rgd->rd_mutex);
    925
    926	error = gfs2_glock_get(sdp, rgd->rd_addr,
    927			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
    928	if (error)
    929		goto fail;
    930
    931	error = compute_bitstructs(rgd);
    932	if (error)
    933		goto fail_glock;
    934
    935	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
    936	rgd->rd_flags &= ~GFS2_RDF_PREFERRED;
    937	if (rgd->rd_data > sdp->sd_max_rg_data)
    938		sdp->sd_max_rg_data = rgd->rd_data;
    939	spin_lock(&sdp->sd_rindex_spin);
    940	error = rgd_insert(rgd);
    941	spin_unlock(&sdp->sd_rindex_spin);
    942	if (!error) {
    943		glock_set_object(rgd->rd_gl, rgd);
    944		return 0;
    945	}
    946
    947	error = 0; /* someone else read in the rgrp; free it and ignore it */
    948fail_glock:
    949	gfs2_glock_put(rgd->rd_gl);
    950
    951fail:
    952	kfree(rgd->rd_bits);
    953	rgd->rd_bits = NULL;
    954	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
    955	return error;
    956}
    957
    958/**
    959 * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
    960 * @sdp: the GFS2 superblock
    961 *
    962 * The purpose of this function is to select a subset of the resource groups
    963 * and mark them as PREFERRED. We do it in such a way that each node prefers
    964 * to use a unique set of rgrps to minimize glock contention.
    965 */
    966static void set_rgrp_preferences(struct gfs2_sbd *sdp)
    967{
    968	struct gfs2_rgrpd *rgd, *first;
    969	int i;
    970
    971	/* Skip an initial number of rgrps, based on this node's journal ID.
    972	   That should start each node out on its own set. */
    973	rgd = gfs2_rgrpd_get_first(sdp);
    974	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
    975		rgd = gfs2_rgrpd_get_next(rgd);
    976	first = rgd;
    977
    978	do {
    979		rgd->rd_flags |= GFS2_RDF_PREFERRED;
    980		for (i = 0; i < sdp->sd_journals; i++) {
    981			rgd = gfs2_rgrpd_get_next(rgd);
    982			if (!rgd || rgd == first)
    983				break;
    984		}
    985	} while (rgd && rgd != first);
    986}
    987
    988/**
    989 * gfs2_ri_update - Pull in a new resource index from the disk
    990 * @ip: pointer to the rindex inode
    991 *
    992 * Returns: 0 on successful update, error code otherwise
    993 */
    994
    995static int gfs2_ri_update(struct gfs2_inode *ip)
    996{
    997	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
    998	int error;
    999
   1000	do {
   1001		error = read_rindex_entry(ip);
   1002	} while (error == 0);
   1003
   1004	if (error < 0)
   1005		return error;
   1006
   1007	if (RB_EMPTY_ROOT(&sdp->sd_rindex_tree)) {
   1008		fs_err(sdp, "no resource groups found in the file system.\n");
   1009		return -ENOENT;
   1010	}
   1011	set_rgrp_preferences(sdp);
   1012
   1013	sdp->sd_rindex_uptodate = 1;
   1014	return 0;
   1015}
   1016
   1017/**
   1018 * gfs2_rindex_update - Update the rindex if required
   1019 * @sdp: The GFS2 superblock
   1020 *
   1021 * We grab a lock on the rindex inode to make sure that it doesn't
   1022 * change whilst we are performing an operation. We keep this lock
   1023 * for quite long periods of time compared to other locks. This
   1024 * doesn't matter, since it is shared and it is very, very rarely
   1025 * accessed in the exclusive mode (i.e. only when expanding the filesystem).
   1026 *
   1027 * This makes sure that we're using the latest copy of the resource index
   1028 * special file, which might have been updated if someone expanded the
   1029 * filesystem (via gfs2_grow utility), which adds new resource groups.
   1030 *
   1031 * Returns: 0 on succeess, error code otherwise
   1032 */
   1033
   1034int gfs2_rindex_update(struct gfs2_sbd *sdp)
   1035{
   1036	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
   1037	struct gfs2_glock *gl = ip->i_gl;
   1038	struct gfs2_holder ri_gh;
   1039	int error = 0;
   1040	int unlock_required = 0;
   1041
   1042	/* Read new copy from disk if we don't have the latest */
   1043	if (!sdp->sd_rindex_uptodate) {
   1044		if (!gfs2_glock_is_locked_by_me(gl)) {
   1045			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
   1046			if (error)
   1047				return error;
   1048			unlock_required = 1;
   1049		}
   1050		if (!sdp->sd_rindex_uptodate)
   1051			error = gfs2_ri_update(ip);
   1052		if (unlock_required)
   1053			gfs2_glock_dq_uninit(&ri_gh);
   1054	}
   1055
   1056	return error;
   1057}
   1058
   1059static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
   1060{
   1061	const struct gfs2_rgrp *str = buf;
   1062	u32 rg_flags;
   1063
   1064	rg_flags = be32_to_cpu(str->rg_flags);
   1065	rg_flags &= ~GFS2_RDF_MASK;
   1066	rgd->rd_flags &= GFS2_RDF_MASK;
   1067	rgd->rd_flags |= rg_flags;
   1068	rgd->rd_free = be32_to_cpu(str->rg_free);
   1069	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
   1070	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
   1071	/* rd_data0, rd_data and rd_bitbytes already set from rindex */
   1072}
   1073
   1074static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
   1075{
   1076	const struct gfs2_rgrp *str = buf;
   1077
   1078	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
   1079	rgl->rl_flags = str->rg_flags;
   1080	rgl->rl_free = str->rg_free;
   1081	rgl->rl_dinodes = str->rg_dinodes;
   1082	rgl->rl_igeneration = str->rg_igeneration;
   1083	rgl->__pad = 0UL;
   1084}
   1085
   1086static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
   1087{
   1088	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
   1089	struct gfs2_rgrp *str = buf;
   1090	u32 crc;
   1091
   1092	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
   1093	str->rg_free = cpu_to_be32(rgd->rd_free);
   1094	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
   1095	if (next == NULL)
   1096		str->rg_skip = 0;
   1097	else if (next->rd_addr > rgd->rd_addr)
   1098		str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
   1099	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
   1100	str->rg_data0 = cpu_to_be64(rgd->rd_data0);
   1101	str->rg_data = cpu_to_be32(rgd->rd_data);
   1102	str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
   1103	str->rg_crc = 0;
   1104	crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp));
   1105	str->rg_crc = cpu_to_be32(crc);
   1106
   1107	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
   1108	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf);
   1109}
   1110
   1111static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
   1112{
   1113	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
   1114	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
   1115	struct gfs2_sbd *sdp = rgd->rd_sbd;
   1116	int valid = 1;
   1117
   1118	if (rgl->rl_flags != str->rg_flags) {
   1119		fs_warn(sdp, "GFS2: rgd: %llu lvb flag mismatch %u/%u",
   1120			(unsigned long long)rgd->rd_addr,
   1121		       be32_to_cpu(rgl->rl_flags), be32_to_cpu(str->rg_flags));
   1122		valid = 0;
   1123	}
   1124	if (rgl->rl_free != str->rg_free) {
   1125		fs_warn(sdp, "GFS2: rgd: %llu lvb free mismatch %u/%u",
   1126			(unsigned long long)rgd->rd_addr,
   1127			be32_to_cpu(rgl->rl_free), be32_to_cpu(str->rg_free));
   1128		valid = 0;
   1129	}
   1130	if (rgl->rl_dinodes != str->rg_dinodes) {
   1131		fs_warn(sdp, "GFS2: rgd: %llu lvb dinode mismatch %u/%u",
   1132			(unsigned long long)rgd->rd_addr,
   1133			be32_to_cpu(rgl->rl_dinodes),
   1134			be32_to_cpu(str->rg_dinodes));
   1135		valid = 0;
   1136	}
   1137	if (rgl->rl_igeneration != str->rg_igeneration) {
   1138		fs_warn(sdp, "GFS2: rgd: %llu lvb igen mismatch %llu/%llu",
   1139			(unsigned long long)rgd->rd_addr,
   1140			(unsigned long long)be64_to_cpu(rgl->rl_igeneration),
   1141			(unsigned long long)be64_to_cpu(str->rg_igeneration));
   1142		valid = 0;
   1143	}
   1144	return valid;
   1145}
   1146
   1147static u32 count_unlinked(struct gfs2_rgrpd *rgd)
   1148{
   1149	struct gfs2_bitmap *bi;
   1150	const u32 length = rgd->rd_length;
   1151	const u8 *buffer = NULL;
   1152	u32 i, goal, count = 0;
   1153
   1154	for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
   1155		goal = 0;
   1156		buffer = bi->bi_bh->b_data + bi->bi_offset;
   1157		WARN_ON(!buffer_uptodate(bi->bi_bh));
   1158		while (goal < bi->bi_blocks) {
   1159			goal = gfs2_bitfit(buffer, bi->bi_bytes, goal,
   1160					   GFS2_BLKST_UNLINKED);
   1161			if (goal == BFITNOENT)
   1162				break;
   1163			count++;
   1164			goal++;
   1165		}
   1166	}
   1167
   1168	return count;
   1169}
   1170
   1171static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd)
   1172{
   1173	struct gfs2_bitmap *bi;
   1174	int x;
   1175
   1176	if (rgd->rd_free) {
   1177		for (x = 0; x < rgd->rd_length; x++) {
   1178			bi = rgd->rd_bits + x;
   1179			clear_bit(GBF_FULL, &bi->bi_flags);
   1180		}
   1181	} else {
   1182		for (x = 0; x < rgd->rd_length; x++) {
   1183			bi = rgd->rd_bits + x;
   1184			set_bit(GBF_FULL, &bi->bi_flags);
   1185		}
   1186	}
   1187}
   1188
   1189/**
   1190 * gfs2_rgrp_go_instantiate - Read in a RG's header and bitmaps
   1191 * @gh: the glock holder representing the rgrpd to read in
   1192 *
   1193 * Read in all of a Resource Group's header and bitmap blocks.
   1194 * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
   1195 *
   1196 * Returns: errno
   1197 */
   1198
   1199int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh)
   1200{
   1201	struct gfs2_glock *gl = gh->gh_gl;
   1202	struct gfs2_rgrpd *rgd = gl->gl_object;
   1203	struct gfs2_sbd *sdp = rgd->rd_sbd;
   1204	unsigned int length = rgd->rd_length;
   1205	struct gfs2_bitmap *bi;
   1206	unsigned int x, y;
   1207	int error;
   1208
   1209	if (rgd->rd_bits[0].bi_bh != NULL)
   1210		return 0;
   1211
   1212	for (x = 0; x < length; x++) {
   1213		bi = rgd->rd_bits + x;
   1214		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
   1215		if (error)
   1216			goto fail;
   1217	}
   1218
   1219	for (y = length; y--;) {
   1220		bi = rgd->rd_bits + y;
   1221		error = gfs2_meta_wait(sdp, bi->bi_bh);
   1222		if (error)
   1223			goto fail;
   1224		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
   1225					      GFS2_METATYPE_RG)) {
   1226			error = -EIO;
   1227			goto fail;
   1228		}
   1229	}
   1230
   1231	gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
   1232	rgrp_set_bitmap_flags(rgd);
   1233	rgd->rd_flags |= GFS2_RDF_CHECK;
   1234	rgd->rd_free_clone = rgd->rd_free;
   1235	GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved);
   1236	/* max out the rgrp allocation failure point */
   1237	rgd->rd_extfail_pt = rgd->rd_free;
   1238	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
   1239		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
   1240		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
   1241				     rgd->rd_bits[0].bi_bh->b_data);
   1242	} else if (sdp->sd_args.ar_rgrplvb) {
   1243		if (!gfs2_rgrp_lvb_valid(rgd)){
   1244			gfs2_consist_rgrpd(rgd);
   1245			error = -EIO;
   1246			goto fail;
   1247		}
   1248		if (rgd->rd_rgl->rl_unlinked == 0)
   1249			rgd->rd_flags &= ~GFS2_RDF_CHECK;
   1250	}
   1251	return 0;
   1252
   1253fail:
   1254	while (x--) {
   1255		bi = rgd->rd_bits + x;
   1256		brelse(bi->bi_bh);
   1257		bi->bi_bh = NULL;
   1258		gfs2_assert_warn(sdp, !bi->bi_clone);
   1259	}
   1260	return error;
   1261}
   1262
   1263static int update_rgrp_lvb(struct gfs2_rgrpd *rgd, struct gfs2_holder *gh)
   1264{
   1265	u32 rl_flags;
   1266
   1267	if (!test_bit(GLF_INSTANTIATE_NEEDED, &gh->gh_gl->gl_flags))
   1268		return 0;
   1269
   1270	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
   1271		return gfs2_instantiate(gh);
   1272
   1273	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
   1274	rl_flags &= ~GFS2_RDF_MASK;
   1275	rgd->rd_flags &= GFS2_RDF_MASK;
   1276	rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK);
   1277	if (rgd->rd_rgl->rl_unlinked == 0)
   1278		rgd->rd_flags &= ~GFS2_RDF_CHECK;
   1279	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
   1280	rgrp_set_bitmap_flags(rgd);
   1281	rgd->rd_free_clone = rgd->rd_free;
   1282	GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved);
   1283	/* max out the rgrp allocation failure point */
   1284	rgd->rd_extfail_pt = rgd->rd_free;
   1285	rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
   1286	rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
   1287	return 0;
   1288}
   1289
   1290/**
   1291 * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
   1292 * @rgd: The resource group
   1293 *
   1294 */
   1295
   1296void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
   1297{
   1298	int x, length = rgd->rd_length;
   1299
   1300	for (x = 0; x < length; x++) {
   1301		struct gfs2_bitmap *bi = rgd->rd_bits + x;
   1302		if (bi->bi_bh) {
   1303			brelse(bi->bi_bh);
   1304			bi->bi_bh = NULL;
   1305		}
   1306	}
   1307	set_bit(GLF_INSTANTIATE_NEEDED, &rgd->rd_gl->gl_flags);
   1308}
   1309
   1310int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
   1311			     struct buffer_head *bh,
   1312			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
   1313{
   1314	struct super_block *sb = sdp->sd_vfs;
   1315	u64 blk;
   1316	sector_t start = 0;
   1317	sector_t nr_blks = 0;
   1318	int rv = -EIO;
   1319	unsigned int x;
   1320	u32 trimmed = 0;
   1321	u8 diff;
   1322
   1323	for (x = 0; x < bi->bi_bytes; x++) {
   1324		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
   1325		clone += bi->bi_offset;
   1326		clone += x;
   1327		if (bh) {
   1328			const u8 *orig = bh->b_data + bi->bi_offset + x;
   1329			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
   1330		} else {
   1331			diff = ~(*clone | (*clone >> 1));
   1332		}
   1333		diff &= 0x55;
   1334		if (diff == 0)
   1335			continue;
   1336		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
   1337		while(diff) {
   1338			if (diff & 1) {
   1339				if (nr_blks == 0)
   1340					goto start_new_extent;
   1341				if ((start + nr_blks) != blk) {
   1342					if (nr_blks >= minlen) {
   1343						rv = sb_issue_discard(sb,
   1344							start, nr_blks,
   1345							GFP_NOFS, 0);
   1346						if (rv)
   1347							goto fail;
   1348						trimmed += nr_blks;
   1349					}
   1350					nr_blks = 0;
   1351start_new_extent:
   1352					start = blk;
   1353				}
   1354				nr_blks++;
   1355			}
   1356			diff >>= 2;
   1357			blk++;
   1358		}
   1359	}
   1360	if (nr_blks >= minlen) {
   1361		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
   1362		if (rv)
   1363			goto fail;
   1364		trimmed += nr_blks;
   1365	}
   1366	if (ptrimmed)
   1367		*ptrimmed = trimmed;
   1368	return 0;
   1369
   1370fail:
   1371	if (sdp->sd_args.ar_discard)
   1372		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
   1373	sdp->sd_args.ar_discard = 0;
   1374	return rv;
   1375}
   1376
   1377/**
   1378 * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
   1379 * @filp: Any file on the filesystem
   1380 * @argp: Pointer to the arguments (also used to pass result)
   1381 *
   1382 * Returns: 0 on success, otherwise error code
   1383 */
   1384
   1385int gfs2_fitrim(struct file *filp, void __user *argp)
   1386{
   1387	struct inode *inode = file_inode(filp);
   1388	struct gfs2_sbd *sdp = GFS2_SB(inode);
   1389	struct block_device *bdev = sdp->sd_vfs->s_bdev;
   1390	struct buffer_head *bh;
   1391	struct gfs2_rgrpd *rgd;
   1392	struct gfs2_rgrpd *rgd_end;
   1393	struct gfs2_holder gh;
   1394	struct fstrim_range r;
   1395	int ret = 0;
   1396	u64 amt;
   1397	u64 trimmed = 0;
   1398	u64 start, end, minlen;
   1399	unsigned int x;
   1400	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
   1401
   1402	if (!capable(CAP_SYS_ADMIN))
   1403		return -EPERM;
   1404
   1405	if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
   1406		return -EROFS;
   1407
   1408	if (!bdev_max_discard_sectors(bdev))
   1409		return -EOPNOTSUPP;
   1410
   1411	if (copy_from_user(&r, argp, sizeof(r)))
   1412		return -EFAULT;
   1413
   1414	ret = gfs2_rindex_update(sdp);
   1415	if (ret)
   1416		return ret;
   1417
   1418	start = r.start >> bs_shift;
   1419	end = start + (r.len >> bs_shift);
   1420	minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize);
   1421	minlen = max_t(u64, minlen, bdev_discard_granularity(bdev)) >> bs_shift;
   1422
   1423	if (end <= start || minlen > sdp->sd_max_rg_data)
   1424		return -EINVAL;
   1425
   1426	rgd = gfs2_blk2rgrpd(sdp, start, 0);
   1427	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
   1428
   1429	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
   1430	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
   1431		return -EINVAL; /* start is beyond the end of the fs */
   1432
   1433	while (1) {
   1434
   1435		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
   1436					 LM_FLAG_NODE_SCOPE, &gh);
   1437		if (ret)
   1438			goto out;
   1439
   1440		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
   1441			/* Trim each bitmap in the rgrp */
   1442			for (x = 0; x < rgd->rd_length; x++) {
   1443				struct gfs2_bitmap *bi = rgd->rd_bits + x;
   1444				rgrp_lock_local(rgd);
   1445				ret = gfs2_rgrp_send_discards(sdp,
   1446						rgd->rd_data0, NULL, bi, minlen,
   1447						&amt);
   1448				rgrp_unlock_local(rgd);
   1449				if (ret) {
   1450					gfs2_glock_dq_uninit(&gh);
   1451					goto out;
   1452				}
   1453				trimmed += amt;
   1454			}
   1455
   1456			/* Mark rgrp as having been trimmed */
   1457			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
   1458			if (ret == 0) {
   1459				bh = rgd->rd_bits[0].bi_bh;
   1460				rgrp_lock_local(rgd);
   1461				rgd->rd_flags |= GFS2_RGF_TRIMMED;
   1462				gfs2_trans_add_meta(rgd->rd_gl, bh);
   1463				gfs2_rgrp_out(rgd, bh->b_data);
   1464				rgrp_unlock_local(rgd);
   1465				gfs2_trans_end(sdp);
   1466			}
   1467		}
   1468		gfs2_glock_dq_uninit(&gh);
   1469
   1470		if (rgd == rgd_end)
   1471			break;
   1472
   1473		rgd = gfs2_rgrpd_get_next(rgd);
   1474	}
   1475
   1476out:
   1477	r.len = trimmed << bs_shift;
   1478	if (copy_to_user(argp, &r, sizeof(r)))
   1479		return -EFAULT;
   1480
   1481	return ret;
   1482}
   1483
   1484/**
   1485 * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
   1486 * @ip: the inode structure
   1487 *
   1488 */
   1489static void rs_insert(struct gfs2_inode *ip)
   1490{
   1491	struct rb_node **newn, *parent = NULL;
   1492	int rc;
   1493	struct gfs2_blkreserv *rs = &ip->i_res;
   1494	struct gfs2_rgrpd *rgd = rs->rs_rgd;
   1495
   1496	BUG_ON(gfs2_rs_active(rs));
   1497
   1498	spin_lock(&rgd->rd_rsspin);
   1499	newn = &rgd->rd_rstree.rb_node;
   1500	while (*newn) {
   1501		struct gfs2_blkreserv *cur =
   1502			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
   1503
   1504		parent = *newn;
   1505		rc = rs_cmp(rs->rs_start, rs->rs_requested, cur);
   1506		if (rc > 0)
   1507			newn = &((*newn)->rb_right);
   1508		else if (rc < 0)
   1509			newn = &((*newn)->rb_left);
   1510		else {
   1511			spin_unlock(&rgd->rd_rsspin);
   1512			WARN_ON(1);
   1513			return;
   1514		}
   1515	}
   1516
   1517	rb_link_node(&rs->rs_node, parent, newn);
   1518	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
   1519
   1520	/* Do our rgrp accounting for the reservation */
   1521	rgd->rd_requested += rs->rs_requested; /* blocks requested */
   1522	spin_unlock(&rgd->rd_rsspin);
   1523	trace_gfs2_rs(rs, TRACE_RS_INSERT);
   1524}
   1525
   1526/**
   1527 * rgd_free - return the number of free blocks we can allocate
   1528 * @rgd: the resource group
   1529 * @rs: The reservation to free
   1530 *
   1531 * This function returns the number of free blocks for an rgrp.
   1532 * That's the clone-free blocks (blocks that are free, not including those
   1533 * still being used for unlinked files that haven't been deleted.)
   1534 *
   1535 * It also subtracts any blocks reserved by someone else, but does not
   1536 * include free blocks that are still part of our current reservation,
   1537 * because obviously we can (and will) allocate them.
   1538 */
   1539static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
   1540{
   1541	u32 tot_reserved, tot_free;
   1542
   1543	if (WARN_ON_ONCE(rgd->rd_requested < rs->rs_requested))
   1544		return 0;
   1545	tot_reserved = rgd->rd_requested - rs->rs_requested;
   1546
   1547	if (rgd->rd_free_clone < tot_reserved)
   1548		tot_reserved = 0;
   1549
   1550	tot_free = rgd->rd_free_clone - tot_reserved;
   1551
   1552	return tot_free;
   1553}
   1554
   1555/**
   1556 * rg_mblk_search - find a group of multiple free blocks to form a reservation
   1557 * @rgd: the resource group descriptor
   1558 * @ip: pointer to the inode for which we're reserving blocks
   1559 * @ap: the allocation parameters
   1560 *
   1561 */
   1562
   1563static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
   1564			   const struct gfs2_alloc_parms *ap)
   1565{
   1566	struct gfs2_rbm rbm = { .rgd = rgd, };
   1567	u64 goal;
   1568	struct gfs2_blkreserv *rs = &ip->i_res;
   1569	u32 extlen;
   1570	u32 free_blocks, blocks_available;
   1571	int ret;
   1572	struct inode *inode = &ip->i_inode;
   1573
   1574	spin_lock(&rgd->rd_rsspin);
   1575	free_blocks = rgd_free(rgd, rs);
   1576	if (rgd->rd_free_clone < rgd->rd_requested)
   1577		free_blocks = 0;
   1578	blocks_available = rgd->rd_free_clone - rgd->rd_reserved;
   1579	if (rgd == rs->rs_rgd)
   1580		blocks_available += rs->rs_reserved;
   1581	spin_unlock(&rgd->rd_rsspin);
   1582
   1583	if (S_ISDIR(inode->i_mode))
   1584		extlen = 1;
   1585	else {
   1586		extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target);
   1587		extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks);
   1588	}
   1589	if (free_blocks < extlen || blocks_available < extlen)
   1590		return;
   1591
   1592	/* Find bitmap block that contains bits for goal block */
   1593	if (rgrp_contains_block(rgd, ip->i_goal))
   1594		goal = ip->i_goal;
   1595	else
   1596		goal = rgd->rd_last_alloc + rgd->rd_data0;
   1597
   1598	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
   1599		return;
   1600
   1601	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, &ip->i_res, true);
   1602	if (ret == 0) {
   1603		rs->rs_start = gfs2_rbm_to_block(&rbm);
   1604		rs->rs_requested = extlen;
   1605		rs_insert(ip);
   1606	} else {
   1607		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
   1608			rgd->rd_last_alloc = 0;
   1609	}
   1610}
   1611
   1612/**
   1613 * gfs2_next_unreserved_block - Return next block that is not reserved
   1614 * @rgd: The resource group
   1615 * @block: The starting block
   1616 * @length: The required length
   1617 * @ignore_rs: Reservation to ignore
   1618 *
   1619 * If the block does not appear in any reservation, then return the
   1620 * block number unchanged. If it does appear in the reservation, then
   1621 * keep looking through the tree of reservations in order to find the
   1622 * first block number which is not reserved.
   1623 */
   1624
   1625static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
   1626				      u32 length,
   1627				      struct gfs2_blkreserv *ignore_rs)
   1628{
   1629	struct gfs2_blkreserv *rs;
   1630	struct rb_node *n;
   1631	int rc;
   1632
   1633	spin_lock(&rgd->rd_rsspin);
   1634	n = rgd->rd_rstree.rb_node;
   1635	while (n) {
   1636		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
   1637		rc = rs_cmp(block, length, rs);
   1638		if (rc < 0)
   1639			n = n->rb_left;
   1640		else if (rc > 0)
   1641			n = n->rb_right;
   1642		else
   1643			break;
   1644	}
   1645
   1646	if (n) {
   1647		while (rs_cmp(block, length, rs) == 0 && rs != ignore_rs) {
   1648			block = rs->rs_start + rs->rs_requested;
   1649			n = n->rb_right;
   1650			if (n == NULL)
   1651				break;
   1652			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
   1653		}
   1654	}
   1655
   1656	spin_unlock(&rgd->rd_rsspin);
   1657	return block;
   1658}
   1659
   1660/**
   1661 * gfs2_reservation_check_and_update - Check for reservations during block alloc
   1662 * @rbm: The current position in the resource group
   1663 * @rs: Our own reservation
   1664 * @minext: The minimum extent length
   1665 * @maxext: A pointer to the maximum extent structure
   1666 *
   1667 * This checks the current position in the rgrp to see whether there is
   1668 * a reservation covering this block. If not then this function is a
   1669 * no-op. If there is, then the position is moved to the end of the
   1670 * contiguous reservation(s) so that we are pointing at the first
   1671 * non-reserved block.
   1672 *
   1673 * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
   1674 */
   1675
   1676static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
   1677					     struct gfs2_blkreserv *rs,
   1678					     u32 minext,
   1679					     struct gfs2_extent *maxext)
   1680{
   1681	u64 block = gfs2_rbm_to_block(rbm);
   1682	u32 extlen = 1;
   1683	u64 nblock;
   1684
   1685	/*
   1686	 * If we have a minimum extent length, then skip over any extent
   1687	 * which is less than the min extent length in size.
   1688	 */
   1689	if (minext > 1) {
   1690		extlen = gfs2_free_extlen(rbm, minext);
   1691		if (extlen <= maxext->len)
   1692			goto fail;
   1693	}
   1694
   1695	/*
   1696	 * Check the extent which has been found against the reservations
   1697	 * and skip if parts of it are already reserved
   1698	 */
   1699	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, rs);
   1700	if (nblock == block) {
   1701		if (!minext || extlen >= minext)
   1702			return 0;
   1703
   1704		if (extlen > maxext->len) {
   1705			maxext->len = extlen;
   1706			maxext->rbm = *rbm;
   1707		}
   1708	} else {
   1709		u64 len = nblock - block;
   1710		if (len >= (u64)1 << 32)
   1711			return -E2BIG;
   1712		extlen = len;
   1713	}
   1714fail:
   1715	if (gfs2_rbm_add(rbm, extlen))
   1716		return -E2BIG;
   1717	return 1;
   1718}
   1719
   1720/**
   1721 * gfs2_rbm_find - Look for blocks of a particular state
   1722 * @rbm: Value/result starting position and final position
   1723 * @state: The state which we want to find
   1724 * @minext: Pointer to the requested extent length
   1725 *          This is updated to be the actual reservation size.
   1726 * @rs: Our own reservation (NULL to skip checking for reservations)
   1727 * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
   1728 *          around until we've reached the starting point.
   1729 *
   1730 * Side effects:
   1731 * - If looking for free blocks, we set GBF_FULL on each bitmap which
   1732 *   has no free blocks in it.
   1733 * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
   1734 *   has come up short on a free block search.
   1735 *
   1736 * Returns: 0 on success, -ENOSPC if there is no block of the requested state
   1737 */
   1738
   1739static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
   1740			 struct gfs2_blkreserv *rs, bool nowrap)
   1741{
   1742	bool scan_from_start = rbm->bii == 0 && rbm->offset == 0;
   1743	struct buffer_head *bh;
   1744	int last_bii;
   1745	u32 offset;
   1746	u8 *buffer;
   1747	bool wrapped = false;
   1748	int ret;
   1749	struct gfs2_bitmap *bi;
   1750	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
   1751
   1752	/*
   1753	 * Determine the last bitmap to search.  If we're not starting at the
   1754	 * beginning of a bitmap, we need to search that bitmap twice to scan
   1755	 * the entire resource group.
   1756	 */
   1757	last_bii = rbm->bii - (rbm->offset == 0);
   1758
   1759	while(1) {
   1760		bi = rbm_bi(rbm);
   1761		if (test_bit(GBF_FULL, &bi->bi_flags) &&
   1762		    (state == GFS2_BLKST_FREE))
   1763			goto next_bitmap;
   1764
   1765		bh = bi->bi_bh;
   1766		buffer = bh->b_data + bi->bi_offset;
   1767		WARN_ON(!buffer_uptodate(bh));
   1768		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
   1769			buffer = bi->bi_clone + bi->bi_offset;
   1770		offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state);
   1771		if (offset == BFITNOENT) {
   1772			if (state == GFS2_BLKST_FREE && rbm->offset == 0)
   1773				set_bit(GBF_FULL, &bi->bi_flags);
   1774			goto next_bitmap;
   1775		}
   1776		rbm->offset = offset;
   1777		if (!rs || !minext)
   1778			return 0;
   1779
   1780		ret = gfs2_reservation_check_and_update(rbm, rs, *minext,
   1781							&maxext);
   1782		if (ret == 0)
   1783			return 0;
   1784		if (ret > 0)
   1785			goto next_iter;
   1786		if (ret == -E2BIG) {
   1787			rbm->bii = 0;
   1788			rbm->offset = 0;
   1789			goto res_covered_end_of_rgrp;
   1790		}
   1791		return ret;
   1792
   1793next_bitmap:	/* Find next bitmap in the rgrp */
   1794		rbm->offset = 0;
   1795		rbm->bii++;
   1796		if (rbm->bii == rbm->rgd->rd_length)
   1797			rbm->bii = 0;
   1798res_covered_end_of_rgrp:
   1799		if (rbm->bii == 0) {
   1800			if (wrapped)
   1801				break;
   1802			wrapped = true;
   1803			if (nowrap)
   1804				break;
   1805		}
   1806next_iter:
   1807		/* Have we scanned the entire resource group? */
   1808		if (wrapped && rbm->bii > last_bii)
   1809			break;
   1810	}
   1811
   1812	if (state != GFS2_BLKST_FREE)
   1813		return -ENOSPC;
   1814
   1815	/* If the extent was too small, and it's smaller than the smallest
   1816	   to have failed before, remember for future reference that it's
   1817	   useless to search this rgrp again for this amount or more. */
   1818	if (wrapped && (scan_from_start || rbm->bii > last_bii) &&
   1819	    *minext < rbm->rgd->rd_extfail_pt)
   1820		rbm->rgd->rd_extfail_pt = *minext - 1;
   1821
   1822	/* If the maximum extent we found is big enough to fulfill the
   1823	   minimum requirements, use it anyway. */
   1824	if (maxext.len) {
   1825		*rbm = maxext.rbm;
   1826		*minext = maxext.len;
   1827		return 0;
   1828	}
   1829
   1830	return -ENOSPC;
   1831}
   1832
   1833/**
   1834 * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
   1835 * @rgd: The rgrp
   1836 * @last_unlinked: block address of the last dinode we unlinked
   1837 * @skip: block address we should explicitly not unlink
   1838 *
   1839 * Returns: 0 if no error
   1840 *          The inode, if one has been found, in inode.
   1841 */
   1842
   1843static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
   1844{
   1845	u64 block;
   1846	struct gfs2_sbd *sdp = rgd->rd_sbd;
   1847	struct gfs2_glock *gl;
   1848	struct gfs2_inode *ip;
   1849	int error;
   1850	int found = 0;
   1851	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
   1852
   1853	while (1) {
   1854		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
   1855				      true);
   1856		if (error == -ENOSPC)
   1857			break;
   1858		if (WARN_ON_ONCE(error))
   1859			break;
   1860
   1861		block = gfs2_rbm_to_block(&rbm);
   1862		if (gfs2_rbm_from_block(&rbm, block + 1))
   1863			break;
   1864		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
   1865			continue;
   1866		if (block == skip)
   1867			continue;
   1868		*last_unlinked = block;
   1869
   1870		error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
   1871		if (error)
   1872			continue;
   1873
   1874		/* If the inode is already in cache, we can ignore it here
   1875		 * because the existing inode disposal code will deal with
   1876		 * it when all refs have gone away. Accessing gl_object like
   1877		 * this is not safe in general. Here it is ok because we do
   1878		 * not dereference the pointer, and we only need an approx
   1879		 * answer to whether it is NULL or not.
   1880		 */
   1881		ip = gl->gl_object;
   1882
   1883		if (ip || !gfs2_queue_delete_work(gl, 0))
   1884			gfs2_glock_put(gl);
   1885		else
   1886			found++;
   1887
   1888		/* Limit reclaim to sensible number of tasks */
   1889		if (found > NR_CPUS)
   1890			return;
   1891	}
   1892
   1893	rgd->rd_flags &= ~GFS2_RDF_CHECK;
   1894	return;
   1895}
   1896
   1897/**
   1898 * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
   1899 * @rgd: The rgrp in question
   1900 * @loops: An indication of how picky we can be (0=very, 1=less so)
   1901 *
   1902 * This function uses the recently added glock statistics in order to
   1903 * figure out whether a parciular resource group is suffering from
   1904 * contention from multiple nodes. This is done purely on the basis
   1905 * of timings, since this is the only data we have to work with and
   1906 * our aim here is to reject a resource group which is highly contended
   1907 * but (very important) not to do this too often in order to ensure that
   1908 * we do not land up introducing fragmentation by changing resource
   1909 * groups when not actually required.
   1910 *
   1911 * The calculation is fairly simple, we want to know whether the SRTTB
   1912 * (i.e. smoothed round trip time for blocking operations) to acquire
   1913 * the lock for this rgrp's glock is significantly greater than the
   1914 * time taken for resource groups on average. We introduce a margin in
   1915 * the form of the variable @var which is computed as the sum of the two
   1916 * respective variences, and multiplied by a factor depending on @loops
   1917 * and whether we have a lot of data to base the decision on. This is
   1918 * then tested against the square difference of the means in order to
   1919 * decide whether the result is statistically significant or not.
   1920 *
   1921 * Returns: A boolean verdict on the congestion status
   1922 */
   1923
   1924static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
   1925{
   1926	const struct gfs2_glock *gl = rgd->rd_gl;
   1927	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
   1928	struct gfs2_lkstats *st;
   1929	u64 r_dcount, l_dcount;
   1930	u64 l_srttb, a_srttb = 0;
   1931	s64 srttb_diff;
   1932	u64 sqr_diff;
   1933	u64 var;
   1934	int cpu, nonzero = 0;
   1935
   1936	preempt_disable();
   1937	for_each_present_cpu(cpu) {
   1938		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
   1939		if (st->stats[GFS2_LKS_SRTTB]) {
   1940			a_srttb += st->stats[GFS2_LKS_SRTTB];
   1941			nonzero++;
   1942		}
   1943	}
   1944	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
   1945	if (nonzero)
   1946		do_div(a_srttb, nonzero);
   1947	r_dcount = st->stats[GFS2_LKS_DCOUNT];
   1948	var = st->stats[GFS2_LKS_SRTTVARB] +
   1949	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
   1950	preempt_enable();
   1951
   1952	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
   1953	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
   1954
   1955	if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
   1956		return false;
   1957
   1958	srttb_diff = a_srttb - l_srttb;
   1959	sqr_diff = srttb_diff * srttb_diff;
   1960
   1961	var *= 2;
   1962	if (l_dcount < 8 || r_dcount < 8)
   1963		var *= 2;
   1964	if (loops == 1)
   1965		var *= 2;
   1966
   1967	return ((srttb_diff < 0) && (sqr_diff > var));
   1968}
   1969
   1970/**
   1971 * gfs2_rgrp_used_recently
   1972 * @rs: The block reservation with the rgrp to test
   1973 * @msecs: The time limit in milliseconds
   1974 *
   1975 * Returns: True if the rgrp glock has been used within the time limit
   1976 */
   1977static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
   1978				    u64 msecs)
   1979{
   1980	u64 tdiff;
   1981
   1982	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
   1983                            rs->rs_rgd->rd_gl->gl_dstamp));
   1984
   1985	return tdiff > (msecs * 1000 * 1000);
   1986}
   1987
   1988static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
   1989{
   1990	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
   1991	u32 skip;
   1992
   1993	get_random_bytes(&skip, sizeof(skip));
   1994	return skip % sdp->sd_rgrps;
   1995}
   1996
   1997static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
   1998{
   1999	struct gfs2_rgrpd *rgd = *pos;
   2000	struct gfs2_sbd *sdp = rgd->rd_sbd;
   2001
   2002	rgd = gfs2_rgrpd_get_next(rgd);
   2003	if (rgd == NULL)
   2004		rgd = gfs2_rgrpd_get_first(sdp);
   2005	*pos = rgd;
   2006	if (rgd != begin) /* If we didn't wrap */
   2007		return true;
   2008	return false;
   2009}
   2010
   2011/**
   2012 * fast_to_acquire - determine if a resource group will be fast to acquire
   2013 * @rgd: The rgrp
   2014 *
   2015 * If this is one of our preferred rgrps, it should be quicker to acquire,
   2016 * because we tried to set ourselves up as dlm lock master.
   2017 */
   2018static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
   2019{
   2020	struct gfs2_glock *gl = rgd->rd_gl;
   2021
   2022	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
   2023	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
   2024	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
   2025		return 1;
   2026	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
   2027		return 1;
   2028	return 0;
   2029}
   2030
   2031/**
   2032 * gfs2_inplace_reserve - Reserve space in the filesystem
   2033 * @ip: the inode to reserve space for
   2034 * @ap: the allocation parameters
   2035 *
   2036 * We try our best to find an rgrp that has at least ap->target blocks
   2037 * available. After a couple of passes (loops == 2), the prospects of finding
   2038 * such an rgrp diminish. At this stage, we return the first rgrp that has
   2039 * at least ap->min_target blocks available.
   2040 *
   2041 * Returns: 0 on success,
   2042 *          -ENOMEM if a suitable rgrp can't be found
   2043 *          errno otherwise
   2044 */
   2045
   2046int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
   2047{
   2048	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
   2049	struct gfs2_rgrpd *begin = NULL;
   2050	struct gfs2_blkreserv *rs = &ip->i_res;
   2051	int error = 0, flags = LM_FLAG_NODE_SCOPE;
   2052	bool rg_locked;
   2053	u64 last_unlinked = NO_BLOCK;
   2054	u32 target = ap->target;
   2055	int loops = 0;
   2056	u32 free_blocks, blocks_available, skip = 0;
   2057
   2058	BUG_ON(rs->rs_reserved);
   2059
   2060	if (sdp->sd_args.ar_rgrplvb)
   2061		flags |= GL_SKIP;
   2062	if (gfs2_assert_warn(sdp, target))
   2063		return -EINVAL;
   2064	if (gfs2_rs_active(rs)) {
   2065		begin = rs->rs_rgd;
   2066	} else if (rs->rs_rgd &&
   2067		   rgrp_contains_block(rs->rs_rgd, ip->i_goal)) {
   2068		begin = rs->rs_rgd;
   2069	} else {
   2070		check_and_update_goal(ip);
   2071		rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
   2072	}
   2073	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
   2074		skip = gfs2_orlov_skip(ip);
   2075	if (rs->rs_rgd == NULL)
   2076		return -EBADSLT;
   2077
   2078	while (loops < 3) {
   2079		struct gfs2_rgrpd *rgd;
   2080
   2081		rg_locked = gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl);
   2082		if (rg_locked) {
   2083			rgrp_lock_local(rs->rs_rgd);
   2084		} else {
   2085			if (skip && skip--)
   2086				goto next_rgrp;
   2087			if (!gfs2_rs_active(rs)) {
   2088				if (loops == 0 &&
   2089				    !fast_to_acquire(rs->rs_rgd))
   2090					goto next_rgrp;
   2091				if ((loops < 2) &&
   2092				    gfs2_rgrp_used_recently(rs, 1000) &&
   2093				    gfs2_rgrp_congested(rs->rs_rgd, loops))
   2094					goto next_rgrp;
   2095			}
   2096			error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl,
   2097						   LM_ST_EXCLUSIVE, flags,
   2098						   &ip->i_rgd_gh);
   2099			if (unlikely(error))
   2100				return error;
   2101			rgrp_lock_local(rs->rs_rgd);
   2102			if (!gfs2_rs_active(rs) && (loops < 2) &&
   2103			    gfs2_rgrp_congested(rs->rs_rgd, loops))
   2104				goto skip_rgrp;
   2105			if (sdp->sd_args.ar_rgrplvb) {
   2106				error = update_rgrp_lvb(rs->rs_rgd,
   2107							&ip->i_rgd_gh);
   2108				if (unlikely(error)) {
   2109					rgrp_unlock_local(rs->rs_rgd);
   2110					gfs2_glock_dq_uninit(&ip->i_rgd_gh);
   2111					return error;
   2112				}
   2113			}
   2114		}
   2115
   2116		/* Skip unusable resource groups */
   2117		if ((rs->rs_rgd->rd_flags & (GFS2_RGF_NOALLOC |
   2118						 GFS2_RDF_ERROR)) ||
   2119		    (loops == 0 && target > rs->rs_rgd->rd_extfail_pt))
   2120			goto skip_rgrp;
   2121
   2122		if (sdp->sd_args.ar_rgrplvb) {
   2123			error = gfs2_instantiate(&ip->i_rgd_gh);
   2124			if (error)
   2125				goto skip_rgrp;
   2126		}
   2127
   2128		/* Get a reservation if we don't already have one */
   2129		if (!gfs2_rs_active(rs))
   2130			rg_mblk_search(rs->rs_rgd, ip, ap);
   2131
   2132		/* Skip rgrps when we can't get a reservation on first pass */
   2133		if (!gfs2_rs_active(rs) && (loops < 1))
   2134			goto check_rgrp;
   2135
   2136		/* If rgrp has enough free space, use it */
   2137		rgd = rs->rs_rgd;
   2138		spin_lock(&rgd->rd_rsspin);
   2139		free_blocks = rgd_free(rgd, rs);
   2140		blocks_available = rgd->rd_free_clone - rgd->rd_reserved;
   2141		if (free_blocks < target || blocks_available < target) {
   2142			spin_unlock(&rgd->rd_rsspin);
   2143			goto check_rgrp;
   2144		}
   2145		rs->rs_reserved = ap->target;
   2146		if (rs->rs_reserved > blocks_available)
   2147			rs->rs_reserved = blocks_available;
   2148		rgd->rd_reserved += rs->rs_reserved;
   2149		spin_unlock(&rgd->rd_rsspin);
   2150		rgrp_unlock_local(rs->rs_rgd);
   2151		return 0;
   2152check_rgrp:
   2153		/* Check for unlinked inodes which can be reclaimed */
   2154		if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK)
   2155			try_rgrp_unlink(rs->rs_rgd, &last_unlinked,
   2156					ip->i_no_addr);
   2157skip_rgrp:
   2158		rgrp_unlock_local(rs->rs_rgd);
   2159
   2160		/* Drop reservation, if we couldn't use reserved rgrp */
   2161		if (gfs2_rs_active(rs))
   2162			gfs2_rs_deltree(rs);
   2163
   2164		/* Unlock rgrp if required */
   2165		if (!rg_locked)
   2166			gfs2_glock_dq_uninit(&ip->i_rgd_gh);
   2167next_rgrp:
   2168		/* Find the next rgrp, and continue looking */
   2169		if (gfs2_select_rgrp(&rs->rs_rgd, begin))
   2170			continue;
   2171		if (skip)
   2172			continue;
   2173
   2174		/* If we've scanned all the rgrps, but found no free blocks
   2175		 * then this checks for some less likely conditions before
   2176		 * trying again.
   2177		 */
   2178		loops++;
   2179		/* Check that fs hasn't grown if writing to rindex */
   2180		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
   2181			error = gfs2_ri_update(ip);
   2182			if (error)
   2183				return error;
   2184		}
   2185		/* Flushing the log may release space */
   2186		if (loops == 2) {
   2187			if (ap->min_target)
   2188				target = ap->min_target;
   2189			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
   2190				       GFS2_LFC_INPLACE_RESERVE);
   2191		}
   2192	}
   2193
   2194	return -ENOSPC;
   2195}
   2196
   2197/**
   2198 * gfs2_inplace_release - release an inplace reservation
   2199 * @ip: the inode the reservation was taken out on
   2200 *
   2201 * Release a reservation made by gfs2_inplace_reserve().
   2202 */
   2203
   2204void gfs2_inplace_release(struct gfs2_inode *ip)
   2205{
   2206	struct gfs2_blkreserv *rs = &ip->i_res;
   2207
   2208	if (rs->rs_reserved) {
   2209		struct gfs2_rgrpd *rgd = rs->rs_rgd;
   2210
   2211		spin_lock(&rgd->rd_rsspin);
   2212		GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved < rs->rs_reserved);
   2213		rgd->rd_reserved -= rs->rs_reserved;
   2214		spin_unlock(&rgd->rd_rsspin);
   2215		rs->rs_reserved = 0;
   2216	}
   2217	if (gfs2_holder_initialized(&ip->i_rgd_gh))
   2218		gfs2_glock_dq_uninit(&ip->i_rgd_gh);
   2219}
   2220
   2221/**
   2222 * gfs2_alloc_extent - allocate an extent from a given bitmap
   2223 * @rbm: the resource group information
   2224 * @dinode: TRUE if the first block we allocate is for a dinode
   2225 * @n: The extent length (value/result)
   2226 *
   2227 * Add the bitmap buffer to the transaction.
   2228 * Set the found bits to @new_state to change block's allocation state.
   2229 */
   2230static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
   2231			     unsigned int *n)
   2232{
   2233	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
   2234	const unsigned int elen = *n;
   2235	u64 block;
   2236	int ret;
   2237
   2238	*n = 1;
   2239	block = gfs2_rbm_to_block(rbm);
   2240	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
   2241	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
   2242	block++;
   2243	while (*n < elen) {
   2244		ret = gfs2_rbm_from_block(&pos, block);
   2245		if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE)
   2246			break;
   2247		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
   2248		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
   2249		(*n)++;
   2250		block++;
   2251	}
   2252}
   2253
   2254/**
   2255 * rgblk_free - Change alloc state of given block(s)
   2256 * @sdp: the filesystem
   2257 * @rgd: the resource group the blocks are in
   2258 * @bstart: the start of a run of blocks to free
   2259 * @blen: the length of the block run (all must lie within ONE RG!)
   2260 * @new_state: GFS2_BLKST_XXX the after-allocation block state
   2261 */
   2262
   2263static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd,
   2264		       u64 bstart, u32 blen, unsigned char new_state)
   2265{
   2266	struct gfs2_rbm rbm;
   2267	struct gfs2_bitmap *bi, *bi_prev = NULL;
   2268
   2269	rbm.rgd = rgd;
   2270	if (WARN_ON_ONCE(gfs2_rbm_from_block(&rbm, bstart)))
   2271		return;
   2272	while (blen--) {
   2273		bi = rbm_bi(&rbm);
   2274		if (bi != bi_prev) {
   2275			if (!bi->bi_clone) {
   2276				bi->bi_clone = kmalloc(bi->bi_bh->b_size,
   2277						      GFP_NOFS | __GFP_NOFAIL);
   2278				memcpy(bi->bi_clone + bi->bi_offset,
   2279				       bi->bi_bh->b_data + bi->bi_offset,
   2280				       bi->bi_bytes);
   2281			}
   2282			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
   2283			bi_prev = bi;
   2284		}
   2285		gfs2_setbit(&rbm, false, new_state);
   2286		gfs2_rbm_add(&rbm, 1);
   2287	}
   2288}
   2289
   2290/**
   2291 * gfs2_rgrp_dump - print out an rgrp
   2292 * @seq: The iterator
   2293 * @rgd: The rgrp in question
   2294 * @fs_id_buf: pointer to file system id (if requested)
   2295 *
   2296 */
   2297
   2298void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
   2299		    const char *fs_id_buf)
   2300{
   2301	struct gfs2_blkreserv *trs;
   2302	const struct rb_node *n;
   2303
   2304	spin_lock(&rgd->rd_rsspin);
   2305	gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u q:%u r:%u e:%u\n",
   2306		       fs_id_buf,
   2307		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
   2308		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
   2309		       rgd->rd_requested, rgd->rd_reserved, rgd->rd_extfail_pt);
   2310	if (rgd->rd_sbd->sd_args.ar_rgrplvb) {
   2311		struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
   2312
   2313		gfs2_print_dbg(seq, "%s  L: f:%02x b:%u i:%u\n", fs_id_buf,
   2314			       be32_to_cpu(rgl->rl_flags),
   2315			       be32_to_cpu(rgl->rl_free),
   2316			       be32_to_cpu(rgl->rl_dinodes));
   2317	}
   2318	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
   2319		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
   2320		dump_rs(seq, trs, fs_id_buf);
   2321	}
   2322	spin_unlock(&rgd->rd_rsspin);
   2323}
   2324
   2325static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
   2326{
   2327	struct gfs2_sbd *sdp = rgd->rd_sbd;
   2328	char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
   2329
   2330	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
   2331		(unsigned long long)rgd->rd_addr);
   2332	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
   2333	sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
   2334	gfs2_rgrp_dump(NULL, rgd, fs_id_buf);
   2335	rgd->rd_flags |= GFS2_RDF_ERROR;
   2336}
   2337
   2338/**
   2339 * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
   2340 * @ip: The inode we have just allocated blocks for
   2341 * @rbm: The start of the allocated blocks
   2342 * @len: The extent length
   2343 *
   2344 * Adjusts a reservation after an allocation has taken place. If the
   2345 * reservation does not match the allocation, or if it is now empty
   2346 * then it is removed.
   2347 */
   2348
   2349static void gfs2_adjust_reservation(struct gfs2_inode *ip,
   2350				    const struct gfs2_rbm *rbm, unsigned len)
   2351{
   2352	struct gfs2_blkreserv *rs = &ip->i_res;
   2353	struct gfs2_rgrpd *rgd = rbm->rgd;
   2354
   2355	BUG_ON(rs->rs_reserved < len);
   2356	rs->rs_reserved -= len;
   2357	if (gfs2_rs_active(rs)) {
   2358		u64 start = gfs2_rbm_to_block(rbm);
   2359
   2360		if (rs->rs_start == start) {
   2361			unsigned int rlen;
   2362
   2363			rs->rs_start += len;
   2364			rlen = min(rs->rs_requested, len);
   2365			rs->rs_requested -= rlen;
   2366			rgd->rd_requested -= rlen;
   2367			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
   2368			if (rs->rs_start < rgd->rd_data0 + rgd->rd_data &&
   2369			    rs->rs_requested)
   2370				return;
   2371			/* We used up our block reservation, so we should
   2372			   reserve more blocks next time. */
   2373			atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint);
   2374		}
   2375		__rs_deltree(rs);
   2376	}
   2377}
   2378
   2379/**
   2380 * gfs2_set_alloc_start - Set starting point for block allocation
   2381 * @rbm: The rbm which will be set to the required location
   2382 * @ip: The gfs2 inode
   2383 * @dinode: Flag to say if allocation includes a new inode
   2384 *
   2385 * This sets the starting point from the reservation if one is active
   2386 * otherwise it falls back to guessing a start point based on the
   2387 * inode's goal block or the last allocation point in the rgrp.
   2388 */
   2389
   2390static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
   2391				 const struct gfs2_inode *ip, bool dinode)
   2392{
   2393	u64 goal;
   2394
   2395	if (gfs2_rs_active(&ip->i_res)) {
   2396		goal = ip->i_res.rs_start;
   2397	} else {
   2398		if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
   2399			goal = ip->i_goal;
   2400		else
   2401			goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
   2402	}
   2403	if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) {
   2404		rbm->bii = 0;
   2405		rbm->offset = 0;
   2406	}
   2407}
   2408
   2409/**
   2410 * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
   2411 * @ip: the inode to allocate the block for
   2412 * @bn: Used to return the starting block number
   2413 * @nblocks: requested number of blocks/extent length (value/result)
   2414 * @dinode: 1 if we're allocating a dinode block, else 0
   2415 * @generation: the generation number of the inode
   2416 *
   2417 * Returns: 0 or error
   2418 */
   2419
   2420int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
   2421		      bool dinode, u64 *generation)
   2422{
   2423	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
   2424	struct buffer_head *dibh;
   2425	struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rgd, };
   2426	u64 block; /* block, within the file system scope */
   2427	u32 minext = 1;
   2428	int error = -ENOSPC;
   2429
   2430	BUG_ON(ip->i_res.rs_reserved < *nblocks);
   2431
   2432	rgrp_lock_local(rbm.rgd);
   2433	if (gfs2_rs_active(&ip->i_res)) {
   2434		gfs2_set_alloc_start(&rbm, ip, dinode);
   2435		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &minext, &ip->i_res, false);
   2436	}
   2437	if (error == -ENOSPC) {
   2438		gfs2_set_alloc_start(&rbm, ip, dinode);
   2439		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &minext, NULL, false);
   2440	}
   2441
   2442	/* Since all blocks are reserved in advance, this shouldn't happen */
   2443	if (error) {
   2444		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
   2445			(unsigned long long)ip->i_no_addr, error, *nblocks,
   2446			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
   2447			rbm.rgd->rd_extfail_pt);
   2448		goto rgrp_error;
   2449	}
   2450
   2451	gfs2_alloc_extent(&rbm, dinode, nblocks);
   2452	block = gfs2_rbm_to_block(&rbm);
   2453	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
   2454	if (!dinode) {
   2455		ip->i_goal = block + *nblocks - 1;
   2456		error = gfs2_meta_inode_buffer(ip, &dibh);
   2457		if (error == 0) {
   2458			struct gfs2_dinode *di =
   2459				(struct gfs2_dinode *)dibh->b_data;
   2460			gfs2_trans_add_meta(ip->i_gl, dibh);
   2461			di->di_goal_meta = di->di_goal_data =
   2462				cpu_to_be64(ip->i_goal);
   2463			brelse(dibh);
   2464		}
   2465	}
   2466	spin_lock(&rbm.rgd->rd_rsspin);
   2467	gfs2_adjust_reservation(ip, &rbm, *nblocks);
   2468	if (rbm.rgd->rd_free < *nblocks || rbm.rgd->rd_reserved < *nblocks) {
   2469		fs_warn(sdp, "nblocks=%u\n", *nblocks);
   2470		spin_unlock(&rbm.rgd->rd_rsspin);
   2471		goto rgrp_error;
   2472	}
   2473	GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_reserved < *nblocks);
   2474	GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_free_clone < *nblocks);
   2475	GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_free < *nblocks);
   2476	rbm.rgd->rd_reserved -= *nblocks;
   2477	rbm.rgd->rd_free_clone -= *nblocks;
   2478	rbm.rgd->rd_free -= *nblocks;
   2479	spin_unlock(&rbm.rgd->rd_rsspin);
   2480	if (dinode) {
   2481		rbm.rgd->rd_dinodes++;
   2482		*generation = rbm.rgd->rd_igeneration++;
   2483		if (*generation == 0)
   2484			*generation = rbm.rgd->rd_igeneration++;
   2485	}
   2486
   2487	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
   2488	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
   2489	rgrp_unlock_local(rbm.rgd);
   2490
   2491	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
   2492	if (dinode)
   2493		gfs2_trans_remove_revoke(sdp, block, *nblocks);
   2494
   2495	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
   2496
   2497	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
   2498			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
   2499	*bn = block;
   2500	return 0;
   2501
   2502rgrp_error:
   2503	rgrp_unlock_local(rbm.rgd);
   2504	gfs2_rgrp_error(rbm.rgd);
   2505	return -EIO;
   2506}
   2507
   2508/**
   2509 * __gfs2_free_blocks - free a contiguous run of block(s)
   2510 * @ip: the inode these blocks are being freed from
   2511 * @rgd: the resource group the blocks are in
   2512 * @bstart: first block of a run of contiguous blocks
   2513 * @blen: the length of the block run
   2514 * @meta: 1 if the blocks represent metadata
   2515 *
   2516 */
   2517
   2518void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
   2519			u64 bstart, u32 blen, int meta)
   2520{
   2521	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
   2522
   2523	rgrp_lock_local(rgd);
   2524	rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE);
   2525	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
   2526	rgd->rd_free += blen;
   2527	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
   2528	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
   2529	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
   2530	rgrp_unlock_local(rgd);
   2531
   2532	/* Directories keep their data in the metadata address space */
   2533	if (meta || ip->i_depth || gfs2_is_jdata(ip))
   2534		gfs2_journal_wipe(ip, bstart, blen);
   2535}
   2536
   2537/**
   2538 * gfs2_free_meta - free a contiguous run of data block(s)
   2539 * @ip: the inode these blocks are being freed from
   2540 * @rgd: the resource group the blocks are in
   2541 * @bstart: first block of a run of contiguous blocks
   2542 * @blen: the length of the block run
   2543 *
   2544 */
   2545
   2546void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
   2547		    u64 bstart, u32 blen)
   2548{
   2549	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
   2550
   2551	__gfs2_free_blocks(ip, rgd, bstart, blen, 1);
   2552	gfs2_statfs_change(sdp, 0, +blen, 0);
   2553	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
   2554}
   2555
   2556void gfs2_unlink_di(struct inode *inode)
   2557{
   2558	struct gfs2_inode *ip = GFS2_I(inode);
   2559	struct gfs2_sbd *sdp = GFS2_SB(inode);
   2560	struct gfs2_rgrpd *rgd;
   2561	u64 blkno = ip->i_no_addr;
   2562
   2563	rgd = gfs2_blk2rgrpd(sdp, blkno, true);
   2564	if (!rgd)
   2565		return;
   2566	rgrp_lock_local(rgd);
   2567	rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
   2568	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
   2569	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
   2570	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
   2571	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
   2572	rgrp_unlock_local(rgd);
   2573}
   2574
   2575void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
   2576{
   2577	struct gfs2_sbd *sdp = rgd->rd_sbd;
   2578
   2579	rgrp_lock_local(rgd);
   2580	rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
   2581	if (!rgd->rd_dinodes)
   2582		gfs2_consist_rgrpd(rgd);
   2583	rgd->rd_dinodes--;
   2584	rgd->rd_free++;
   2585
   2586	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
   2587	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
   2588	rgrp_unlock_local(rgd);
   2589	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);
   2590
   2591	gfs2_statfs_change(sdp, 0, +1, -1);
   2592	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
   2593	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
   2594	gfs2_journal_wipe(ip, ip->i_no_addr, 1);
   2595}
   2596
   2597/**
   2598 * gfs2_check_blk_type - Check the type of a block
   2599 * @sdp: The superblock
   2600 * @no_addr: The block number to check
   2601 * @type: The block type we are looking for
   2602 *
   2603 * The inode glock of @no_addr must be held.  The @type to check for is either
   2604 * GFS2_BLKST_DINODE or GFS2_BLKST_UNLINKED; checking for type GFS2_BLKST_FREE
   2605 * or GFS2_BLKST_USED would make no sense.
   2606 *
   2607 * Returns: 0 if the block type matches the expected type
   2608 *          -ESTALE if it doesn't match
   2609 *          or -ve errno if something went wrong while checking
   2610 */
   2611
   2612int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
   2613{
   2614	struct gfs2_rgrpd *rgd;
   2615	struct gfs2_holder rgd_gh;
   2616	struct gfs2_rbm rbm;
   2617	int error = -EINVAL;
   2618
   2619	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
   2620	if (!rgd)
   2621		goto fail;
   2622
   2623	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
   2624	if (error)
   2625		goto fail;
   2626
   2627	rbm.rgd = rgd;
   2628	error = gfs2_rbm_from_block(&rbm, no_addr);
   2629	if (!WARN_ON_ONCE(error)) {
   2630		/*
   2631		 * No need to take the local resource group lock here; the
   2632		 * inode glock of @no_addr provides the necessary
   2633		 * synchronization in case the block is an inode.  (In case
   2634		 * the block is not an inode, the block type will not match
   2635		 * the @type we are looking for.)
   2636		 */
   2637		if (gfs2_testbit(&rbm, false) != type)
   2638			error = -ESTALE;
   2639	}
   2640
   2641	gfs2_glock_dq_uninit(&rgd_gh);
   2642
   2643fail:
   2644	return error;
   2645}
   2646
   2647/**
   2648 * gfs2_rlist_add - add a RG to a list of RGs
   2649 * @ip: the inode
   2650 * @rlist: the list of resource groups
   2651 * @block: the block
   2652 *
   2653 * Figure out what RG a block belongs to and add that RG to the list
   2654 *
   2655 * FIXME: Don't use NOFAIL
   2656 *
   2657 */
   2658
   2659void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
   2660		    u64 block)
   2661{
   2662	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
   2663	struct gfs2_rgrpd *rgd;
   2664	struct gfs2_rgrpd **tmp;
   2665	unsigned int new_space;
   2666	unsigned int x;
   2667
   2668	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
   2669		return;
   2670
   2671	/*
   2672	 * The resource group last accessed is kept in the last position.
   2673	 */
   2674
   2675	if (rlist->rl_rgrps) {
   2676		rgd = rlist->rl_rgd[rlist->rl_rgrps - 1];
   2677		if (rgrp_contains_block(rgd, block))
   2678			return;
   2679		rgd = gfs2_blk2rgrpd(sdp, block, 1);
   2680	} else {
   2681		rgd = ip->i_res.rs_rgd;
   2682		if (!rgd || !rgrp_contains_block(rgd, block))
   2683			rgd = gfs2_blk2rgrpd(sdp, block, 1);
   2684	}
   2685
   2686	if (!rgd) {
   2687		fs_err(sdp, "rlist_add: no rgrp for block %llu\n",
   2688		       (unsigned long long)block);
   2689		return;
   2690	}
   2691
   2692	for (x = 0; x < rlist->rl_rgrps; x++) {
   2693		if (rlist->rl_rgd[x] == rgd) {
   2694			swap(rlist->rl_rgd[x],
   2695			     rlist->rl_rgd[rlist->rl_rgrps - 1]);
   2696			return;
   2697		}
   2698	}
   2699
   2700	if (rlist->rl_rgrps == rlist->rl_space) {
   2701		new_space = rlist->rl_space + 10;
   2702
   2703		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
   2704			      GFP_NOFS | __GFP_NOFAIL);
   2705
   2706		if (rlist->rl_rgd) {
   2707			memcpy(tmp, rlist->rl_rgd,
   2708			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
   2709			kfree(rlist->rl_rgd);
   2710		}
   2711
   2712		rlist->rl_space = new_space;
   2713		rlist->rl_rgd = tmp;
   2714	}
   2715
   2716	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
   2717}
   2718
   2719/**
   2720 * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
   2721 *      and initialize an array of glock holders for them
   2722 * @rlist: the list of resource groups
   2723 *
   2724 * FIXME: Don't use NOFAIL
   2725 *
   2726 */
   2727
   2728void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
   2729{
   2730	unsigned int x;
   2731
   2732	rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps,
   2733				      sizeof(struct gfs2_holder),
   2734				      GFP_NOFS | __GFP_NOFAIL);
   2735	for (x = 0; x < rlist->rl_rgrps; x++)
   2736		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, LM_ST_EXCLUSIVE,
   2737				 LM_FLAG_NODE_SCOPE, &rlist->rl_ghs[x]);
   2738}
   2739
   2740/**
   2741 * gfs2_rlist_free - free a resource group list
   2742 * @rlist: the list of resource groups
   2743 *
   2744 */
   2745
   2746void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
   2747{
   2748	unsigned int x;
   2749
   2750	kfree(rlist->rl_rgd);
   2751
   2752	if (rlist->rl_ghs) {
   2753		for (x = 0; x < rlist->rl_rgrps; x++)
   2754			gfs2_holder_uninit(&rlist->rl_ghs[x]);
   2755		kfree(rlist->rl_ghs);
   2756		rlist->rl_ghs = NULL;
   2757	}
   2758}
   2759
   2760void rgrp_lock_local(struct gfs2_rgrpd *rgd)
   2761{
   2762	mutex_lock(&rgd->rd_mutex);
   2763}
   2764
   2765void rgrp_unlock_local(struct gfs2_rgrpd *rgd)
   2766{
   2767	mutex_unlock(&rgd->rd_mutex);
   2768}