cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

jfs_dmap.c (112293B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 *   Copyright (C) International Business Machines Corp., 2000-2004
      4 *   Portions Copyright (C) Tino Reichardt, 2012
      5 */
      6
      7#include <linux/fs.h>
      8#include <linux/slab.h>
      9#include "jfs_incore.h"
     10#include "jfs_superblock.h"
     11#include "jfs_dmap.h"
     12#include "jfs_imap.h"
     13#include "jfs_lock.h"
     14#include "jfs_metapage.h"
     15#include "jfs_debug.h"
     16#include "jfs_discard.h"
     17
     18/*
     19 *	SERIALIZATION of the Block Allocation Map.
     20 *
     21 *	the working state of the block allocation map is accessed in
     22 *	two directions:
     23 *
     24 *	1) allocation and free requests that start at the dmap
     25 *	   level and move up through the dmap control pages (i.e.
     26 *	   the vast majority of requests).
     27 *
     28 *	2) allocation requests that start at dmap control page
     29 *	   level and work down towards the dmaps.
     30 *
     31 *	the serialization scheme used here is as follows.
     32 *
     33 *	requests which start at the bottom are serialized against each
     34 *	other through buffers and each requests holds onto its buffers
     35 *	as it works it way up from a single dmap to the required level
     36 *	of dmap control page.
     37 *	requests that start at the top are serialized against each other
     38 *	and request that start from the bottom by the multiple read/single
     39 *	write inode lock of the bmap inode. requests starting at the top
     40 *	take this lock in write mode while request starting at the bottom
     41 *	take the lock in read mode.  a single top-down request may proceed
     42 *	exclusively while multiple bottoms-up requests may proceed
     43 *	simultaneously (under the protection of busy buffers).
     44 *
     45 *	in addition to information found in dmaps and dmap control pages,
     46 *	the working state of the block allocation map also includes read/
     47 *	write information maintained in the bmap descriptor (i.e. total
     48 *	free block count, allocation group level free block counts).
     49 *	a single exclusive lock (BMAP_LOCK) is used to guard this information
     50 *	in the face of multiple-bottoms up requests.
     51 *	(lock ordering: IREAD_LOCK, BMAP_LOCK);
     52 *
     53 *	accesses to the persistent state of the block allocation map (limited
     54 *	to the persistent bitmaps in dmaps) is guarded by (busy) buffers.
     55 */
     56
     57#define BMAP_LOCK_INIT(bmp)	mutex_init(&bmp->db_bmaplock)
     58#define BMAP_LOCK(bmp)		mutex_lock(&bmp->db_bmaplock)
     59#define BMAP_UNLOCK(bmp)	mutex_unlock(&bmp->db_bmaplock)
     60
     61/*
     62 * forward references
     63 */
     64static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
     65			int nblocks);
     66static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval);
     67static int dbBackSplit(dmtree_t * tp, int leafno);
     68static int dbJoin(dmtree_t * tp, int leafno, int newval);
     69static void dbAdjTree(dmtree_t * tp, int leafno, int newval);
     70static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc,
     71		    int level);
     72static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results);
     73static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
     74		       int nblocks);
     75static int dbAllocNear(struct bmap * bmp, struct dmap * dp, s64 blkno,
     76		       int nblocks,
     77		       int l2nb, s64 * results);
     78static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
     79		       int nblocks);
     80static int dbAllocDmapLev(struct bmap * bmp, struct dmap * dp, int nblocks,
     81			  int l2nb,
     82			  s64 * results);
     83static int dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb,
     84		     s64 * results);
     85static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno,
     86		      s64 * results);
     87static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks);
     88static int dbFindBits(u32 word, int l2nb);
     89static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno);
     90static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx);
     91static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
     92		      int nblocks);
     93static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
     94		      int nblocks);
     95static int dbMaxBud(u8 * cp);
     96static int blkstol2(s64 nb);
     97
     98static int cntlz(u32 value);
     99static int cnttz(u32 word);
    100
    101static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
    102			 int nblocks);
    103static int dbInitDmap(struct dmap * dp, s64 blkno, int nblocks);
    104static int dbInitDmapTree(struct dmap * dp);
    105static int dbInitTree(struct dmaptree * dtp);
    106static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i);
    107static int dbGetL2AGSize(s64 nblocks);
    108
    109/*
    110 *	buddy table
    111 *
    112 * table used for determining buddy sizes within characters of
    113 * dmap bitmap words.  the characters themselves serve as indexes
    114 * into the table, with the table elements yielding the maximum
    115 * binary buddy of free bits within the character.
    116 */
    117static const s8 budtab[256] = {
    118	3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    119	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    120	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    121	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    122	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    123	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
    124	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
    125	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
    126	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    127	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
    128	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
    129	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
    130	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    131	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
    132	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
    133	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1
    134};
    135
    136/*
    137 * NAME:	dbMount()
    138 *
    139 * FUNCTION:	initializate the block allocation map.
    140 *
    141 *		memory is allocated for the in-core bmap descriptor and
    142 *		the in-core descriptor is initialized from disk.
    143 *
    144 * PARAMETERS:
    145 *	ipbmap	- pointer to in-core inode for the block map.
    146 *
    147 * RETURN VALUES:
    148 *	0	- success
    149 *	-ENOMEM	- insufficient memory
    150 *	-EIO	- i/o error
    151 *	-EINVAL - wrong bmap data
    152 */
    153int dbMount(struct inode *ipbmap)
    154{
    155	struct bmap *bmp;
    156	struct dbmap_disk *dbmp_le;
    157	struct metapage *mp;
    158	int i;
    159
    160	/*
    161	 * allocate/initialize the in-memory bmap descriptor
    162	 */
    163	/* allocate memory for the in-memory bmap descriptor */
    164	bmp = kmalloc(sizeof(struct bmap), GFP_KERNEL);
    165	if (bmp == NULL)
    166		return -ENOMEM;
    167
    168	/* read the on-disk bmap descriptor. */
    169	mp = read_metapage(ipbmap,
    170			   BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
    171			   PSIZE, 0);
    172	if (mp == NULL) {
    173		kfree(bmp);
    174		return -EIO;
    175	}
    176
    177	/* copy the on-disk bmap descriptor to its in-memory version. */
    178	dbmp_le = (struct dbmap_disk *) mp->data;
    179	bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
    180	bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
    181	bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
    182	bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
    183	if (!bmp->db_numag) {
    184		release_metapage(mp);
    185		kfree(bmp);
    186		return -EINVAL;
    187	}
    188
    189	bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
    190	bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
    191	bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
    192	bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
    193	bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
    194	bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
    195	bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
    196	bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
    197	for (i = 0; i < MAXAG; i++)
    198		bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]);
    199	bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize);
    200	bmp->db_maxfreebud = dbmp_le->dn_maxfreebud;
    201
    202	/* release the buffer. */
    203	release_metapage(mp);
    204
    205	/* bind the bmap inode and the bmap descriptor to each other. */
    206	bmp->db_ipbmap = ipbmap;
    207	JFS_SBI(ipbmap->i_sb)->bmap = bmp;
    208
    209	memset(bmp->db_active, 0, sizeof(bmp->db_active));
    210
    211	/*
    212	 * allocate/initialize the bmap lock
    213	 */
    214	BMAP_LOCK_INIT(bmp);
    215
    216	return (0);
    217}
    218
    219
    220/*
    221 * NAME:	dbUnmount()
    222 *
    223 * FUNCTION:	terminate the block allocation map in preparation for
    224 *		file system unmount.
    225 *
    226 *		the in-core bmap descriptor is written to disk and
    227 *		the memory for this descriptor is freed.
    228 *
    229 * PARAMETERS:
    230 *	ipbmap	- pointer to in-core inode for the block map.
    231 *
    232 * RETURN VALUES:
    233 *	0	- success
    234 *	-EIO	- i/o error
    235 */
    236int dbUnmount(struct inode *ipbmap, int mounterror)
    237{
    238	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
    239
    240	if (!(mounterror || isReadOnly(ipbmap)))
    241		dbSync(ipbmap);
    242
    243	/*
    244	 * Invalidate the page cache buffers
    245	 */
    246	truncate_inode_pages(ipbmap->i_mapping, 0);
    247
    248	/* free the memory for the in-memory bmap. */
    249	kfree(bmp);
    250
    251	return (0);
    252}
    253
    254/*
    255 *	dbSync()
    256 */
    257int dbSync(struct inode *ipbmap)
    258{
    259	struct dbmap_disk *dbmp_le;
    260	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
    261	struct metapage *mp;
    262	int i;
    263
    264	/*
    265	 * write bmap global control page
    266	 */
    267	/* get the buffer for the on-disk bmap descriptor. */
    268	mp = read_metapage(ipbmap,
    269			   BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
    270			   PSIZE, 0);
    271	if (mp == NULL) {
    272		jfs_err("dbSync: read_metapage failed!");
    273		return -EIO;
    274	}
    275	/* copy the in-memory version of the bmap to the on-disk version */
    276	dbmp_le = (struct dbmap_disk *) mp->data;
    277	dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize);
    278	dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree);
    279	dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage);
    280	dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag);
    281	dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel);
    282	dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
    283	dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
    284	dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
    285	dbmp_le->dn_agheight = cpu_to_le32(bmp->db_agheight);
    286	dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
    287	dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
    288	dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
    289	for (i = 0; i < MAXAG; i++)
    290		dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]);
    291	dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize);
    292	dbmp_le->dn_maxfreebud = bmp->db_maxfreebud;
    293
    294	/* write the buffer */
    295	write_metapage(mp);
    296
    297	/*
    298	 * write out dirty pages of bmap
    299	 */
    300	filemap_write_and_wait(ipbmap->i_mapping);
    301
    302	diWriteSpecial(ipbmap, 0);
    303
    304	return (0);
    305}
    306
    307/*
    308 * NAME:	dbFree()
    309 *
    310 * FUNCTION:	free the specified block range from the working block
    311 *		allocation map.
    312 *
    313 *		the blocks will be free from the working map one dmap
    314 *		at a time.
    315 *
    316 * PARAMETERS:
    317 *	ip	- pointer to in-core inode;
    318 *	blkno	- starting block number to be freed.
    319 *	nblocks	- number of blocks to be freed.
    320 *
    321 * RETURN VALUES:
    322 *	0	- success
    323 *	-EIO	- i/o error
    324 */
    325int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
    326{
    327	struct metapage *mp;
    328	struct dmap *dp;
    329	int nb, rc;
    330	s64 lblkno, rem;
    331	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
    332	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
    333	struct super_block *sb = ipbmap->i_sb;
    334
    335	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
    336
    337	/* block to be freed better be within the mapsize. */
    338	if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) {
    339		IREAD_UNLOCK(ipbmap);
    340		printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
    341		       (unsigned long long) blkno,
    342		       (unsigned long long) nblocks);
    343		jfs_error(ip->i_sb, "block to be freed is outside the map\n");
    344		return -EIO;
    345	}
    346
    347	/**
    348	 * TRIM the blocks, when mounted with discard option
    349	 */
    350	if (JFS_SBI(sb)->flag & JFS_DISCARD)
    351		if (JFS_SBI(sb)->minblks_trim <= nblocks)
    352			jfs_issue_discard(ipbmap, blkno, nblocks);
    353
    354	/*
    355	 * free the blocks a dmap at a time.
    356	 */
    357	mp = NULL;
    358	for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
    359		/* release previous dmap if any */
    360		if (mp) {
    361			write_metapage(mp);
    362		}
    363
    364		/* get the buffer for the current dmap. */
    365		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
    366		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
    367		if (mp == NULL) {
    368			IREAD_UNLOCK(ipbmap);
    369			return -EIO;
    370		}
    371		dp = (struct dmap *) mp->data;
    372
    373		/* determine the number of blocks to be freed from
    374		 * this dmap.
    375		 */
    376		nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
    377
    378		/* free the blocks. */
    379		if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
    380			jfs_error(ip->i_sb, "error in block map\n");
    381			release_metapage(mp);
    382			IREAD_UNLOCK(ipbmap);
    383			return (rc);
    384		}
    385	}
    386
    387	/* write the last buffer. */
    388	if (mp)
    389		write_metapage(mp);
    390
    391	IREAD_UNLOCK(ipbmap);
    392
    393	return (0);
    394}
    395
    396
    397/*
    398 * NAME:	dbUpdatePMap()
    399 *
    400 * FUNCTION:	update the allocation state (free or allocate) of the
    401 *		specified block range in the persistent block allocation map.
    402 *
    403 *		the blocks will be updated in the persistent map one
    404 *		dmap at a time.
    405 *
    406 * PARAMETERS:
    407 *	ipbmap	- pointer to in-core inode for the block map.
    408 *	free	- 'true' if block range is to be freed from the persistent
    409 *		  map; 'false' if it is to be allocated.
    410 *	blkno	- starting block number of the range.
    411 *	nblocks	- number of contiguous blocks in the range.
    412 *	tblk	- transaction block;
    413 *
    414 * RETURN VALUES:
    415 *	0	- success
    416 *	-EIO	- i/o error
    417 */
    418int
    419dbUpdatePMap(struct inode *ipbmap,
    420	     int free, s64 blkno, s64 nblocks, struct tblock * tblk)
    421{
    422	int nblks, dbitno, wbitno, rbits;
    423	int word, nbits, nwords;
    424	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
    425	s64 lblkno, rem, lastlblkno;
    426	u32 mask;
    427	struct dmap *dp;
    428	struct metapage *mp;
    429	struct jfs_log *log;
    430	int lsn, difft, diffp;
    431	unsigned long flags;
    432
    433	/* the blocks better be within the mapsize. */
    434	if (blkno + nblocks > bmp->db_mapsize) {
    435		printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
    436		       (unsigned long long) blkno,
    437		       (unsigned long long) nblocks);
    438		jfs_error(ipbmap->i_sb, "blocks are outside the map\n");
    439		return -EIO;
    440	}
    441
    442	/* compute delta of transaction lsn from log syncpt */
    443	lsn = tblk->lsn;
    444	log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
    445	logdiff(difft, lsn, log);
    446
    447	/*
    448	 * update the block state a dmap at a time.
    449	 */
    450	mp = NULL;
    451	lastlblkno = 0;
    452	for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) {
    453		/* get the buffer for the current dmap. */
    454		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
    455		if (lblkno != lastlblkno) {
    456			if (mp) {
    457				write_metapage(mp);
    458			}
    459
    460			mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE,
    461					   0);
    462			if (mp == NULL)
    463				return -EIO;
    464			metapage_wait_for_io(mp);
    465		}
    466		dp = (struct dmap *) mp->data;
    467
    468		/* determine the bit number and word within the dmap of
    469		 * the starting block.  also determine how many blocks
    470		 * are to be updated within this dmap.
    471		 */
    472		dbitno = blkno & (BPERDMAP - 1);
    473		word = dbitno >> L2DBWORD;
    474		nblks = min(rem, (s64)BPERDMAP - dbitno);
    475
    476		/* update the bits of the dmap words. the first and last
    477		 * words may only have a subset of their bits updated. if
    478		 * this is the case, we'll work against that word (i.e.
    479		 * partial first and/or last) only in a single pass.  a
    480		 * single pass will also be used to update all words that
    481		 * are to have all their bits updated.
    482		 */
    483		for (rbits = nblks; rbits > 0;
    484		     rbits -= nbits, dbitno += nbits) {
    485			/* determine the bit number within the word and
    486			 * the number of bits within the word.
    487			 */
    488			wbitno = dbitno & (DBWORD - 1);
    489			nbits = min(rbits, DBWORD - wbitno);
    490
    491			/* check if only part of the word is to be updated. */
    492			if (nbits < DBWORD) {
    493				/* update (free or allocate) the bits
    494				 * in this word.
    495				 */
    496				mask =
    497				    (ONES << (DBWORD - nbits) >> wbitno);
    498				if (free)
    499					dp->pmap[word] &=
    500					    cpu_to_le32(~mask);
    501				else
    502					dp->pmap[word] |=
    503					    cpu_to_le32(mask);
    504
    505				word += 1;
    506			} else {
    507				/* one or more words are to have all
    508				 * their bits updated.  determine how
    509				 * many words and how many bits.
    510				 */
    511				nwords = rbits >> L2DBWORD;
    512				nbits = nwords << L2DBWORD;
    513
    514				/* update (free or allocate) the bits
    515				 * in these words.
    516				 */
    517				if (free)
    518					memset(&dp->pmap[word], 0,
    519					       nwords * 4);
    520				else
    521					memset(&dp->pmap[word], (int) ONES,
    522					       nwords * 4);
    523
    524				word += nwords;
    525			}
    526		}
    527
    528		/*
    529		 * update dmap lsn
    530		 */
    531		if (lblkno == lastlblkno)
    532			continue;
    533
    534		lastlblkno = lblkno;
    535
    536		LOGSYNC_LOCK(log, flags);
    537		if (mp->lsn != 0) {
    538			/* inherit older/smaller lsn */
    539			logdiff(diffp, mp->lsn, log);
    540			if (difft < diffp) {
    541				mp->lsn = lsn;
    542
    543				/* move bp after tblock in logsync list */
    544				list_move(&mp->synclist, &tblk->synclist);
    545			}
    546
    547			/* inherit younger/larger clsn */
    548			logdiff(difft, tblk->clsn, log);
    549			logdiff(diffp, mp->clsn, log);
    550			if (difft > diffp)
    551				mp->clsn = tblk->clsn;
    552		} else {
    553			mp->log = log;
    554			mp->lsn = lsn;
    555
    556			/* insert bp after tblock in logsync list */
    557			log->count++;
    558			list_add(&mp->synclist, &tblk->synclist);
    559
    560			mp->clsn = tblk->clsn;
    561		}
    562		LOGSYNC_UNLOCK(log, flags);
    563	}
    564
    565	/* write the last buffer. */
    566	if (mp) {
    567		write_metapage(mp);
    568	}
    569
    570	return (0);
    571}
    572
    573
    574/*
    575 * NAME:	dbNextAG()
    576 *
    577 * FUNCTION:	find the preferred allocation group for new allocations.
    578 *
    579 *		Within the allocation groups, we maintain a preferred
    580 *		allocation group which consists of a group with at least
    581 *		average free space.  It is the preferred group that we target
    582 *		new inode allocation towards.  The tie-in between inode
    583 *		allocation and block allocation occurs as we allocate the
    584 *		first (data) block of an inode and specify the inode (block)
    585 *		as the allocation hint for this block.
    586 *
    587 *		We try to avoid having more than one open file growing in
    588 *		an allocation group, as this will lead to fragmentation.
    589 *		This differs from the old OS/2 method of trying to keep
    590 *		empty ags around for large allocations.
    591 *
    592 * PARAMETERS:
    593 *	ipbmap	- pointer to in-core inode for the block map.
    594 *
    595 * RETURN VALUES:
    596 *	the preferred allocation group number.
    597 */
    598int dbNextAG(struct inode *ipbmap)
    599{
    600	s64 avgfree;
    601	int agpref;
    602	s64 hwm = 0;
    603	int i;
    604	int next_best = -1;
    605	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
    606
    607	BMAP_LOCK(bmp);
    608
    609	/* determine the average number of free blocks within the ags. */
    610	avgfree = (u32)bmp->db_nfree / bmp->db_numag;
    611
    612	/*
    613	 * if the current preferred ag does not have an active allocator
    614	 * and has at least average freespace, return it
    615	 */
    616	agpref = bmp->db_agpref;
    617	if ((atomic_read(&bmp->db_active[agpref]) == 0) &&
    618	    (bmp->db_agfree[agpref] >= avgfree))
    619		goto unlock;
    620
    621	/* From the last preferred ag, find the next one with at least
    622	 * average free space.
    623	 */
    624	for (i = 0 ; i < bmp->db_numag; i++, agpref++) {
    625		if (agpref == bmp->db_numag)
    626			agpref = 0;
    627
    628		if (atomic_read(&bmp->db_active[agpref]))
    629			/* open file is currently growing in this ag */
    630			continue;
    631		if (bmp->db_agfree[agpref] >= avgfree) {
    632			/* Return this one */
    633			bmp->db_agpref = agpref;
    634			goto unlock;
    635		} else if (bmp->db_agfree[agpref] > hwm) {
    636			/* Less than avg. freespace, but best so far */
    637			hwm = bmp->db_agfree[agpref];
    638			next_best = agpref;
    639		}
    640	}
    641
    642	/*
    643	 * If no inactive ag was found with average freespace, use the
    644	 * next best
    645	 */
    646	if (next_best != -1)
    647		bmp->db_agpref = next_best;
    648	/* else leave db_agpref unchanged */
    649unlock:
    650	BMAP_UNLOCK(bmp);
    651
    652	/* return the preferred group.
    653	 */
    654	return (bmp->db_agpref);
    655}
    656
    657/*
    658 * NAME:	dbAlloc()
    659 *
    660 * FUNCTION:	attempt to allocate a specified number of contiguous free
    661 *		blocks from the working allocation block map.
    662 *
    663 *		the block allocation policy uses hints and a multi-step
    664 *		approach.
    665 *
    666 *		for allocation requests smaller than the number of blocks
    667 *		per dmap, we first try to allocate the new blocks
    668 *		immediately following the hint.  if these blocks are not
    669 *		available, we try to allocate blocks near the hint.  if
    670 *		no blocks near the hint are available, we next try to
    671 *		allocate within the same dmap as contains the hint.
    672 *
    673 *		if no blocks are available in the dmap or the allocation
    674 *		request is larger than the dmap size, we try to allocate
    675 *		within the same allocation group as contains the hint. if
    676 *		this does not succeed, we finally try to allocate anywhere
    677 *		within the aggregate.
    678 *
    679 *		we also try to allocate anywhere within the aggregate
    680 *		for allocation requests larger than the allocation group
    681 *		size or requests that specify no hint value.
    682 *
    683 * PARAMETERS:
    684 *	ip	- pointer to in-core inode;
    685 *	hint	- allocation hint.
    686 *	nblocks	- number of contiguous blocks in the range.
    687 *	results	- on successful return, set to the starting block number
    688 *		  of the newly allocated contiguous range.
    689 *
    690 * RETURN VALUES:
    691 *	0	- success
    692 *	-ENOSPC	- insufficient disk resources
    693 *	-EIO	- i/o error
    694 */
    695int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
    696{
    697	int rc, agno;
    698	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
    699	struct bmap *bmp;
    700	struct metapage *mp;
    701	s64 lblkno, blkno;
    702	struct dmap *dp;
    703	int l2nb;
    704	s64 mapSize;
    705	int writers;
    706
    707	/* assert that nblocks is valid */
    708	assert(nblocks > 0);
    709
    710	/* get the log2 number of blocks to be allocated.
    711	 * if the number of blocks is not a log2 multiple,
    712	 * it will be rounded up to the next log2 multiple.
    713	 */
    714	l2nb = BLKSTOL2(nblocks);
    715
    716	bmp = JFS_SBI(ip->i_sb)->bmap;
    717
    718	mapSize = bmp->db_mapsize;
    719
    720	/* the hint should be within the map */
    721	if (hint >= mapSize) {
    722		jfs_error(ip->i_sb, "the hint is outside the map\n");
    723		return -EIO;
    724	}
    725
    726	/* if the number of blocks to be allocated is greater than the
    727	 * allocation group size, try to allocate anywhere.
    728	 */
    729	if (l2nb > bmp->db_agl2size) {
    730		IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
    731
    732		rc = dbAllocAny(bmp, nblocks, l2nb, results);
    733
    734		goto write_unlock;
    735	}
    736
    737	/*
    738	 * If no hint, let dbNextAG recommend an allocation group
    739	 */
    740	if (hint == 0)
    741		goto pref_ag;
    742
    743	/* we would like to allocate close to the hint.  adjust the
    744	 * hint to the block following the hint since the allocators
    745	 * will start looking for free space starting at this point.
    746	 */
    747	blkno = hint + 1;
    748
    749	if (blkno >= bmp->db_mapsize)
    750		goto pref_ag;
    751
    752	agno = blkno >> bmp->db_agl2size;
    753
    754	/* check if blkno crosses over into a new allocation group.
    755	 * if so, check if we should allow allocations within this
    756	 * allocation group.
    757	 */
    758	if ((blkno & (bmp->db_agsize - 1)) == 0)
    759		/* check if the AG is currently being written to.
    760		 * if so, call dbNextAG() to find a non-busy
    761		 * AG with sufficient free space.
    762		 */
    763		if (atomic_read(&bmp->db_active[agno]))
    764			goto pref_ag;
    765
    766	/* check if the allocation request size can be satisfied from a
    767	 * single dmap.  if so, try to allocate from the dmap containing
    768	 * the hint using a tiered strategy.
    769	 */
    770	if (nblocks <= BPERDMAP) {
    771		IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
    772
    773		/* get the buffer for the dmap containing the hint.
    774		 */
    775		rc = -EIO;
    776		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
    777		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
    778		if (mp == NULL)
    779			goto read_unlock;
    780
    781		dp = (struct dmap *) mp->data;
    782
    783		/* first, try to satisfy the allocation request with the
    784		 * blocks beginning at the hint.
    785		 */
    786		if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks))
    787		    != -ENOSPC) {
    788			if (rc == 0) {
    789				*results = blkno;
    790				mark_metapage_dirty(mp);
    791			}
    792
    793			release_metapage(mp);
    794			goto read_unlock;
    795		}
    796
    797		writers = atomic_read(&bmp->db_active[agno]);
    798		if ((writers > 1) ||
    799		    ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) {
    800			/*
    801			 * Someone else is writing in this allocation
    802			 * group.  To avoid fragmenting, try another ag
    803			 */
    804			release_metapage(mp);
    805			IREAD_UNLOCK(ipbmap);
    806			goto pref_ag;
    807		}
    808
    809		/* next, try to satisfy the allocation request with blocks
    810		 * near the hint.
    811		 */
    812		if ((rc =
    813		     dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results))
    814		    != -ENOSPC) {
    815			if (rc == 0)
    816				mark_metapage_dirty(mp);
    817
    818			release_metapage(mp);
    819			goto read_unlock;
    820		}
    821
    822		/* try to satisfy the allocation request with blocks within
    823		 * the same dmap as the hint.
    824		 */
    825		if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results))
    826		    != -ENOSPC) {
    827			if (rc == 0)
    828				mark_metapage_dirty(mp);
    829
    830			release_metapage(mp);
    831			goto read_unlock;
    832		}
    833
    834		release_metapage(mp);
    835		IREAD_UNLOCK(ipbmap);
    836	}
    837
    838	/* try to satisfy the allocation request with blocks within
    839	 * the same allocation group as the hint.
    840	 */
    841	IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
    842	if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC)
    843		goto write_unlock;
    844
    845	IWRITE_UNLOCK(ipbmap);
    846
    847
    848      pref_ag:
    849	/*
    850	 * Let dbNextAG recommend a preferred allocation group
    851	 */
    852	agno = dbNextAG(ipbmap);
    853	IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
    854
    855	/* Try to allocate within this allocation group.  if that fails, try to
    856	 * allocate anywhere in the map.
    857	 */
    858	if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == -ENOSPC)
    859		rc = dbAllocAny(bmp, nblocks, l2nb, results);
    860
    861      write_unlock:
    862	IWRITE_UNLOCK(ipbmap);
    863
    864	return (rc);
    865
    866      read_unlock:
    867	IREAD_UNLOCK(ipbmap);
    868
    869	return (rc);
    870}
    871
    872/*
    873 * NAME:	dbReAlloc()
    874 *
    875 * FUNCTION:	attempt to extend a current allocation by a specified
    876 *		number of blocks.
    877 *
    878 *		this routine attempts to satisfy the allocation request
    879 *		by first trying to extend the existing allocation in
    880 *		place by allocating the additional blocks as the blocks
    881 *		immediately following the current allocation.  if these
    882 *		blocks are not available, this routine will attempt to
    883 *		allocate a new set of contiguous blocks large enough
    884 *		to cover the existing allocation plus the additional
    885 *		number of blocks required.
    886 *
    887 * PARAMETERS:
    888 *	ip	    -  pointer to in-core inode requiring allocation.
    889 *	blkno	    -  starting block of the current allocation.
    890 *	nblocks	    -  number of contiguous blocks within the current
    891 *		       allocation.
    892 *	addnblocks  -  number of blocks to add to the allocation.
    893 *	results	-      on successful return, set to the starting block number
    894 *		       of the existing allocation if the existing allocation
    895 *		       was extended in place or to a newly allocated contiguous
    896 *		       range if the existing allocation could not be extended
    897 *		       in place.
    898 *
    899 * RETURN VALUES:
    900 *	0	- success
    901 *	-ENOSPC	- insufficient disk resources
    902 *	-EIO	- i/o error
    903 */
    904int
    905dbReAlloc(struct inode *ip,
    906	  s64 blkno, s64 nblocks, s64 addnblocks, s64 * results)
    907{
    908	int rc;
    909
    910	/* try to extend the allocation in place.
    911	 */
    912	if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) {
    913		*results = blkno;
    914		return (0);
    915	} else {
    916		if (rc != -ENOSPC)
    917			return (rc);
    918	}
    919
    920	/* could not extend the allocation in place, so allocate a
    921	 * new set of blocks for the entire request (i.e. try to get
    922	 * a range of contiguous blocks large enough to cover the
    923	 * existing allocation plus the additional blocks.)
    924	 */
    925	return (dbAlloc
    926		(ip, blkno + nblocks - 1, addnblocks + nblocks, results));
    927}
    928
    929
    930/*
    931 * NAME:	dbExtend()
    932 *
    933 * FUNCTION:	attempt to extend a current allocation by a specified
    934 *		number of blocks.
    935 *
    936 *		this routine attempts to satisfy the allocation request
    937 *		by first trying to extend the existing allocation in
    938 *		place by allocating the additional blocks as the blocks
    939 *		immediately following the current allocation.
    940 *
    941 * PARAMETERS:
    942 *	ip	    -  pointer to in-core inode requiring allocation.
    943 *	blkno	    -  starting block of the current allocation.
    944 *	nblocks	    -  number of contiguous blocks within the current
    945 *		       allocation.
    946 *	addnblocks  -  number of blocks to add to the allocation.
    947 *
    948 * RETURN VALUES:
    949 *	0	- success
    950 *	-ENOSPC	- insufficient disk resources
    951 *	-EIO	- i/o error
    952 */
    953static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
    954{
    955	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
    956	s64 lblkno, lastblkno, extblkno;
    957	uint rel_block;
    958	struct metapage *mp;
    959	struct dmap *dp;
    960	int rc;
    961	struct inode *ipbmap = sbi->ipbmap;
    962	struct bmap *bmp;
    963
    964	/*
    965	 * We don't want a non-aligned extent to cross a page boundary
    966	 */
    967	if (((rel_block = blkno & (sbi->nbperpage - 1))) &&
    968	    (rel_block + nblocks + addnblocks > sbi->nbperpage))
    969		return -ENOSPC;
    970
    971	/* get the last block of the current allocation */
    972	lastblkno = blkno + nblocks - 1;
    973
    974	/* determine the block number of the block following
    975	 * the existing allocation.
    976	 */
    977	extblkno = lastblkno + 1;
    978
    979	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
    980
    981	/* better be within the file system */
    982	bmp = sbi->bmap;
    983	if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) {
    984		IREAD_UNLOCK(ipbmap);
    985		jfs_error(ip->i_sb, "the block is outside the filesystem\n");
    986		return -EIO;
    987	}
    988
    989	/* we'll attempt to extend the current allocation in place by
    990	 * allocating the additional blocks as the blocks immediately
    991	 * following the current allocation.  we only try to extend the
    992	 * current allocation in place if the number of additional blocks
    993	 * can fit into a dmap, the last block of the current allocation
    994	 * is not the last block of the file system, and the start of the
    995	 * inplace extension is not on an allocation group boundary.
    996	 */
    997	if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize ||
    998	    (extblkno & (bmp->db_agsize - 1)) == 0) {
    999		IREAD_UNLOCK(ipbmap);
   1000		return -ENOSPC;
   1001	}
   1002
   1003	/* get the buffer for the dmap containing the first block
   1004	 * of the extension.
   1005	 */
   1006	lblkno = BLKTODMAP(extblkno, bmp->db_l2nbperpage);
   1007	mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
   1008	if (mp == NULL) {
   1009		IREAD_UNLOCK(ipbmap);
   1010		return -EIO;
   1011	}
   1012
   1013	dp = (struct dmap *) mp->data;
   1014
   1015	/* try to allocate the blocks immediately following the
   1016	 * current allocation.
   1017	 */
   1018	rc = dbAllocNext(bmp, dp, extblkno, (int) addnblocks);
   1019
   1020	IREAD_UNLOCK(ipbmap);
   1021
   1022	/* were we successful ? */
   1023	if (rc == 0)
   1024		write_metapage(mp);
   1025	else
   1026		/* we were not successful */
   1027		release_metapage(mp);
   1028
   1029	return (rc);
   1030}
   1031
   1032
   1033/*
   1034 * NAME:	dbAllocNext()
   1035 *
   1036 * FUNCTION:	attempt to allocate the blocks of the specified block
   1037 *		range within a dmap.
   1038 *
   1039 * PARAMETERS:
   1040 *	bmp	-  pointer to bmap descriptor
   1041 *	dp	-  pointer to dmap.
   1042 *	blkno	-  starting block number of the range.
   1043 *	nblocks	-  number of contiguous free blocks of the range.
   1044 *
   1045 * RETURN VALUES:
   1046 *	0	- success
   1047 *	-ENOSPC	- insufficient disk resources
   1048 *	-EIO	- i/o error
   1049 *
   1050 * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
   1051 */
   1052static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
   1053		       int nblocks)
   1054{
   1055	int dbitno, word, rembits, nb, nwords, wbitno, nw;
   1056	int l2size;
   1057	s8 *leaf;
   1058	u32 mask;
   1059
   1060	if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
   1061		jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
   1062		return -EIO;
   1063	}
   1064
   1065	/* pick up a pointer to the leaves of the dmap tree.
   1066	 */
   1067	leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
   1068
   1069	/* determine the bit number and word within the dmap of the
   1070	 * starting block.
   1071	 */
   1072	dbitno = blkno & (BPERDMAP - 1);
   1073	word = dbitno >> L2DBWORD;
   1074
   1075	/* check if the specified block range is contained within
   1076	 * this dmap.
   1077	 */
   1078	if (dbitno + nblocks > BPERDMAP)
   1079		return -ENOSPC;
   1080
   1081	/* check if the starting leaf indicates that anything
   1082	 * is free.
   1083	 */
   1084	if (leaf[word] == NOFREE)
   1085		return -ENOSPC;
   1086
   1087	/* check the dmaps words corresponding to block range to see
   1088	 * if the block range is free.  not all bits of the first and
   1089	 * last words may be contained within the block range.  if this
   1090	 * is the case, we'll work against those words (i.e. partial first
   1091	 * and/or last) on an individual basis (a single pass) and examine
   1092	 * the actual bits to determine if they are free.  a single pass
   1093	 * will be used for all dmap words fully contained within the
   1094	 * specified range.  within this pass, the leaves of the dmap
   1095	 * tree will be examined to determine if the blocks are free. a
   1096	 * single leaf may describe the free space of multiple dmap
   1097	 * words, so we may visit only a subset of the actual leaves
   1098	 * corresponding to the dmap words of the block range.
   1099	 */
   1100	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
   1101		/* determine the bit number within the word and
   1102		 * the number of bits within the word.
   1103		 */
   1104		wbitno = dbitno & (DBWORD - 1);
   1105		nb = min(rembits, DBWORD - wbitno);
   1106
   1107		/* check if only part of the word is to be examined.
   1108		 */
   1109		if (nb < DBWORD) {
   1110			/* check if the bits are free.
   1111			 */
   1112			mask = (ONES << (DBWORD - nb) >> wbitno);
   1113			if ((mask & ~le32_to_cpu(dp->wmap[word])) != mask)
   1114				return -ENOSPC;
   1115
   1116			word += 1;
   1117		} else {
   1118			/* one or more dmap words are fully contained
   1119			 * within the block range.  determine how many
   1120			 * words and how many bits.
   1121			 */
   1122			nwords = rembits >> L2DBWORD;
   1123			nb = nwords << L2DBWORD;
   1124
   1125			/* now examine the appropriate leaves to determine
   1126			 * if the blocks are free.
   1127			 */
   1128			while (nwords > 0) {
   1129				/* does the leaf describe any free space ?
   1130				 */
   1131				if (leaf[word] < BUDMIN)
   1132					return -ENOSPC;
   1133
   1134				/* determine the l2 number of bits provided
   1135				 * by this leaf.
   1136				 */
   1137				l2size =
   1138				    min_t(int, leaf[word], NLSTOL2BSZ(nwords));
   1139
   1140				/* determine how many words were handled.
   1141				 */
   1142				nw = BUDSIZE(l2size, BUDMIN);
   1143
   1144				nwords -= nw;
   1145				word += nw;
   1146			}
   1147		}
   1148	}
   1149
   1150	/* allocate the blocks.
   1151	 */
   1152	return (dbAllocDmap(bmp, dp, blkno, nblocks));
   1153}
   1154
   1155
   1156/*
   1157 * NAME:	dbAllocNear()
   1158 *
   1159 * FUNCTION:	attempt to allocate a number of contiguous free blocks near
   1160 *		a specified block (hint) within a dmap.
   1161 *
   1162 *		starting with the dmap leaf that covers the hint, we'll
   1163 *		check the next four contiguous leaves for sufficient free
   1164 *		space.  if sufficient free space is found, we'll allocate
   1165 *		the desired free space.
   1166 *
   1167 * PARAMETERS:
   1168 *	bmp	-  pointer to bmap descriptor
   1169 *	dp	-  pointer to dmap.
   1170 *	blkno	-  block number to allocate near.
   1171 *	nblocks	-  actual number of contiguous free blocks desired.
   1172 *	l2nb	-  log2 number of contiguous free blocks desired.
   1173 *	results	-  on successful return, set to the starting block number
   1174 *		   of the newly allocated range.
   1175 *
   1176 * RETURN VALUES:
   1177 *	0	- success
   1178 *	-ENOSPC	- insufficient disk resources
   1179 *	-EIO	- i/o error
   1180 *
   1181 * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
   1182 */
   1183static int
   1184dbAllocNear(struct bmap * bmp,
   1185	    struct dmap * dp, s64 blkno, int nblocks, int l2nb, s64 * results)
   1186{
   1187	int word, lword, rc;
   1188	s8 *leaf;
   1189
   1190	if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
   1191		jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n");
   1192		return -EIO;
   1193	}
   1194
   1195	leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
   1196
   1197	/* determine the word within the dmap that holds the hint
   1198	 * (i.e. blkno).  also, determine the last word in the dmap
   1199	 * that we'll include in our examination.
   1200	 */
   1201	word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
   1202	lword = min(word + 4, LPERDMAP);
   1203
   1204	/* examine the leaves for sufficient free space.
   1205	 */
   1206	for (; word < lword; word++) {
   1207		/* does the leaf describe sufficient free space ?
   1208		 */
   1209		if (leaf[word] < l2nb)
   1210			continue;
   1211
   1212		/* determine the block number within the file system
   1213		 * of the first block described by this dmap word.
   1214		 */
   1215		blkno = le64_to_cpu(dp->start) + (word << L2DBWORD);
   1216
   1217		/* if not all bits of the dmap word are free, get the
   1218		 * starting bit number within the dmap word of the required
   1219		 * string of free bits and adjust the block number with the
   1220		 * value.
   1221		 */
   1222		if (leaf[word] < BUDMIN)
   1223			blkno +=
   1224			    dbFindBits(le32_to_cpu(dp->wmap[word]), l2nb);
   1225
   1226		/* allocate the blocks.
   1227		 */
   1228		if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
   1229			*results = blkno;
   1230
   1231		return (rc);
   1232	}
   1233
   1234	return -ENOSPC;
   1235}
   1236
   1237
   1238/*
   1239 * NAME:	dbAllocAG()
   1240 *
   1241 * FUNCTION:	attempt to allocate the specified number of contiguous
   1242 *		free blocks within the specified allocation group.
   1243 *
   1244 *		unless the allocation group size is equal to the number
   1245 *		of blocks per dmap, the dmap control pages will be used to
   1246 *		find the required free space, if available.  we start the
   1247 *		search at the highest dmap control page level which
   1248 *		distinctly describes the allocation group's free space
   1249 *		(i.e. the highest level at which the allocation group's
   1250 *		free space is not mixed in with that of any other group).
   1251 *		in addition, we start the search within this level at a
   1252 *		height of the dmapctl dmtree at which the nodes distinctly
   1253 *		describe the allocation group's free space.  at this height,
   1254 *		the allocation group's free space may be represented by 1
   1255 *		or two sub-trees, depending on the allocation group size.
   1256 *		we search the top nodes of these subtrees left to right for
   1257 *		sufficient free space.  if sufficient free space is found,
   1258 *		the subtree is searched to find the leftmost leaf that
   1259 *		has free space.  once we have made it to the leaf, we
   1260 *		move the search to the next lower level dmap control page
   1261 *		corresponding to this leaf.  we continue down the dmap control
   1262 *		pages until we find the dmap that contains or starts the
   1263 *		sufficient free space and we allocate at this dmap.
   1264 *
   1265 *		if the allocation group size is equal to the dmap size,
   1266 *		we'll start at the dmap corresponding to the allocation
   1267 *		group and attempt the allocation at this level.
   1268 *
   1269 *		the dmap control page search is also not performed if the
   1270 *		allocation group is completely free and we go to the first
   1271 *		dmap of the allocation group to do the allocation.  this is
   1272 *		done because the allocation group may be part (not the first
   1273 *		part) of a larger binary buddy system, causing the dmap
   1274 *		control pages to indicate no free space (NOFREE) within
   1275 *		the allocation group.
   1276 *
   1277 * PARAMETERS:
   1278 *	bmp	-  pointer to bmap descriptor
   1279 *	agno	- allocation group number.
   1280 *	nblocks	-  actual number of contiguous free blocks desired.
   1281 *	l2nb	-  log2 number of contiguous free blocks desired.
   1282 *	results	-  on successful return, set to the starting block number
   1283 *		   of the newly allocated range.
   1284 *
   1285 * RETURN VALUES:
   1286 *	0	- success
   1287 *	-ENOSPC	- insufficient disk resources
   1288 *	-EIO	- i/o error
   1289 *
   1290 * note: IWRITE_LOCK(ipmap) held on entry/exit;
   1291 */
   1292static int
   1293dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
   1294{
   1295	struct metapage *mp;
   1296	struct dmapctl *dcp;
   1297	int rc, ti, i, k, m, n, agperlev;
   1298	s64 blkno, lblkno;
   1299	int budmin;
   1300
   1301	/* allocation request should not be for more than the
   1302	 * allocation group size.
   1303	 */
   1304	if (l2nb > bmp->db_agl2size) {
   1305		jfs_error(bmp->db_ipbmap->i_sb,
   1306			  "allocation request is larger than the allocation group size\n");
   1307		return -EIO;
   1308	}
   1309
   1310	/* determine the starting block number of the allocation
   1311	 * group.
   1312	 */
   1313	blkno = (s64) agno << bmp->db_agl2size;
   1314
   1315	/* check if the allocation group size is the minimum allocation
   1316	 * group size or if the allocation group is completely free. if
   1317	 * the allocation group size is the minimum size of BPERDMAP (i.e.
   1318	 * 1 dmap), there is no need to search the dmap control page (below)
   1319	 * that fully describes the allocation group since the allocation
   1320	 * group is already fully described by a dmap.  in this case, we
   1321	 * just call dbAllocCtl() to search the dmap tree and allocate the
   1322	 * required space if available.
   1323	 *
   1324	 * if the allocation group is completely free, dbAllocCtl() is
   1325	 * also called to allocate the required space.  this is done for
   1326	 * two reasons.  first, it makes no sense searching the dmap control
   1327	 * pages for free space when we know that free space exists.  second,
   1328	 * the dmap control pages may indicate that the allocation group
   1329	 * has no free space if the allocation group is part (not the first
   1330	 * part) of a larger binary buddy system.
   1331	 */
   1332	if (bmp->db_agsize == BPERDMAP
   1333	    || bmp->db_agfree[agno] == bmp->db_agsize) {
   1334		rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
   1335		if ((rc == -ENOSPC) &&
   1336		    (bmp->db_agfree[agno] == bmp->db_agsize)) {
   1337			printk(KERN_ERR "blkno = %Lx, blocks = %Lx\n",
   1338			       (unsigned long long) blkno,
   1339			       (unsigned long long) nblocks);
   1340			jfs_error(bmp->db_ipbmap->i_sb,
   1341				  "dbAllocCtl failed in free AG\n");
   1342		}
   1343		return (rc);
   1344	}
   1345
   1346	/* the buffer for the dmap control page that fully describes the
   1347	 * allocation group.
   1348	 */
   1349	lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, bmp->db_aglevel);
   1350	mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
   1351	if (mp == NULL)
   1352		return -EIO;
   1353	dcp = (struct dmapctl *) mp->data;
   1354	budmin = dcp->budmin;
   1355
   1356	if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
   1357		jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
   1358		release_metapage(mp);
   1359		return -EIO;
   1360	}
   1361
   1362	/* search the subtree(s) of the dmap control page that describes
   1363	 * the allocation group, looking for sufficient free space.  to begin,
   1364	 * determine how many allocation groups are represented in a dmap
   1365	 * control page at the control page level (i.e. L0, L1, L2) that
   1366	 * fully describes an allocation group. next, determine the starting
   1367	 * tree index of this allocation group within the control page.
   1368	 */
   1369	agperlev =
   1370	    (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth;
   1371	ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
   1372
   1373	/* dmap control page trees fan-out by 4 and a single allocation
   1374	 * group may be described by 1 or 2 subtrees within the ag level
   1375	 * dmap control page, depending upon the ag size. examine the ag's
   1376	 * subtrees for sufficient free space, starting with the leftmost
   1377	 * subtree.
   1378	 */
   1379	for (i = 0; i < bmp->db_agwidth; i++, ti++) {
   1380		/* is there sufficient free space ?
   1381		 */
   1382		if (l2nb > dcp->stree[ti])
   1383			continue;
   1384
   1385		/* sufficient free space found in a subtree. now search down
   1386		 * the subtree to find the leftmost leaf that describes this
   1387		 * free space.
   1388		 */
   1389		for (k = bmp->db_agheight; k > 0; k--) {
   1390			for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
   1391				if (l2nb <= dcp->stree[m + n]) {
   1392					ti = m + n;
   1393					break;
   1394				}
   1395			}
   1396			if (n == 4) {
   1397				jfs_error(bmp->db_ipbmap->i_sb,
   1398					  "failed descending stree\n");
   1399				release_metapage(mp);
   1400				return -EIO;
   1401			}
   1402		}
   1403
   1404		/* determine the block number within the file system
   1405		 * that corresponds to this leaf.
   1406		 */
   1407		if (bmp->db_aglevel == 2)
   1408			blkno = 0;
   1409		else if (bmp->db_aglevel == 1)
   1410			blkno &= ~(MAXL1SIZE - 1);
   1411		else		/* bmp->db_aglevel == 0 */
   1412			blkno &= ~(MAXL0SIZE - 1);
   1413
   1414		blkno +=
   1415		    ((s64) (ti - le32_to_cpu(dcp->leafidx))) << budmin;
   1416
   1417		/* release the buffer in preparation for going down
   1418		 * the next level of dmap control pages.
   1419		 */
   1420		release_metapage(mp);
   1421
   1422		/* check if we need to continue to search down the lower
   1423		 * level dmap control pages.  we need to if the number of
   1424		 * blocks required is less than maximum number of blocks
   1425		 * described at the next lower level.
   1426		 */
   1427		if (l2nb < budmin) {
   1428
   1429			/* search the lower level dmap control pages to get
   1430			 * the starting block number of the dmap that
   1431			 * contains or starts off the free space.
   1432			 */
   1433			if ((rc =
   1434			     dbFindCtl(bmp, l2nb, bmp->db_aglevel - 1,
   1435				       &blkno))) {
   1436				if (rc == -ENOSPC) {
   1437					jfs_error(bmp->db_ipbmap->i_sb,
   1438						  "control page inconsistent\n");
   1439					return -EIO;
   1440				}
   1441				return (rc);
   1442			}
   1443		}
   1444
   1445		/* allocate the blocks.
   1446		 */
   1447		rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
   1448		if (rc == -ENOSPC) {
   1449			jfs_error(bmp->db_ipbmap->i_sb,
   1450				  "unable to allocate blocks\n");
   1451			rc = -EIO;
   1452		}
   1453		return (rc);
   1454	}
   1455
   1456	/* no space in the allocation group.  release the buffer and
   1457	 * return -ENOSPC.
   1458	 */
   1459	release_metapage(mp);
   1460
   1461	return -ENOSPC;
   1462}
   1463
   1464
   1465/*
   1466 * NAME:	dbAllocAny()
   1467 *
   1468 * FUNCTION:	attempt to allocate the specified number of contiguous
   1469 *		free blocks anywhere in the file system.
   1470 *
   1471 *		dbAllocAny() attempts to find the sufficient free space by
   1472 *		searching down the dmap control pages, starting with the
   1473 *		highest level (i.e. L0, L1, L2) control page.  if free space
   1474 *		large enough to satisfy the desired free space is found, the
   1475 *		desired free space is allocated.
   1476 *
   1477 * PARAMETERS:
   1478 *	bmp	-  pointer to bmap descriptor
   1479 *	nblocks	 -  actual number of contiguous free blocks desired.
   1480 *	l2nb	 -  log2 number of contiguous free blocks desired.
   1481 *	results	-  on successful return, set to the starting block number
   1482 *		   of the newly allocated range.
   1483 *
   1484 * RETURN VALUES:
   1485 *	0	- success
   1486 *	-ENOSPC	- insufficient disk resources
   1487 *	-EIO	- i/o error
   1488 *
   1489 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
   1490 */
   1491static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
   1492{
   1493	int rc;
   1494	s64 blkno = 0;
   1495
   1496	/* starting with the top level dmap control page, search
   1497	 * down the dmap control levels for sufficient free space.
   1498	 * if free space is found, dbFindCtl() returns the starting
   1499	 * block number of the dmap that contains or starts off the
   1500	 * range of free space.
   1501	 */
   1502	if ((rc = dbFindCtl(bmp, l2nb, bmp->db_maxlevel, &blkno)))
   1503		return (rc);
   1504
   1505	/* allocate the blocks.
   1506	 */
   1507	rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
   1508	if (rc == -ENOSPC) {
   1509		jfs_error(bmp->db_ipbmap->i_sb, "unable to allocate blocks\n");
   1510		return -EIO;
   1511	}
   1512	return (rc);
   1513}
   1514
   1515
   1516/*
   1517 * NAME:	dbDiscardAG()
   1518 *
   1519 * FUNCTION:	attempt to discard (TRIM) all free blocks of specific AG
   1520 *
   1521 *		algorithm:
   1522 *		1) allocate blocks, as large as possible and save them
   1523 *		   while holding IWRITE_LOCK on ipbmap
   1524 *		2) trim all these saved block/length values
   1525 *		3) mark the blocks free again
   1526 *
   1527 *		benefit:
   1528 *		- we work only on one ag at some time, minimizing how long we
   1529 *		  need to lock ipbmap
   1530 *		- reading / writing the fs is possible most time, even on
   1531 *		  trimming
   1532 *
   1533 *		downside:
   1534 *		- we write two times to the dmapctl and dmap pages
   1535 *		- but for me, this seems the best way, better ideas?
   1536 *		/TR 2012
   1537 *
   1538 * PARAMETERS:
   1539 *	ip	- pointer to in-core inode
   1540 *	agno	- ag to trim
   1541 *	minlen	- minimum value of contiguous blocks
   1542 *
   1543 * RETURN VALUES:
   1544 *	s64	- actual number of blocks trimmed
   1545 */
   1546s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen)
   1547{
   1548	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
   1549	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
   1550	s64 nblocks, blkno;
   1551	u64 trimmed = 0;
   1552	int rc, l2nb;
   1553	struct super_block *sb = ipbmap->i_sb;
   1554
   1555	struct range2trim {
   1556		u64 blkno;
   1557		u64 nblocks;
   1558	} *totrim, *tt;
   1559
   1560	/* max blkno / nblocks pairs to trim */
   1561	int count = 0, range_cnt;
   1562	u64 max_ranges;
   1563
   1564	/* prevent others from writing new stuff here, while trimming */
   1565	IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP);
   1566
   1567	nblocks = bmp->db_agfree[agno];
   1568	max_ranges = nblocks;
   1569	do_div(max_ranges, minlen);
   1570	range_cnt = min_t(u64, max_ranges + 1, 32 * 1024);
   1571	totrim = kmalloc_array(range_cnt, sizeof(struct range2trim), GFP_NOFS);
   1572	if (totrim == NULL) {
   1573		jfs_error(bmp->db_ipbmap->i_sb, "no memory for trim array\n");
   1574		IWRITE_UNLOCK(ipbmap);
   1575		return 0;
   1576	}
   1577
   1578	tt = totrim;
   1579	while (nblocks >= minlen) {
   1580		l2nb = BLKSTOL2(nblocks);
   1581
   1582		/* 0 = okay, -EIO = fatal, -ENOSPC -> try smaller block */
   1583		rc = dbAllocAG(bmp, agno, nblocks, l2nb, &blkno);
   1584		if (rc == 0) {
   1585			tt->blkno = blkno;
   1586			tt->nblocks = nblocks;
   1587			tt++; count++;
   1588
   1589			/* the whole ag is free, trim now */
   1590			if (bmp->db_agfree[agno] == 0)
   1591				break;
   1592
   1593			/* give a hint for the next while */
   1594			nblocks = bmp->db_agfree[agno];
   1595			continue;
   1596		} else if (rc == -ENOSPC) {
   1597			/* search for next smaller log2 block */
   1598			l2nb = BLKSTOL2(nblocks) - 1;
   1599			nblocks = 1LL << l2nb;
   1600		} else {
   1601			/* Trim any already allocated blocks */
   1602			jfs_error(bmp->db_ipbmap->i_sb, "-EIO\n");
   1603			break;
   1604		}
   1605
   1606		/* check, if our trim array is full */
   1607		if (unlikely(count >= range_cnt - 1))
   1608			break;
   1609	}
   1610	IWRITE_UNLOCK(ipbmap);
   1611
   1612	tt->nblocks = 0; /* mark the current end */
   1613	for (tt = totrim; tt->nblocks != 0; tt++) {
   1614		/* when mounted with online discard, dbFree() will
   1615		 * call jfs_issue_discard() itself */
   1616		if (!(JFS_SBI(sb)->flag & JFS_DISCARD))
   1617			jfs_issue_discard(ip, tt->blkno, tt->nblocks);
   1618		dbFree(ip, tt->blkno, tt->nblocks);
   1619		trimmed += tt->nblocks;
   1620	}
   1621	kfree(totrim);
   1622
   1623	return trimmed;
   1624}
   1625
   1626/*
   1627 * NAME:	dbFindCtl()
   1628 *
   1629 * FUNCTION:	starting at a specified dmap control page level and block
   1630 *		number, search down the dmap control levels for a range of
   1631 *		contiguous free blocks large enough to satisfy an allocation
   1632 *		request for the specified number of free blocks.
   1633 *
   1634 *		if sufficient contiguous free blocks are found, this routine
   1635 *		returns the starting block number within a dmap page that
   1636 *		contains or starts a range of contiqious free blocks that
   1637 *		is sufficient in size.
   1638 *
   1639 * PARAMETERS:
   1640 *	bmp	-  pointer to bmap descriptor
   1641 *	level	-  starting dmap control page level.
   1642 *	l2nb	-  log2 number of contiguous free blocks desired.
   1643 *	*blkno	-  on entry, starting block number for conducting the search.
   1644 *		   on successful return, the first block within a dmap page
   1645 *		   that contains or starts a range of contiguous free blocks.
   1646 *
   1647 * RETURN VALUES:
   1648 *	0	- success
   1649 *	-ENOSPC	- insufficient disk resources
   1650 *	-EIO	- i/o error
   1651 *
   1652 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
   1653 */
   1654static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
   1655{
   1656	int rc, leafidx, lev;
   1657	s64 b, lblkno;
   1658	struct dmapctl *dcp;
   1659	int budmin;
   1660	struct metapage *mp;
   1661
   1662	/* starting at the specified dmap control page level and block
   1663	 * number, search down the dmap control levels for the starting
   1664	 * block number of a dmap page that contains or starts off
   1665	 * sufficient free blocks.
   1666	 */
   1667	for (lev = level, b = *blkno; lev >= 0; lev--) {
   1668		/* get the buffer of the dmap control page for the block
   1669		 * number and level (i.e. L0, L1, L2).
   1670		 */
   1671		lblkno = BLKTOCTL(b, bmp->db_l2nbperpage, lev);
   1672		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
   1673		if (mp == NULL)
   1674			return -EIO;
   1675		dcp = (struct dmapctl *) mp->data;
   1676		budmin = dcp->budmin;
   1677
   1678		if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
   1679			jfs_error(bmp->db_ipbmap->i_sb,
   1680				  "Corrupt dmapctl page\n");
   1681			release_metapage(mp);
   1682			return -EIO;
   1683		}
   1684
   1685		/* search the tree within the dmap control page for
   1686		 * sufficient free space.  if sufficient free space is found,
   1687		 * dbFindLeaf() returns the index of the leaf at which
   1688		 * free space was found.
   1689		 */
   1690		rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx);
   1691
   1692		/* release the buffer.
   1693		 */
   1694		release_metapage(mp);
   1695
   1696		/* space found ?
   1697		 */
   1698		if (rc) {
   1699			if (lev != level) {
   1700				jfs_error(bmp->db_ipbmap->i_sb,
   1701					  "dmap inconsistent\n");
   1702				return -EIO;
   1703			}
   1704			return -ENOSPC;
   1705		}
   1706
   1707		/* adjust the block number to reflect the location within
   1708		 * the dmap control page (i.e. the leaf) at which free
   1709		 * space was found.
   1710		 */
   1711		b += (((s64) leafidx) << budmin);
   1712
   1713		/* we stop the search at this dmap control page level if
   1714		 * the number of blocks required is greater than or equal
   1715		 * to the maximum number of blocks described at the next
   1716		 * (lower) level.
   1717		 */
   1718		if (l2nb >= budmin)
   1719			break;
   1720	}
   1721
   1722	*blkno = b;
   1723	return (0);
   1724}
   1725
   1726
   1727/*
   1728 * NAME:	dbAllocCtl()
   1729 *
   1730 * FUNCTION:	attempt to allocate a specified number of contiguous
   1731 *		blocks starting within a specific dmap.
   1732 *
   1733 *		this routine is called by higher level routines that search
   1734 *		the dmap control pages above the actual dmaps for contiguous
   1735 *		free space.  the result of successful searches by these
   1736 *		routines are the starting block numbers within dmaps, with
   1737 *		the dmaps themselves containing the desired contiguous free
   1738 *		space or starting a contiguous free space of desired size
   1739 *		that is made up of the blocks of one or more dmaps. these
   1740 *		calls should not fail due to insufficent resources.
   1741 *
   1742 *		this routine is called in some cases where it is not known
   1743 *		whether it will fail due to insufficient resources.  more
   1744 *		specifically, this occurs when allocating from an allocation
   1745 *		group whose size is equal to the number of blocks per dmap.
   1746 *		in this case, the dmap control pages are not examined prior
   1747 *		to calling this routine (to save pathlength) and the call
   1748 *		might fail.
   1749 *
   1750 *		for a request size that fits within a dmap, this routine relies
   1751 *		upon the dmap's dmtree to find the requested contiguous free
   1752 *		space.  for request sizes that are larger than a dmap, the
   1753 *		requested free space will start at the first block of the
   1754 *		first dmap (i.e. blkno).
   1755 *
   1756 * PARAMETERS:
   1757 *	bmp	-  pointer to bmap descriptor
   1758 *	nblocks	 -  actual number of contiguous free blocks to allocate.
   1759 *	l2nb	 -  log2 number of contiguous free blocks to allocate.
   1760 *	blkno	 -  starting block number of the dmap to start the allocation
   1761 *		    from.
   1762 *	results	-  on successful return, set to the starting block number
   1763 *		   of the newly allocated range.
   1764 *
   1765 * RETURN VALUES:
   1766 *	0	- success
   1767 *	-ENOSPC	- insufficient disk resources
   1768 *	-EIO	- i/o error
   1769 *
   1770 * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
   1771 */
   1772static int
   1773dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
   1774{
   1775	int rc, nb;
   1776	s64 b, lblkno, n;
   1777	struct metapage *mp;
   1778	struct dmap *dp;
   1779
   1780	/* check if the allocation request is confined to a single dmap.
   1781	 */
   1782	if (l2nb <= L2BPERDMAP) {
   1783		/* get the buffer for the dmap.
   1784		 */
   1785		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
   1786		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
   1787		if (mp == NULL)
   1788			return -EIO;
   1789		dp = (struct dmap *) mp->data;
   1790
   1791		/* try to allocate the blocks.
   1792		 */
   1793		rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results);
   1794		if (rc == 0)
   1795			mark_metapage_dirty(mp);
   1796
   1797		release_metapage(mp);
   1798
   1799		return (rc);
   1800	}
   1801
   1802	/* allocation request involving multiple dmaps. it must start on
   1803	 * a dmap boundary.
   1804	 */
   1805	assert((blkno & (BPERDMAP - 1)) == 0);
   1806
   1807	/* allocate the blocks dmap by dmap.
   1808	 */
   1809	for (n = nblocks, b = blkno; n > 0; n -= nb, b += nb) {
   1810		/* get the buffer for the dmap.
   1811		 */
   1812		lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
   1813		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
   1814		if (mp == NULL) {
   1815			rc = -EIO;
   1816			goto backout;
   1817		}
   1818		dp = (struct dmap *) mp->data;
   1819
   1820		/* the dmap better be all free.
   1821		 */
   1822		if (dp->tree.stree[ROOT] != L2BPERDMAP) {
   1823			release_metapage(mp);
   1824			jfs_error(bmp->db_ipbmap->i_sb,
   1825				  "the dmap is not all free\n");
   1826			rc = -EIO;
   1827			goto backout;
   1828		}
   1829
   1830		/* determine how many blocks to allocate from this dmap.
   1831		 */
   1832		nb = min_t(s64, n, BPERDMAP);
   1833
   1834		/* allocate the blocks from the dmap.
   1835		 */
   1836		if ((rc = dbAllocDmap(bmp, dp, b, nb))) {
   1837			release_metapage(mp);
   1838			goto backout;
   1839		}
   1840
   1841		/* write the buffer.
   1842		 */
   1843		write_metapage(mp);
   1844	}
   1845
   1846	/* set the results (starting block number) and return.
   1847	 */
   1848	*results = blkno;
   1849	return (0);
   1850
   1851	/* something failed in handling an allocation request involving
   1852	 * multiple dmaps.  we'll try to clean up by backing out any
   1853	 * allocation that has already happened for this request.  if
   1854	 * we fail in backing out the allocation, we'll mark the file
   1855	 * system to indicate that blocks have been leaked.
   1856	 */
   1857      backout:
   1858
   1859	/* try to backout the allocations dmap by dmap.
   1860	 */
   1861	for (n = nblocks - n, b = blkno; n > 0;
   1862	     n -= BPERDMAP, b += BPERDMAP) {
   1863		/* get the buffer for this dmap.
   1864		 */
   1865		lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
   1866		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
   1867		if (mp == NULL) {
   1868			/* could not back out.  mark the file system
   1869			 * to indicate that we have leaked blocks.
   1870			 */
   1871			jfs_error(bmp->db_ipbmap->i_sb,
   1872				  "I/O Error: Block Leakage\n");
   1873			continue;
   1874		}
   1875		dp = (struct dmap *) mp->data;
   1876
   1877		/* free the blocks is this dmap.
   1878		 */
   1879		if (dbFreeDmap(bmp, dp, b, BPERDMAP)) {
   1880			/* could not back out.  mark the file system
   1881			 * to indicate that we have leaked blocks.
   1882			 */
   1883			release_metapage(mp);
   1884			jfs_error(bmp->db_ipbmap->i_sb, "Block Leakage\n");
   1885			continue;
   1886		}
   1887
   1888		/* write the buffer.
   1889		 */
   1890		write_metapage(mp);
   1891	}
   1892
   1893	return (rc);
   1894}
   1895
   1896
   1897/*
   1898 * NAME:	dbAllocDmapLev()
   1899 *
   1900 * FUNCTION:	attempt to allocate a specified number of contiguous blocks
   1901 *		from a specified dmap.
   1902 *
   1903 *		this routine checks if the contiguous blocks are available.
   1904 *		if so, nblocks of blocks are allocated; otherwise, ENOSPC is
   1905 *		returned.
   1906 *
   1907 * PARAMETERS:
   1908 *	mp	-  pointer to bmap descriptor
   1909 *	dp	-  pointer to dmap to attempt to allocate blocks from.
   1910 *	l2nb	-  log2 number of contiguous block desired.
   1911 *	nblocks	-  actual number of contiguous block desired.
   1912 *	results	-  on successful return, set to the starting block number
   1913 *		   of the newly allocated range.
   1914 *
   1915 * RETURN VALUES:
   1916 *	0	- success
   1917 *	-ENOSPC	- insufficient disk resources
   1918 *	-EIO	- i/o error
   1919 *
   1920 * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or
   1921 *	IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit;
   1922 */
   1923static int
   1924dbAllocDmapLev(struct bmap * bmp,
   1925	       struct dmap * dp, int nblocks, int l2nb, s64 * results)
   1926{
   1927	s64 blkno;
   1928	int leafidx, rc;
   1929
   1930	/* can't be more than a dmaps worth of blocks */
   1931	assert(l2nb <= L2BPERDMAP);
   1932
   1933	/* search the tree within the dmap page for sufficient
   1934	 * free space.  if sufficient free space is found, dbFindLeaf()
   1935	 * returns the index of the leaf at which free space was found.
   1936	 */
   1937	if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx))
   1938		return -ENOSPC;
   1939
   1940	/* determine the block number within the file system corresponding
   1941	 * to the leaf at which free space was found.
   1942	 */
   1943	blkno = le64_to_cpu(dp->start) + (leafidx << L2DBWORD);
   1944
   1945	/* if not all bits of the dmap word are free, get the starting
   1946	 * bit number within the dmap word of the required string of free
   1947	 * bits and adjust the block number with this value.
   1948	 */
   1949	if (dp->tree.stree[leafidx + LEAFIND] < BUDMIN)
   1950		blkno += dbFindBits(le32_to_cpu(dp->wmap[leafidx]), l2nb);
   1951
   1952	/* allocate the blocks */
   1953	if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
   1954		*results = blkno;
   1955
   1956	return (rc);
   1957}
   1958
   1959
   1960/*
   1961 * NAME:	dbAllocDmap()
   1962 *
   1963 * FUNCTION:	adjust the disk allocation map to reflect the allocation
   1964 *		of a specified block range within a dmap.
   1965 *
   1966 *		this routine allocates the specified blocks from the dmap
   1967 *		through a call to dbAllocBits(). if the allocation of the
   1968 *		block range causes the maximum string of free blocks within
   1969 *		the dmap to change (i.e. the value of the root of the dmap's
   1970 *		dmtree), this routine will cause this change to be reflected
   1971 *		up through the appropriate levels of the dmap control pages
   1972 *		by a call to dbAdjCtl() for the L0 dmap control page that
   1973 *		covers this dmap.
   1974 *
   1975 * PARAMETERS:
   1976 *	bmp	-  pointer to bmap descriptor
   1977 *	dp	-  pointer to dmap to allocate the block range from.
   1978 *	blkno	-  starting block number of the block to be allocated.
   1979 *	nblocks	-  number of blocks to be allocated.
   1980 *
   1981 * RETURN VALUES:
   1982 *	0	- success
   1983 *	-EIO	- i/o error
   1984 *
   1985 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
   1986 */
   1987static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
   1988		       int nblocks)
   1989{
   1990	s8 oldroot;
   1991	int rc;
   1992
   1993	/* save the current value of the root (i.e. maximum free string)
   1994	 * of the dmap tree.
   1995	 */
   1996	oldroot = dp->tree.stree[ROOT];
   1997
   1998	/* allocate the specified (blocks) bits */
   1999	dbAllocBits(bmp, dp, blkno, nblocks);
   2000
   2001	/* if the root has not changed, done. */
   2002	if (dp->tree.stree[ROOT] == oldroot)
   2003		return (0);
   2004
   2005	/* root changed. bubble the change up to the dmap control pages.
   2006	 * if the adjustment of the upper level control pages fails,
   2007	 * backout the bit allocation (thus making everything consistent).
   2008	 */
   2009	if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 1, 0)))
   2010		dbFreeBits(bmp, dp, blkno, nblocks);
   2011
   2012	return (rc);
   2013}
   2014
   2015
   2016/*
   2017 * NAME:	dbFreeDmap()
   2018 *
   2019 * FUNCTION:	adjust the disk allocation map to reflect the allocation
   2020 *		of a specified block range within a dmap.
   2021 *
   2022 *		this routine frees the specified blocks from the dmap through
   2023 *		a call to dbFreeBits(). if the deallocation of the block range
   2024 *		causes the maximum string of free blocks within the dmap to
   2025 *		change (i.e. the value of the root of the dmap's dmtree), this
   2026 *		routine will cause this change to be reflected up through the
   2027 *		appropriate levels of the dmap control pages by a call to
   2028 *		dbAdjCtl() for the L0 dmap control page that covers this dmap.
   2029 *
   2030 * PARAMETERS:
   2031 *	bmp	-  pointer to bmap descriptor
   2032 *	dp	-  pointer to dmap to free the block range from.
   2033 *	blkno	-  starting block number of the block to be freed.
   2034 *	nblocks	-  number of blocks to be freed.
   2035 *
   2036 * RETURN VALUES:
   2037 *	0	- success
   2038 *	-EIO	- i/o error
   2039 *
   2040 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
   2041 */
   2042static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
   2043		      int nblocks)
   2044{
   2045	s8 oldroot;
   2046	int rc = 0, word;
   2047
   2048	/* save the current value of the root (i.e. maximum free string)
   2049	 * of the dmap tree.
   2050	 */
   2051	oldroot = dp->tree.stree[ROOT];
   2052
   2053	/* free the specified (blocks) bits */
   2054	rc = dbFreeBits(bmp, dp, blkno, nblocks);
   2055
   2056	/* if error or the root has not changed, done. */
   2057	if (rc || (dp->tree.stree[ROOT] == oldroot))
   2058		return (rc);
   2059
   2060	/* root changed. bubble the change up to the dmap control pages.
   2061	 * if the adjustment of the upper level control pages fails,
   2062	 * backout the deallocation.
   2063	 */
   2064	if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) {
   2065		word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
   2066
   2067		/* as part of backing out the deallocation, we will have
   2068		 * to back split the dmap tree if the deallocation caused
   2069		 * the freed blocks to become part of a larger binary buddy
   2070		 * system.
   2071		 */
   2072		if (dp->tree.stree[word] == NOFREE)
   2073			dbBackSplit((dmtree_t *) & dp->tree, word);
   2074
   2075		dbAllocBits(bmp, dp, blkno, nblocks);
   2076	}
   2077
   2078	return (rc);
   2079}
   2080
   2081
   2082/*
   2083 * NAME:	dbAllocBits()
   2084 *
   2085 * FUNCTION:	allocate a specified block range from a dmap.
   2086 *
   2087 *		this routine updates the dmap to reflect the working
   2088 *		state allocation of the specified block range. it directly
   2089 *		updates the bits of the working map and causes the adjustment
   2090 *		of the binary buddy system described by the dmap's dmtree
   2091 *		leaves to reflect the bits allocated.  it also causes the
   2092 *		dmap's dmtree, as a whole, to reflect the allocated range.
   2093 *
   2094 * PARAMETERS:
   2095 *	bmp	-  pointer to bmap descriptor
   2096 *	dp	-  pointer to dmap to allocate bits from.
   2097 *	blkno	-  starting block number of the bits to be allocated.
   2098 *	nblocks	-  number of bits to be allocated.
   2099 *
   2100 * RETURN VALUES: none
   2101 *
   2102 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
   2103 */
   2104static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
   2105			int nblocks)
   2106{
   2107	int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
   2108	dmtree_t *tp = (dmtree_t *) & dp->tree;
   2109	int size;
   2110	s8 *leaf;
   2111
   2112	/* pick up a pointer to the leaves of the dmap tree */
   2113	leaf = dp->tree.stree + LEAFIND;
   2114
   2115	/* determine the bit number and word within the dmap of the
   2116	 * starting block.
   2117	 */
   2118	dbitno = blkno & (BPERDMAP - 1);
   2119	word = dbitno >> L2DBWORD;
   2120
   2121	/* block range better be within the dmap */
   2122	assert(dbitno + nblocks <= BPERDMAP);
   2123
   2124	/* allocate the bits of the dmap's words corresponding to the block
   2125	 * range. not all bits of the first and last words may be contained
   2126	 * within the block range.  if this is the case, we'll work against
   2127	 * those words (i.e. partial first and/or last) on an individual basis
   2128	 * (a single pass), allocating the bits of interest by hand and
   2129	 * updating the leaf corresponding to the dmap word. a single pass
   2130	 * will be used for all dmap words fully contained within the
   2131	 * specified range.  within this pass, the bits of all fully contained
   2132	 * dmap words will be marked as free in a single shot and the leaves
   2133	 * will be updated. a single leaf may describe the free space of
   2134	 * multiple dmap words, so we may update only a subset of the actual
   2135	 * leaves corresponding to the dmap words of the block range.
   2136	 */
   2137	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
   2138		/* determine the bit number within the word and
   2139		 * the number of bits within the word.
   2140		 */
   2141		wbitno = dbitno & (DBWORD - 1);
   2142		nb = min(rembits, DBWORD - wbitno);
   2143
   2144		/* check if only part of a word is to be allocated.
   2145		 */
   2146		if (nb < DBWORD) {
   2147			/* allocate (set to 1) the appropriate bits within
   2148			 * this dmap word.
   2149			 */
   2150			dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
   2151						      >> wbitno);
   2152
   2153			/* update the leaf for this dmap word. in addition
   2154			 * to setting the leaf value to the binary buddy max
   2155			 * of the updated dmap word, dbSplit() will split
   2156			 * the binary system of the leaves if need be.
   2157			 */
   2158			dbSplit(tp, word, BUDMIN,
   2159				dbMaxBud((u8 *) & dp->wmap[word]));
   2160
   2161			word += 1;
   2162		} else {
   2163			/* one or more dmap words are fully contained
   2164			 * within the block range.  determine how many
   2165			 * words and allocate (set to 1) the bits of these
   2166			 * words.
   2167			 */
   2168			nwords = rembits >> L2DBWORD;
   2169			memset(&dp->wmap[word], (int) ONES, nwords * 4);
   2170
   2171			/* determine how many bits.
   2172			 */
   2173			nb = nwords << L2DBWORD;
   2174
   2175			/* now update the appropriate leaves to reflect
   2176			 * the allocated words.
   2177			 */
   2178			for (; nwords > 0; nwords -= nw) {
   2179				if (leaf[word] < BUDMIN) {
   2180					jfs_error(bmp->db_ipbmap->i_sb,
   2181						  "leaf page corrupt\n");
   2182					break;
   2183				}
   2184
   2185				/* determine what the leaf value should be
   2186				 * updated to as the minimum of the l2 number
   2187				 * of bits being allocated and the l2 number
   2188				 * of bits currently described by this leaf.
   2189				 */
   2190				size = min_t(int, leaf[word],
   2191					     NLSTOL2BSZ(nwords));
   2192
   2193				/* update the leaf to reflect the allocation.
   2194				 * in addition to setting the leaf value to
   2195				 * NOFREE, dbSplit() will split the binary
   2196				 * system of the leaves to reflect the current
   2197				 * allocation (size).
   2198				 */
   2199				dbSplit(tp, word, size, NOFREE);
   2200
   2201				/* get the number of dmap words handled */
   2202				nw = BUDSIZE(size, BUDMIN);
   2203				word += nw;
   2204			}
   2205		}
   2206	}
   2207
   2208	/* update the free count for this dmap */
   2209	le32_add_cpu(&dp->nfree, -nblocks);
   2210
   2211	BMAP_LOCK(bmp);
   2212
   2213	/* if this allocation group is completely free,
   2214	 * update the maximum allocation group number if this allocation
   2215	 * group is the new max.
   2216	 */
   2217	agno = blkno >> bmp->db_agl2size;
   2218	if (agno > bmp->db_maxag)
   2219		bmp->db_maxag = agno;
   2220
   2221	/* update the free count for the allocation group and map */
   2222	bmp->db_agfree[agno] -= nblocks;
   2223	bmp->db_nfree -= nblocks;
   2224
   2225	BMAP_UNLOCK(bmp);
   2226}
   2227
   2228
   2229/*
   2230 * NAME:	dbFreeBits()
   2231 *
   2232 * FUNCTION:	free a specified block range from a dmap.
   2233 *
   2234 *		this routine updates the dmap to reflect the working
   2235 *		state allocation of the specified block range. it directly
   2236 *		updates the bits of the working map and causes the adjustment
   2237 *		of the binary buddy system described by the dmap's dmtree
   2238 *		leaves to reflect the bits freed.  it also causes the dmap's
   2239 *		dmtree, as a whole, to reflect the deallocated range.
   2240 *
   2241 * PARAMETERS:
   2242 *	bmp	-  pointer to bmap descriptor
   2243 *	dp	-  pointer to dmap to free bits from.
   2244 *	blkno	-  starting block number of the bits to be freed.
   2245 *	nblocks	-  number of bits to be freed.
   2246 *
   2247 * RETURN VALUES: 0 for success
   2248 *
   2249 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
   2250 */
   2251static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
   2252		       int nblocks)
   2253{
   2254	int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
   2255	dmtree_t *tp = (dmtree_t *) & dp->tree;
   2256	int rc = 0;
   2257	int size;
   2258
   2259	/* determine the bit number and word within the dmap of the
   2260	 * starting block.
   2261	 */
   2262	dbitno = blkno & (BPERDMAP - 1);
   2263	word = dbitno >> L2DBWORD;
   2264
   2265	/* block range better be within the dmap.
   2266	 */
   2267	assert(dbitno + nblocks <= BPERDMAP);
   2268
   2269	/* free the bits of the dmaps words corresponding to the block range.
   2270	 * not all bits of the first and last words may be contained within
   2271	 * the block range.  if this is the case, we'll work against those
   2272	 * words (i.e. partial first and/or last) on an individual basis
   2273	 * (a single pass), freeing the bits of interest by hand and updating
   2274	 * the leaf corresponding to the dmap word. a single pass will be used
   2275	 * for all dmap words fully contained within the specified range.
   2276	 * within this pass, the bits of all fully contained dmap words will
   2277	 * be marked as free in a single shot and the leaves will be updated. a
   2278	 * single leaf may describe the free space of multiple dmap words,
   2279	 * so we may update only a subset of the actual leaves corresponding
   2280	 * to the dmap words of the block range.
   2281	 *
   2282	 * dbJoin() is used to update leaf values and will join the binary
   2283	 * buddy system of the leaves if the new leaf values indicate this
   2284	 * should be done.
   2285	 */
   2286	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
   2287		/* determine the bit number within the word and
   2288		 * the number of bits within the word.
   2289		 */
   2290		wbitno = dbitno & (DBWORD - 1);
   2291		nb = min(rembits, DBWORD - wbitno);
   2292
   2293		/* check if only part of a word is to be freed.
   2294		 */
   2295		if (nb < DBWORD) {
   2296			/* free (zero) the appropriate bits within this
   2297			 * dmap word.
   2298			 */
   2299			dp->wmap[word] &=
   2300			    cpu_to_le32(~(ONES << (DBWORD - nb)
   2301					  >> wbitno));
   2302
   2303			/* update the leaf for this dmap word.
   2304			 */
   2305			rc = dbJoin(tp, word,
   2306				    dbMaxBud((u8 *) & dp->wmap[word]));
   2307			if (rc)
   2308				return rc;
   2309
   2310			word += 1;
   2311		} else {
   2312			/* one or more dmap words are fully contained
   2313			 * within the block range.  determine how many
   2314			 * words and free (zero) the bits of these words.
   2315			 */
   2316			nwords = rembits >> L2DBWORD;
   2317			memset(&dp->wmap[word], 0, nwords * 4);
   2318
   2319			/* determine how many bits.
   2320			 */
   2321			nb = nwords << L2DBWORD;
   2322
   2323			/* now update the appropriate leaves to reflect
   2324			 * the freed words.
   2325			 */
   2326			for (; nwords > 0; nwords -= nw) {
   2327				/* determine what the leaf value should be
   2328				 * updated to as the minimum of the l2 number
   2329				 * of bits being freed and the l2 (max) number
   2330				 * of bits that can be described by this leaf.
   2331				 */
   2332				size =
   2333				    min(LITOL2BSZ
   2334					(word, L2LPERDMAP, BUDMIN),
   2335					NLSTOL2BSZ(nwords));
   2336
   2337				/* update the leaf.
   2338				 */
   2339				rc = dbJoin(tp, word, size);
   2340				if (rc)
   2341					return rc;
   2342
   2343				/* get the number of dmap words handled.
   2344				 */
   2345				nw = BUDSIZE(size, BUDMIN);
   2346				word += nw;
   2347			}
   2348		}
   2349	}
   2350
   2351	/* update the free count for this dmap.
   2352	 */
   2353	le32_add_cpu(&dp->nfree, nblocks);
   2354
   2355	BMAP_LOCK(bmp);
   2356
   2357	/* update the free count for the allocation group and
   2358	 * map.
   2359	 */
   2360	agno = blkno >> bmp->db_agl2size;
   2361	bmp->db_nfree += nblocks;
   2362	bmp->db_agfree[agno] += nblocks;
   2363
   2364	/* check if this allocation group is not completely free and
   2365	 * if it is currently the maximum (rightmost) allocation group.
   2366	 * if so, establish the new maximum allocation group number by
   2367	 * searching left for the first allocation group with allocation.
   2368	 */
   2369	if ((bmp->db_agfree[agno] == bmp->db_agsize && agno == bmp->db_maxag) ||
   2370	    (agno == bmp->db_numag - 1 &&
   2371	     bmp->db_agfree[agno] == (bmp-> db_mapsize & (BPERDMAP - 1)))) {
   2372		while (bmp->db_maxag > 0) {
   2373			bmp->db_maxag -= 1;
   2374			if (bmp->db_agfree[bmp->db_maxag] !=
   2375			    bmp->db_agsize)
   2376				break;
   2377		}
   2378
   2379		/* re-establish the allocation group preference if the
   2380		 * current preference is right of the maximum allocation
   2381		 * group.
   2382		 */
   2383		if (bmp->db_agpref > bmp->db_maxag)
   2384			bmp->db_agpref = bmp->db_maxag;
   2385	}
   2386
   2387	BMAP_UNLOCK(bmp);
   2388
   2389	return 0;
   2390}
   2391
   2392
   2393/*
   2394 * NAME:	dbAdjCtl()
   2395 *
   2396 * FUNCTION:	adjust a dmap control page at a specified level to reflect
   2397 *		the change in a lower level dmap or dmap control page's
   2398 *		maximum string of free blocks (i.e. a change in the root
   2399 *		of the lower level object's dmtree) due to the allocation
   2400 *		or deallocation of a range of blocks with a single dmap.
   2401 *
   2402 *		on entry, this routine is provided with the new value of
   2403 *		the lower level dmap or dmap control page root and the
   2404 *		starting block number of the block range whose allocation
   2405 *		or deallocation resulted in the root change.  this range
   2406 *		is respresented by a single leaf of the current dmapctl
   2407 *		and the leaf will be updated with this value, possibly
   2408 *		causing a binary buddy system within the leaves to be
   2409 *		split or joined.  the update may also cause the dmapctl's
   2410 *		dmtree to be updated.
   2411 *
   2412 *		if the adjustment of the dmap control page, itself, causes its
   2413 *		root to change, this change will be bubbled up to the next dmap
   2414 *		control level by a recursive call to this routine, specifying
   2415 *		the new root value and the next dmap control page level to
   2416 *		be adjusted.
   2417 * PARAMETERS:
   2418 *	bmp	-  pointer to bmap descriptor
   2419 *	blkno	-  the first block of a block range within a dmap.  it is
   2420 *		   the allocation or deallocation of this block range that
   2421 *		   requires the dmap control page to be adjusted.
   2422 *	newval	-  the new value of the lower level dmap or dmap control
   2423 *		   page root.
   2424 *	alloc	-  'true' if adjustment is due to an allocation.
   2425 *	level	-  current level of dmap control page (i.e. L0, L1, L2) to
   2426 *		   be adjusted.
   2427 *
   2428 * RETURN VALUES:
   2429 *	0	- success
   2430 *	-EIO	- i/o error
   2431 *
   2432 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
   2433 */
   2434static int
   2435dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
   2436{
   2437	struct metapage *mp;
   2438	s8 oldroot;
   2439	int oldval;
   2440	s64 lblkno;
   2441	struct dmapctl *dcp;
   2442	int rc, leafno, ti;
   2443
   2444	/* get the buffer for the dmap control page for the specified
   2445	 * block number and control page level.
   2446	 */
   2447	lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, level);
   2448	mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
   2449	if (mp == NULL)
   2450		return -EIO;
   2451	dcp = (struct dmapctl *) mp->data;
   2452
   2453	if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
   2454		jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
   2455		release_metapage(mp);
   2456		return -EIO;
   2457	}
   2458
   2459	/* determine the leaf number corresponding to the block and
   2460	 * the index within the dmap control tree.
   2461	 */
   2462	leafno = BLKTOCTLLEAF(blkno, dcp->budmin);
   2463	ti = leafno + le32_to_cpu(dcp->leafidx);
   2464
   2465	/* save the current leaf value and the current root level (i.e.
   2466	 * maximum l2 free string described by this dmapctl).
   2467	 */
   2468	oldval = dcp->stree[ti];
   2469	oldroot = dcp->stree[ROOT];
   2470
   2471	/* check if this is a control page update for an allocation.
   2472	 * if so, update the leaf to reflect the new leaf value using
   2473	 * dbSplit(); otherwise (deallocation), use dbJoin() to update
   2474	 * the leaf with the new value.  in addition to updating the
   2475	 * leaf, dbSplit() will also split the binary buddy system of
   2476	 * the leaves, if required, and bubble new values within the
   2477	 * dmapctl tree, if required.  similarly, dbJoin() will join
   2478	 * the binary buddy system of leaves and bubble new values up
   2479	 * the dmapctl tree as required by the new leaf value.
   2480	 */
   2481	if (alloc) {
   2482		/* check if we are in the middle of a binary buddy
   2483		 * system.  this happens when we are performing the
   2484		 * first allocation out of an allocation group that
   2485		 * is part (not the first part) of a larger binary
   2486		 * buddy system.  if we are in the middle, back split
   2487		 * the system prior to calling dbSplit() which assumes
   2488		 * that it is at the front of a binary buddy system.
   2489		 */
   2490		if (oldval == NOFREE) {
   2491			rc = dbBackSplit((dmtree_t *) dcp, leafno);
   2492			if (rc) {
   2493				release_metapage(mp);
   2494				return rc;
   2495			}
   2496			oldval = dcp->stree[ti];
   2497		}
   2498		dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval);
   2499	} else {
   2500		rc = dbJoin((dmtree_t *) dcp, leafno, newval);
   2501		if (rc) {
   2502			release_metapage(mp);
   2503			return rc;
   2504		}
   2505	}
   2506
   2507	/* check if the root of the current dmap control page changed due
   2508	 * to the update and if the current dmap control page is not at
   2509	 * the current top level (i.e. L0, L1, L2) of the map.  if so (i.e.
   2510	 * root changed and this is not the top level), call this routine
   2511	 * again (recursion) for the next higher level of the mapping to
   2512	 * reflect the change in root for the current dmap control page.
   2513	 */
   2514	if (dcp->stree[ROOT] != oldroot) {
   2515		/* are we below the top level of the map.  if so,
   2516		 * bubble the root up to the next higher level.
   2517		 */
   2518		if (level < bmp->db_maxlevel) {
   2519			/* bubble up the new root of this dmap control page to
   2520			 * the next level.
   2521			 */
   2522			if ((rc =
   2523			     dbAdjCtl(bmp, blkno, dcp->stree[ROOT], alloc,
   2524				      level + 1))) {
   2525				/* something went wrong in bubbling up the new
   2526				 * root value, so backout the changes to the
   2527				 * current dmap control page.
   2528				 */
   2529				if (alloc) {
   2530					dbJoin((dmtree_t *) dcp, leafno,
   2531					       oldval);
   2532				} else {
   2533					/* the dbJoin() above might have
   2534					 * caused a larger binary buddy system
   2535					 * to form and we may now be in the
   2536					 * middle of it.  if this is the case,
   2537					 * back split the buddies.
   2538					 */
   2539					if (dcp->stree[ti] == NOFREE)
   2540						dbBackSplit((dmtree_t *)
   2541							    dcp, leafno);
   2542					dbSplit((dmtree_t *) dcp, leafno,
   2543						dcp->budmin, oldval);
   2544				}
   2545
   2546				/* release the buffer and return the error.
   2547				 */
   2548				release_metapage(mp);
   2549				return (rc);
   2550			}
   2551		} else {
   2552			/* we're at the top level of the map. update
   2553			 * the bmap control page to reflect the size
   2554			 * of the maximum free buddy system.
   2555			 */
   2556			assert(level == bmp->db_maxlevel);
   2557			if (bmp->db_maxfreebud != oldroot) {
   2558				jfs_error(bmp->db_ipbmap->i_sb,
   2559					  "the maximum free buddy is not the old root\n");
   2560			}
   2561			bmp->db_maxfreebud = dcp->stree[ROOT];
   2562		}
   2563	}
   2564
   2565	/* write the buffer.
   2566	 */
   2567	write_metapage(mp);
   2568
   2569	return (0);
   2570}
   2571
   2572
   2573/*
   2574 * NAME:	dbSplit()
   2575 *
   2576 * FUNCTION:	update the leaf of a dmtree with a new value, splitting
   2577 *		the leaf from the binary buddy system of the dmtree's
   2578 *		leaves, as required.
   2579 *
   2580 * PARAMETERS:
   2581 *	tp	- pointer to the tree containing the leaf.
   2582 *	leafno	- the number of the leaf to be updated.
   2583 *	splitsz	- the size the binary buddy system starting at the leaf
   2584 *		  must be split to, specified as the log2 number of blocks.
   2585 *	newval	- the new value for the leaf.
   2586 *
   2587 * RETURN VALUES: none
   2588 *
   2589 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
   2590 */
   2591static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
   2592{
   2593	int budsz;
   2594	int cursz;
   2595	s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
   2596
   2597	/* check if the leaf needs to be split.
   2598	 */
   2599	if (leaf[leafno] > tp->dmt_budmin) {
   2600		/* the split occurs by cutting the buddy system in half
   2601		 * at the specified leaf until we reach the specified
   2602		 * size.  pick up the starting split size (current size
   2603		 * - 1 in l2) and the corresponding buddy size.
   2604		 */
   2605		cursz = leaf[leafno] - 1;
   2606		budsz = BUDSIZE(cursz, tp->dmt_budmin);
   2607
   2608		/* split until we reach the specified size.
   2609		 */
   2610		while (cursz >= splitsz) {
   2611			/* update the buddy's leaf with its new value.
   2612			 */
   2613			dbAdjTree(tp, leafno ^ budsz, cursz);
   2614
   2615			/* on to the next size and buddy.
   2616			 */
   2617			cursz -= 1;
   2618			budsz >>= 1;
   2619		}
   2620	}
   2621
   2622	/* adjust the dmap tree to reflect the specified leaf's new
   2623	 * value.
   2624	 */
   2625	dbAdjTree(tp, leafno, newval);
   2626}
   2627
   2628
   2629/*
   2630 * NAME:	dbBackSplit()
   2631 *
   2632 * FUNCTION:	back split the binary buddy system of dmtree leaves
   2633 *		that hold a specified leaf until the specified leaf
   2634 *		starts its own binary buddy system.
   2635 *
   2636 *		the allocators typically perform allocations at the start
   2637 *		of binary buddy systems and dbSplit() is used to accomplish
   2638 *		any required splits.  in some cases, however, allocation
   2639 *		may occur in the middle of a binary system and requires a
   2640 *		back split, with the split proceeding out from the middle of
   2641 *		the system (less efficient) rather than the start of the
   2642 *		system (more efficient).  the cases in which a back split
   2643 *		is required are rare and are limited to the first allocation
   2644 *		within an allocation group which is a part (not first part)
   2645 *		of a larger binary buddy system and a few exception cases
   2646 *		in which a previous join operation must be backed out.
   2647 *
   2648 * PARAMETERS:
   2649 *	tp	- pointer to the tree containing the leaf.
   2650 *	leafno	- the number of the leaf to be updated.
   2651 *
   2652 * RETURN VALUES: none
   2653 *
   2654 * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
   2655 */
   2656static int dbBackSplit(dmtree_t * tp, int leafno)
   2657{
   2658	int budsz, bud, w, bsz, size;
   2659	int cursz;
   2660	s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
   2661
   2662	/* leaf should be part (not first part) of a binary
   2663	 * buddy system.
   2664	 */
   2665	assert(leaf[leafno] == NOFREE);
   2666
   2667	/* the back split is accomplished by iteratively finding the leaf
   2668	 * that starts the buddy system that contains the specified leaf and
   2669	 * splitting that system in two.  this iteration continues until
   2670	 * the specified leaf becomes the start of a buddy system.
   2671	 *
   2672	 * determine maximum possible l2 size for the specified leaf.
   2673	 */
   2674	size =
   2675	    LITOL2BSZ(leafno, le32_to_cpu(tp->dmt_l2nleafs),
   2676		      tp->dmt_budmin);
   2677
   2678	/* determine the number of leaves covered by this size.  this
   2679	 * is the buddy size that we will start with as we search for
   2680	 * the buddy system that contains the specified leaf.
   2681	 */
   2682	budsz = BUDSIZE(size, tp->dmt_budmin);
   2683
   2684	/* back split.
   2685	 */
   2686	while (leaf[leafno] == NOFREE) {
   2687		/* find the leftmost buddy leaf.
   2688		 */
   2689		for (w = leafno, bsz = budsz;; bsz <<= 1,
   2690		     w = (w < bud) ? w : bud) {
   2691			if (bsz >= le32_to_cpu(tp->dmt_nleafs)) {
   2692				jfs_err("JFS: block map error in dbBackSplit");
   2693				return -EIO;
   2694			}
   2695
   2696			/* determine the buddy.
   2697			 */
   2698			bud = w ^ bsz;
   2699
   2700			/* check if this buddy is the start of the system.
   2701			 */
   2702			if (leaf[bud] != NOFREE) {
   2703				/* split the leaf at the start of the
   2704				 * system in two.
   2705				 */
   2706				cursz = leaf[bud] - 1;
   2707				dbSplit(tp, bud, cursz, cursz);
   2708				break;
   2709			}
   2710		}
   2711	}
   2712
   2713	if (leaf[leafno] != size) {
   2714		jfs_err("JFS: wrong leaf value in dbBackSplit");
   2715		return -EIO;
   2716	}
   2717	return 0;
   2718}
   2719
   2720
   2721/*
   2722 * NAME:	dbJoin()
   2723 *
   2724 * FUNCTION:	update the leaf of a dmtree with a new value, joining
   2725 *		the leaf with other leaves of the dmtree into a multi-leaf
   2726 *		binary buddy system, as required.
   2727 *
   2728 * PARAMETERS:
   2729 *	tp	- pointer to the tree containing the leaf.
   2730 *	leafno	- the number of the leaf to be updated.
   2731 *	newval	- the new value for the leaf.
   2732 *
   2733 * RETURN VALUES: none
   2734 */
   2735static int dbJoin(dmtree_t * tp, int leafno, int newval)
   2736{
   2737	int budsz, buddy;
   2738	s8 *leaf;
   2739
   2740	/* can the new leaf value require a join with other leaves ?
   2741	 */
   2742	if (newval >= tp->dmt_budmin) {
   2743		/* pickup a pointer to the leaves of the tree.
   2744		 */
   2745		leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
   2746
   2747		/* try to join the specified leaf into a large binary
   2748		 * buddy system.  the join proceeds by attempting to join
   2749		 * the specified leafno with its buddy (leaf) at new value.
   2750		 * if the join occurs, we attempt to join the left leaf
   2751		 * of the joined buddies with its buddy at new value + 1.
   2752		 * we continue to join until we find a buddy that cannot be
   2753		 * joined (does not have a value equal to the size of the
   2754		 * last join) or until all leaves have been joined into a
   2755		 * single system.
   2756		 *
   2757		 * get the buddy size (number of words covered) of
   2758		 * the new value.
   2759		 */
   2760		budsz = BUDSIZE(newval, tp->dmt_budmin);
   2761
   2762		/* try to join.
   2763		 */
   2764		while (budsz < le32_to_cpu(tp->dmt_nleafs)) {
   2765			/* get the buddy leaf.
   2766			 */
   2767			buddy = leafno ^ budsz;
   2768
   2769			/* if the leaf's new value is greater than its
   2770			 * buddy's value, we join no more.
   2771			 */
   2772			if (newval > leaf[buddy])
   2773				break;
   2774
   2775			/* It shouldn't be less */
   2776			if (newval < leaf[buddy])
   2777				return -EIO;
   2778
   2779			/* check which (leafno or buddy) is the left buddy.
   2780			 * the left buddy gets to claim the blocks resulting
   2781			 * from the join while the right gets to claim none.
   2782			 * the left buddy is also eligible to participate in
   2783			 * a join at the next higher level while the right
   2784			 * is not.
   2785			 *
   2786			 */
   2787			if (leafno < buddy) {
   2788				/* leafno is the left buddy.
   2789				 */
   2790				dbAdjTree(tp, buddy, NOFREE);
   2791			} else {
   2792				/* buddy is the left buddy and becomes
   2793				 * leafno.
   2794				 */
   2795				dbAdjTree(tp, leafno, NOFREE);
   2796				leafno = buddy;
   2797			}
   2798
   2799			/* on to try the next join.
   2800			 */
   2801			newval += 1;
   2802			budsz <<= 1;
   2803		}
   2804	}
   2805
   2806	/* update the leaf value.
   2807	 */
   2808	dbAdjTree(tp, leafno, newval);
   2809
   2810	return 0;
   2811}
   2812
   2813
   2814/*
   2815 * NAME:	dbAdjTree()
   2816 *
   2817 * FUNCTION:	update a leaf of a dmtree with a new value, adjusting
   2818 *		the dmtree, as required, to reflect the new leaf value.
   2819 *		the combination of any buddies must already be done before
   2820 *		this is called.
   2821 *
   2822 * PARAMETERS:
   2823 *	tp	- pointer to the tree to be adjusted.
   2824 *	leafno	- the number of the leaf to be updated.
   2825 *	newval	- the new value for the leaf.
   2826 *
   2827 * RETURN VALUES: none
   2828 */
   2829static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
   2830{
   2831	int lp, pp, k;
   2832	int max;
   2833
   2834	/* pick up the index of the leaf for this leafno.
   2835	 */
   2836	lp = leafno + le32_to_cpu(tp->dmt_leafidx);
   2837
   2838	/* is the current value the same as the old value ?  if so,
   2839	 * there is nothing to do.
   2840	 */
   2841	if (tp->dmt_stree[lp] == newval)
   2842		return;
   2843
   2844	/* set the new value.
   2845	 */
   2846	tp->dmt_stree[lp] = newval;
   2847
   2848	/* bubble the new value up the tree as required.
   2849	 */
   2850	for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) {
   2851		/* get the index of the first leaf of the 4 leaf
   2852		 * group containing the specified leaf (leafno).
   2853		 */
   2854		lp = ((lp - 1) & ~0x03) + 1;
   2855
   2856		/* get the index of the parent of this 4 leaf group.
   2857		 */
   2858		pp = (lp - 1) >> 2;
   2859
   2860		/* determine the maximum of the 4 leaves.
   2861		 */
   2862		max = TREEMAX(&tp->dmt_stree[lp]);
   2863
   2864		/* if the maximum of the 4 is the same as the
   2865		 * parent's value, we're done.
   2866		 */
   2867		if (tp->dmt_stree[pp] == max)
   2868			break;
   2869
   2870		/* parent gets new value.
   2871		 */
   2872		tp->dmt_stree[pp] = max;
   2873
   2874		/* parent becomes leaf for next go-round.
   2875		 */
   2876		lp = pp;
   2877	}
   2878}
   2879
   2880
   2881/*
   2882 * NAME:	dbFindLeaf()
   2883 *
   2884 * FUNCTION:	search a dmtree_t for sufficient free blocks, returning
   2885 *		the index of a leaf describing the free blocks if
   2886 *		sufficient free blocks are found.
   2887 *
   2888 *		the search starts at the top of the dmtree_t tree and
   2889 *		proceeds down the tree to the leftmost leaf with sufficient
   2890 *		free space.
   2891 *
   2892 * PARAMETERS:
   2893 *	tp	- pointer to the tree to be searched.
   2894 *	l2nb	- log2 number of free blocks to search for.
   2895 *	leafidx	- return pointer to be set to the index of the leaf
   2896 *		  describing at least l2nb free blocks if sufficient
   2897 *		  free blocks are found.
   2898 *
   2899 * RETURN VALUES:
   2900 *	0	- success
   2901 *	-ENOSPC	- insufficient free blocks.
   2902 */
   2903static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
   2904{
   2905	int ti, n = 0, k, x = 0;
   2906
   2907	/* first check the root of the tree to see if there is
   2908	 * sufficient free space.
   2909	 */
   2910	if (l2nb > tp->dmt_stree[ROOT])
   2911		return -ENOSPC;
   2912
   2913	/* sufficient free space available. now search down the tree
   2914	 * starting at the next level for the leftmost leaf that
   2915	 * describes sufficient free space.
   2916	 */
   2917	for (k = le32_to_cpu(tp->dmt_height), ti = 1;
   2918	     k > 0; k--, ti = ((ti + n) << 2) + 1) {
   2919		/* search the four nodes at this level, starting from
   2920		 * the left.
   2921		 */
   2922		for (x = ti, n = 0; n < 4; n++) {
   2923			/* sufficient free space found.  move to the next
   2924			 * level (or quit if this is the last level).
   2925			 */
   2926			if (l2nb <= tp->dmt_stree[x + n])
   2927				break;
   2928		}
   2929
   2930		/* better have found something since the higher
   2931		 * levels of the tree said it was here.
   2932		 */
   2933		assert(n < 4);
   2934	}
   2935
   2936	/* set the return to the leftmost leaf describing sufficient
   2937	 * free space.
   2938	 */
   2939	*leafidx = x + n - le32_to_cpu(tp->dmt_leafidx);
   2940
   2941	return (0);
   2942}
   2943
   2944
   2945/*
   2946 * NAME:	dbFindBits()
   2947 *
   2948 * FUNCTION:	find a specified number of binary buddy free bits within a
   2949 *		dmap bitmap word value.
   2950 *
   2951 *		this routine searches the bitmap value for (1 << l2nb) free
   2952 *		bits at (1 << l2nb) alignments within the value.
   2953 *
   2954 * PARAMETERS:
   2955 *	word	-  dmap bitmap word value.
   2956 *	l2nb	-  number of free bits specified as a log2 number.
   2957 *
   2958 * RETURN VALUES:
   2959 *	starting bit number of free bits.
   2960 */
   2961static int dbFindBits(u32 word, int l2nb)
   2962{
   2963	int bitno, nb;
   2964	u32 mask;
   2965
   2966	/* get the number of bits.
   2967	 */
   2968	nb = 1 << l2nb;
   2969	assert(nb <= DBWORD);
   2970
   2971	/* complement the word so we can use a mask (i.e. 0s represent
   2972	 * free bits) and compute the mask.
   2973	 */
   2974	word = ~word;
   2975	mask = ONES << (DBWORD - nb);
   2976
   2977	/* scan the word for nb free bits at nb alignments.
   2978	 */
   2979	for (bitno = 0; mask != 0; bitno += nb, mask >>= nb) {
   2980		if ((mask & word) == mask)
   2981			break;
   2982	}
   2983
   2984	ASSERT(bitno < 32);
   2985
   2986	/* return the bit number.
   2987	 */
   2988	return (bitno);
   2989}
   2990
   2991
   2992/*
   2993 * NAME:	dbMaxBud(u8 *cp)
   2994 *
   2995 * FUNCTION:	determine the largest binary buddy string of free
   2996 *		bits within 32-bits of the map.
   2997 *
   2998 * PARAMETERS:
   2999 *	cp	-  pointer to the 32-bit value.
   3000 *
   3001 * RETURN VALUES:
   3002 *	largest binary buddy of free bits within a dmap word.
   3003 */
   3004static int dbMaxBud(u8 * cp)
   3005{
   3006	signed char tmp1, tmp2;
   3007
   3008	/* check if the wmap word is all free. if so, the
   3009	 * free buddy size is BUDMIN.
   3010	 */
   3011	if (*((uint *) cp) == 0)
   3012		return (BUDMIN);
   3013
   3014	/* check if the wmap word is half free. if so, the
   3015	 * free buddy size is BUDMIN-1.
   3016	 */
   3017	if (*((u16 *) cp) == 0 || *((u16 *) cp + 1) == 0)
   3018		return (BUDMIN - 1);
   3019
   3020	/* not all free or half free. determine the free buddy
   3021	 * size thru table lookup using quarters of the wmap word.
   3022	 */
   3023	tmp1 = max(budtab[cp[2]], budtab[cp[3]]);
   3024	tmp2 = max(budtab[cp[0]], budtab[cp[1]]);
   3025	return (max(tmp1, tmp2));
   3026}
   3027
   3028
   3029/*
   3030 * NAME:	cnttz(uint word)
   3031 *
   3032 * FUNCTION:	determine the number of trailing zeros within a 32-bit
   3033 *		value.
   3034 *
   3035 * PARAMETERS:
   3036 *	value	-  32-bit value to be examined.
   3037 *
   3038 * RETURN VALUES:
   3039 *	count of trailing zeros
   3040 */
   3041static int cnttz(u32 word)
   3042{
   3043	int n;
   3044
   3045	for (n = 0; n < 32; n++, word >>= 1) {
   3046		if (word & 0x01)
   3047			break;
   3048	}
   3049
   3050	return (n);
   3051}
   3052
   3053
   3054/*
   3055 * NAME:	cntlz(u32 value)
   3056 *
   3057 * FUNCTION:	determine the number of leading zeros within a 32-bit
   3058 *		value.
   3059 *
   3060 * PARAMETERS:
   3061 *	value	-  32-bit value to be examined.
   3062 *
   3063 * RETURN VALUES:
   3064 *	count of leading zeros
   3065 */
   3066static int cntlz(u32 value)
   3067{
   3068	int n;
   3069
   3070	for (n = 0; n < 32; n++, value <<= 1) {
   3071		if (value & HIGHORDER)
   3072			break;
   3073	}
   3074	return (n);
   3075}
   3076
   3077
   3078/*
   3079 * NAME:	blkstol2(s64 nb)
   3080 *
   3081 * FUNCTION:	convert a block count to its log2 value. if the block
   3082 *		count is not a l2 multiple, it is rounded up to the next
   3083 *		larger l2 multiple.
   3084 *
   3085 * PARAMETERS:
   3086 *	nb	-  number of blocks
   3087 *
   3088 * RETURN VALUES:
   3089 *	log2 number of blocks
   3090 */
   3091static int blkstol2(s64 nb)
   3092{
   3093	int l2nb;
   3094	s64 mask;		/* meant to be signed */
   3095
   3096	mask = (s64) 1 << (64 - 1);
   3097
   3098	/* count the leading bits.
   3099	 */
   3100	for (l2nb = 0; l2nb < 64; l2nb++, mask >>= 1) {
   3101		/* leading bit found.
   3102		 */
   3103		if (nb & mask) {
   3104			/* determine the l2 value.
   3105			 */
   3106			l2nb = (64 - 1) - l2nb;
   3107
   3108			/* check if we need to round up.
   3109			 */
   3110			if (~mask & nb)
   3111				l2nb++;
   3112
   3113			return (l2nb);
   3114		}
   3115	}
   3116	assert(0);
   3117	return 0;		/* fix compiler warning */
   3118}
   3119
   3120
   3121/*
   3122 * NAME:	dbAllocBottomUp()
   3123 *
   3124 * FUNCTION:	alloc the specified block range from the working block
   3125 *		allocation map.
   3126 *
   3127 *		the blocks will be alloc from the working map one dmap
   3128 *		at a time.
   3129 *
   3130 * PARAMETERS:
   3131 *	ip	-  pointer to in-core inode;
   3132 *	blkno	-  starting block number to be freed.
   3133 *	nblocks	-  number of blocks to be freed.
   3134 *
   3135 * RETURN VALUES:
   3136 *	0	- success
   3137 *	-EIO	- i/o error
   3138 */
   3139int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks)
   3140{
   3141	struct metapage *mp;
   3142	struct dmap *dp;
   3143	int nb, rc;
   3144	s64 lblkno, rem;
   3145	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
   3146	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
   3147
   3148	IREAD_LOCK(ipbmap, RDWRLOCK_DMAP);
   3149
   3150	/* block to be allocated better be within the mapsize. */
   3151	ASSERT(nblocks <= bmp->db_mapsize - blkno);
   3152
   3153	/*
   3154	 * allocate the blocks a dmap at a time.
   3155	 */
   3156	mp = NULL;
   3157	for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
   3158		/* release previous dmap if any */
   3159		if (mp) {
   3160			write_metapage(mp);
   3161		}
   3162
   3163		/* get the buffer for the current dmap. */
   3164		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
   3165		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
   3166		if (mp == NULL) {
   3167			IREAD_UNLOCK(ipbmap);
   3168			return -EIO;
   3169		}
   3170		dp = (struct dmap *) mp->data;
   3171
   3172		/* determine the number of blocks to be allocated from
   3173		 * this dmap.
   3174		 */
   3175		nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
   3176
   3177		/* allocate the blocks. */
   3178		if ((rc = dbAllocDmapBU(bmp, dp, blkno, nb))) {
   3179			release_metapage(mp);
   3180			IREAD_UNLOCK(ipbmap);
   3181			return (rc);
   3182		}
   3183	}
   3184
   3185	/* write the last buffer. */
   3186	write_metapage(mp);
   3187
   3188	IREAD_UNLOCK(ipbmap);
   3189
   3190	return (0);
   3191}
   3192
   3193
   3194static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
   3195			 int nblocks)
   3196{
   3197	int rc;
   3198	int dbitno, word, rembits, nb, nwords, wbitno, agno;
   3199	s8 oldroot;
   3200	struct dmaptree *tp = (struct dmaptree *) & dp->tree;
   3201
   3202	/* save the current value of the root (i.e. maximum free string)
   3203	 * of the dmap tree.
   3204	 */
   3205	oldroot = tp->stree[ROOT];
   3206
   3207	/* determine the bit number and word within the dmap of the
   3208	 * starting block.
   3209	 */
   3210	dbitno = blkno & (BPERDMAP - 1);
   3211	word = dbitno >> L2DBWORD;
   3212
   3213	/* block range better be within the dmap */
   3214	assert(dbitno + nblocks <= BPERDMAP);
   3215
   3216	/* allocate the bits of the dmap's words corresponding to the block
   3217	 * range. not all bits of the first and last words may be contained
   3218	 * within the block range.  if this is the case, we'll work against
   3219	 * those words (i.e. partial first and/or last) on an individual basis
   3220	 * (a single pass), allocating the bits of interest by hand and
   3221	 * updating the leaf corresponding to the dmap word. a single pass
   3222	 * will be used for all dmap words fully contained within the
   3223	 * specified range.  within this pass, the bits of all fully contained
   3224	 * dmap words will be marked as free in a single shot and the leaves
   3225	 * will be updated. a single leaf may describe the free space of
   3226	 * multiple dmap words, so we may update only a subset of the actual
   3227	 * leaves corresponding to the dmap words of the block range.
   3228	 */
   3229	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
   3230		/* determine the bit number within the word and
   3231		 * the number of bits within the word.
   3232		 */
   3233		wbitno = dbitno & (DBWORD - 1);
   3234		nb = min(rembits, DBWORD - wbitno);
   3235
   3236		/* check if only part of a word is to be allocated.
   3237		 */
   3238		if (nb < DBWORD) {
   3239			/* allocate (set to 1) the appropriate bits within
   3240			 * this dmap word.
   3241			 */
   3242			dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
   3243						      >> wbitno);
   3244
   3245			word++;
   3246		} else {
   3247			/* one or more dmap words are fully contained
   3248			 * within the block range.  determine how many
   3249			 * words and allocate (set to 1) the bits of these
   3250			 * words.
   3251			 */
   3252			nwords = rembits >> L2DBWORD;
   3253			memset(&dp->wmap[word], (int) ONES, nwords * 4);
   3254
   3255			/* determine how many bits */
   3256			nb = nwords << L2DBWORD;
   3257			word += nwords;
   3258		}
   3259	}
   3260
   3261	/* update the free count for this dmap */
   3262	le32_add_cpu(&dp->nfree, -nblocks);
   3263
   3264	/* reconstruct summary tree */
   3265	dbInitDmapTree(dp);
   3266
   3267	BMAP_LOCK(bmp);
   3268
   3269	/* if this allocation group is completely free,
   3270	 * update the highest active allocation group number
   3271	 * if this allocation group is the new max.
   3272	 */
   3273	agno = blkno >> bmp->db_agl2size;
   3274	if (agno > bmp->db_maxag)
   3275		bmp->db_maxag = agno;
   3276
   3277	/* update the free count for the allocation group and map */
   3278	bmp->db_agfree[agno] -= nblocks;
   3279	bmp->db_nfree -= nblocks;
   3280
   3281	BMAP_UNLOCK(bmp);
   3282
   3283	/* if the root has not changed, done. */
   3284	if (tp->stree[ROOT] == oldroot)
   3285		return (0);
   3286
   3287	/* root changed. bubble the change up to the dmap control pages.
   3288	 * if the adjustment of the upper level control pages fails,
   3289	 * backout the bit allocation (thus making everything consistent).
   3290	 */
   3291	if ((rc = dbAdjCtl(bmp, blkno, tp->stree[ROOT], 1, 0)))
   3292		dbFreeBits(bmp, dp, blkno, nblocks);
   3293
   3294	return (rc);
   3295}
   3296
   3297
   3298/*
   3299 * NAME:	dbExtendFS()
   3300 *
   3301 * FUNCTION:	extend bmap from blkno for nblocks;
   3302 *		dbExtendFS() updates bmap ready for dbAllocBottomUp();
   3303 *
   3304 * L2
   3305 *  |
   3306 *   L1---------------------------------L1
   3307 *    |					 |
   3308 *     L0---------L0---------L0		  L0---------L0---------L0
   3309 *      |	   |	      |		   |	      |		 |
   3310 *	 d0,...,dn  d0,...,dn  d0,...,dn    d0,...,dn  d0,...,dn  d0,.,dm;
   3311 * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm
   3312 *
   3313 * <---old---><----------------------------extend----------------------->
   3314 */
   3315int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
   3316{
   3317	struct jfs_sb_info *sbi = JFS_SBI(ipbmap->i_sb);
   3318	int nbperpage = sbi->nbperpage;
   3319	int i, i0 = true, j, j0 = true, k, n;
   3320	s64 newsize;
   3321	s64 p;
   3322	struct metapage *mp, *l2mp, *l1mp = NULL, *l0mp = NULL;
   3323	struct dmapctl *l2dcp, *l1dcp, *l0dcp;
   3324	struct dmap *dp;
   3325	s8 *l0leaf, *l1leaf, *l2leaf;
   3326	struct bmap *bmp = sbi->bmap;
   3327	int agno, l2agsize, oldl2agsize;
   3328	s64 ag_rem;
   3329
   3330	newsize = blkno + nblocks;
   3331
   3332	jfs_info("dbExtendFS: blkno:%Ld nblocks:%Ld newsize:%Ld",
   3333		 (long long) blkno, (long long) nblocks, (long long) newsize);
   3334
   3335	/*
   3336	 *	initialize bmap control page.
   3337	 *
   3338	 * all the data in bmap control page should exclude
   3339	 * the mkfs hidden dmap page.
   3340	 */
   3341
   3342	/* update mapsize */
   3343	bmp->db_mapsize = newsize;
   3344	bmp->db_maxlevel = BMAPSZTOLEV(bmp->db_mapsize);
   3345
   3346	/* compute new AG size */
   3347	l2agsize = dbGetL2AGSize(newsize);
   3348	oldl2agsize = bmp->db_agl2size;
   3349
   3350	bmp->db_agl2size = l2agsize;
   3351	bmp->db_agsize = 1 << l2agsize;
   3352
   3353	/* compute new number of AG */
   3354	agno = bmp->db_numag;
   3355	bmp->db_numag = newsize >> l2agsize;
   3356	bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0;
   3357
   3358	/*
   3359	 *	reconfigure db_agfree[]
   3360	 * from old AG configuration to new AG configuration;
   3361	 *
   3362	 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
   3363	 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
   3364	 * note: new AG size = old AG size * (2**x).
   3365	 */
   3366	if (l2agsize == oldl2agsize)
   3367		goto extend;
   3368	k = 1 << (l2agsize - oldl2agsize);
   3369	ag_rem = bmp->db_agfree[0];	/* save agfree[0] */
   3370	for (i = 0, n = 0; i < agno; n++) {
   3371		bmp->db_agfree[n] = 0;	/* init collection point */
   3372
   3373		/* coalesce contiguous k AGs; */
   3374		for (j = 0; j < k && i < agno; j++, i++) {
   3375			/* merge AGi to AGn */
   3376			bmp->db_agfree[n] += bmp->db_agfree[i];
   3377		}
   3378	}
   3379	bmp->db_agfree[0] += ag_rem;	/* restore agfree[0] */
   3380
   3381	for (; n < MAXAG; n++)
   3382		bmp->db_agfree[n] = 0;
   3383
   3384	/*
   3385	 * update highest active ag number
   3386	 */
   3387
   3388	bmp->db_maxag = bmp->db_maxag / k;
   3389
   3390	/*
   3391	 *	extend bmap
   3392	 *
   3393	 * update bit maps and corresponding level control pages;
   3394	 * global control page db_nfree, db_agfree[agno], db_maxfreebud;
   3395	 */
   3396      extend:
   3397	/* get L2 page */
   3398	p = BMAPBLKNO + nbperpage;	/* L2 page */
   3399	l2mp = read_metapage(ipbmap, p, PSIZE, 0);
   3400	if (!l2mp) {
   3401		jfs_error(ipbmap->i_sb, "L2 page could not be read\n");
   3402		return -EIO;
   3403	}
   3404	l2dcp = (struct dmapctl *) l2mp->data;
   3405
   3406	/* compute start L1 */
   3407	k = blkno >> L2MAXL1SIZE;
   3408	l2leaf = l2dcp->stree + CTLLEAFIND + k;
   3409	p = BLKTOL1(blkno, sbi->l2nbperpage);	/* L1 page */
   3410
   3411	/*
   3412	 * extend each L1 in L2
   3413	 */
   3414	for (; k < LPERCTL; k++, p += nbperpage) {
   3415		/* get L1 page */
   3416		if (j0) {
   3417			/* read in L1 page: (blkno & (MAXL1SIZE - 1)) */
   3418			l1mp = read_metapage(ipbmap, p, PSIZE, 0);
   3419			if (l1mp == NULL)
   3420				goto errout;
   3421			l1dcp = (struct dmapctl *) l1mp->data;
   3422
   3423			/* compute start L0 */
   3424			j = (blkno & (MAXL1SIZE - 1)) >> L2MAXL0SIZE;
   3425			l1leaf = l1dcp->stree + CTLLEAFIND + j;
   3426			p = BLKTOL0(blkno, sbi->l2nbperpage);
   3427			j0 = false;
   3428		} else {
   3429			/* assign/init L1 page */
   3430			l1mp = get_metapage(ipbmap, p, PSIZE, 0);
   3431			if (l1mp == NULL)
   3432				goto errout;
   3433
   3434			l1dcp = (struct dmapctl *) l1mp->data;
   3435
   3436			/* compute start L0 */
   3437			j = 0;
   3438			l1leaf = l1dcp->stree + CTLLEAFIND;
   3439			p += nbperpage;	/* 1st L0 of L1.k */
   3440		}
   3441
   3442		/*
   3443		 * extend each L0 in L1
   3444		 */
   3445		for (; j < LPERCTL; j++) {
   3446			/* get L0 page */
   3447			if (i0) {
   3448				/* read in L0 page: (blkno & (MAXL0SIZE - 1)) */
   3449
   3450				l0mp = read_metapage(ipbmap, p, PSIZE, 0);
   3451				if (l0mp == NULL)
   3452					goto errout;
   3453				l0dcp = (struct dmapctl *) l0mp->data;
   3454
   3455				/* compute start dmap */
   3456				i = (blkno & (MAXL0SIZE - 1)) >>
   3457				    L2BPERDMAP;
   3458				l0leaf = l0dcp->stree + CTLLEAFIND + i;
   3459				p = BLKTODMAP(blkno,
   3460					      sbi->l2nbperpage);
   3461				i0 = false;
   3462			} else {
   3463				/* assign/init L0 page */
   3464				l0mp = get_metapage(ipbmap, p, PSIZE, 0);
   3465				if (l0mp == NULL)
   3466					goto errout;
   3467
   3468				l0dcp = (struct dmapctl *) l0mp->data;
   3469
   3470				/* compute start dmap */
   3471				i = 0;
   3472				l0leaf = l0dcp->stree + CTLLEAFIND;
   3473				p += nbperpage;	/* 1st dmap of L0.j */
   3474			}
   3475
   3476			/*
   3477			 * extend each dmap in L0
   3478			 */
   3479			for (; i < LPERCTL; i++) {
   3480				/*
   3481				 * reconstruct the dmap page, and
   3482				 * initialize corresponding parent L0 leaf
   3483				 */
   3484				if ((n = blkno & (BPERDMAP - 1))) {
   3485					/* read in dmap page: */
   3486					mp = read_metapage(ipbmap, p,
   3487							   PSIZE, 0);
   3488					if (mp == NULL)
   3489						goto errout;
   3490					n = min(nblocks, (s64)BPERDMAP - n);
   3491				} else {
   3492					/* assign/init dmap page */
   3493					mp = read_metapage(ipbmap, p,
   3494							   PSIZE, 0);
   3495					if (mp == NULL)
   3496						goto errout;
   3497
   3498					n = min_t(s64, nblocks, BPERDMAP);
   3499				}
   3500
   3501				dp = (struct dmap *) mp->data;
   3502				*l0leaf = dbInitDmap(dp, blkno, n);
   3503
   3504				bmp->db_nfree += n;
   3505				agno = le64_to_cpu(dp->start) >> l2agsize;
   3506				bmp->db_agfree[agno] += n;
   3507
   3508				write_metapage(mp);
   3509
   3510				l0leaf++;
   3511				p += nbperpage;
   3512
   3513				blkno += n;
   3514				nblocks -= n;
   3515				if (nblocks == 0)
   3516					break;
   3517			}	/* for each dmap in a L0 */
   3518
   3519			/*
   3520			 * build current L0 page from its leaves, and
   3521			 * initialize corresponding parent L1 leaf
   3522			 */
   3523			*l1leaf = dbInitDmapCtl(l0dcp, 0, ++i);
   3524			write_metapage(l0mp);
   3525			l0mp = NULL;
   3526
   3527			if (nblocks)
   3528				l1leaf++;	/* continue for next L0 */
   3529			else {
   3530				/* more than 1 L0 ? */
   3531				if (j > 0)
   3532					break;	/* build L1 page */
   3533				else {
   3534					/* summarize in global bmap page */
   3535					bmp->db_maxfreebud = *l1leaf;
   3536					release_metapage(l1mp);
   3537					release_metapage(l2mp);
   3538					goto finalize;
   3539				}
   3540			}
   3541		}		/* for each L0 in a L1 */
   3542
   3543		/*
   3544		 * build current L1 page from its leaves, and
   3545		 * initialize corresponding parent L2 leaf
   3546		 */
   3547		*l2leaf = dbInitDmapCtl(l1dcp, 1, ++j);
   3548		write_metapage(l1mp);
   3549		l1mp = NULL;
   3550
   3551		if (nblocks)
   3552			l2leaf++;	/* continue for next L1 */
   3553		else {
   3554			/* more than 1 L1 ? */
   3555			if (k > 0)
   3556				break;	/* build L2 page */
   3557			else {
   3558				/* summarize in global bmap page */
   3559				bmp->db_maxfreebud = *l2leaf;
   3560				release_metapage(l2mp);
   3561				goto finalize;
   3562			}
   3563		}
   3564	}			/* for each L1 in a L2 */
   3565
   3566	jfs_error(ipbmap->i_sb, "function has not returned as expected\n");
   3567errout:
   3568	if (l0mp)
   3569		release_metapage(l0mp);
   3570	if (l1mp)
   3571		release_metapage(l1mp);
   3572	release_metapage(l2mp);
   3573	return -EIO;
   3574
   3575	/*
   3576	 *	finalize bmap control page
   3577	 */
   3578finalize:
   3579
   3580	return 0;
   3581}
   3582
   3583
   3584/*
   3585 *	dbFinalizeBmap()
   3586 */
   3587void dbFinalizeBmap(struct inode *ipbmap)
   3588{
   3589	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
   3590	int actags, inactags, l2nl;
   3591	s64 ag_rem, actfree, inactfree, avgfree;
   3592	int i, n;
   3593
   3594	/*
   3595	 *	finalize bmap control page
   3596	 */
   3597//finalize:
   3598	/*
   3599	 * compute db_agpref: preferred ag to allocate from
   3600	 * (the leftmost ag with average free space in it);
   3601	 */
   3602//agpref:
   3603	/* get the number of active ags and inactive ags */
   3604	actags = bmp->db_maxag + 1;
   3605	inactags = bmp->db_numag - actags;
   3606	ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1);	/* ??? */
   3607
   3608	/* determine how many blocks are in the inactive allocation
   3609	 * groups. in doing this, we must account for the fact that
   3610	 * the rightmost group might be a partial group (i.e. file
   3611	 * system size is not a multiple of the group size).
   3612	 */
   3613	inactfree = (inactags && ag_rem) ?
   3614	    ((inactags - 1) << bmp->db_agl2size) + ag_rem
   3615	    : inactags << bmp->db_agl2size;
   3616
   3617	/* determine how many free blocks are in the active
   3618	 * allocation groups plus the average number of free blocks
   3619	 * within the active ags.
   3620	 */
   3621	actfree = bmp->db_nfree - inactfree;
   3622	avgfree = (u32) actfree / (u32) actags;
   3623
   3624	/* if the preferred allocation group has not average free space.
   3625	 * re-establish the preferred group as the leftmost
   3626	 * group with average free space.
   3627	 */
   3628	if (bmp->db_agfree[bmp->db_agpref] < avgfree) {
   3629		for (bmp->db_agpref = 0; bmp->db_agpref < actags;
   3630		     bmp->db_agpref++) {
   3631			if (bmp->db_agfree[bmp->db_agpref] >= avgfree)
   3632				break;
   3633		}
   3634		if (bmp->db_agpref >= bmp->db_numag) {
   3635			jfs_error(ipbmap->i_sb,
   3636				  "cannot find ag with average freespace\n");
   3637		}
   3638	}
   3639
   3640	/*
   3641	 * compute db_aglevel, db_agheight, db_width, db_agstart:
   3642	 * an ag is covered in aglevel dmapctl summary tree,
   3643	 * at agheight level height (from leaf) with agwidth number of nodes
   3644	 * each, which starts at agstart index node of the smmary tree node
   3645	 * array;
   3646	 */
   3647	bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
   3648	l2nl =
   3649	    bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
   3650	bmp->db_agheight = l2nl >> 1;
   3651	bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheight << 1));
   3652	for (i = 5 - bmp->db_agheight, bmp->db_agstart = 0, n = 1; i > 0;
   3653	     i--) {
   3654		bmp->db_agstart += n;
   3655		n <<= 2;
   3656	}
   3657
   3658}
   3659
   3660
   3661/*
   3662 * NAME:	dbInitDmap()/ujfs_idmap_page()
   3663 *
   3664 * FUNCTION:	initialize working/persistent bitmap of the dmap page
   3665 *		for the specified number of blocks:
   3666 *
   3667 *		at entry, the bitmaps had been initialized as free (ZEROS);
   3668 *		The number of blocks will only account for the actually
   3669 *		existing blocks. Blocks which don't actually exist in
   3670 *		the aggregate will be marked as allocated (ONES);
   3671 *
   3672 * PARAMETERS:
   3673 *	dp	- pointer to page of map
   3674 *	nblocks	- number of blocks this page
   3675 *
   3676 * RETURNS: NONE
   3677 */
   3678static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks)
   3679{
   3680	int blkno, w, b, r, nw, nb, i;
   3681
   3682	/* starting block number within the dmap */
   3683	blkno = Blkno & (BPERDMAP - 1);
   3684
   3685	if (blkno == 0) {
   3686		dp->nblocks = dp->nfree = cpu_to_le32(nblocks);
   3687		dp->start = cpu_to_le64(Blkno);
   3688
   3689		if (nblocks == BPERDMAP) {
   3690			memset(&dp->wmap[0], 0, LPERDMAP * 4);
   3691			memset(&dp->pmap[0], 0, LPERDMAP * 4);
   3692			goto initTree;
   3693		}
   3694	} else {
   3695		le32_add_cpu(&dp->nblocks, nblocks);
   3696		le32_add_cpu(&dp->nfree, nblocks);
   3697	}
   3698
   3699	/* word number containing start block number */
   3700	w = blkno >> L2DBWORD;
   3701
   3702	/*
   3703	 * free the bits corresponding to the block range (ZEROS):
   3704	 * note: not all bits of the first and last words may be contained
   3705	 * within the block range.
   3706	 */
   3707	for (r = nblocks; r > 0; r -= nb, blkno += nb) {
   3708		/* number of bits preceding range to be freed in the word */
   3709		b = blkno & (DBWORD - 1);
   3710		/* number of bits to free in the word */
   3711		nb = min(r, DBWORD - b);
   3712
   3713		/* is partial word to be freed ? */
   3714		if (nb < DBWORD) {
   3715			/* free (set to 0) from the bitmap word */
   3716			dp->wmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
   3717						     >> b));
   3718			dp->pmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
   3719						     >> b));
   3720
   3721			/* skip the word freed */
   3722			w++;
   3723		} else {
   3724			/* free (set to 0) contiguous bitmap words */
   3725			nw = r >> L2DBWORD;
   3726			memset(&dp->wmap[w], 0, nw * 4);
   3727			memset(&dp->pmap[w], 0, nw * 4);
   3728
   3729			/* skip the words freed */
   3730			nb = nw << L2DBWORD;
   3731			w += nw;
   3732		}
   3733	}
   3734
   3735	/*
   3736	 * mark bits following the range to be freed (non-existing
   3737	 * blocks) as allocated (ONES)
   3738	 */
   3739
   3740	if (blkno == BPERDMAP)
   3741		goto initTree;
   3742
   3743	/* the first word beyond the end of existing blocks */
   3744	w = blkno >> L2DBWORD;
   3745
   3746	/* does nblocks fall on a 32-bit boundary ? */
   3747	b = blkno & (DBWORD - 1);
   3748	if (b) {
   3749		/* mark a partial word allocated */
   3750		dp->wmap[w] = dp->pmap[w] = cpu_to_le32(ONES >> b);
   3751		w++;
   3752	}
   3753
   3754	/* set the rest of the words in the page to allocated (ONES) */
   3755	for (i = w; i < LPERDMAP; i++)
   3756		dp->pmap[i] = dp->wmap[i] = cpu_to_le32(ONES);
   3757
   3758	/*
   3759	 * init tree
   3760	 */
   3761      initTree:
   3762	return (dbInitDmapTree(dp));
   3763}
   3764
   3765
   3766/*
   3767 * NAME:	dbInitDmapTree()/ujfs_complete_dmap()
   3768 *
   3769 * FUNCTION:	initialize summary tree of the specified dmap:
   3770 *
   3771 *		at entry, bitmap of the dmap has been initialized;
   3772 *
   3773 * PARAMETERS:
   3774 *	dp	- dmap to complete
   3775 *	blkno	- starting block number for this dmap
   3776 *	treemax	- will be filled in with max free for this dmap
   3777 *
   3778 * RETURNS:	max free string at the root of the tree
   3779 */
   3780static int dbInitDmapTree(struct dmap * dp)
   3781{
   3782	struct dmaptree *tp;
   3783	s8 *cp;
   3784	int i;
   3785
   3786	/* init fixed info of tree */
   3787	tp = &dp->tree;
   3788	tp->nleafs = cpu_to_le32(LPERDMAP);
   3789	tp->l2nleafs = cpu_to_le32(L2LPERDMAP);
   3790	tp->leafidx = cpu_to_le32(LEAFIND);
   3791	tp->height = cpu_to_le32(4);
   3792	tp->budmin = BUDMIN;
   3793
   3794	/* init each leaf from corresponding wmap word:
   3795	 * note: leaf is set to NOFREE(-1) if all blocks of corresponding
   3796	 * bitmap word are allocated.
   3797	 */
   3798	cp = tp->stree + le32_to_cpu(tp->leafidx);
   3799	for (i = 0; i < LPERDMAP; i++)
   3800		*cp++ = dbMaxBud((u8 *) & dp->wmap[i]);
   3801
   3802	/* build the dmap's binary buddy summary tree */
   3803	return (dbInitTree(tp));
   3804}
   3805
   3806
   3807/*
   3808 * NAME:	dbInitTree()/ujfs_adjtree()
   3809 *
   3810 * FUNCTION:	initialize binary buddy summary tree of a dmap or dmapctl.
   3811 *
   3812 *		at entry, the leaves of the tree has been initialized
   3813 *		from corresponding bitmap word or root of summary tree
   3814 *		of the child control page;
   3815 *		configure binary buddy system at the leaf level, then
   3816 *		bubble up the values of the leaf nodes up the tree.
   3817 *
   3818 * PARAMETERS:
   3819 *	cp	- Pointer to the root of the tree
   3820 *	l2leaves- Number of leaf nodes as a power of 2
   3821 *	l2min	- Number of blocks that can be covered by a leaf
   3822 *		  as a power of 2
   3823 *
   3824 * RETURNS: max free string at the root of the tree
   3825 */
   3826static int dbInitTree(struct dmaptree * dtp)
   3827{
   3828	int l2max, l2free, bsize, nextb, i;
   3829	int child, parent, nparent;
   3830	s8 *tp, *cp, *cp1;
   3831
   3832	tp = dtp->stree;
   3833
   3834	/* Determine the maximum free string possible for the leaves */
   3835	l2max = le32_to_cpu(dtp->l2nleafs) + dtp->budmin;
   3836
   3837	/*
   3838	 * configure the leaf levevl into binary buddy system
   3839	 *
   3840	 * Try to combine buddies starting with a buddy size of 1
   3841	 * (i.e. two leaves). At a buddy size of 1 two buddy leaves
   3842	 * can be combined if both buddies have a maximum free of l2min;
   3843	 * the combination will result in the left-most buddy leaf having
   3844	 * a maximum free of l2min+1.
   3845	 * After processing all buddies for a given size, process buddies
   3846	 * at the next higher buddy size (i.e. current size * 2) and
   3847	 * the next maximum free (current free + 1).
   3848	 * This continues until the maximum possible buddy combination
   3849	 * yields maximum free.
   3850	 */
   3851	for (l2free = dtp->budmin, bsize = 1; l2free < l2max;
   3852	     l2free++, bsize = nextb) {
   3853		/* get next buddy size == current buddy pair size */
   3854		nextb = bsize << 1;
   3855
   3856		/* scan each adjacent buddy pair at current buddy size */
   3857		for (i = 0, cp = tp + le32_to_cpu(dtp->leafidx);
   3858		     i < le32_to_cpu(dtp->nleafs);
   3859		     i += nextb, cp += nextb) {
   3860			/* coalesce if both adjacent buddies are max free */
   3861			if (*cp == l2free && *(cp + bsize) == l2free) {
   3862				*cp = l2free + 1;	/* left take right */
   3863				*(cp + bsize) = -1;	/* right give left */
   3864			}
   3865		}
   3866	}
   3867
   3868	/*
   3869	 * bubble summary information of leaves up the tree.
   3870	 *
   3871	 * Starting at the leaf node level, the four nodes described by
   3872	 * the higher level parent node are compared for a maximum free and
   3873	 * this maximum becomes the value of the parent node.
   3874	 * when all lower level nodes are processed in this fashion then
   3875	 * move up to the next level (parent becomes a lower level node) and
   3876	 * continue the process for that level.
   3877	 */
   3878	for (child = le32_to_cpu(dtp->leafidx),
   3879	     nparent = le32_to_cpu(dtp->nleafs) >> 2;
   3880	     nparent > 0; nparent >>= 2, child = parent) {
   3881		/* get index of 1st node of parent level */
   3882		parent = (child - 1) >> 2;
   3883
   3884		/* set the value of the parent node as the maximum
   3885		 * of the four nodes of the current level.
   3886		 */
   3887		for (i = 0, cp = tp + child, cp1 = tp + parent;
   3888		     i < nparent; i++, cp += 4, cp1++)
   3889			*cp1 = TREEMAX(cp);
   3890	}
   3891
   3892	return (*tp);
   3893}
   3894
   3895
   3896/*
   3897 *	dbInitDmapCtl()
   3898 *
   3899 * function: initialize dmapctl page
   3900 */
   3901static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i)
   3902{				/* start leaf index not covered by range */
   3903	s8 *cp;
   3904
   3905	dcp->nleafs = cpu_to_le32(LPERCTL);
   3906	dcp->l2nleafs = cpu_to_le32(L2LPERCTL);
   3907	dcp->leafidx = cpu_to_le32(CTLLEAFIND);
   3908	dcp->height = cpu_to_le32(5);
   3909	dcp->budmin = L2BPERDMAP + L2LPERCTL * level;
   3910
   3911	/*
   3912	 * initialize the leaves of current level that were not covered
   3913	 * by the specified input block range (i.e. the leaves have no
   3914	 * low level dmapctl or dmap).
   3915	 */
   3916	cp = &dcp->stree[CTLLEAFIND + i];
   3917	for (; i < LPERCTL; i++)
   3918		*cp++ = NOFREE;
   3919
   3920	/* build the dmap's binary buddy summary tree */
   3921	return (dbInitTree((struct dmaptree *) dcp));
   3922}
   3923
   3924
   3925/*
   3926 * NAME:	dbGetL2AGSize()/ujfs_getagl2size()
   3927 *
   3928 * FUNCTION:	Determine log2(allocation group size) from aggregate size
   3929 *
   3930 * PARAMETERS:
   3931 *	nblocks	- Number of blocks in aggregate
   3932 *
   3933 * RETURNS: log2(allocation group size) in aggregate blocks
   3934 */
   3935static int dbGetL2AGSize(s64 nblocks)
   3936{
   3937	s64 sz;
   3938	s64 m;
   3939	int l2sz;
   3940
   3941	if (nblocks < BPERDMAP * MAXAG)
   3942		return (L2BPERDMAP);
   3943
   3944	/* round up aggregate size to power of 2 */
   3945	m = ((u64) 1 << (64 - 1));
   3946	for (l2sz = 64; l2sz >= 0; l2sz--, m >>= 1) {
   3947		if (m & nblocks)
   3948			break;
   3949	}
   3950
   3951	sz = (s64) 1 << l2sz;
   3952	if (sz < nblocks)
   3953		l2sz += 1;
   3954
   3955	/* agsize = roundupSize/max_number_of_ag */
   3956	return (l2sz - L2MAXAG);
   3957}
   3958
   3959
   3960/*
   3961 * NAME:	dbMapFileSizeToMapSize()
   3962 *
   3963 * FUNCTION:	compute number of blocks the block allocation map file
   3964 *		can cover from the map file size;
   3965 *
   3966 * RETURNS:	Number of blocks which can be covered by this block map file;
   3967 */
   3968
   3969/*
   3970 * maximum number of map pages at each level including control pages
   3971 */
   3972#define MAXL0PAGES	(1 + LPERCTL)
   3973#define MAXL1PAGES	(1 + LPERCTL * MAXL0PAGES)
   3974
   3975/*
   3976 * convert number of map pages to the zero origin top dmapctl level
   3977 */
   3978#define BMAPPGTOLEV(npages)	\
   3979	(((npages) <= 3 + MAXL0PAGES) ? 0 : \
   3980	 ((npages) <= 2 + MAXL1PAGES) ? 1 : 2)
   3981
   3982s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
   3983{
   3984	struct super_block *sb = ipbmap->i_sb;
   3985	s64 nblocks;
   3986	s64 npages, ndmaps;
   3987	int level, i;
   3988	int complete, factor;
   3989
   3990	nblocks = ipbmap->i_size >> JFS_SBI(sb)->l2bsize;
   3991	npages = nblocks >> JFS_SBI(sb)->l2nbperpage;
   3992	level = BMAPPGTOLEV(npages);
   3993
   3994	/* At each level, accumulate the number of dmap pages covered by
   3995	 * the number of full child levels below it;
   3996	 * repeat for the last incomplete child level.
   3997	 */
   3998	ndmaps = 0;
   3999	npages--;		/* skip the first global control page */
   4000	/* skip higher level control pages above top level covered by map */
   4001	npages -= (2 - level);
   4002	npages--;		/* skip top level's control page */
   4003	for (i = level; i >= 0; i--) {
   4004		factor =
   4005		    (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1);
   4006		complete = (u32) npages / factor;
   4007		ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL :
   4008				      ((i == 1) ? LPERCTL : 1));
   4009
   4010		/* pages in last/incomplete child */
   4011		npages = (u32) npages % factor;
   4012		/* skip incomplete child's level control page */
   4013		npages--;
   4014	}
   4015
   4016	/* convert the number of dmaps into the number of blocks
   4017	 * which can be covered by the dmaps;
   4018	 */
   4019	nblocks = ndmaps << L2BPERDMAP;
   4020
   4021	return (nblocks);
   4022}