cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

gc.c (29024B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * This file is part of UBIFS.
      4 *
      5 * Copyright (C) 2006-2008 Nokia Corporation.
      6 *
      7 * Authors: Adrian Hunter
      8 *          Artem Bityutskiy (Битюцкий Артём)
      9 */
     10
     11/*
     12 * This file implements garbage collection. The procedure for garbage collection
     13 * is different depending on whether a LEB as an index LEB (contains index
     14 * nodes) or not. For non-index LEBs, garbage collection finds a LEB which
     15 * contains a lot of dirty space (obsolete nodes), and copies the non-obsolete
     16 * nodes to the journal, at which point the garbage-collected LEB is free to be
     17 * reused. For index LEBs, garbage collection marks the non-obsolete index nodes
     18 * dirty in the TNC, and after the next commit, the garbage-collected LEB is
     19 * to be reused. Garbage collection will cause the number of dirty index nodes
     20 * to grow, however sufficient space is reserved for the index to ensure the
     21 * commit will never run out of space.
     22 *
     23 * Notes about dead watermark. At current UBIFS implementation we assume that
     24 * LEBs which have less than @c->dead_wm bytes of free + dirty space are full
     25 * and not worth garbage-collecting. The dead watermark is one min. I/O unit
     26 * size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS
     27 * Garbage Collector has to synchronize the GC head's write buffer before
     28 * returning, so this is about wasting one min. I/O unit. However, UBIFS GC can
     29 * actually reclaim even very small pieces of dirty space by garbage collecting
     30 * enough dirty LEBs, but we do not bother doing this at this implementation.
     31 *
     32 * Notes about dark watermark. The results of GC work depends on how big are
     33 * the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed,
     34 * if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would
     35 * have to waste large pieces of free space at the end of LEB B, because nodes
     36 * from LEB A would not fit. And the worst situation is when all nodes are of
     37 * maximum size. So dark watermark is the amount of free + dirty space in LEB
     38 * which are guaranteed to be reclaimable. If LEB has less space, the GC might
     39 * be unable to reclaim it. So, LEBs with free + dirty greater than dark
     40 * watermark are "good" LEBs from GC's point of view. The other LEBs are not so
     41 * good, and GC takes extra care when moving them.
     42 */
     43
     44#include <linux/slab.h>
     45#include <linux/pagemap.h>
     46#include <linux/list_sort.h>
     47#include "ubifs.h"
     48
     49/*
     50 * GC may need to move more than one LEB to make progress. The below constants
     51 * define "soft" and "hard" limits on the number of LEBs the garbage collector
     52 * may move.
     53 */
     54#define SOFT_LEBS_LIMIT 4
     55#define HARD_LEBS_LIMIT 32
     56
     57/**
     58 * switch_gc_head - switch the garbage collection journal head.
     59 * @c: UBIFS file-system description object
     60 *
     61 * This function switch the GC head to the next LEB which is reserved in
     62 * @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required,
     63 * and other negative error code in case of failures.
     64 */
     65static int switch_gc_head(struct ubifs_info *c)
     66{
     67	int err, gc_lnum = c->gc_lnum;
     68	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
     69
     70	ubifs_assert(c, gc_lnum != -1);
     71	dbg_gc("switch GC head from LEB %d:%d to LEB %d (waste %d bytes)",
     72	       wbuf->lnum, wbuf->offs + wbuf->used, gc_lnum,
     73	       c->leb_size - wbuf->offs - wbuf->used);
     74
     75	err = ubifs_wbuf_sync_nolock(wbuf);
     76	if (err)
     77		return err;
     78
     79	/*
     80	 * The GC write-buffer was synchronized, we may safely unmap
     81	 * 'c->gc_lnum'.
     82	 */
     83	err = ubifs_leb_unmap(c, gc_lnum);
     84	if (err)
     85		return err;
     86
     87	err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
     88	if (err)
     89		return err;
     90
     91	c->gc_lnum = -1;
     92	err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0);
     93	return err;
     94}
     95
     96/**
     97 * data_nodes_cmp - compare 2 data nodes.
     98 * @priv: UBIFS file-system description object
     99 * @a: first data node
    100 * @b: second data node
    101 *
    102 * This function compares data nodes @a and @b. Returns %1 if @a has greater
    103 * inode or block number, and %-1 otherwise.
    104 */
    105static int data_nodes_cmp(void *priv, const struct list_head *a,
    106			  const struct list_head *b)
    107{
    108	ino_t inuma, inumb;
    109	struct ubifs_info *c = priv;
    110	struct ubifs_scan_node *sa, *sb;
    111
    112	cond_resched();
    113	if (a == b)
    114		return 0;
    115
    116	sa = list_entry(a, struct ubifs_scan_node, list);
    117	sb = list_entry(b, struct ubifs_scan_node, list);
    118
    119	ubifs_assert(c, key_type(c, &sa->key) == UBIFS_DATA_KEY);
    120	ubifs_assert(c, key_type(c, &sb->key) == UBIFS_DATA_KEY);
    121	ubifs_assert(c, sa->type == UBIFS_DATA_NODE);
    122	ubifs_assert(c, sb->type == UBIFS_DATA_NODE);
    123
    124	inuma = key_inum(c, &sa->key);
    125	inumb = key_inum(c, &sb->key);
    126
    127	if (inuma == inumb) {
    128		unsigned int blka = key_block(c, &sa->key);
    129		unsigned int blkb = key_block(c, &sb->key);
    130
    131		if (blka <= blkb)
    132			return -1;
    133	} else if (inuma <= inumb)
    134		return -1;
    135
    136	return 1;
    137}
    138
    139/*
    140 * nondata_nodes_cmp - compare 2 non-data nodes.
    141 * @priv: UBIFS file-system description object
    142 * @a: first node
    143 * @a: second node
    144 *
    145 * This function compares nodes @a and @b. It makes sure that inode nodes go
    146 * first and sorted by length in descending order. Directory entry nodes go
    147 * after inode nodes and are sorted in ascending hash valuer order.
    148 */
    149static int nondata_nodes_cmp(void *priv, const struct list_head *a,
    150			     const struct list_head *b)
    151{
    152	ino_t inuma, inumb;
    153	struct ubifs_info *c = priv;
    154	struct ubifs_scan_node *sa, *sb;
    155
    156	cond_resched();
    157	if (a == b)
    158		return 0;
    159
    160	sa = list_entry(a, struct ubifs_scan_node, list);
    161	sb = list_entry(b, struct ubifs_scan_node, list);
    162
    163	ubifs_assert(c, key_type(c, &sa->key) != UBIFS_DATA_KEY &&
    164		     key_type(c, &sb->key) != UBIFS_DATA_KEY);
    165	ubifs_assert(c, sa->type != UBIFS_DATA_NODE &&
    166		     sb->type != UBIFS_DATA_NODE);
    167
    168	/* Inodes go before directory entries */
    169	if (sa->type == UBIFS_INO_NODE) {
    170		if (sb->type == UBIFS_INO_NODE)
    171			return sb->len - sa->len;
    172		return -1;
    173	}
    174	if (sb->type == UBIFS_INO_NODE)
    175		return 1;
    176
    177	ubifs_assert(c, key_type(c, &sa->key) == UBIFS_DENT_KEY ||
    178		     key_type(c, &sa->key) == UBIFS_XENT_KEY);
    179	ubifs_assert(c, key_type(c, &sb->key) == UBIFS_DENT_KEY ||
    180		     key_type(c, &sb->key) == UBIFS_XENT_KEY);
    181	ubifs_assert(c, sa->type == UBIFS_DENT_NODE ||
    182		     sa->type == UBIFS_XENT_NODE);
    183	ubifs_assert(c, sb->type == UBIFS_DENT_NODE ||
    184		     sb->type == UBIFS_XENT_NODE);
    185
    186	inuma = key_inum(c, &sa->key);
    187	inumb = key_inum(c, &sb->key);
    188
    189	if (inuma == inumb) {
    190		uint32_t hasha = key_hash(c, &sa->key);
    191		uint32_t hashb = key_hash(c, &sb->key);
    192
    193		if (hasha <= hashb)
    194			return -1;
    195	} else if (inuma <= inumb)
    196		return -1;
    197
    198	return 1;
    199}
    200
    201/**
    202 * sort_nodes - sort nodes for GC.
    203 * @c: UBIFS file-system description object
    204 * @sleb: describes nodes to sort and contains the result on exit
    205 * @nondata: contains non-data nodes on exit
    206 * @min: minimum node size is returned here
    207 *
    208 * This function sorts the list of inodes to garbage collect. First of all, it
    209 * kills obsolete nodes and separates data and non-data nodes to the
    210 * @sleb->nodes and @nondata lists correspondingly.
    211 *
    212 * Data nodes are then sorted in block number order - this is important for
    213 * bulk-read; data nodes with lower inode number go before data nodes with
    214 * higher inode number, and data nodes with lower block number go before data
    215 * nodes with higher block number;
    216 *
    217 * Non-data nodes are sorted as follows.
    218 *   o First go inode nodes - they are sorted in descending length order.
    219 *   o Then go directory entry nodes - they are sorted in hash order, which
    220 *     should supposedly optimize 'readdir()'. Direntry nodes with lower parent
    221 *     inode number go before direntry nodes with higher parent inode number,
    222 *     and direntry nodes with lower name hash values go before direntry nodes
    223 *     with higher name hash values.
    224 *
    225 * This function returns zero in case of success and a negative error code in
    226 * case of failure.
    227 */
    228static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
    229		      struct list_head *nondata, int *min)
    230{
    231	int err;
    232	struct ubifs_scan_node *snod, *tmp;
    233
    234	*min = INT_MAX;
    235
    236	/* Separate data nodes and non-data nodes */
    237	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
    238		ubifs_assert(c, snod->type == UBIFS_INO_NODE  ||
    239			     snod->type == UBIFS_DATA_NODE ||
    240			     snod->type == UBIFS_DENT_NODE ||
    241			     snod->type == UBIFS_XENT_NODE ||
    242			     snod->type == UBIFS_TRUN_NODE ||
    243			     snod->type == UBIFS_AUTH_NODE);
    244
    245		if (snod->type != UBIFS_INO_NODE  &&
    246		    snod->type != UBIFS_DATA_NODE &&
    247		    snod->type != UBIFS_DENT_NODE &&
    248		    snod->type != UBIFS_XENT_NODE) {
    249			/* Probably truncation node, zap it */
    250			list_del(&snod->list);
    251			kfree(snod);
    252			continue;
    253		}
    254
    255		ubifs_assert(c, key_type(c, &snod->key) == UBIFS_DATA_KEY ||
    256			     key_type(c, &snod->key) == UBIFS_INO_KEY  ||
    257			     key_type(c, &snod->key) == UBIFS_DENT_KEY ||
    258			     key_type(c, &snod->key) == UBIFS_XENT_KEY);
    259
    260		err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
    261					 snod->offs, 0);
    262		if (err < 0)
    263			return err;
    264
    265		if (!err) {
    266			/* The node is obsolete, remove it from the list */
    267			list_del(&snod->list);
    268			kfree(snod);
    269			continue;
    270		}
    271
    272		if (snod->len < *min)
    273			*min = snod->len;
    274
    275		if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
    276			list_move_tail(&snod->list, nondata);
    277	}
    278
    279	/* Sort data and non-data nodes */
    280	list_sort(c, &sleb->nodes, &data_nodes_cmp);
    281	list_sort(c, nondata, &nondata_nodes_cmp);
    282
    283	err = dbg_check_data_nodes_order(c, &sleb->nodes);
    284	if (err)
    285		return err;
    286	err = dbg_check_nondata_nodes_order(c, nondata);
    287	if (err)
    288		return err;
    289	return 0;
    290}
    291
    292/**
    293 * move_node - move a node.
    294 * @c: UBIFS file-system description object
    295 * @sleb: describes the LEB to move nodes from
    296 * @snod: the mode to move
    297 * @wbuf: write-buffer to move node to
    298 *
    299 * This function moves node @snod to @wbuf, changes TNC correspondingly, and
    300 * destroys @snod. Returns zero in case of success and a negative error code in
    301 * case of failure.
    302 */
    303static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
    304		     struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
    305{
    306	int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
    307
    308	cond_resched();
    309	err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
    310	if (err)
    311		return err;
    312
    313	err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
    314				snod->offs, new_lnum, new_offs,
    315				snod->len);
    316	list_del(&snod->list);
    317	kfree(snod);
    318	return err;
    319}
    320
    321/**
    322 * move_nodes - move nodes.
    323 * @c: UBIFS file-system description object
    324 * @sleb: describes the LEB to move nodes from
    325 *
    326 * This function moves valid nodes from data LEB described by @sleb to the GC
    327 * journal head. This function returns zero in case of success, %-EAGAIN if
    328 * commit is required, and other negative error codes in case of other
    329 * failures.
    330 */
    331static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
    332{
    333	int err, min;
    334	LIST_HEAD(nondata);
    335	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
    336
    337	if (wbuf->lnum == -1) {
    338		/*
    339		 * The GC journal head is not set, because it is the first GC
    340		 * invocation since mount.
    341		 */
    342		err = switch_gc_head(c);
    343		if (err)
    344			return err;
    345	}
    346
    347	err = sort_nodes(c, sleb, &nondata, &min);
    348	if (err)
    349		goto out;
    350
    351	/* Write nodes to their new location. Use the first-fit strategy */
    352	while (1) {
    353		int avail, moved = 0;
    354		struct ubifs_scan_node *snod, *tmp;
    355
    356		/* Move data nodes */
    357		list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
    358			avail = c->leb_size - wbuf->offs - wbuf->used -
    359					ubifs_auth_node_sz(c);
    360			if  (snod->len > avail)
    361				/*
    362				 * Do not skip data nodes in order to optimize
    363				 * bulk-read.
    364				 */
    365				break;
    366
    367			err = ubifs_shash_update(c, c->jheads[GCHD].log_hash,
    368						 snod->node, snod->len);
    369			if (err)
    370				goto out;
    371
    372			err = move_node(c, sleb, snod, wbuf);
    373			if (err)
    374				goto out;
    375			moved = 1;
    376		}
    377
    378		/* Move non-data nodes */
    379		list_for_each_entry_safe(snod, tmp, &nondata, list) {
    380			avail = c->leb_size - wbuf->offs - wbuf->used -
    381					ubifs_auth_node_sz(c);
    382			if (avail < min)
    383				break;
    384
    385			if  (snod->len > avail) {
    386				/*
    387				 * Keep going only if this is an inode with
    388				 * some data. Otherwise stop and switch the GC
    389				 * head. IOW, we assume that data-less inode
    390				 * nodes and direntry nodes are roughly of the
    391				 * same size.
    392				 */
    393				if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
    394				    snod->len == UBIFS_INO_NODE_SZ)
    395					break;
    396				continue;
    397			}
    398
    399			err = ubifs_shash_update(c, c->jheads[GCHD].log_hash,
    400						 snod->node, snod->len);
    401			if (err)
    402				goto out;
    403
    404			err = move_node(c, sleb, snod, wbuf);
    405			if (err)
    406				goto out;
    407			moved = 1;
    408		}
    409
    410		if (ubifs_authenticated(c) && moved) {
    411			struct ubifs_auth_node *auth;
    412
    413			auth = kmalloc(ubifs_auth_node_sz(c), GFP_NOFS);
    414			if (!auth) {
    415				err = -ENOMEM;
    416				goto out;
    417			}
    418
    419			err = ubifs_prepare_auth_node(c, auth,
    420						c->jheads[GCHD].log_hash);
    421			if (err) {
    422				kfree(auth);
    423				goto out;
    424			}
    425
    426			err = ubifs_wbuf_write_nolock(wbuf, auth,
    427						      ubifs_auth_node_sz(c));
    428			if (err) {
    429				kfree(auth);
    430				goto out;
    431			}
    432
    433			ubifs_add_dirt(c, wbuf->lnum, ubifs_auth_node_sz(c));
    434		}
    435
    436		if (list_empty(&sleb->nodes) && list_empty(&nondata))
    437			break;
    438
    439		/*
    440		 * Waste the rest of the space in the LEB and switch to the
    441		 * next LEB.
    442		 */
    443		err = switch_gc_head(c);
    444		if (err)
    445			goto out;
    446	}
    447
    448	return 0;
    449
    450out:
    451	list_splice_tail(&nondata, &sleb->nodes);
    452	return err;
    453}
    454
    455/**
    456 * gc_sync_wbufs - sync write-buffers for GC.
    457 * @c: UBIFS file-system description object
    458 *
    459 * We must guarantee that obsoleting nodes are on flash. Unfortunately they may
    460 * be in a write-buffer instead. That is, a node could be written to a
    461 * write-buffer, obsoleting another node in a LEB that is GC'd. If that LEB is
    462 * erased before the write-buffer is sync'd and then there is an unclean
    463 * unmount, then an existing node is lost. To avoid this, we sync all
    464 * write-buffers.
    465 *
    466 * This function returns %0 on success or a negative error code on failure.
    467 */
    468static int gc_sync_wbufs(struct ubifs_info *c)
    469{
    470	int err, i;
    471
    472	for (i = 0; i < c->jhead_cnt; i++) {
    473		if (i == GCHD)
    474			continue;
    475		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
    476		if (err)
    477			return err;
    478	}
    479	return 0;
    480}
    481
    482/**
    483 * ubifs_garbage_collect_leb - garbage-collect a logical eraseblock.
    484 * @c: UBIFS file-system description object
    485 * @lp: describes the LEB to garbage collect
    486 *
    487 * This function garbage-collects an LEB and returns one of the @LEB_FREED,
    488 * @LEB_RETAINED, etc positive codes in case of success, %-EAGAIN if commit is
    489 * required, and other negative error codes in case of failures.
    490 */
    491int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
    492{
    493	struct ubifs_scan_leb *sleb;
    494	struct ubifs_scan_node *snod;
    495	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
    496	int err = 0, lnum = lp->lnum;
    497
    498	ubifs_assert(c, c->gc_lnum != -1 || wbuf->offs + wbuf->used == 0 ||
    499		     c->need_recovery);
    500	ubifs_assert(c, c->gc_lnum != lnum);
    501	ubifs_assert(c, wbuf->lnum != lnum);
    502
    503	if (lp->free + lp->dirty == c->leb_size) {
    504		/* Special case - a free LEB  */
    505		dbg_gc("LEB %d is free, return it", lp->lnum);
    506		ubifs_assert(c, !(lp->flags & LPROPS_INDEX));
    507
    508		if (lp->free != c->leb_size) {
    509			/*
    510			 * Write buffers must be sync'd before unmapping
    511			 * freeable LEBs, because one of them may contain data
    512			 * which obsoletes something in 'lp->lnum'.
    513			 */
    514			err = gc_sync_wbufs(c);
    515			if (err)
    516				return err;
    517			err = ubifs_change_one_lp(c, lp->lnum, c->leb_size,
    518						  0, 0, 0, 0);
    519			if (err)
    520				return err;
    521		}
    522		err = ubifs_leb_unmap(c, lp->lnum);
    523		if (err)
    524			return err;
    525
    526		if (c->gc_lnum == -1) {
    527			c->gc_lnum = lnum;
    528			return LEB_RETAINED;
    529		}
    530
    531		return LEB_FREED;
    532	}
    533
    534	/*
    535	 * We scan the entire LEB even though we only really need to scan up to
    536	 * (c->leb_size - lp->free).
    537	 */
    538	sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
    539	if (IS_ERR(sleb))
    540		return PTR_ERR(sleb);
    541
    542	ubifs_assert(c, !list_empty(&sleb->nodes));
    543	snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
    544
    545	if (snod->type == UBIFS_IDX_NODE) {
    546		struct ubifs_gced_idx_leb *idx_gc;
    547
    548		dbg_gc("indexing LEB %d (free %d, dirty %d)",
    549		       lnum, lp->free, lp->dirty);
    550		list_for_each_entry(snod, &sleb->nodes, list) {
    551			struct ubifs_idx_node *idx = snod->node;
    552			int level = le16_to_cpu(idx->level);
    553
    554			ubifs_assert(c, snod->type == UBIFS_IDX_NODE);
    555			key_read(c, ubifs_idx_key(c, idx), &snod->key);
    556			err = ubifs_dirty_idx_node(c, &snod->key, level, lnum,
    557						   snod->offs);
    558			if (err)
    559				goto out;
    560		}
    561
    562		idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
    563		if (!idx_gc) {
    564			err = -ENOMEM;
    565			goto out;
    566		}
    567
    568		idx_gc->lnum = lnum;
    569		idx_gc->unmap = 0;
    570		list_add(&idx_gc->list, &c->idx_gc);
    571
    572		/*
    573		 * Don't release the LEB until after the next commit, because
    574		 * it may contain data which is needed for recovery. So
    575		 * although we freed this LEB, it will become usable only after
    576		 * the commit.
    577		 */
    578		err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0,
    579					  LPROPS_INDEX, 1);
    580		if (err)
    581			goto out;
    582		err = LEB_FREED_IDX;
    583	} else {
    584		dbg_gc("data LEB %d (free %d, dirty %d)",
    585		       lnum, lp->free, lp->dirty);
    586
    587		err = move_nodes(c, sleb);
    588		if (err)
    589			goto out_inc_seq;
    590
    591		err = gc_sync_wbufs(c);
    592		if (err)
    593			goto out_inc_seq;
    594
    595		err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);
    596		if (err)
    597			goto out_inc_seq;
    598
    599		/* Allow for races with TNC */
    600		c->gced_lnum = lnum;
    601		smp_wmb();
    602		c->gc_seq += 1;
    603		smp_wmb();
    604
    605		if (c->gc_lnum == -1) {
    606			c->gc_lnum = lnum;
    607			err = LEB_RETAINED;
    608		} else {
    609			err = ubifs_wbuf_sync_nolock(wbuf);
    610			if (err)
    611				goto out;
    612
    613			err = ubifs_leb_unmap(c, lnum);
    614			if (err)
    615				goto out;
    616
    617			err = LEB_FREED;
    618		}
    619	}
    620
    621out:
    622	ubifs_scan_destroy(sleb);
    623	return err;
    624
    625out_inc_seq:
    626	/* We may have moved at least some nodes so allow for races with TNC */
    627	c->gced_lnum = lnum;
    628	smp_wmb();
    629	c->gc_seq += 1;
    630	smp_wmb();
    631	goto out;
    632}
    633
    634/**
    635 * ubifs_garbage_collect - UBIFS garbage collector.
    636 * @c: UBIFS file-system description object
    637 * @anyway: do GC even if there are free LEBs
    638 *
    639 * This function does out-of-place garbage collection. The return codes are:
    640 *   o positive LEB number if the LEB has been freed and may be used;
    641 *   o %-EAGAIN if the caller has to run commit;
    642 *   o %-ENOSPC if GC failed to make any progress;
    643 *   o other negative error codes in case of other errors.
    644 *
    645 * Garbage collector writes data to the journal when GC'ing data LEBs, and just
    646 * marking indexing nodes dirty when GC'ing indexing LEBs. Thus, at some point
    647 * commit may be required. But commit cannot be run from inside GC, because the
    648 * caller might be holding the commit lock, so %-EAGAIN is returned instead;
    649 * And this error code means that the caller has to run commit, and re-run GC
    650 * if there is still no free space.
    651 *
    652 * There are many reasons why this function may return %-EAGAIN:
    653 * o the log is full and there is no space to write an LEB reference for
    654 *   @c->gc_lnum;
    655 * o the journal is too large and exceeds size limitations;
    656 * o GC moved indexing LEBs, but they can be used only after the commit;
    657 * o the shrinker fails to find clean znodes to free and requests the commit;
    658 * o etc.
    659 *
    660 * Note, if the file-system is close to be full, this function may return
    661 * %-EAGAIN infinitely, so the caller has to limit amount of re-invocations of
    662 * the function. E.g., this happens if the limits on the journal size are too
    663 * tough and GC writes too much to the journal before an LEB is freed. This
    664 * might also mean that the journal is too large, and the TNC becomes to big,
    665 * so that the shrinker is constantly called, finds not clean znodes to free,
    666 * and requests commit. Well, this may also happen if the journal is all right,
    667 * but another kernel process consumes too much memory. Anyway, infinite
    668 * %-EAGAIN may happen, but in some extreme/misconfiguration cases.
    669 */
    670int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
    671{
    672	int i, err, ret, min_space = c->dead_wm;
    673	struct ubifs_lprops lp;
    674	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
    675
    676	ubifs_assert_cmt_locked(c);
    677	ubifs_assert(c, !c->ro_media && !c->ro_mount);
    678
    679	if (ubifs_gc_should_commit(c))
    680		return -EAGAIN;
    681
    682	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
    683
    684	if (c->ro_error) {
    685		ret = -EROFS;
    686		goto out_unlock;
    687	}
    688
    689	/* We expect the write-buffer to be empty on entry */
    690	ubifs_assert(c, !wbuf->used);
    691
    692	for (i = 0; ; i++) {
    693		int space_before, space_after;
    694
    695		/* Maybe continue after find and break before find */
    696		lp.lnum = -1;
    697
    698		cond_resched();
    699
    700		/* Give the commit an opportunity to run */
    701		if (ubifs_gc_should_commit(c)) {
    702			ret = -EAGAIN;
    703			break;
    704		}
    705
    706		if (i > SOFT_LEBS_LIMIT && !list_empty(&c->idx_gc)) {
    707			/*
    708			 * We've done enough iterations. Indexing LEBs were
    709			 * moved and will be available after the commit.
    710			 */
    711			dbg_gc("soft limit, some index LEBs GC'ed, -EAGAIN");
    712			ubifs_commit_required(c);
    713			ret = -EAGAIN;
    714			break;
    715		}
    716
    717		if (i > HARD_LEBS_LIMIT) {
    718			/*
    719			 * We've moved too many LEBs and have not made
    720			 * progress, give up.
    721			 */
    722			dbg_gc("hard limit, -ENOSPC");
    723			ret = -ENOSPC;
    724			break;
    725		}
    726
    727		/*
    728		 * Empty and freeable LEBs can turn up while we waited for
    729		 * the wbuf lock, or while we have been running GC. In that
    730		 * case, we should just return one of those instead of
    731		 * continuing to GC dirty LEBs. Hence we request
    732		 * 'ubifs_find_dirty_leb()' to return an empty LEB if it can.
    733		 */
    734		ret = ubifs_find_dirty_leb(c, &lp, min_space, anyway ? 0 : 1);
    735		if (ret) {
    736			if (ret == -ENOSPC)
    737				dbg_gc("no more dirty LEBs");
    738			break;
    739		}
    740
    741		dbg_gc("found LEB %d: free %d, dirty %d, sum %d (min. space %d)",
    742		       lp.lnum, lp.free, lp.dirty, lp.free + lp.dirty,
    743		       min_space);
    744
    745		space_before = c->leb_size - wbuf->offs - wbuf->used;
    746		if (wbuf->lnum == -1)
    747			space_before = 0;
    748
    749		ret = ubifs_garbage_collect_leb(c, &lp);
    750		if (ret < 0) {
    751			if (ret == -EAGAIN) {
    752				/*
    753				 * This is not error, so we have to return the
    754				 * LEB to lprops. But if 'ubifs_return_leb()'
    755				 * fails, its failure code is propagated to the
    756				 * caller instead of the original '-EAGAIN'.
    757				 */
    758				err = ubifs_return_leb(c, lp.lnum);
    759				if (err) {
    760					ret = err;
    761					/*
    762					 * An LEB may always be "taken",
    763					 * so setting ubifs to read-only,
    764					 * and then executing sync wbuf will
    765					 * return -EROFS and enter the "out"
    766					 * error branch.
    767					 */
    768					ubifs_ro_mode(c, ret);
    769				}
    770				/*  Maybe double return LEB if goto out */
    771				lp.lnum = -1;
    772				break;
    773			}
    774			goto out;
    775		}
    776
    777		if (ret == LEB_FREED) {
    778			/* An LEB has been freed and is ready for use */
    779			dbg_gc("LEB %d freed, return", lp.lnum);
    780			ret = lp.lnum;
    781			break;
    782		}
    783
    784		if (ret == LEB_FREED_IDX) {
    785			/*
    786			 * This was an indexing LEB and it cannot be
    787			 * immediately used. And instead of requesting the
    788			 * commit straight away, we try to garbage collect some
    789			 * more.
    790			 */
    791			dbg_gc("indexing LEB %d freed, continue", lp.lnum);
    792			continue;
    793		}
    794
    795		ubifs_assert(c, ret == LEB_RETAINED);
    796		space_after = c->leb_size - wbuf->offs - wbuf->used;
    797		dbg_gc("LEB %d retained, freed %d bytes", lp.lnum,
    798		       space_after - space_before);
    799
    800		if (space_after > space_before) {
    801			/* GC makes progress, keep working */
    802			min_space >>= 1;
    803			if (min_space < c->dead_wm)
    804				min_space = c->dead_wm;
    805			continue;
    806		}
    807
    808		dbg_gc("did not make progress");
    809
    810		/*
    811		 * GC moved an LEB bud have not done any progress. This means
    812		 * that the previous GC head LEB contained too few free space
    813		 * and the LEB which was GC'ed contained only large nodes which
    814		 * did not fit that space.
    815		 *
    816		 * We can do 2 things:
    817		 * 1. pick another LEB in a hope it'll contain a small node
    818		 *    which will fit the space we have at the end of current GC
    819		 *    head LEB, but there is no guarantee, so we try this out
    820		 *    unless we have already been working for too long;
    821		 * 2. request an LEB with more dirty space, which will force
    822		 *    'ubifs_find_dirty_leb()' to start scanning the lprops
    823		 *    table, instead of just picking one from the heap
    824		 *    (previously it already picked the dirtiest LEB).
    825		 */
    826		if (i < SOFT_LEBS_LIMIT) {
    827			dbg_gc("try again");
    828			continue;
    829		}
    830
    831		min_space <<= 1;
    832		if (min_space > c->dark_wm)
    833			min_space = c->dark_wm;
    834		dbg_gc("set min. space to %d", min_space);
    835	}
    836
    837	if (ret == -ENOSPC && !list_empty(&c->idx_gc)) {
    838		dbg_gc("no space, some index LEBs GC'ed, -EAGAIN");
    839		ubifs_commit_required(c);
    840		ret = -EAGAIN;
    841	}
    842
    843	err = ubifs_wbuf_sync_nolock(wbuf);
    844	if (!err)
    845		err = ubifs_leb_unmap(c, c->gc_lnum);
    846	if (err) {
    847		ret = err;
    848		goto out;
    849	}
    850out_unlock:
    851	mutex_unlock(&wbuf->io_mutex);
    852	return ret;
    853
    854out:
    855	ubifs_assert(c, ret < 0);
    856	ubifs_assert(c, ret != -ENOSPC && ret != -EAGAIN);
    857	ubifs_wbuf_sync_nolock(wbuf);
    858	ubifs_ro_mode(c, ret);
    859	mutex_unlock(&wbuf->io_mutex);
    860	if (lp.lnum != -1)
    861		ubifs_return_leb(c, lp.lnum);
    862	return ret;
    863}
    864
    865/**
    866 * ubifs_gc_start_commit - garbage collection at start of commit.
    867 * @c: UBIFS file-system description object
    868 *
    869 * If a LEB has only dirty and free space, then we may safely unmap it and make
    870 * it free.  Note, we cannot do this with indexing LEBs because dirty space may
    871 * correspond index nodes that are required for recovery.  In that case, the
    872 * LEB cannot be unmapped until after the next commit.
    873 *
    874 * This function returns %0 upon success and a negative error code upon failure.
    875 */
    876int ubifs_gc_start_commit(struct ubifs_info *c)
    877{
    878	struct ubifs_gced_idx_leb *idx_gc;
    879	const struct ubifs_lprops *lp;
    880	int err = 0, flags;
    881
    882	ubifs_get_lprops(c);
    883
    884	/*
    885	 * Unmap (non-index) freeable LEBs. Note that recovery requires that all
    886	 * wbufs are sync'd before this, which is done in 'do_commit()'.
    887	 */
    888	while (1) {
    889		lp = ubifs_fast_find_freeable(c);
    890		if (!lp)
    891			break;
    892		ubifs_assert(c, !(lp->flags & LPROPS_TAKEN));
    893		ubifs_assert(c, !(lp->flags & LPROPS_INDEX));
    894		err = ubifs_leb_unmap(c, lp->lnum);
    895		if (err)
    896			goto out;
    897		lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0);
    898		if (IS_ERR(lp)) {
    899			err = PTR_ERR(lp);
    900			goto out;
    901		}
    902		ubifs_assert(c, !(lp->flags & LPROPS_TAKEN));
    903		ubifs_assert(c, !(lp->flags & LPROPS_INDEX));
    904	}
    905
    906	/* Mark GC'd index LEBs OK to unmap after this commit finishes */
    907	list_for_each_entry(idx_gc, &c->idx_gc, list)
    908		idx_gc->unmap = 1;
    909
    910	/* Record index freeable LEBs for unmapping after commit */
    911	while (1) {
    912		lp = ubifs_fast_find_frdi_idx(c);
    913		if (IS_ERR(lp)) {
    914			err = PTR_ERR(lp);
    915			goto out;
    916		}
    917		if (!lp)
    918			break;
    919		idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
    920		if (!idx_gc) {
    921			err = -ENOMEM;
    922			goto out;
    923		}
    924		ubifs_assert(c, !(lp->flags & LPROPS_TAKEN));
    925		ubifs_assert(c, lp->flags & LPROPS_INDEX);
    926		/* Don't release the LEB until after the next commit */
    927		flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX;
    928		lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1);
    929		if (IS_ERR(lp)) {
    930			err = PTR_ERR(lp);
    931			kfree(idx_gc);
    932			goto out;
    933		}
    934		ubifs_assert(c, lp->flags & LPROPS_TAKEN);
    935		ubifs_assert(c, !(lp->flags & LPROPS_INDEX));
    936		idx_gc->lnum = lp->lnum;
    937		idx_gc->unmap = 1;
    938		list_add(&idx_gc->list, &c->idx_gc);
    939	}
    940out:
    941	ubifs_release_lprops(c);
    942	return err;
    943}
    944
    945/**
    946 * ubifs_gc_end_commit - garbage collection at end of commit.
    947 * @c: UBIFS file-system description object
    948 *
    949 * This function completes out-of-place garbage collection of index LEBs.
    950 */
    951int ubifs_gc_end_commit(struct ubifs_info *c)
    952{
    953	struct ubifs_gced_idx_leb *idx_gc, *tmp;
    954	struct ubifs_wbuf *wbuf;
    955	int err = 0;
    956
    957	wbuf = &c->jheads[GCHD].wbuf;
    958	mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
    959	list_for_each_entry_safe(idx_gc, tmp, &c->idx_gc, list)
    960		if (idx_gc->unmap) {
    961			dbg_gc("LEB %d", idx_gc->lnum);
    962			err = ubifs_leb_unmap(c, idx_gc->lnum);
    963			if (err)
    964				goto out;
    965			err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC,
    966					  LPROPS_NC, 0, LPROPS_TAKEN, -1);
    967			if (err)
    968				goto out;
    969			list_del(&idx_gc->list);
    970			kfree(idx_gc);
    971		}
    972out:
    973	mutex_unlock(&wbuf->io_mutex);
    974	return err;
    975}
    976
    977/**
    978 * ubifs_destroy_idx_gc - destroy idx_gc list.
    979 * @c: UBIFS file-system description object
    980 *
    981 * This function destroys the @c->idx_gc list. It is called when unmounting
    982 * so locks are not needed. Returns zero in case of success and a negative
    983 * error code in case of failure.
    984 */
    985void ubifs_destroy_idx_gc(struct ubifs_info *c)
    986{
    987	while (!list_empty(&c->idx_gc)) {
    988		struct ubifs_gced_idx_leb *idx_gc;
    989
    990		idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb,
    991				    list);
    992		c->idx_gc_cnt -= 1;
    993		list_del(&idx_gc->list);
    994		kfree(idx_gc);
    995	}
    996}
    997
    998/**
    999 * ubifs_get_idx_gc_leb - get a LEB from GC'd index LEB list.
   1000 * @c: UBIFS file-system description object
   1001 *
   1002 * Called during start commit so locks are not needed.
   1003 */
   1004int ubifs_get_idx_gc_leb(struct ubifs_info *c)
   1005{
   1006	struct ubifs_gced_idx_leb *idx_gc;
   1007	int lnum;
   1008
   1009	if (list_empty(&c->idx_gc))
   1010		return -ENOSPC;
   1011	idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, list);
   1012	lnum = idx_gc->lnum;
   1013	/* c->idx_gc_cnt is updated by the caller when lprops are updated */
   1014	list_del(&idx_gc->list);
   1015	kfree(idx_gc);
   1016	return lnum;
   1017}