cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

node.c (83395B)


      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * fs/f2fs/node.c
      4 *
      5 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
      6 *             http://www.samsung.com/
      7 */
      8#include <linux/fs.h>
      9#include <linux/f2fs_fs.h>
     10#include <linux/mpage.h>
     11#include <linux/sched/mm.h>
     12#include <linux/blkdev.h>
     13#include <linux/pagevec.h>
     14#include <linux/swap.h>
     15
     16#include "f2fs.h"
     17#include "node.h"
     18#include "segment.h"
     19#include "xattr.h"
     20#include "iostat.h"
     21#include <trace/events/f2fs.h>
     22
     23#define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock)
     24
     25static struct kmem_cache *nat_entry_slab;
     26static struct kmem_cache *free_nid_slab;
     27static struct kmem_cache *nat_entry_set_slab;
     28static struct kmem_cache *fsync_node_entry_slab;
     29
     30/*
     31 * Check whether the given nid is within node id range.
     32 */
     33int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
     34{
     35	if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) {
     36		set_sbi_flag(sbi, SBI_NEED_FSCK);
     37		f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.",
     38			  __func__, nid);
     39		return -EFSCORRUPTED;
     40	}
     41	return 0;
     42}
     43
     44bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
     45{
     46	struct f2fs_nm_info *nm_i = NM_I(sbi);
     47	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
     48	struct sysinfo val;
     49	unsigned long avail_ram;
     50	unsigned long mem_size = 0;
     51	bool res = false;
     52
     53	if (!nm_i)
     54		return true;
     55
     56	si_meminfo(&val);
     57
     58	/* only uses low memory */
     59	avail_ram = val.totalram - val.totalhigh;
     60
     61	/*
     62	 * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
     63	 */
     64	if (type == FREE_NIDS) {
     65		mem_size = (nm_i->nid_cnt[FREE_NID] *
     66				sizeof(struct free_nid)) >> PAGE_SHIFT;
     67		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
     68	} else if (type == NAT_ENTRIES) {
     69		mem_size = (nm_i->nat_cnt[TOTAL_NAT] *
     70				sizeof(struct nat_entry)) >> PAGE_SHIFT;
     71		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
     72		if (excess_cached_nats(sbi))
     73			res = false;
     74	} else if (type == DIRTY_DENTS) {
     75		if (sbi->sb->s_bdi->wb.dirty_exceeded)
     76			return false;
     77		mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
     78		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
     79	} else if (type == INO_ENTRIES) {
     80		int i;
     81
     82		for (i = 0; i < MAX_INO_ENTRY; i++)
     83			mem_size += sbi->im[i].ino_num *
     84						sizeof(struct ino_entry);
     85		mem_size >>= PAGE_SHIFT;
     86		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
     87	} else if (type == EXTENT_CACHE) {
     88		mem_size = (atomic_read(&sbi->total_ext_tree) *
     89				sizeof(struct extent_tree) +
     90				atomic_read(&sbi->total_ext_node) *
     91				sizeof(struct extent_node)) >> PAGE_SHIFT;
     92		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
     93	} else if (type == DISCARD_CACHE) {
     94		mem_size = (atomic_read(&dcc->discard_cmd_cnt) *
     95				sizeof(struct discard_cmd)) >> PAGE_SHIFT;
     96		res = mem_size < (avail_ram * nm_i->ram_thresh / 100);
     97	} else if (type == COMPRESS_PAGE) {
     98#ifdef CONFIG_F2FS_FS_COMPRESSION
     99		unsigned long free_ram = val.freeram;
    100
    101		/*
    102		 * free memory is lower than watermark or cached page count
    103		 * exceed threshold, deny caching compress page.
    104		 */
    105		res = (free_ram > avail_ram * sbi->compress_watermark / 100) &&
    106			(COMPRESS_MAPPING(sbi)->nrpages <
    107			 free_ram * sbi->compress_percent / 100);
    108#else
    109		res = false;
    110#endif
    111	} else {
    112		if (!sbi->sb->s_bdi->wb.dirty_exceeded)
    113			return true;
    114	}
    115	return res;
    116}
    117
    118static void clear_node_page_dirty(struct page *page)
    119{
    120	if (PageDirty(page)) {
    121		f2fs_clear_page_cache_dirty_tag(page);
    122		clear_page_dirty_for_io(page);
    123		dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
    124	}
    125	ClearPageUptodate(page);
    126}
    127
    128static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
    129{
    130	return f2fs_get_meta_page_retry(sbi, current_nat_addr(sbi, nid));
    131}
    132
    133static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
    134{
    135	struct page *src_page;
    136	struct page *dst_page;
    137	pgoff_t dst_off;
    138	void *src_addr;
    139	void *dst_addr;
    140	struct f2fs_nm_info *nm_i = NM_I(sbi);
    141
    142	dst_off = next_nat_addr(sbi, current_nat_addr(sbi, nid));
    143
    144	/* get current nat block page with lock */
    145	src_page = get_current_nat_page(sbi, nid);
    146	if (IS_ERR(src_page))
    147		return src_page;
    148	dst_page = f2fs_grab_meta_page(sbi, dst_off);
    149	f2fs_bug_on(sbi, PageDirty(src_page));
    150
    151	src_addr = page_address(src_page);
    152	dst_addr = page_address(dst_page);
    153	memcpy(dst_addr, src_addr, PAGE_SIZE);
    154	set_page_dirty(dst_page);
    155	f2fs_put_page(src_page, 1);
    156
    157	set_to_next_nat(nm_i, nid);
    158
    159	return dst_page;
    160}
    161
    162static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi,
    163						nid_t nid, bool no_fail)
    164{
    165	struct nat_entry *new;
    166
    167	new = f2fs_kmem_cache_alloc(nat_entry_slab,
    168					GFP_F2FS_ZERO, no_fail, sbi);
    169	if (new) {
    170		nat_set_nid(new, nid);
    171		nat_reset_flag(new);
    172	}
    173	return new;
    174}
    175
    176static void __free_nat_entry(struct nat_entry *e)
    177{
    178	kmem_cache_free(nat_entry_slab, e);
    179}
    180
    181/* must be locked by nat_tree_lock */
    182static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
    183	struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail)
    184{
    185	if (no_fail)
    186		f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne);
    187	else if (radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne))
    188		return NULL;
    189
    190	if (raw_ne)
    191		node_info_from_raw_nat(&ne->ni, raw_ne);
    192
    193	spin_lock(&nm_i->nat_list_lock);
    194	list_add_tail(&ne->list, &nm_i->nat_entries);
    195	spin_unlock(&nm_i->nat_list_lock);
    196
    197	nm_i->nat_cnt[TOTAL_NAT]++;
    198	nm_i->nat_cnt[RECLAIMABLE_NAT]++;
    199	return ne;
    200}
    201
    202static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
    203{
    204	struct nat_entry *ne;
    205
    206	ne = radix_tree_lookup(&nm_i->nat_root, n);
    207
    208	/* for recent accessed nat entry, move it to tail of lru list */
    209	if (ne && !get_nat_flag(ne, IS_DIRTY)) {
    210		spin_lock(&nm_i->nat_list_lock);
    211		if (!list_empty(&ne->list))
    212			list_move_tail(&ne->list, &nm_i->nat_entries);
    213		spin_unlock(&nm_i->nat_list_lock);
    214	}
    215
    216	return ne;
    217}
    218
    219static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
    220		nid_t start, unsigned int nr, struct nat_entry **ep)
    221{
    222	return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr);
    223}
    224
    225static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
    226{
    227	radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
    228	nm_i->nat_cnt[TOTAL_NAT]--;
    229	nm_i->nat_cnt[RECLAIMABLE_NAT]--;
    230	__free_nat_entry(e);
    231}
    232
    233static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
    234							struct nat_entry *ne)
    235{
    236	nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
    237	struct nat_entry_set *head;
    238
    239	head = radix_tree_lookup(&nm_i->nat_set_root, set);
    240	if (!head) {
    241		head = f2fs_kmem_cache_alloc(nat_entry_set_slab,
    242						GFP_NOFS, true, NULL);
    243
    244		INIT_LIST_HEAD(&head->entry_list);
    245		INIT_LIST_HEAD(&head->set_list);
    246		head->set = set;
    247		head->entry_cnt = 0;
    248		f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
    249	}
    250	return head;
    251}
    252
    253static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
    254						struct nat_entry *ne)
    255{
    256	struct nat_entry_set *head;
    257	bool new_ne = nat_get_blkaddr(ne) == NEW_ADDR;
    258
    259	if (!new_ne)
    260		head = __grab_nat_entry_set(nm_i, ne);
    261
    262	/*
    263	 * update entry_cnt in below condition:
    264	 * 1. update NEW_ADDR to valid block address;
    265	 * 2. update old block address to new one;
    266	 */
    267	if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
    268				!get_nat_flag(ne, IS_DIRTY)))
    269		head->entry_cnt++;
    270
    271	set_nat_flag(ne, IS_PREALLOC, new_ne);
    272
    273	if (get_nat_flag(ne, IS_DIRTY))
    274		goto refresh_list;
    275
    276	nm_i->nat_cnt[DIRTY_NAT]++;
    277	nm_i->nat_cnt[RECLAIMABLE_NAT]--;
    278	set_nat_flag(ne, IS_DIRTY, true);
    279refresh_list:
    280	spin_lock(&nm_i->nat_list_lock);
    281	if (new_ne)
    282		list_del_init(&ne->list);
    283	else
    284		list_move_tail(&ne->list, &head->entry_list);
    285	spin_unlock(&nm_i->nat_list_lock);
    286}
    287
    288static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
    289		struct nat_entry_set *set, struct nat_entry *ne)
    290{
    291	spin_lock(&nm_i->nat_list_lock);
    292	list_move_tail(&ne->list, &nm_i->nat_entries);
    293	spin_unlock(&nm_i->nat_list_lock);
    294
    295	set_nat_flag(ne, IS_DIRTY, false);
    296	set->entry_cnt--;
    297	nm_i->nat_cnt[DIRTY_NAT]--;
    298	nm_i->nat_cnt[RECLAIMABLE_NAT]++;
    299}
    300
    301static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
    302		nid_t start, unsigned int nr, struct nat_entry_set **ep)
    303{
    304	return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
    305							start, nr);
    306}
    307
    308bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page)
    309{
    310	return NODE_MAPPING(sbi) == page->mapping &&
    311			IS_DNODE(page) && is_cold_node(page);
    312}
    313
    314void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi)
    315{
    316	spin_lock_init(&sbi->fsync_node_lock);
    317	INIT_LIST_HEAD(&sbi->fsync_node_list);
    318	sbi->fsync_seg_id = 0;
    319	sbi->fsync_node_num = 0;
    320}
    321
    322static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi,
    323							struct page *page)
    324{
    325	struct fsync_node_entry *fn;
    326	unsigned long flags;
    327	unsigned int seq_id;
    328
    329	fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab,
    330					GFP_NOFS, true, NULL);
    331
    332	get_page(page);
    333	fn->page = page;
    334	INIT_LIST_HEAD(&fn->list);
    335
    336	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
    337	list_add_tail(&fn->list, &sbi->fsync_node_list);
    338	fn->seq_id = sbi->fsync_seg_id++;
    339	seq_id = fn->seq_id;
    340	sbi->fsync_node_num++;
    341	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
    342
    343	return seq_id;
    344}
    345
    346void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page)
    347{
    348	struct fsync_node_entry *fn;
    349	unsigned long flags;
    350
    351	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
    352	list_for_each_entry(fn, &sbi->fsync_node_list, list) {
    353		if (fn->page == page) {
    354			list_del(&fn->list);
    355			sbi->fsync_node_num--;
    356			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
    357			kmem_cache_free(fsync_node_entry_slab, fn);
    358			put_page(page);
    359			return;
    360		}
    361	}
    362	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
    363	f2fs_bug_on(sbi, 1);
    364}
    365
    366void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi)
    367{
    368	unsigned long flags;
    369
    370	spin_lock_irqsave(&sbi->fsync_node_lock, flags);
    371	sbi->fsync_seg_id = 0;
    372	spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
    373}
    374
    375int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
    376{
    377	struct f2fs_nm_info *nm_i = NM_I(sbi);
    378	struct nat_entry *e;
    379	bool need = false;
    380
    381	f2fs_down_read(&nm_i->nat_tree_lock);
    382	e = __lookup_nat_cache(nm_i, nid);
    383	if (e) {
    384		if (!get_nat_flag(e, IS_CHECKPOINTED) &&
    385				!get_nat_flag(e, HAS_FSYNCED_INODE))
    386			need = true;
    387	}
    388	f2fs_up_read(&nm_i->nat_tree_lock);
    389	return need;
    390}
    391
    392bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
    393{
    394	struct f2fs_nm_info *nm_i = NM_I(sbi);
    395	struct nat_entry *e;
    396	bool is_cp = true;
    397
    398	f2fs_down_read(&nm_i->nat_tree_lock);
    399	e = __lookup_nat_cache(nm_i, nid);
    400	if (e && !get_nat_flag(e, IS_CHECKPOINTED))
    401		is_cp = false;
    402	f2fs_up_read(&nm_i->nat_tree_lock);
    403	return is_cp;
    404}
    405
    406bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
    407{
    408	struct f2fs_nm_info *nm_i = NM_I(sbi);
    409	struct nat_entry *e;
    410	bool need_update = true;
    411
    412	f2fs_down_read(&nm_i->nat_tree_lock);
    413	e = __lookup_nat_cache(nm_i, ino);
    414	if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
    415			(get_nat_flag(e, IS_CHECKPOINTED) ||
    416			 get_nat_flag(e, HAS_FSYNCED_INODE)))
    417		need_update = false;
    418	f2fs_up_read(&nm_i->nat_tree_lock);
    419	return need_update;
    420}
    421
    422/* must be locked by nat_tree_lock */
    423static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
    424						struct f2fs_nat_entry *ne)
    425{
    426	struct f2fs_nm_info *nm_i = NM_I(sbi);
    427	struct nat_entry *new, *e;
    428
    429	/* Let's mitigate lock contention of nat_tree_lock during checkpoint */
    430	if (f2fs_rwsem_is_locked(&sbi->cp_global_sem))
    431		return;
    432
    433	new = __alloc_nat_entry(sbi, nid, false);
    434	if (!new)
    435		return;
    436
    437	f2fs_down_write(&nm_i->nat_tree_lock);
    438	e = __lookup_nat_cache(nm_i, nid);
    439	if (!e)
    440		e = __init_nat_entry(nm_i, new, ne, false);
    441	else
    442		f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) ||
    443				nat_get_blkaddr(e) !=
    444					le32_to_cpu(ne->block_addr) ||
    445				nat_get_version(e) != ne->version);
    446	f2fs_up_write(&nm_i->nat_tree_lock);
    447	if (e != new)
    448		__free_nat_entry(new);
    449}
    450
    451static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
    452			block_t new_blkaddr, bool fsync_done)
    453{
    454	struct f2fs_nm_info *nm_i = NM_I(sbi);
    455	struct nat_entry *e;
    456	struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true);
    457
    458	f2fs_down_write(&nm_i->nat_tree_lock);
    459	e = __lookup_nat_cache(nm_i, ni->nid);
    460	if (!e) {
    461		e = __init_nat_entry(nm_i, new, NULL, true);
    462		copy_node_info(&e->ni, ni);
    463		f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
    464	} else if (new_blkaddr == NEW_ADDR) {
    465		/*
    466		 * when nid is reallocated,
    467		 * previous nat entry can be remained in nat cache.
    468		 * So, reinitialize it with new information.
    469		 */
    470		copy_node_info(&e->ni, ni);
    471		f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
    472	}
    473	/* let's free early to reduce memory consumption */
    474	if (e != new)
    475		__free_nat_entry(new);
    476
    477	/* sanity check */
    478	f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
    479	f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR &&
    480			new_blkaddr == NULL_ADDR);
    481	f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR &&
    482			new_blkaddr == NEW_ADDR);
    483	f2fs_bug_on(sbi, __is_valid_data_blkaddr(nat_get_blkaddr(e)) &&
    484			new_blkaddr == NEW_ADDR);
    485
    486	/* increment version no as node is removed */
    487	if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
    488		unsigned char version = nat_get_version(e);
    489
    490		nat_set_version(e, inc_node_version(version));
    491	}
    492
    493	/* change address */
    494	nat_set_blkaddr(e, new_blkaddr);
    495	if (!__is_valid_data_blkaddr(new_blkaddr))
    496		set_nat_flag(e, IS_CHECKPOINTED, false);
    497	__set_nat_cache_dirty(nm_i, e);
    498
    499	/* update fsync_mark if its inode nat entry is still alive */
    500	if (ni->nid != ni->ino)
    501		e = __lookup_nat_cache(nm_i, ni->ino);
    502	if (e) {
    503		if (fsync_done && ni->nid == ni->ino)
    504			set_nat_flag(e, HAS_FSYNCED_INODE, true);
    505		set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
    506	}
    507	f2fs_up_write(&nm_i->nat_tree_lock);
    508}
    509
    510int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
    511{
    512	struct f2fs_nm_info *nm_i = NM_I(sbi);
    513	int nr = nr_shrink;
    514
    515	if (!f2fs_down_write_trylock(&nm_i->nat_tree_lock))
    516		return 0;
    517
    518	spin_lock(&nm_i->nat_list_lock);
    519	while (nr_shrink) {
    520		struct nat_entry *ne;
    521
    522		if (list_empty(&nm_i->nat_entries))
    523			break;
    524
    525		ne = list_first_entry(&nm_i->nat_entries,
    526					struct nat_entry, list);
    527		list_del(&ne->list);
    528		spin_unlock(&nm_i->nat_list_lock);
    529
    530		__del_from_nat_cache(nm_i, ne);
    531		nr_shrink--;
    532
    533		spin_lock(&nm_i->nat_list_lock);
    534	}
    535	spin_unlock(&nm_i->nat_list_lock);
    536
    537	f2fs_up_write(&nm_i->nat_tree_lock);
    538	return nr - nr_shrink;
    539}
    540
    541int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
    542				struct node_info *ni, bool checkpoint_context)
    543{
    544	struct f2fs_nm_info *nm_i = NM_I(sbi);
    545	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
    546	struct f2fs_journal *journal = curseg->journal;
    547	nid_t start_nid = START_NID(nid);
    548	struct f2fs_nat_block *nat_blk;
    549	struct page *page = NULL;
    550	struct f2fs_nat_entry ne;
    551	struct nat_entry *e;
    552	pgoff_t index;
    553	block_t blkaddr;
    554	int i;
    555
    556	ni->nid = nid;
    557retry:
    558	/* Check nat cache */
    559	f2fs_down_read(&nm_i->nat_tree_lock);
    560	e = __lookup_nat_cache(nm_i, nid);
    561	if (e) {
    562		ni->ino = nat_get_ino(e);
    563		ni->blk_addr = nat_get_blkaddr(e);
    564		ni->version = nat_get_version(e);
    565		f2fs_up_read(&nm_i->nat_tree_lock);
    566		return 0;
    567	}
    568
    569	/*
    570	 * Check current segment summary by trying to grab journal_rwsem first.
    571	 * This sem is on the critical path on the checkpoint requiring the above
    572	 * nat_tree_lock. Therefore, we should retry, if we failed to grab here
    573	 * while not bothering checkpoint.
    574	 */
    575	if (!f2fs_rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) {
    576		down_read(&curseg->journal_rwsem);
    577	} else if (f2fs_rwsem_is_contended(&nm_i->nat_tree_lock) ||
    578				!down_read_trylock(&curseg->journal_rwsem)) {
    579		f2fs_up_read(&nm_i->nat_tree_lock);
    580		goto retry;
    581	}
    582
    583	i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
    584	if (i >= 0) {
    585		ne = nat_in_journal(journal, i);
    586		node_info_from_raw_nat(ni, &ne);
    587	}
    588        up_read(&curseg->journal_rwsem);
    589	if (i >= 0) {
    590		f2fs_up_read(&nm_i->nat_tree_lock);
    591		goto cache;
    592	}
    593
    594	/* Fill node_info from nat page */
    595	index = current_nat_addr(sbi, nid);
    596	f2fs_up_read(&nm_i->nat_tree_lock);
    597
    598	page = f2fs_get_meta_page(sbi, index);
    599	if (IS_ERR(page))
    600		return PTR_ERR(page);
    601
    602	nat_blk = (struct f2fs_nat_block *)page_address(page);
    603	ne = nat_blk->entries[nid - start_nid];
    604	node_info_from_raw_nat(ni, &ne);
    605	f2fs_put_page(page, 1);
    606cache:
    607	blkaddr = le32_to_cpu(ne.block_addr);
    608	if (__is_valid_data_blkaddr(blkaddr) &&
    609		!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE))
    610		return -EFAULT;
    611
    612	/* cache nat entry */
    613	cache_nat_entry(sbi, nid, &ne);
    614	return 0;
    615}
    616
    617/*
    618 * readahead MAX_RA_NODE number of node pages.
    619 */
    620static void f2fs_ra_node_pages(struct page *parent, int start, int n)
    621{
    622	struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
    623	struct blk_plug plug;
    624	int i, end;
    625	nid_t nid;
    626
    627	blk_start_plug(&plug);
    628
    629	/* Then, try readahead for siblings of the desired node */
    630	end = start + n;
    631	end = min(end, NIDS_PER_BLOCK);
    632	for (i = start; i < end; i++) {
    633		nid = get_nid(parent, i, false);
    634		f2fs_ra_node_page(sbi, nid);
    635	}
    636
    637	blk_finish_plug(&plug);
    638}
    639
    640pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs)
    641{
    642	const long direct_index = ADDRS_PER_INODE(dn->inode);
    643	const long direct_blks = ADDRS_PER_BLOCK(dn->inode);
    644	const long indirect_blks = ADDRS_PER_BLOCK(dn->inode) * NIDS_PER_BLOCK;
    645	unsigned int skipped_unit = ADDRS_PER_BLOCK(dn->inode);
    646	int cur_level = dn->cur_level;
    647	int max_level = dn->max_level;
    648	pgoff_t base = 0;
    649
    650	if (!dn->max_level)
    651		return pgofs + 1;
    652
    653	while (max_level-- > cur_level)
    654		skipped_unit *= NIDS_PER_BLOCK;
    655
    656	switch (dn->max_level) {
    657	case 3:
    658		base += 2 * indirect_blks;
    659		fallthrough;
    660	case 2:
    661		base += 2 * direct_blks;
    662		fallthrough;
    663	case 1:
    664		base += direct_index;
    665		break;
    666	default:
    667		f2fs_bug_on(F2FS_I_SB(dn->inode), 1);
    668	}
    669
    670	return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base;
    671}
    672
    673/*
    674 * The maximum depth is four.
    675 * Offset[0] will have raw inode offset.
    676 */
    677static int get_node_path(struct inode *inode, long block,
    678				int offset[4], unsigned int noffset[4])
    679{
    680	const long direct_index = ADDRS_PER_INODE(inode);
    681	const long direct_blks = ADDRS_PER_BLOCK(inode);
    682	const long dptrs_per_blk = NIDS_PER_BLOCK;
    683	const long indirect_blks = ADDRS_PER_BLOCK(inode) * NIDS_PER_BLOCK;
    684	const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK;
    685	int n = 0;
    686	int level = 0;
    687
    688	noffset[0] = 0;
    689
    690	if (block < direct_index) {
    691		offset[n] = block;
    692		goto got;
    693	}
    694	block -= direct_index;
    695	if (block < direct_blks) {
    696		offset[n++] = NODE_DIR1_BLOCK;
    697		noffset[n] = 1;
    698		offset[n] = block;
    699		level = 1;
    700		goto got;
    701	}
    702	block -= direct_blks;
    703	if (block < direct_blks) {
    704		offset[n++] = NODE_DIR2_BLOCK;
    705		noffset[n] = 2;
    706		offset[n] = block;
    707		level = 1;
    708		goto got;
    709	}
    710	block -= direct_blks;
    711	if (block < indirect_blks) {
    712		offset[n++] = NODE_IND1_BLOCK;
    713		noffset[n] = 3;
    714		offset[n++] = block / direct_blks;
    715		noffset[n] = 4 + offset[n - 1];
    716		offset[n] = block % direct_blks;
    717		level = 2;
    718		goto got;
    719	}
    720	block -= indirect_blks;
    721	if (block < indirect_blks) {
    722		offset[n++] = NODE_IND2_BLOCK;
    723		noffset[n] = 4 + dptrs_per_blk;
    724		offset[n++] = block / direct_blks;
    725		noffset[n] = 5 + dptrs_per_blk + offset[n - 1];
    726		offset[n] = block % direct_blks;
    727		level = 2;
    728		goto got;
    729	}
    730	block -= indirect_blks;
    731	if (block < dindirect_blks) {
    732		offset[n++] = NODE_DIND_BLOCK;
    733		noffset[n] = 5 + (dptrs_per_blk * 2);
    734		offset[n++] = block / indirect_blks;
    735		noffset[n] = 6 + (dptrs_per_blk * 2) +
    736			      offset[n - 1] * (dptrs_per_blk + 1);
    737		offset[n++] = (block / direct_blks) % dptrs_per_blk;
    738		noffset[n] = 7 + (dptrs_per_blk * 2) +
    739			      offset[n - 2] * (dptrs_per_blk + 1) +
    740			      offset[n - 1];
    741		offset[n] = block % direct_blks;
    742		level = 3;
    743		goto got;
    744	} else {
    745		return -E2BIG;
    746	}
    747got:
    748	return level;
    749}
    750
    751/*
    752 * Caller should call f2fs_put_dnode(dn).
    753 * Also, it should grab and release a rwsem by calling f2fs_lock_op() and
    754 * f2fs_unlock_op() only if mode is set with ALLOC_NODE.
    755 */
    756int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
    757{
    758	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
    759	struct page *npage[4];
    760	struct page *parent = NULL;
    761	int offset[4];
    762	unsigned int noffset[4];
    763	nid_t nids[4];
    764	int level, i = 0;
    765	int err = 0;
    766
    767	level = get_node_path(dn->inode, index, offset, noffset);
    768	if (level < 0)
    769		return level;
    770
    771	nids[0] = dn->inode->i_ino;
    772	npage[0] = dn->inode_page;
    773
    774	if (!npage[0]) {
    775		npage[0] = f2fs_get_node_page(sbi, nids[0]);
    776		if (IS_ERR(npage[0]))
    777			return PTR_ERR(npage[0]);
    778	}
    779
    780	/* if inline_data is set, should not report any block indices */
    781	if (f2fs_has_inline_data(dn->inode) && index) {
    782		err = -ENOENT;
    783		f2fs_put_page(npage[0], 1);
    784		goto release_out;
    785	}
    786
    787	parent = npage[0];
    788	if (level != 0)
    789		nids[1] = get_nid(parent, offset[0], true);
    790	dn->inode_page = npage[0];
    791	dn->inode_page_locked = true;
    792
    793	/* get indirect or direct nodes */
    794	for (i = 1; i <= level; i++) {
    795		bool done = false;
    796
    797		if (!nids[i] && mode == ALLOC_NODE) {
    798			/* alloc new node */
    799			if (!f2fs_alloc_nid(sbi, &(nids[i]))) {
    800				err = -ENOSPC;
    801				goto release_pages;
    802			}
    803
    804			dn->nid = nids[i];
    805			npage[i] = f2fs_new_node_page(dn, noffset[i]);
    806			if (IS_ERR(npage[i])) {
    807				f2fs_alloc_nid_failed(sbi, nids[i]);
    808				err = PTR_ERR(npage[i]);
    809				goto release_pages;
    810			}
    811
    812			set_nid(parent, offset[i - 1], nids[i], i == 1);
    813			f2fs_alloc_nid_done(sbi, nids[i]);
    814			done = true;
    815		} else if (mode == LOOKUP_NODE_RA && i == level && level > 1) {
    816			npage[i] = f2fs_get_node_page_ra(parent, offset[i - 1]);
    817			if (IS_ERR(npage[i])) {
    818				err = PTR_ERR(npage[i]);
    819				goto release_pages;
    820			}
    821			done = true;
    822		}
    823		if (i == 1) {
    824			dn->inode_page_locked = false;
    825			unlock_page(parent);
    826		} else {
    827			f2fs_put_page(parent, 1);
    828		}
    829
    830		if (!done) {
    831			npage[i] = f2fs_get_node_page(sbi, nids[i]);
    832			if (IS_ERR(npage[i])) {
    833				err = PTR_ERR(npage[i]);
    834				f2fs_put_page(npage[0], 0);
    835				goto release_out;
    836			}
    837		}
    838		if (i < level) {
    839			parent = npage[i];
    840			nids[i + 1] = get_nid(parent, offset[i], false);
    841		}
    842	}
    843	dn->nid = nids[level];
    844	dn->ofs_in_node = offset[level];
    845	dn->node_page = npage[level];
    846	dn->data_blkaddr = f2fs_data_blkaddr(dn);
    847
    848	if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) &&
    849					f2fs_sb_has_readonly(sbi)) {
    850		unsigned int c_len = f2fs_cluster_blocks_are_contiguous(dn);
    851		block_t blkaddr;
    852
    853		if (!c_len)
    854			goto out;
    855
    856		blkaddr = f2fs_data_blkaddr(dn);
    857		if (blkaddr == COMPRESS_ADDR)
    858			blkaddr = data_blkaddr(dn->inode, dn->node_page,
    859						dn->ofs_in_node + 1);
    860
    861		f2fs_update_extent_tree_range_compressed(dn->inode,
    862					index, blkaddr,
    863					F2FS_I(dn->inode)->i_cluster_size,
    864					c_len);
    865	}
    866out:
    867	return 0;
    868
    869release_pages:
    870	f2fs_put_page(parent, 1);
    871	if (i > 1)
    872		f2fs_put_page(npage[0], 0);
    873release_out:
    874	dn->inode_page = NULL;
    875	dn->node_page = NULL;
    876	if (err == -ENOENT) {
    877		dn->cur_level = i;
    878		dn->max_level = level;
    879		dn->ofs_in_node = offset[level];
    880	}
    881	return err;
    882}
    883
    884static int truncate_node(struct dnode_of_data *dn)
    885{
    886	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
    887	struct node_info ni;
    888	int err;
    889	pgoff_t index;
    890
    891	err = f2fs_get_node_info(sbi, dn->nid, &ni, false);
    892	if (err)
    893		return err;
    894
    895	/* Deallocate node address */
    896	f2fs_invalidate_blocks(sbi, ni.blk_addr);
    897	dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino);
    898	set_node_addr(sbi, &ni, NULL_ADDR, false);
    899
    900	if (dn->nid == dn->inode->i_ino) {
    901		f2fs_remove_orphan_inode(sbi, dn->nid);
    902		dec_valid_inode_count(sbi);
    903		f2fs_inode_synced(dn->inode);
    904	}
    905
    906	clear_node_page_dirty(dn->node_page);
    907	set_sbi_flag(sbi, SBI_IS_DIRTY);
    908
    909	index = dn->node_page->index;
    910	f2fs_put_page(dn->node_page, 1);
    911
    912	invalidate_mapping_pages(NODE_MAPPING(sbi),
    913			index, index);
    914
    915	dn->node_page = NULL;
    916	trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
    917
    918	return 0;
    919}
    920
    921static int truncate_dnode(struct dnode_of_data *dn)
    922{
    923	struct page *page;
    924	int err;
    925
    926	if (dn->nid == 0)
    927		return 1;
    928
    929	/* get direct node */
    930	page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid);
    931	if (PTR_ERR(page) == -ENOENT)
    932		return 1;
    933	else if (IS_ERR(page))
    934		return PTR_ERR(page);
    935
    936	/* Make dnode_of_data for parameter */
    937	dn->node_page = page;
    938	dn->ofs_in_node = 0;
    939	f2fs_truncate_data_blocks(dn);
    940	err = truncate_node(dn);
    941	if (err)
    942		return err;
    943
    944	return 1;
    945}
    946
    947static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
    948						int ofs, int depth)
    949{
    950	struct dnode_of_data rdn = *dn;
    951	struct page *page;
    952	struct f2fs_node *rn;
    953	nid_t child_nid;
    954	unsigned int child_nofs;
    955	int freed = 0;
    956	int i, ret;
    957
    958	if (dn->nid == 0)
    959		return NIDS_PER_BLOCK + 1;
    960
    961	trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
    962
    963	page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid);
    964	if (IS_ERR(page)) {
    965		trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
    966		return PTR_ERR(page);
    967	}
    968
    969	f2fs_ra_node_pages(page, ofs, NIDS_PER_BLOCK);
    970
    971	rn = F2FS_NODE(page);
    972	if (depth < 3) {
    973		for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
    974			child_nid = le32_to_cpu(rn->in.nid[i]);
    975			if (child_nid == 0)
    976				continue;
    977			rdn.nid = child_nid;
    978			ret = truncate_dnode(&rdn);
    979			if (ret < 0)
    980				goto out_err;
    981			if (set_nid(page, i, 0, false))
    982				dn->node_changed = true;
    983		}
    984	} else {
    985		child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
    986		for (i = ofs; i < NIDS_PER_BLOCK; i++) {
    987			child_nid = le32_to_cpu(rn->in.nid[i]);
    988			if (child_nid == 0) {
    989				child_nofs += NIDS_PER_BLOCK + 1;
    990				continue;
    991			}
    992			rdn.nid = child_nid;
    993			ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
    994			if (ret == (NIDS_PER_BLOCK + 1)) {
    995				if (set_nid(page, i, 0, false))
    996					dn->node_changed = true;
    997				child_nofs += ret;
    998			} else if (ret < 0 && ret != -ENOENT) {
    999				goto out_err;
   1000			}
   1001		}
   1002		freed = child_nofs;
   1003	}
   1004
   1005	if (!ofs) {
   1006		/* remove current indirect node */
   1007		dn->node_page = page;
   1008		ret = truncate_node(dn);
   1009		if (ret)
   1010			goto out_err;
   1011		freed++;
   1012	} else {
   1013		f2fs_put_page(page, 1);
   1014	}
   1015	trace_f2fs_truncate_nodes_exit(dn->inode, freed);
   1016	return freed;
   1017
   1018out_err:
   1019	f2fs_put_page(page, 1);
   1020	trace_f2fs_truncate_nodes_exit(dn->inode, ret);
   1021	return ret;
   1022}
   1023
   1024static int truncate_partial_nodes(struct dnode_of_data *dn,
   1025			struct f2fs_inode *ri, int *offset, int depth)
   1026{
   1027	struct page *pages[2];
   1028	nid_t nid[3];
   1029	nid_t child_nid;
   1030	int err = 0;
   1031	int i;
   1032	int idx = depth - 2;
   1033
   1034	nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
   1035	if (!nid[0])
   1036		return 0;
   1037
   1038	/* get indirect nodes in the path */
   1039	for (i = 0; i < idx + 1; i++) {
   1040		/* reference count'll be increased */
   1041		pages[i] = f2fs_get_node_page(F2FS_I_SB(dn->inode), nid[i]);
   1042		if (IS_ERR(pages[i])) {
   1043			err = PTR_ERR(pages[i]);
   1044			idx = i - 1;
   1045			goto fail;
   1046		}
   1047		nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
   1048	}
   1049
   1050	f2fs_ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK);
   1051
   1052	/* free direct nodes linked to a partial indirect node */
   1053	for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
   1054		child_nid = get_nid(pages[idx], i, false);
   1055		if (!child_nid)
   1056			continue;
   1057		dn->nid = child_nid;
   1058		err = truncate_dnode(dn);
   1059		if (err < 0)
   1060			goto fail;
   1061		if (set_nid(pages[idx], i, 0, false))
   1062			dn->node_changed = true;
   1063	}
   1064
   1065	if (offset[idx + 1] == 0) {
   1066		dn->node_page = pages[idx];
   1067		dn->nid = nid[idx];
   1068		err = truncate_node(dn);
   1069		if (err)
   1070			goto fail;
   1071	} else {
   1072		f2fs_put_page(pages[idx], 1);
   1073	}
   1074	offset[idx]++;
   1075	offset[idx + 1] = 0;
   1076	idx--;
   1077fail:
   1078	for (i = idx; i >= 0; i--)
   1079		f2fs_put_page(pages[i], 1);
   1080
   1081	trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err);
   1082
   1083	return err;
   1084}
   1085
   1086/*
   1087 * All the block addresses of data and nodes should be nullified.
   1088 */
   1089int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from)
   1090{
   1091	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
   1092	int err = 0, cont = 1;
   1093	int level, offset[4], noffset[4];
   1094	unsigned int nofs = 0;
   1095	struct f2fs_inode *ri;
   1096	struct dnode_of_data dn;
   1097	struct page *page;
   1098
   1099	trace_f2fs_truncate_inode_blocks_enter(inode, from);
   1100
   1101	level = get_node_path(inode, from, offset, noffset);
   1102	if (level < 0) {
   1103		trace_f2fs_truncate_inode_blocks_exit(inode, level);
   1104		return level;
   1105	}
   1106
   1107	page = f2fs_get_node_page(sbi, inode->i_ino);
   1108	if (IS_ERR(page)) {
   1109		trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page));
   1110		return PTR_ERR(page);
   1111	}
   1112
   1113	set_new_dnode(&dn, inode, page, NULL, 0);
   1114	unlock_page(page);
   1115
   1116	ri = F2FS_INODE(page);
   1117	switch (level) {
   1118	case 0:
   1119	case 1:
   1120		nofs = noffset[1];
   1121		break;
   1122	case 2:
   1123		nofs = noffset[1];
   1124		if (!offset[level - 1])
   1125			goto skip_partial;
   1126		err = truncate_partial_nodes(&dn, ri, offset, level);
   1127		if (err < 0 && err != -ENOENT)
   1128			goto fail;
   1129		nofs += 1 + NIDS_PER_BLOCK;
   1130		break;
   1131	case 3:
   1132		nofs = 5 + 2 * NIDS_PER_BLOCK;
   1133		if (!offset[level - 1])
   1134			goto skip_partial;
   1135		err = truncate_partial_nodes(&dn, ri, offset, level);
   1136		if (err < 0 && err != -ENOENT)
   1137			goto fail;
   1138		break;
   1139	default:
   1140		BUG();
   1141	}
   1142
   1143skip_partial:
   1144	while (cont) {
   1145		dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
   1146		switch (offset[0]) {
   1147		case NODE_DIR1_BLOCK:
   1148		case NODE_DIR2_BLOCK:
   1149			err = truncate_dnode(&dn);
   1150			break;
   1151
   1152		case NODE_IND1_BLOCK:
   1153		case NODE_IND2_BLOCK:
   1154			err = truncate_nodes(&dn, nofs, offset[1], 2);
   1155			break;
   1156
   1157		case NODE_DIND_BLOCK:
   1158			err = truncate_nodes(&dn, nofs, offset[1], 3);
   1159			cont = 0;
   1160			break;
   1161
   1162		default:
   1163			BUG();
   1164		}
   1165		if (err < 0 && err != -ENOENT)
   1166			goto fail;
   1167		if (offset[1] == 0 &&
   1168				ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
   1169			lock_page(page);
   1170			BUG_ON(page->mapping != NODE_MAPPING(sbi));
   1171			f2fs_wait_on_page_writeback(page, NODE, true, true);
   1172			ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
   1173			set_page_dirty(page);
   1174			unlock_page(page);
   1175		}
   1176		offset[1] = 0;
   1177		offset[0]++;
   1178		nofs += err;
   1179	}
   1180fail:
   1181	f2fs_put_page(page, 0);
   1182	trace_f2fs_truncate_inode_blocks_exit(inode, err);
   1183	return err > 0 ? 0 : err;
   1184}
   1185
   1186/* caller must lock inode page */
   1187int f2fs_truncate_xattr_node(struct inode *inode)
   1188{
   1189	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
   1190	nid_t nid = F2FS_I(inode)->i_xattr_nid;
   1191	struct dnode_of_data dn;
   1192	struct page *npage;
   1193	int err;
   1194
   1195	if (!nid)
   1196		return 0;
   1197
   1198	npage = f2fs_get_node_page(sbi, nid);
   1199	if (IS_ERR(npage))
   1200		return PTR_ERR(npage);
   1201
   1202	set_new_dnode(&dn, inode, NULL, npage, nid);
   1203	err = truncate_node(&dn);
   1204	if (err) {
   1205		f2fs_put_page(npage, 1);
   1206		return err;
   1207	}
   1208
   1209	f2fs_i_xnid_write(inode, 0);
   1210
   1211	return 0;
   1212}
   1213
   1214/*
   1215 * Caller should grab and release a rwsem by calling f2fs_lock_op() and
   1216 * f2fs_unlock_op().
   1217 */
   1218int f2fs_remove_inode_page(struct inode *inode)
   1219{
   1220	struct dnode_of_data dn;
   1221	int err;
   1222
   1223	set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
   1224	err = f2fs_get_dnode_of_data(&dn, 0, LOOKUP_NODE);
   1225	if (err)
   1226		return err;
   1227
   1228	err = f2fs_truncate_xattr_node(inode);
   1229	if (err) {
   1230		f2fs_put_dnode(&dn);
   1231		return err;
   1232	}
   1233
   1234	/* remove potential inline_data blocks */
   1235	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
   1236				S_ISLNK(inode->i_mode))
   1237		f2fs_truncate_data_blocks_range(&dn, 1);
   1238
   1239	/* 0 is possible, after f2fs_new_inode() has failed */
   1240	if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) {
   1241		f2fs_put_dnode(&dn);
   1242		return -EIO;
   1243	}
   1244
   1245	if (unlikely(inode->i_blocks != 0 && inode->i_blocks != 8)) {
   1246		f2fs_warn(F2FS_I_SB(inode),
   1247			"f2fs_remove_inode_page: inconsistent i_blocks, ino:%lu, iblocks:%llu",
   1248			inode->i_ino, (unsigned long long)inode->i_blocks);
   1249		set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
   1250	}
   1251
   1252	/* will put inode & node pages */
   1253	err = truncate_node(&dn);
   1254	if (err) {
   1255		f2fs_put_dnode(&dn);
   1256		return err;
   1257	}
   1258	return 0;
   1259}
   1260
   1261struct page *f2fs_new_inode_page(struct inode *inode)
   1262{
   1263	struct dnode_of_data dn;
   1264
   1265	/* allocate inode page for new inode */
   1266	set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
   1267
   1268	/* caller should f2fs_put_page(page, 1); */
   1269	return f2fs_new_node_page(&dn, 0);
   1270}
   1271
   1272struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
   1273{
   1274	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
   1275	struct node_info new_ni;
   1276	struct page *page;
   1277	int err;
   1278
   1279	if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
   1280		return ERR_PTR(-EPERM);
   1281
   1282	page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false);
   1283	if (!page)
   1284		return ERR_PTR(-ENOMEM);
   1285
   1286	if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs))))
   1287		goto fail;
   1288
   1289#ifdef CONFIG_F2FS_CHECK_FS
   1290	err = f2fs_get_node_info(sbi, dn->nid, &new_ni, false);
   1291	if (err) {
   1292		dec_valid_node_count(sbi, dn->inode, !ofs);
   1293		goto fail;
   1294	}
   1295	f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR);
   1296#endif
   1297	new_ni.nid = dn->nid;
   1298	new_ni.ino = dn->inode->i_ino;
   1299	new_ni.blk_addr = NULL_ADDR;
   1300	new_ni.flag = 0;
   1301	new_ni.version = 0;
   1302	set_node_addr(sbi, &new_ni, NEW_ADDR, false);
   1303
   1304	f2fs_wait_on_page_writeback(page, NODE, true, true);
   1305	fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
   1306	set_cold_node(page, S_ISDIR(dn->inode->i_mode));
   1307	if (!PageUptodate(page))
   1308		SetPageUptodate(page);
   1309	if (set_page_dirty(page))
   1310		dn->node_changed = true;
   1311
   1312	if (f2fs_has_xattr_block(ofs))
   1313		f2fs_i_xnid_write(dn->inode, dn->nid);
   1314
   1315	if (ofs == 0)
   1316		inc_valid_inode_count(sbi);
   1317	return page;
   1318
   1319fail:
   1320	clear_node_page_dirty(page);
   1321	f2fs_put_page(page, 1);
   1322	return ERR_PTR(err);
   1323}
   1324
   1325/*
   1326 * Caller should do after getting the following values.
   1327 * 0: f2fs_put_page(page, 0)
   1328 * LOCKED_PAGE or error: f2fs_put_page(page, 1)
   1329 */
   1330static int read_node_page(struct page *page, int op_flags)
   1331{
   1332	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
   1333	struct node_info ni;
   1334	struct f2fs_io_info fio = {
   1335		.sbi = sbi,
   1336		.type = NODE,
   1337		.op = REQ_OP_READ,
   1338		.op_flags = op_flags,
   1339		.page = page,
   1340		.encrypted_page = NULL,
   1341	};
   1342	int err;
   1343
   1344	if (PageUptodate(page)) {
   1345		if (!f2fs_inode_chksum_verify(sbi, page)) {
   1346			ClearPageUptodate(page);
   1347			return -EFSBADCRC;
   1348		}
   1349		return LOCKED_PAGE;
   1350	}
   1351
   1352	err = f2fs_get_node_info(sbi, page->index, &ni, false);
   1353	if (err)
   1354		return err;
   1355
   1356	/* NEW_ADDR can be seen, after cp_error drops some dirty node pages */
   1357	if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) ||
   1358			is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) {
   1359		ClearPageUptodate(page);
   1360		return -ENOENT;
   1361	}
   1362
   1363	fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr;
   1364
   1365	err = f2fs_submit_page_bio(&fio);
   1366
   1367	if (!err)
   1368		f2fs_update_iostat(sbi, FS_NODE_READ_IO, F2FS_BLKSIZE);
   1369
   1370	return err;
   1371}
   1372
   1373/*
   1374 * Readahead a node page
   1375 */
   1376void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
   1377{
   1378	struct page *apage;
   1379	int err;
   1380
   1381	if (!nid)
   1382		return;
   1383	if (f2fs_check_nid_range(sbi, nid))
   1384		return;
   1385
   1386	apage = xa_load(&NODE_MAPPING(sbi)->i_pages, nid);
   1387	if (apage)
   1388		return;
   1389
   1390	apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
   1391	if (!apage)
   1392		return;
   1393
   1394	err = read_node_page(apage, REQ_RAHEAD);
   1395	f2fs_put_page(apage, err ? 1 : 0);
   1396}
   1397
   1398static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
   1399					struct page *parent, int start)
   1400{
   1401	struct page *page;
   1402	int err;
   1403
   1404	if (!nid)
   1405		return ERR_PTR(-ENOENT);
   1406	if (f2fs_check_nid_range(sbi, nid))
   1407		return ERR_PTR(-EINVAL);
   1408repeat:
   1409	page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
   1410	if (!page)
   1411		return ERR_PTR(-ENOMEM);
   1412
   1413	err = read_node_page(page, 0);
   1414	if (err < 0) {
   1415		goto out_put_err;
   1416	} else if (err == LOCKED_PAGE) {
   1417		err = 0;
   1418		goto page_hit;
   1419	}
   1420
   1421	if (parent)
   1422		f2fs_ra_node_pages(parent, start + 1, MAX_RA_NODE);
   1423
   1424	lock_page(page);
   1425
   1426	if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
   1427		f2fs_put_page(page, 1);
   1428		goto repeat;
   1429	}
   1430
   1431	if (unlikely(!PageUptodate(page))) {
   1432		err = -EIO;
   1433		goto out_err;
   1434	}
   1435
   1436	if (!f2fs_inode_chksum_verify(sbi, page)) {
   1437		err = -EFSBADCRC;
   1438		goto out_err;
   1439	}
   1440page_hit:
   1441	if (likely(nid == nid_of_node(page)))
   1442		return page;
   1443
   1444	f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
   1445			  nid, nid_of_node(page), ino_of_node(page),
   1446			  ofs_of_node(page), cpver_of_node(page),
   1447			  next_blkaddr_of_node(page));
   1448	set_sbi_flag(sbi, SBI_NEED_FSCK);
   1449	err = -EINVAL;
   1450out_err:
   1451	ClearPageUptodate(page);
   1452out_put_err:
   1453	/* ENOENT comes from read_node_page which is not an error. */
   1454	if (err != -ENOENT)
   1455		f2fs_handle_page_eio(sbi, page->index, NODE);
   1456	f2fs_put_page(page, 1);
   1457	return ERR_PTR(err);
   1458}
   1459
   1460struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
   1461{
   1462	return __get_node_page(sbi, nid, NULL, 0);
   1463}
   1464
   1465struct page *f2fs_get_node_page_ra(struct page *parent, int start)
   1466{
   1467	struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
   1468	nid_t nid = get_nid(parent, start, false);
   1469
   1470	return __get_node_page(sbi, nid, parent, start);
   1471}
   1472
   1473static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
   1474{
   1475	struct inode *inode;
   1476	struct page *page;
   1477	int ret;
   1478
   1479	/* should flush inline_data before evict_inode */
   1480	inode = ilookup(sbi->sb, ino);
   1481	if (!inode)
   1482		return;
   1483
   1484	page = f2fs_pagecache_get_page(inode->i_mapping, 0,
   1485					FGP_LOCK|FGP_NOWAIT, 0);
   1486	if (!page)
   1487		goto iput_out;
   1488
   1489	if (!PageUptodate(page))
   1490		goto page_out;
   1491
   1492	if (!PageDirty(page))
   1493		goto page_out;
   1494
   1495	if (!clear_page_dirty_for_io(page))
   1496		goto page_out;
   1497
   1498	ret = f2fs_write_inline_data(inode, page);
   1499	inode_dec_dirty_pages(inode);
   1500	f2fs_remove_dirty_inode(inode);
   1501	if (ret)
   1502		set_page_dirty(page);
   1503page_out:
   1504	f2fs_put_page(page, 1);
   1505iput_out:
   1506	iput(inode);
   1507}
   1508
   1509static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
   1510{
   1511	pgoff_t index;
   1512	struct pagevec pvec;
   1513	struct page *last_page = NULL;
   1514	int nr_pages;
   1515
   1516	pagevec_init(&pvec);
   1517	index = 0;
   1518
   1519	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
   1520				PAGECACHE_TAG_DIRTY))) {
   1521		int i;
   1522
   1523		for (i = 0; i < nr_pages; i++) {
   1524			struct page *page = pvec.pages[i];
   1525
   1526			if (unlikely(f2fs_cp_error(sbi))) {
   1527				f2fs_put_page(last_page, 0);
   1528				pagevec_release(&pvec);
   1529				return ERR_PTR(-EIO);
   1530			}
   1531
   1532			if (!IS_DNODE(page) || !is_cold_node(page))
   1533				continue;
   1534			if (ino_of_node(page) != ino)
   1535				continue;
   1536
   1537			lock_page(page);
   1538
   1539			if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
   1540continue_unlock:
   1541				unlock_page(page);
   1542				continue;
   1543			}
   1544			if (ino_of_node(page) != ino)
   1545				goto continue_unlock;
   1546
   1547			if (!PageDirty(page)) {
   1548				/* someone wrote it for us */
   1549				goto continue_unlock;
   1550			}
   1551
   1552			if (last_page)
   1553				f2fs_put_page(last_page, 0);
   1554
   1555			get_page(page);
   1556			last_page = page;
   1557			unlock_page(page);
   1558		}
   1559		pagevec_release(&pvec);
   1560		cond_resched();
   1561	}
   1562	return last_page;
   1563}
   1564
   1565static int __write_node_page(struct page *page, bool atomic, bool *submitted,
   1566				struct writeback_control *wbc, bool do_balance,
   1567				enum iostat_type io_type, unsigned int *seq_id)
   1568{
   1569	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
   1570	nid_t nid;
   1571	struct node_info ni;
   1572	struct f2fs_io_info fio = {
   1573		.sbi = sbi,
   1574		.ino = ino_of_node(page),
   1575		.type = NODE,
   1576		.op = REQ_OP_WRITE,
   1577		.op_flags = wbc_to_write_flags(wbc),
   1578		.page = page,
   1579		.encrypted_page = NULL,
   1580		.submitted = false,
   1581		.io_type = io_type,
   1582		.io_wbc = wbc,
   1583	};
   1584	unsigned int seq;
   1585
   1586	trace_f2fs_writepage(page, NODE);
   1587
   1588	if (unlikely(f2fs_cp_error(sbi))) {
   1589		ClearPageUptodate(page);
   1590		dec_page_count(sbi, F2FS_DIRTY_NODES);
   1591		unlock_page(page);
   1592		return 0;
   1593	}
   1594
   1595	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
   1596		goto redirty_out;
   1597
   1598	if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
   1599			wbc->sync_mode == WB_SYNC_NONE &&
   1600			IS_DNODE(page) && is_cold_node(page))
   1601		goto redirty_out;
   1602
   1603	/* get old block addr of this node page */
   1604	nid = nid_of_node(page);
   1605	f2fs_bug_on(sbi, page->index != nid);
   1606
   1607	if (f2fs_get_node_info(sbi, nid, &ni, !do_balance))
   1608		goto redirty_out;
   1609
   1610	if (wbc->for_reclaim) {
   1611		if (!f2fs_down_read_trylock(&sbi->node_write))
   1612			goto redirty_out;
   1613	} else {
   1614		f2fs_down_read(&sbi->node_write);
   1615	}
   1616
   1617	/* This page is already truncated */
   1618	if (unlikely(ni.blk_addr == NULL_ADDR)) {
   1619		ClearPageUptodate(page);
   1620		dec_page_count(sbi, F2FS_DIRTY_NODES);
   1621		f2fs_up_read(&sbi->node_write);
   1622		unlock_page(page);
   1623		return 0;
   1624	}
   1625
   1626	if (__is_valid_data_blkaddr(ni.blk_addr) &&
   1627		!f2fs_is_valid_blkaddr(sbi, ni.blk_addr,
   1628					DATA_GENERIC_ENHANCE)) {
   1629		f2fs_up_read(&sbi->node_write);
   1630		goto redirty_out;
   1631	}
   1632
   1633	if (atomic && !test_opt(sbi, NOBARRIER) && !f2fs_sb_has_blkzoned(sbi))
   1634		fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
   1635
   1636	/* should add to global list before clearing PAGECACHE status */
   1637	if (f2fs_in_warm_node_list(sbi, page)) {
   1638		seq = f2fs_add_fsync_node_entry(sbi, page);
   1639		if (seq_id)
   1640			*seq_id = seq;
   1641	}
   1642
   1643	set_page_writeback(page);
   1644	ClearPageError(page);
   1645
   1646	fio.old_blkaddr = ni.blk_addr;
   1647	f2fs_do_write_node_page(nid, &fio);
   1648	set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
   1649	dec_page_count(sbi, F2FS_DIRTY_NODES);
   1650	f2fs_up_read(&sbi->node_write);
   1651
   1652	if (wbc->for_reclaim) {
   1653		f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE);
   1654		submitted = NULL;
   1655	}
   1656
   1657	unlock_page(page);
   1658
   1659	if (unlikely(f2fs_cp_error(sbi))) {
   1660		f2fs_submit_merged_write(sbi, NODE);
   1661		submitted = NULL;
   1662	}
   1663	if (submitted)
   1664		*submitted = fio.submitted;
   1665
   1666	if (do_balance)
   1667		f2fs_balance_fs(sbi, false);
   1668	return 0;
   1669
   1670redirty_out:
   1671	redirty_page_for_writepage(wbc, page);
   1672	return AOP_WRITEPAGE_ACTIVATE;
   1673}
   1674
   1675int f2fs_move_node_page(struct page *node_page, int gc_type)
   1676{
   1677	int err = 0;
   1678
   1679	if (gc_type == FG_GC) {
   1680		struct writeback_control wbc = {
   1681			.sync_mode = WB_SYNC_ALL,
   1682			.nr_to_write = 1,
   1683			.for_reclaim = 0,
   1684		};
   1685
   1686		f2fs_wait_on_page_writeback(node_page, NODE, true, true);
   1687
   1688		set_page_dirty(node_page);
   1689
   1690		if (!clear_page_dirty_for_io(node_page)) {
   1691			err = -EAGAIN;
   1692			goto out_page;
   1693		}
   1694
   1695		if (__write_node_page(node_page, false, NULL,
   1696					&wbc, false, FS_GC_NODE_IO, NULL)) {
   1697			err = -EAGAIN;
   1698			unlock_page(node_page);
   1699		}
   1700		goto release_page;
   1701	} else {
   1702		/* set page dirty and write it */
   1703		if (!PageWriteback(node_page))
   1704			set_page_dirty(node_page);
   1705	}
   1706out_page:
   1707	unlock_page(node_page);
   1708release_page:
   1709	f2fs_put_page(node_page, 0);
   1710	return err;
   1711}
   1712
   1713static int f2fs_write_node_page(struct page *page,
   1714				struct writeback_control *wbc)
   1715{
   1716	return __write_node_page(page, false, NULL, wbc, false,
   1717						FS_NODE_IO, NULL);
   1718}
   1719
   1720int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
   1721			struct writeback_control *wbc, bool atomic,
   1722			unsigned int *seq_id)
   1723{
   1724	pgoff_t index;
   1725	struct pagevec pvec;
   1726	int ret = 0;
   1727	struct page *last_page = NULL;
   1728	bool marked = false;
   1729	nid_t ino = inode->i_ino;
   1730	int nr_pages;
   1731	int nwritten = 0;
   1732
   1733	if (atomic) {
   1734		last_page = last_fsync_dnode(sbi, ino);
   1735		if (IS_ERR_OR_NULL(last_page))
   1736			return PTR_ERR_OR_ZERO(last_page);
   1737	}
   1738retry:
   1739	pagevec_init(&pvec);
   1740	index = 0;
   1741
   1742	while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
   1743				PAGECACHE_TAG_DIRTY))) {
   1744		int i;
   1745
   1746		for (i = 0; i < nr_pages; i++) {
   1747			struct page *page = pvec.pages[i];
   1748			bool submitted = false;
   1749
   1750			if (unlikely(f2fs_cp_error(sbi))) {
   1751				f2fs_put_page(last_page, 0);
   1752				pagevec_release(&pvec);
   1753				ret = -EIO;
   1754				goto out;
   1755			}
   1756
   1757			if (!IS_DNODE(page) || !is_cold_node(page))
   1758				continue;
   1759			if (ino_of_node(page) != ino)
   1760				continue;
   1761
   1762			lock_page(page);
   1763
   1764			if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
   1765continue_unlock:
   1766				unlock_page(page);
   1767				continue;
   1768			}
   1769			if (ino_of_node(page) != ino)
   1770				goto continue_unlock;
   1771
   1772			if (!PageDirty(page) && page != last_page) {
   1773				/* someone wrote it for us */
   1774				goto continue_unlock;
   1775			}
   1776
   1777			f2fs_wait_on_page_writeback(page, NODE, true, true);
   1778
   1779			set_fsync_mark(page, 0);
   1780			set_dentry_mark(page, 0);
   1781
   1782			if (!atomic || page == last_page) {
   1783				set_fsync_mark(page, 1);
   1784				percpu_counter_inc(&sbi->rf_node_block_count);
   1785				if (IS_INODE(page)) {
   1786					if (is_inode_flag_set(inode,
   1787								FI_DIRTY_INODE))
   1788						f2fs_update_inode(inode, page);
   1789					set_dentry_mark(page,
   1790						f2fs_need_dentry_mark(sbi, ino));
   1791				}
   1792				/* may be written by other thread */
   1793				if (!PageDirty(page))
   1794					set_page_dirty(page);
   1795			}
   1796
   1797			if (!clear_page_dirty_for_io(page))
   1798				goto continue_unlock;
   1799
   1800			ret = __write_node_page(page, atomic &&
   1801						page == last_page,
   1802						&submitted, wbc, true,
   1803						FS_NODE_IO, seq_id);
   1804			if (ret) {
   1805				unlock_page(page);
   1806				f2fs_put_page(last_page, 0);
   1807				break;
   1808			} else if (submitted) {
   1809				nwritten++;
   1810			}
   1811
   1812			if (page == last_page) {
   1813				f2fs_put_page(page, 0);
   1814				marked = true;
   1815				break;
   1816			}
   1817		}
   1818		pagevec_release(&pvec);
   1819		cond_resched();
   1820
   1821		if (ret || marked)
   1822			break;
   1823	}
   1824	if (!ret && atomic && !marked) {
   1825		f2fs_debug(sbi, "Retry to write fsync mark: ino=%u, idx=%lx",
   1826			   ino, last_page->index);
   1827		lock_page(last_page);
   1828		f2fs_wait_on_page_writeback(last_page, NODE, true, true);
   1829		set_page_dirty(last_page);
   1830		unlock_page(last_page);
   1831		goto retry;
   1832	}
   1833out:
   1834	if (nwritten)
   1835		f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE);
   1836	return ret ? -EIO : 0;
   1837}
   1838
   1839static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data)
   1840{
   1841	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
   1842	bool clean;
   1843
   1844	if (inode->i_ino != ino)
   1845		return 0;
   1846
   1847	if (!is_inode_flag_set(inode, FI_DIRTY_INODE))
   1848		return 0;
   1849
   1850	spin_lock(&sbi->inode_lock[DIRTY_META]);
   1851	clean = list_empty(&F2FS_I(inode)->gdirty_list);
   1852	spin_unlock(&sbi->inode_lock[DIRTY_META]);
   1853
   1854	if (clean)
   1855		return 0;
   1856
   1857	inode = igrab(inode);
   1858	if (!inode)
   1859		return 0;
   1860	return 1;
   1861}
   1862
   1863static bool flush_dirty_inode(struct page *page)
   1864{
   1865	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
   1866	struct inode *inode;
   1867	nid_t ino = ino_of_node(page);
   1868
   1869	inode = find_inode_nowait(sbi->sb, ino, f2fs_match_ino, NULL);
   1870	if (!inode)
   1871		return false;
   1872
   1873	f2fs_update_inode(inode, page);
   1874	unlock_page(page);
   1875
   1876	iput(inode);
   1877	return true;
   1878}
   1879
   1880void f2fs_flush_inline_data(struct f2fs_sb_info *sbi)
   1881{
   1882	pgoff_t index = 0;
   1883	struct pagevec pvec;
   1884	int nr_pages;
   1885
   1886	pagevec_init(&pvec);
   1887
   1888	while ((nr_pages = pagevec_lookup_tag(&pvec,
   1889			NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) {
   1890		int i;
   1891
   1892		for (i = 0; i < nr_pages; i++) {
   1893			struct page *page = pvec.pages[i];
   1894
   1895			if (!IS_DNODE(page))
   1896				continue;
   1897
   1898			lock_page(page);
   1899
   1900			if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
   1901continue_unlock:
   1902				unlock_page(page);
   1903				continue;
   1904			}
   1905
   1906			if (!PageDirty(page)) {
   1907				/* someone wrote it for us */
   1908				goto continue_unlock;
   1909			}
   1910
   1911			/* flush inline_data, if it's async context. */
   1912			if (page_private_inline(page)) {
   1913				clear_page_private_inline(page);
   1914				unlock_page(page);
   1915				flush_inline_data(sbi, ino_of_node(page));
   1916				continue;
   1917			}
   1918			unlock_page(page);
   1919		}
   1920		pagevec_release(&pvec);
   1921		cond_resched();
   1922	}
   1923}
   1924
   1925int f2fs_sync_node_pages(struct f2fs_sb_info *sbi,
   1926				struct writeback_control *wbc,
   1927				bool do_balance, enum iostat_type io_type)
   1928{
   1929	pgoff_t index;
   1930	struct pagevec pvec;
   1931	int step = 0;
   1932	int nwritten = 0;
   1933	int ret = 0;
   1934	int nr_pages, done = 0;
   1935
   1936	pagevec_init(&pvec);
   1937
   1938next_step:
   1939	index = 0;
   1940
   1941	while (!done && (nr_pages = pagevec_lookup_tag(&pvec,
   1942			NODE_MAPPING(sbi), &index, PAGECACHE_TAG_DIRTY))) {
   1943		int i;
   1944
   1945		for (i = 0; i < nr_pages; i++) {
   1946			struct page *page = pvec.pages[i];
   1947			bool submitted = false;
   1948			bool may_dirty = true;
   1949
   1950			/* give a priority to WB_SYNC threads */
   1951			if (atomic_read(&sbi->wb_sync_req[NODE]) &&
   1952					wbc->sync_mode == WB_SYNC_NONE) {
   1953				done = 1;
   1954				break;
   1955			}
   1956
   1957			/*
   1958			 * flushing sequence with step:
   1959			 * 0. indirect nodes
   1960			 * 1. dentry dnodes
   1961			 * 2. file dnodes
   1962			 */
   1963			if (step == 0 && IS_DNODE(page))
   1964				continue;
   1965			if (step == 1 && (!IS_DNODE(page) ||
   1966						is_cold_node(page)))
   1967				continue;
   1968			if (step == 2 && (!IS_DNODE(page) ||
   1969						!is_cold_node(page)))
   1970				continue;
   1971lock_node:
   1972			if (wbc->sync_mode == WB_SYNC_ALL)
   1973				lock_page(page);
   1974			else if (!trylock_page(page))
   1975				continue;
   1976
   1977			if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
   1978continue_unlock:
   1979				unlock_page(page);
   1980				continue;
   1981			}
   1982
   1983			if (!PageDirty(page)) {
   1984				/* someone wrote it for us */
   1985				goto continue_unlock;
   1986			}
   1987
   1988			/* flush inline_data/inode, if it's async context. */
   1989			if (!do_balance)
   1990				goto write_node;
   1991
   1992			/* flush inline_data */
   1993			if (page_private_inline(page)) {
   1994				clear_page_private_inline(page);
   1995				unlock_page(page);
   1996				flush_inline_data(sbi, ino_of_node(page));
   1997				goto lock_node;
   1998			}
   1999
   2000			/* flush dirty inode */
   2001			if (IS_INODE(page) && may_dirty) {
   2002				may_dirty = false;
   2003				if (flush_dirty_inode(page))
   2004					goto lock_node;
   2005			}
   2006write_node:
   2007			f2fs_wait_on_page_writeback(page, NODE, true, true);
   2008
   2009			if (!clear_page_dirty_for_io(page))
   2010				goto continue_unlock;
   2011
   2012			set_fsync_mark(page, 0);
   2013			set_dentry_mark(page, 0);
   2014
   2015			ret = __write_node_page(page, false, &submitted,
   2016						wbc, do_balance, io_type, NULL);
   2017			if (ret)
   2018				unlock_page(page);
   2019			else if (submitted)
   2020				nwritten++;
   2021
   2022			if (--wbc->nr_to_write == 0)
   2023				break;
   2024		}
   2025		pagevec_release(&pvec);
   2026		cond_resched();
   2027
   2028		if (wbc->nr_to_write == 0) {
   2029			step = 2;
   2030			break;
   2031		}
   2032	}
   2033
   2034	if (step < 2) {
   2035		if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
   2036				wbc->sync_mode == WB_SYNC_NONE && step == 1)
   2037			goto out;
   2038		step++;
   2039		goto next_step;
   2040	}
   2041out:
   2042	if (nwritten)
   2043		f2fs_submit_merged_write(sbi, NODE);
   2044
   2045	if (unlikely(f2fs_cp_error(sbi)))
   2046		return -EIO;
   2047	return ret;
   2048}
   2049
   2050int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
   2051						unsigned int seq_id)
   2052{
   2053	struct fsync_node_entry *fn;
   2054	struct page *page;
   2055	struct list_head *head = &sbi->fsync_node_list;
   2056	unsigned long flags;
   2057	unsigned int cur_seq_id = 0;
   2058	int ret2, ret = 0;
   2059
   2060	while (seq_id && cur_seq_id < seq_id) {
   2061		spin_lock_irqsave(&sbi->fsync_node_lock, flags);
   2062		if (list_empty(head)) {
   2063			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
   2064			break;
   2065		}
   2066		fn = list_first_entry(head, struct fsync_node_entry, list);
   2067		if (fn->seq_id > seq_id) {
   2068			spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
   2069			break;
   2070		}
   2071		cur_seq_id = fn->seq_id;
   2072		page = fn->page;
   2073		get_page(page);
   2074		spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
   2075
   2076		f2fs_wait_on_page_writeback(page, NODE, true, false);
   2077		if (TestClearPageError(page))
   2078			ret = -EIO;
   2079
   2080		put_page(page);
   2081
   2082		if (ret)
   2083			break;
   2084	}
   2085
   2086	ret2 = filemap_check_errors(NODE_MAPPING(sbi));
   2087	if (!ret)
   2088		ret = ret2;
   2089
   2090	return ret;
   2091}
   2092
   2093static int f2fs_write_node_pages(struct address_space *mapping,
   2094			    struct writeback_control *wbc)
   2095{
   2096	struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
   2097	struct blk_plug plug;
   2098	long diff;
   2099
   2100	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
   2101		goto skip_write;
   2102
   2103	/* balancing f2fs's metadata in background */
   2104	f2fs_balance_fs_bg(sbi, true);
   2105
   2106	/* collect a number of dirty node pages and write together */
   2107	if (wbc->sync_mode != WB_SYNC_ALL &&
   2108			get_pages(sbi, F2FS_DIRTY_NODES) <
   2109					nr_pages_to_skip(sbi, NODE))
   2110		goto skip_write;
   2111
   2112	if (wbc->sync_mode == WB_SYNC_ALL)
   2113		atomic_inc(&sbi->wb_sync_req[NODE]);
   2114	else if (atomic_read(&sbi->wb_sync_req[NODE])) {
   2115		/* to avoid potential deadlock */
   2116		if (current->plug)
   2117			blk_finish_plug(current->plug);
   2118		goto skip_write;
   2119	}
   2120
   2121	trace_f2fs_writepages(mapping->host, wbc, NODE);
   2122
   2123	diff = nr_pages_to_write(sbi, NODE, wbc);
   2124	blk_start_plug(&plug);
   2125	f2fs_sync_node_pages(sbi, wbc, true, FS_NODE_IO);
   2126	blk_finish_plug(&plug);
   2127	wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
   2128
   2129	if (wbc->sync_mode == WB_SYNC_ALL)
   2130		atomic_dec(&sbi->wb_sync_req[NODE]);
   2131	return 0;
   2132
   2133skip_write:
   2134	wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES);
   2135	trace_f2fs_writepages(mapping->host, wbc, NODE);
   2136	return 0;
   2137}
   2138
   2139static bool f2fs_dirty_node_folio(struct address_space *mapping,
   2140		struct folio *folio)
   2141{
   2142	trace_f2fs_set_page_dirty(&folio->page, NODE);
   2143
   2144	if (!folio_test_uptodate(folio))
   2145		folio_mark_uptodate(folio);
   2146#ifdef CONFIG_F2FS_CHECK_FS
   2147	if (IS_INODE(&folio->page))
   2148		f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page);
   2149#endif
   2150	if (!folio_test_dirty(folio)) {
   2151		filemap_dirty_folio(mapping, folio);
   2152		inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
   2153		set_page_private_reference(&folio->page);
   2154		return true;
   2155	}
   2156	return false;
   2157}
   2158
   2159/*
   2160 * Structure of the f2fs node operations
   2161 */
   2162const struct address_space_operations f2fs_node_aops = {
   2163	.writepage	= f2fs_write_node_page,
   2164	.writepages	= f2fs_write_node_pages,
   2165	.dirty_folio	= f2fs_dirty_node_folio,
   2166	.invalidate_folio = f2fs_invalidate_folio,
   2167	.release_folio	= f2fs_release_folio,
   2168#ifdef CONFIG_MIGRATION
   2169	.migratepage	= f2fs_migrate_page,
   2170#endif
   2171};
   2172
   2173static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
   2174						nid_t n)
   2175{
   2176	return radix_tree_lookup(&nm_i->free_nid_root, n);
   2177}
   2178
   2179static int __insert_free_nid(struct f2fs_sb_info *sbi,
   2180				struct free_nid *i)
   2181{
   2182	struct f2fs_nm_info *nm_i = NM_I(sbi);
   2183	int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
   2184
   2185	if (err)
   2186		return err;
   2187
   2188	nm_i->nid_cnt[FREE_NID]++;
   2189	list_add_tail(&i->list, &nm_i->free_nid_list);
   2190	return 0;
   2191}
   2192
   2193static void __remove_free_nid(struct f2fs_sb_info *sbi,
   2194			struct free_nid *i, enum nid_state state)
   2195{
   2196	struct f2fs_nm_info *nm_i = NM_I(sbi);
   2197
   2198	f2fs_bug_on(sbi, state != i->state);
   2199	nm_i->nid_cnt[state]--;
   2200	if (state == FREE_NID)
   2201		list_del(&i->list);
   2202	radix_tree_delete(&nm_i->free_nid_root, i->nid);
   2203}
   2204
   2205static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i,
   2206			enum nid_state org_state, enum nid_state dst_state)
   2207{
   2208	struct f2fs_nm_info *nm_i = NM_I(sbi);
   2209
   2210	f2fs_bug_on(sbi, org_state != i->state);
   2211	i->state = dst_state;
   2212	nm_i->nid_cnt[org_state]--;
   2213	nm_i->nid_cnt[dst_state]++;
   2214
   2215	switch (dst_state) {
   2216	case PREALLOC_NID:
   2217		list_del(&i->list);
   2218		break;
   2219	case FREE_NID:
   2220		list_add_tail(&i->list, &nm_i->free_nid_list);
   2221		break;
   2222	default:
   2223		BUG_ON(1);
   2224	}
   2225}
   2226
   2227bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi)
   2228{
   2229	struct f2fs_nm_info *nm_i = NM_I(sbi);
   2230	unsigned int i;
   2231	bool ret = true;
   2232
   2233	f2fs_down_read(&nm_i->nat_tree_lock);
   2234	for (i = 0; i < nm_i->nat_blocks; i++) {
   2235		if (!test_bit_le(i, nm_i->nat_block_bitmap)) {
   2236			ret = false;
   2237			break;
   2238		}
   2239	}
   2240	f2fs_up_read(&nm_i->nat_tree_lock);
   2241
   2242	return ret;
   2243}
   2244
   2245static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
   2246							bool set, bool build)
   2247{
   2248	struct f2fs_nm_info *nm_i = NM_I(sbi);
   2249	unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid);
   2250	unsigned int nid_ofs = nid - START_NID(nid);
   2251
   2252	if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
   2253		return;
   2254
   2255	if (set) {
   2256		if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
   2257			return;
   2258		__set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
   2259		nm_i->free_nid_count[nat_ofs]++;
   2260	} else {
   2261		if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
   2262			return;
   2263		__clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
   2264		if (!build)
   2265			nm_i->free_nid_count[nat_ofs]--;
   2266	}
   2267}
   2268
   2269/* return if the nid is recognized as free */
   2270static bool add_free_nid(struct f2fs_sb_info *sbi,
   2271				nid_t nid, bool build, bool update)
   2272{
   2273	struct f2fs_nm_info *nm_i = NM_I(sbi);
   2274	struct free_nid *i, *e;
   2275	struct nat_entry *ne;
   2276	int err = -EINVAL;
   2277	bool ret = false;
   2278
   2279	/* 0 nid should not be used */
   2280	if (unlikely(nid == 0))
   2281		return false;
   2282
   2283	if (unlikely(f2fs_check_nid_range(sbi, nid)))
   2284		return false;
   2285
   2286	i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS, true, NULL);
   2287	i->nid = nid;
   2288	i->state = FREE_NID;
   2289
   2290	radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
   2291
   2292	spin_lock(&nm_i->nid_list_lock);
   2293
   2294	if (build) {
   2295		/*
   2296		 *   Thread A             Thread B
   2297		 *  - f2fs_create
   2298		 *   - f2fs_new_inode
   2299		 *    - f2fs_alloc_nid
   2300		 *     - __insert_nid_to_list(PREALLOC_NID)
   2301		 *                     - f2fs_balance_fs_bg
   2302		 *                      - f2fs_build_free_nids
   2303		 *                       - __f2fs_build_free_nids
   2304		 *                        - scan_nat_page
   2305		 *                         - add_free_nid
   2306		 *                          - __lookup_nat_cache
   2307		 *  - f2fs_add_link
   2308		 *   - f2fs_init_inode_metadata
   2309		 *    - f2fs_new_inode_page
   2310		 *     - f2fs_new_node_page
   2311		 *      - set_node_addr
   2312		 *  - f2fs_alloc_nid_done
   2313		 *   - __remove_nid_from_list(PREALLOC_NID)
   2314		 *                         - __insert_nid_to_list(FREE_NID)
   2315		 */
   2316		ne = __lookup_nat_cache(nm_i, nid);
   2317		if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
   2318				nat_get_blkaddr(ne) != NULL_ADDR))
   2319			goto err_out;
   2320
   2321		e = __lookup_free_nid_list(nm_i, nid);
   2322		if (e) {
   2323			if (e->state == FREE_NID)
   2324				ret = true;
   2325			goto err_out;
   2326		}
   2327	}
   2328	ret = true;
   2329	err = __insert_free_nid(sbi, i);
   2330err_out:
   2331	if (update) {
   2332		update_free_nid_bitmap(sbi, nid, ret, build);
   2333		if (!build)
   2334			nm_i->available_nids++;
   2335	}
   2336	spin_unlock(&nm_i->nid_list_lock);
   2337	radix_tree_preload_end();
   2338
   2339	if (err)
   2340		kmem_cache_free(free_nid_slab, i);
   2341	return ret;
   2342}
   2343
   2344static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
   2345{
   2346	struct f2fs_nm_info *nm_i = NM_I(sbi);
   2347	struct free_nid *i;
   2348	bool need_free = false;
   2349
   2350	spin_lock(&nm_i->nid_list_lock);
   2351	i = __lookup_free_nid_list(nm_i, nid);
   2352	if (i && i->state == FREE_NID) {
   2353		__remove_free_nid(sbi, i, FREE_NID);
   2354		need_free = true;
   2355	}
   2356	spin_unlock(&nm_i->nid_list_lock);
   2357
   2358	if (need_free)
   2359		kmem_cache_free(free_nid_slab, i);
   2360}
   2361
   2362static int scan_nat_page(struct f2fs_sb_info *sbi,
   2363			struct page *nat_page, nid_t start_nid)
   2364{
   2365	struct f2fs_nm_info *nm_i = NM_I(sbi);
   2366	struct f2fs_nat_block *nat_blk = page_address(nat_page);
   2367	block_t blk_addr;
   2368	unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid);
   2369	int i;
   2370
   2371	__set_bit_le(nat_ofs, nm_i->nat_block_bitmap);
   2372
   2373	i = start_nid % NAT_ENTRY_PER_BLOCK;
   2374
   2375	for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
   2376		if (unlikely(start_nid >= nm_i->max_nid))
   2377			break;
   2378
   2379		blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
   2380
   2381		if (blk_addr == NEW_ADDR)
   2382			return -EINVAL;
   2383
   2384		if (blk_addr == NULL_ADDR) {
   2385			add_free_nid(sbi, start_nid, true, true);
   2386		} else {
   2387			spin_lock(&NM_I(sbi)->nid_list_lock);
   2388			update_free_nid_bitmap(sbi, start_nid, false, true);
   2389			spin_unlock(&NM_I(sbi)->nid_list_lock);
   2390		}
   2391	}
   2392
   2393	return 0;
   2394}
   2395
   2396static void scan_curseg_cache(struct f2fs_sb_info *sbi)
   2397{
   2398	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
   2399	struct f2fs_journal *journal = curseg->journal;
   2400	int i;
   2401
   2402	down_read(&curseg->journal_rwsem);
   2403	for (i = 0; i < nats_in_cursum(journal); i++) {
   2404		block_t addr;
   2405		nid_t nid;
   2406
   2407		addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
   2408		nid = le32_to_cpu(nid_in_journal(journal, i));
   2409		if (addr == NULL_ADDR)
   2410			add_free_nid(sbi, nid, true, false);
   2411		else
   2412			remove_free_nid(sbi, nid);
   2413	}
   2414	up_read(&curseg->journal_rwsem);
   2415}
   2416
   2417static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
   2418{
   2419	struct f2fs_nm_info *nm_i = NM_I(sbi);
   2420	unsigned int i, idx;
   2421	nid_t nid;
   2422
   2423	f2fs_down_read(&nm_i->nat_tree_lock);
   2424
   2425	for (i = 0; i < nm_i->nat_blocks; i++) {
   2426		if (!test_bit_le(i, nm_i->nat_block_bitmap))
   2427			continue;
   2428		if (!nm_i->free_nid_count[i])
   2429			continue;
   2430		for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) {
   2431			idx = find_next_bit_le(nm_i->free_nid_bitmap[i],
   2432						NAT_ENTRY_PER_BLOCK, idx);
   2433			if (idx >= NAT_ENTRY_PER_BLOCK)
   2434				break;
   2435
   2436			nid = i * NAT_ENTRY_PER_BLOCK + idx;
   2437			add_free_nid(sbi, nid, true, false);
   2438
   2439			if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS)
   2440				goto out;
   2441		}
   2442	}
   2443out:
   2444	scan_curseg_cache(sbi);
   2445
   2446	f2fs_up_read(&nm_i->nat_tree_lock);
   2447}
   2448
   2449static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
   2450						bool sync, bool mount)
   2451{
   2452	struct f2fs_nm_info *nm_i = NM_I(sbi);
   2453	int i = 0, ret;
   2454	nid_t nid = nm_i->next_scan_nid;
   2455
   2456	if (unlikely(nid >= nm_i->max_nid))
   2457		nid = 0;
   2458
   2459	if (unlikely(nid % NAT_ENTRY_PER_BLOCK))
   2460		nid = NAT_BLOCK_OFFSET(nid) * NAT_ENTRY_PER_BLOCK;
   2461
   2462	/* Enough entries */
   2463	if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
   2464		return 0;
   2465
   2466	if (!sync && !f2fs_available_free_memory(sbi, FREE_NIDS))
   2467		return 0;
   2468
   2469	if (!mount) {
   2470		/* try to find free nids in free_nid_bitmap */
   2471		scan_free_nid_bits(sbi);
   2472
   2473		if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
   2474			return 0;
   2475	}
   2476
   2477	/* readahead nat pages to be scanned */
   2478	f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
   2479							META_NAT, true);
   2480
   2481	f2fs_down_read(&nm_i->nat_tree_lock);
   2482
   2483	while (1) {
   2484		if (!test_bit_le(NAT_BLOCK_OFFSET(nid),
   2485						nm_i->nat_block_bitmap)) {
   2486			struct page *page = get_current_nat_page(sbi, nid);
   2487
   2488			if (IS_ERR(page)) {
   2489				ret = PTR_ERR(page);
   2490			} else {
   2491				ret = scan_nat_page(sbi, page, nid);
   2492				f2fs_put_page(page, 1);
   2493			}
   2494
   2495			if (ret) {
   2496				f2fs_up_read(&nm_i->nat_tree_lock);
   2497				f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
   2498				return ret;
   2499			}
   2500		}
   2501
   2502		nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
   2503		if (unlikely(nid >= nm_i->max_nid))
   2504			nid = 0;
   2505
   2506		if (++i >= FREE_NID_PAGES)
   2507			break;
   2508	}
   2509
   2510	/* go to the next free nat pages to find free nids abundantly */
   2511	nm_i->next_scan_nid = nid;
   2512
   2513	/* find free nids from current sum_pages */
   2514	scan_curseg_cache(sbi);
   2515
   2516	f2fs_up_read(&nm_i->nat_tree_lock);
   2517
   2518	f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
   2519					nm_i->ra_nid_pages, META_NAT, false);
   2520
   2521	return 0;
   2522}
   2523
   2524int f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
   2525{
   2526	int ret;
   2527
   2528	mutex_lock(&NM_I(sbi)->build_lock);
   2529	ret = __f2fs_build_free_nids(sbi, sync, mount);
   2530	mutex_unlock(&NM_I(sbi)->build_lock);
   2531
   2532	return ret;
   2533}
   2534
   2535/*
   2536 * If this function returns success, caller can obtain a new nid
   2537 * from second parameter of this function.
   2538 * The returned nid could be used ino as well as nid when inode is created.
   2539 */
   2540bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
   2541{
   2542	struct f2fs_nm_info *nm_i = NM_I(sbi);
   2543	struct free_nid *i = NULL;
   2544retry:
   2545	if (time_to_inject(sbi, FAULT_ALLOC_NID)) {
   2546		f2fs_show_injection_info(sbi, FAULT_ALLOC_NID);
   2547		return false;
   2548	}
   2549
   2550	spin_lock(&nm_i->nid_list_lock);
   2551
   2552	if (unlikely(nm_i->available_nids == 0)) {
   2553		spin_unlock(&nm_i->nid_list_lock);
   2554		return false;
   2555	}
   2556
   2557	/* We should not use stale free nids created by f2fs_build_free_nids */
   2558	if (nm_i->nid_cnt[FREE_NID] && !on_f2fs_build_free_nids(nm_i)) {
   2559		f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
   2560		i = list_first_entry(&nm_i->free_nid_list,
   2561					struct free_nid, list);
   2562		*nid = i->nid;
   2563
   2564		__move_free_nid(sbi, i, FREE_NID, PREALLOC_NID);
   2565		nm_i->available_nids--;
   2566
   2567		update_free_nid_bitmap(sbi, *nid, false, false);
   2568
   2569		spin_unlock(&nm_i->nid_list_lock);
   2570		return true;
   2571	}
   2572	spin_unlock(&nm_i->nid_list_lock);
   2573
   2574	/* Let's scan nat pages and its caches to get free nids */
   2575	if (!f2fs_build_free_nids(sbi, true, false))
   2576		goto retry;
   2577	return false;
   2578}
   2579
   2580/*
   2581 * f2fs_alloc_nid() should be called prior to this function.
   2582 */
   2583void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
   2584{
   2585	struct f2fs_nm_info *nm_i = NM_I(sbi);
   2586	struct free_nid *i;
   2587
   2588	spin_lock(&nm_i->nid_list_lock);
   2589	i = __lookup_free_nid_list(nm_i, nid);
   2590	f2fs_bug_on(sbi, !i);
   2591	__remove_free_nid(sbi, i, PREALLOC_NID);
   2592	spin_unlock(&nm_i->nid_list_lock);
   2593
   2594	kmem_cache_free(free_nid_slab, i);
   2595}
   2596
   2597/*
   2598 * f2fs_alloc_nid() should be called prior to this function.
   2599 */
   2600void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
   2601{
   2602	struct f2fs_nm_info *nm_i = NM_I(sbi);
   2603	struct free_nid *i;
   2604	bool need_free = false;
   2605
   2606	if (!nid)
   2607		return;
   2608
   2609	spin_lock(&nm_i->nid_list_lock);
   2610	i = __lookup_free_nid_list(nm_i, nid);
   2611	f2fs_bug_on(sbi, !i);
   2612
   2613	if (!f2fs_available_free_memory(sbi, FREE_NIDS)) {
   2614		__remove_free_nid(sbi, i, PREALLOC_NID);
   2615		need_free = true;
   2616	} else {
   2617		__move_free_nid(sbi, i, PREALLOC_NID, FREE_NID);
   2618	}
   2619
   2620	nm_i->available_nids++;
   2621
   2622	update_free_nid_bitmap(sbi, nid, true, false);
   2623
   2624	spin_unlock(&nm_i->nid_list_lock);
   2625
   2626	if (need_free)
   2627		kmem_cache_free(free_nid_slab, i);
   2628}
   2629
   2630int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
   2631{
   2632	struct f2fs_nm_info *nm_i = NM_I(sbi);
   2633	int nr = nr_shrink;
   2634
   2635	if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
   2636		return 0;
   2637
   2638	if (!mutex_trylock(&nm_i->build_lock))
   2639		return 0;
   2640
   2641	while (nr_shrink && nm_i->nid_cnt[FREE_NID] > MAX_FREE_NIDS) {
   2642		struct free_nid *i, *next;
   2643		unsigned int batch = SHRINK_NID_BATCH_SIZE;
   2644
   2645		spin_lock(&nm_i->nid_list_lock);
   2646		list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
   2647			if (!nr_shrink || !batch ||
   2648				nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
   2649				break;
   2650			__remove_free_nid(sbi, i, FREE_NID);
   2651			kmem_cache_free(free_nid_slab, i);
   2652			nr_shrink--;
   2653			batch--;
   2654		}
   2655		spin_unlock(&nm_i->nid_list_lock);
   2656	}
   2657
   2658	mutex_unlock(&nm_i->build_lock);
   2659
   2660	return nr - nr_shrink;
   2661}
   2662
   2663int f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
   2664{
   2665	void *src_addr, *dst_addr;
   2666	size_t inline_size;
   2667	struct page *ipage;
   2668	struct f2fs_inode *ri;
   2669
   2670	ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
   2671	if (IS_ERR(ipage))
   2672		return PTR_ERR(ipage);
   2673
   2674	ri = F2FS_INODE(page);
   2675	if (ri->i_inline & F2FS_INLINE_XATTR) {
   2676		if (!f2fs_has_inline_xattr(inode)) {
   2677			set_inode_flag(inode, FI_INLINE_XATTR);
   2678			stat_inc_inline_xattr(inode);
   2679		}
   2680	} else {
   2681		if (f2fs_has_inline_xattr(inode)) {
   2682			stat_dec_inline_xattr(inode);
   2683			clear_inode_flag(inode, FI_INLINE_XATTR);
   2684		}
   2685		goto update_inode;
   2686	}
   2687
   2688	dst_addr = inline_xattr_addr(inode, ipage);
   2689	src_addr = inline_xattr_addr(inode, page);
   2690	inline_size = inline_xattr_size(inode);
   2691
   2692	f2fs_wait_on_page_writeback(ipage, NODE, true, true);
   2693	memcpy(dst_addr, src_addr, inline_size);
   2694update_inode:
   2695	f2fs_update_inode(inode, ipage);
   2696	f2fs_put_page(ipage, 1);
   2697	return 0;
   2698}
   2699
   2700int f2fs_recover_xattr_data(struct inode *inode, struct page *page)
   2701{
   2702	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
   2703	nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
   2704	nid_t new_xnid;
   2705	struct dnode_of_data dn;
   2706	struct node_info ni;
   2707	struct page *xpage;
   2708	int err;
   2709
   2710	if (!prev_xnid)
   2711		goto recover_xnid;
   2712
   2713	/* 1: invalidate the previous xattr nid */
   2714	err = f2fs_get_node_info(sbi, prev_xnid, &ni, false);
   2715	if (err)
   2716		return err;
   2717
   2718	f2fs_invalidate_blocks(sbi, ni.blk_addr);
   2719	dec_valid_node_count(sbi, inode, false);
   2720	set_node_addr(sbi, &ni, NULL_ADDR, false);
   2721
   2722recover_xnid:
   2723	/* 2: update xattr nid in inode */
   2724	if (!f2fs_alloc_nid(sbi, &new_xnid))
   2725		return -ENOSPC;
   2726
   2727	set_new_dnode(&dn, inode, NULL, NULL, new_xnid);
   2728	xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET);
   2729	if (IS_ERR(xpage)) {
   2730		f2fs_alloc_nid_failed(sbi, new_xnid);
   2731		return PTR_ERR(xpage);
   2732	}
   2733
   2734	f2fs_alloc_nid_done(sbi, new_xnid);
   2735	f2fs_update_inode_page(inode);
   2736
   2737	/* 3: update and set xattr node page dirty */
   2738	memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE);
   2739
   2740	set_page_dirty(xpage);
   2741	f2fs_put_page(xpage, 1);
   2742
   2743	return 0;
   2744}
   2745
   2746int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
   2747{
   2748	struct f2fs_inode *src, *dst;
   2749	nid_t ino = ino_of_node(page);
   2750	struct node_info old_ni, new_ni;
   2751	struct page *ipage;
   2752	int err;
   2753
   2754	err = f2fs_get_node_info(sbi, ino, &old_ni, false);
   2755	if (err)
   2756		return err;
   2757
   2758	if (unlikely(old_ni.blk_addr != NULL_ADDR))
   2759		return -EINVAL;
   2760retry:
   2761	ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
   2762	if (!ipage) {
   2763		memalloc_retry_wait(GFP_NOFS);
   2764		goto retry;
   2765	}
   2766
   2767	/* Should not use this inode from free nid list */
   2768	remove_free_nid(sbi, ino);
   2769
   2770	if (!PageUptodate(ipage))
   2771		SetPageUptodate(ipage);
   2772	fill_node_footer(ipage, ino, ino, 0, true);
   2773	set_cold_node(ipage, false);
   2774
   2775	src = F2FS_INODE(page);
   2776	dst = F2FS_INODE(ipage);
   2777
   2778	memcpy(dst, src, offsetof(struct f2fs_inode, i_ext));
   2779	dst->i_size = 0;
   2780	dst->i_blocks = cpu_to_le64(1);
   2781	dst->i_links = cpu_to_le32(1);
   2782	dst->i_xattr_nid = 0;
   2783	dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR);
   2784	if (dst->i_inline & F2FS_EXTRA_ATTR) {
   2785		dst->i_extra_isize = src->i_extra_isize;
   2786
   2787		if (f2fs_sb_has_flexible_inline_xattr(sbi) &&
   2788			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
   2789							i_inline_xattr_size))
   2790			dst->i_inline_xattr_size = src->i_inline_xattr_size;
   2791
   2792		if (f2fs_sb_has_project_quota(sbi) &&
   2793			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
   2794								i_projid))
   2795			dst->i_projid = src->i_projid;
   2796
   2797		if (f2fs_sb_has_inode_crtime(sbi) &&
   2798			F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
   2799							i_crtime_nsec)) {
   2800			dst->i_crtime = src->i_crtime;
   2801			dst->i_crtime_nsec = src->i_crtime_nsec;
   2802		}
   2803	}
   2804
   2805	new_ni = old_ni;
   2806	new_ni.ino = ino;
   2807
   2808	if (unlikely(inc_valid_node_count(sbi, NULL, true)))
   2809		WARN_ON(1);
   2810	set_node_addr(sbi, &new_ni, NEW_ADDR, false);
   2811	inc_valid_inode_count(sbi);
   2812	set_page_dirty(ipage);
   2813	f2fs_put_page(ipage, 1);
   2814	return 0;
   2815}
   2816
   2817int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
   2818			unsigned int segno, struct f2fs_summary_block *sum)
   2819{
   2820	struct f2fs_node *rn;
   2821	struct f2fs_summary *sum_entry;
   2822	block_t addr;
   2823	int i, idx, last_offset, nrpages;
   2824
   2825	/* scan the node segment */
   2826	last_offset = sbi->blocks_per_seg;
   2827	addr = START_BLOCK(sbi, segno);
   2828	sum_entry = &sum->entries[0];
   2829
   2830	for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
   2831		nrpages = bio_max_segs(last_offset - i);
   2832
   2833		/* readahead node pages */
   2834		f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true);
   2835
   2836		for (idx = addr; idx < addr + nrpages; idx++) {
   2837			struct page *page = f2fs_get_tmp_page(sbi, idx);
   2838
   2839			if (IS_ERR(page))
   2840				return PTR_ERR(page);
   2841
   2842			rn = F2FS_NODE(page);
   2843			sum_entry->nid = rn->footer.nid;
   2844			sum_entry->version = 0;
   2845			sum_entry->ofs_in_node = 0;
   2846			sum_entry++;
   2847			f2fs_put_page(page, 1);
   2848		}
   2849
   2850		invalidate_mapping_pages(META_MAPPING(sbi), addr,
   2851							addr + nrpages);
   2852	}
   2853	return 0;
   2854}
   2855
   2856static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
   2857{
   2858	struct f2fs_nm_info *nm_i = NM_I(sbi);
   2859	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
   2860	struct f2fs_journal *journal = curseg->journal;
   2861	int i;
   2862
   2863	down_write(&curseg->journal_rwsem);
   2864	for (i = 0; i < nats_in_cursum(journal); i++) {
   2865		struct nat_entry *ne;
   2866		struct f2fs_nat_entry raw_ne;
   2867		nid_t nid = le32_to_cpu(nid_in_journal(journal, i));
   2868
   2869		if (f2fs_check_nid_range(sbi, nid))
   2870			continue;
   2871
   2872		raw_ne = nat_in_journal(journal, i);
   2873
   2874		ne = __lookup_nat_cache(nm_i, nid);
   2875		if (!ne) {
   2876			ne = __alloc_nat_entry(sbi, nid, true);
   2877			__init_nat_entry(nm_i, ne, &raw_ne, true);
   2878		}
   2879
   2880		/*
   2881		 * if a free nat in journal has not been used after last
   2882		 * checkpoint, we should remove it from available nids,
   2883		 * since later we will add it again.
   2884		 */
   2885		if (!get_nat_flag(ne, IS_DIRTY) &&
   2886				le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) {
   2887			spin_lock(&nm_i->nid_list_lock);
   2888			nm_i->available_nids--;
   2889			spin_unlock(&nm_i->nid_list_lock);
   2890		}
   2891
   2892		__set_nat_cache_dirty(nm_i, ne);
   2893	}
   2894	update_nats_in_cursum(journal, -i);
   2895	up_write(&curseg->journal_rwsem);
   2896}
   2897
   2898static void __adjust_nat_entry_set(struct nat_entry_set *nes,
   2899						struct list_head *head, int max)
   2900{
   2901	struct nat_entry_set *cur;
   2902
   2903	if (nes->entry_cnt >= max)
   2904		goto add_out;
   2905
   2906	list_for_each_entry(cur, head, set_list) {
   2907		if (cur->entry_cnt >= nes->entry_cnt) {
   2908			list_add(&nes->set_list, cur->set_list.prev);
   2909			return;
   2910		}
   2911	}
   2912add_out:
   2913	list_add_tail(&nes->set_list, head);
   2914}
   2915
   2916static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs,
   2917							unsigned int valid)
   2918{
   2919	if (valid == 0) {
   2920		__set_bit_le(nat_ofs, nm_i->empty_nat_bits);
   2921		__clear_bit_le(nat_ofs, nm_i->full_nat_bits);
   2922		return;
   2923	}
   2924
   2925	__clear_bit_le(nat_ofs, nm_i->empty_nat_bits);
   2926	if (valid == NAT_ENTRY_PER_BLOCK)
   2927		__set_bit_le(nat_ofs, nm_i->full_nat_bits);
   2928	else
   2929		__clear_bit_le(nat_ofs, nm_i->full_nat_bits);
   2930}
   2931
   2932static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
   2933						struct page *page)
   2934{
   2935	struct f2fs_nm_info *nm_i = NM_I(sbi);
   2936	unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK;
   2937	struct f2fs_nat_block *nat_blk = page_address(page);
   2938	int valid = 0;
   2939	int i = 0;
   2940
   2941	if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
   2942		return;
   2943
   2944	if (nat_index == 0) {
   2945		valid = 1;
   2946		i = 1;
   2947	}
   2948	for (; i < NAT_ENTRY_PER_BLOCK; i++) {
   2949		if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR)
   2950			valid++;
   2951	}
   2952
   2953	__update_nat_bits(nm_i, nat_index, valid);
   2954}
   2955
   2956void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi)
   2957{
   2958	struct f2fs_nm_info *nm_i = NM_I(sbi);
   2959	unsigned int nat_ofs;
   2960
   2961	f2fs_down_read(&nm_i->nat_tree_lock);
   2962
   2963	for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) {
   2964		unsigned int valid = 0, nid_ofs = 0;
   2965
   2966		/* handle nid zero due to it should never be used */
   2967		if (unlikely(nat_ofs == 0)) {
   2968			valid = 1;
   2969			nid_ofs = 1;
   2970		}
   2971
   2972		for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) {
   2973			if (!test_bit_le(nid_ofs,
   2974					nm_i->free_nid_bitmap[nat_ofs]))
   2975				valid++;
   2976		}
   2977
   2978		__update_nat_bits(nm_i, nat_ofs, valid);
   2979	}
   2980
   2981	f2fs_up_read(&nm_i->nat_tree_lock);
   2982}
   2983
   2984static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
   2985		struct nat_entry_set *set, struct cp_control *cpc)
   2986{
   2987	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
   2988	struct f2fs_journal *journal = curseg->journal;
   2989	nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
   2990	bool to_journal = true;
   2991	struct f2fs_nat_block *nat_blk;
   2992	struct nat_entry *ne, *cur;
   2993	struct page *page = NULL;
   2994
   2995	/*
   2996	 * there are two steps to flush nat entries:
   2997	 * #1, flush nat entries to journal in current hot data summary block.
   2998	 * #2, flush nat entries to nat page.
   2999	 */
   3000	if ((cpc->reason & CP_UMOUNT) ||
   3001		!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
   3002		to_journal = false;
   3003
   3004	if (to_journal) {
   3005		down_write(&curseg->journal_rwsem);
   3006	} else {
   3007		page = get_next_nat_page(sbi, start_nid);
   3008		if (IS_ERR(page))
   3009			return PTR_ERR(page);
   3010
   3011		nat_blk = page_address(page);
   3012		f2fs_bug_on(sbi, !nat_blk);
   3013	}
   3014
   3015	/* flush dirty nats in nat entry set */
   3016	list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
   3017		struct f2fs_nat_entry *raw_ne;
   3018		nid_t nid = nat_get_nid(ne);
   3019		int offset;
   3020
   3021		f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR);
   3022
   3023		if (to_journal) {
   3024			offset = f2fs_lookup_journal_in_cursum(journal,
   3025							NAT_JOURNAL, nid, 1);
   3026			f2fs_bug_on(sbi, offset < 0);
   3027			raw_ne = &nat_in_journal(journal, offset);
   3028			nid_in_journal(journal, offset) = cpu_to_le32(nid);
   3029		} else {
   3030			raw_ne = &nat_blk->entries[nid - start_nid];
   3031		}
   3032		raw_nat_from_node_info(raw_ne, &ne->ni);
   3033		nat_reset_flag(ne);
   3034		__clear_nat_cache_dirty(NM_I(sbi), set, ne);
   3035		if (nat_get_blkaddr(ne) == NULL_ADDR) {
   3036			add_free_nid(sbi, nid, false, true);
   3037		} else {
   3038			spin_lock(&NM_I(sbi)->nid_list_lock);
   3039			update_free_nid_bitmap(sbi, nid, false, false);
   3040			spin_unlock(&NM_I(sbi)->nid_list_lock);
   3041		}
   3042	}
   3043
   3044	if (to_journal) {
   3045		up_write(&curseg->journal_rwsem);
   3046	} else {
   3047		update_nat_bits(sbi, start_nid, page);
   3048		f2fs_put_page(page, 1);
   3049	}
   3050
   3051	/* Allow dirty nats by node block allocation in write_begin */
   3052	if (!set->entry_cnt) {
   3053		radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
   3054		kmem_cache_free(nat_entry_set_slab, set);
   3055	}
   3056	return 0;
   3057}
   3058
   3059/*
   3060 * This function is called during the checkpointing process.
   3061 */
   3062int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
   3063{
   3064	struct f2fs_nm_info *nm_i = NM_I(sbi);
   3065	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
   3066	struct f2fs_journal *journal = curseg->journal;
   3067	struct nat_entry_set *setvec[SETVEC_SIZE];
   3068	struct nat_entry_set *set, *tmp;
   3069	unsigned int found;
   3070	nid_t set_idx = 0;
   3071	LIST_HEAD(sets);
   3072	int err = 0;
   3073
   3074	/*
   3075	 * during unmount, let's flush nat_bits before checking
   3076	 * nat_cnt[DIRTY_NAT].
   3077	 */
   3078	if (cpc->reason & CP_UMOUNT) {
   3079		f2fs_down_write(&nm_i->nat_tree_lock);
   3080		remove_nats_in_journal(sbi);
   3081		f2fs_up_write(&nm_i->nat_tree_lock);
   3082	}
   3083
   3084	if (!nm_i->nat_cnt[DIRTY_NAT])
   3085		return 0;
   3086
   3087	f2fs_down_write(&nm_i->nat_tree_lock);
   3088
   3089	/*
   3090	 * if there are no enough space in journal to store dirty nat
   3091	 * entries, remove all entries from journal and merge them
   3092	 * into nat entry set.
   3093	 */
   3094	if (cpc->reason & CP_UMOUNT ||
   3095		!__has_cursum_space(journal,
   3096			nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
   3097		remove_nats_in_journal(sbi);
   3098
   3099	while ((found = __gang_lookup_nat_set(nm_i,
   3100					set_idx, SETVEC_SIZE, setvec))) {
   3101		unsigned idx;
   3102
   3103		set_idx = setvec[found - 1]->set + 1;
   3104		for (idx = 0; idx < found; idx++)
   3105			__adjust_nat_entry_set(setvec[idx], &sets,
   3106						MAX_NAT_JENTRIES(journal));
   3107	}
   3108
   3109	/* flush dirty nats in nat entry set */
   3110	list_for_each_entry_safe(set, tmp, &sets, set_list) {
   3111		err = __flush_nat_entry_set(sbi, set, cpc);
   3112		if (err)
   3113			break;
   3114	}
   3115
   3116	f2fs_up_write(&nm_i->nat_tree_lock);
   3117	/* Allow dirty nats by node block allocation in write_begin */
   3118
   3119	return err;
   3120}
   3121
   3122static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
   3123{
   3124	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
   3125	struct f2fs_nm_info *nm_i = NM_I(sbi);
   3126	unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE;
   3127	unsigned int i;
   3128	__u64 cp_ver = cur_cp_version(ckpt);
   3129	block_t nat_bits_addr;
   3130
   3131	nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8);
   3132	nm_i->nat_bits = f2fs_kvzalloc(sbi,
   3133			nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL);
   3134	if (!nm_i->nat_bits)
   3135		return -ENOMEM;
   3136
   3137	nm_i->full_nat_bits = nm_i->nat_bits + 8;
   3138	nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
   3139
   3140	if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
   3141		return 0;
   3142
   3143	nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg -
   3144						nm_i->nat_bits_blocks;
   3145	for (i = 0; i < nm_i->nat_bits_blocks; i++) {
   3146		struct page *page;
   3147
   3148		page = f2fs_get_meta_page(sbi, nat_bits_addr++);
   3149		if (IS_ERR(page))
   3150			return PTR_ERR(page);
   3151
   3152		memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS),
   3153					page_address(page), F2FS_BLKSIZE);
   3154		f2fs_put_page(page, 1);
   3155	}
   3156
   3157	cp_ver |= (cur_cp_crc(ckpt) << 32);
   3158	if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) {
   3159		clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
   3160		f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)",
   3161			cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits));
   3162		return 0;
   3163	}
   3164
   3165	f2fs_notice(sbi, "Found nat_bits in checkpoint");
   3166	return 0;
   3167}
   3168
   3169static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
   3170{
   3171	struct f2fs_nm_info *nm_i = NM_I(sbi);
   3172	unsigned int i = 0;
   3173	nid_t nid, last_nid;
   3174
   3175	if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
   3176		return;
   3177
   3178	for (i = 0; i < nm_i->nat_blocks; i++) {
   3179		i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i);
   3180		if (i >= nm_i->nat_blocks)
   3181			break;
   3182
   3183		__set_bit_le(i, nm_i->nat_block_bitmap);
   3184
   3185		nid = i * NAT_ENTRY_PER_BLOCK;
   3186		last_nid = nid + NAT_ENTRY_PER_BLOCK;
   3187
   3188		spin_lock(&NM_I(sbi)->nid_list_lock);
   3189		for (; nid < last_nid; nid++)
   3190			update_free_nid_bitmap(sbi, nid, true, true);
   3191		spin_unlock(&NM_I(sbi)->nid_list_lock);
   3192	}
   3193
   3194	for (i = 0; i < nm_i->nat_blocks; i++) {
   3195		i = find_next_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i);
   3196		if (i >= nm_i->nat_blocks)
   3197			break;
   3198
   3199		__set_bit_le(i, nm_i->nat_block_bitmap);
   3200	}
   3201}
   3202
   3203static int init_node_manager(struct f2fs_sb_info *sbi)
   3204{
   3205	struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
   3206	struct f2fs_nm_info *nm_i = NM_I(sbi);
   3207	unsigned char *version_bitmap;
   3208	unsigned int nat_segs;
   3209	int err;
   3210
   3211	nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
   3212
   3213	/* segment_count_nat includes pair segment so divide to 2. */
   3214	nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
   3215	nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
   3216	nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks;
   3217
   3218	/* not used nids: 0, node, meta, (and root counted as valid node) */
   3219	nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
   3220						F2FS_RESERVED_NODE_NUM;
   3221	nm_i->nid_cnt[FREE_NID] = 0;
   3222	nm_i->nid_cnt[PREALLOC_NID] = 0;
   3223	nm_i->ram_thresh = DEF_RAM_THRESHOLD;
   3224	nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
   3225	nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
   3226	nm_i->max_rf_node_blocks = DEF_RF_NODE_BLOCKS;
   3227
   3228	INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
   3229	INIT_LIST_HEAD(&nm_i->free_nid_list);
   3230	INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
   3231	INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
   3232	INIT_LIST_HEAD(&nm_i->nat_entries);
   3233	spin_lock_init(&nm_i->nat_list_lock);
   3234
   3235	mutex_init(&nm_i->build_lock);
   3236	spin_lock_init(&nm_i->nid_list_lock);
   3237	init_f2fs_rwsem(&nm_i->nat_tree_lock);
   3238
   3239	nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
   3240	nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
   3241	version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
   3242	nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size,
   3243					GFP_KERNEL);
   3244	if (!nm_i->nat_bitmap)
   3245		return -ENOMEM;
   3246
   3247	err = __get_nat_bitmaps(sbi);
   3248	if (err)
   3249		return err;
   3250
   3251#ifdef CONFIG_F2FS_CHECK_FS
   3252	nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size,
   3253					GFP_KERNEL);
   3254	if (!nm_i->nat_bitmap_mir)
   3255		return -ENOMEM;
   3256#endif
   3257
   3258	return 0;
   3259}
   3260
   3261static int init_free_nid_cache(struct f2fs_sb_info *sbi)
   3262{
   3263	struct f2fs_nm_info *nm_i = NM_I(sbi);
   3264	int i;
   3265
   3266	nm_i->free_nid_bitmap =
   3267		f2fs_kvzalloc(sbi, array_size(sizeof(unsigned char *),
   3268					      nm_i->nat_blocks),
   3269			      GFP_KERNEL);
   3270	if (!nm_i->free_nid_bitmap)
   3271		return -ENOMEM;
   3272
   3273	for (i = 0; i < nm_i->nat_blocks; i++) {
   3274		nm_i->free_nid_bitmap[i] = f2fs_kvzalloc(sbi,
   3275			f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK), GFP_KERNEL);
   3276		if (!nm_i->free_nid_bitmap[i])
   3277			return -ENOMEM;
   3278	}
   3279
   3280	nm_i->nat_block_bitmap = f2fs_kvzalloc(sbi, nm_i->nat_blocks / 8,
   3281								GFP_KERNEL);
   3282	if (!nm_i->nat_block_bitmap)
   3283		return -ENOMEM;
   3284
   3285	nm_i->free_nid_count =
   3286		f2fs_kvzalloc(sbi, array_size(sizeof(unsigned short),
   3287					      nm_i->nat_blocks),
   3288			      GFP_KERNEL);
   3289	if (!nm_i->free_nid_count)
   3290		return -ENOMEM;
   3291	return 0;
   3292}
   3293
   3294int f2fs_build_node_manager(struct f2fs_sb_info *sbi)
   3295{
   3296	int err;
   3297
   3298	sbi->nm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_nm_info),
   3299							GFP_KERNEL);
   3300	if (!sbi->nm_info)
   3301		return -ENOMEM;
   3302
   3303	err = init_node_manager(sbi);
   3304	if (err)
   3305		return err;
   3306
   3307	err = init_free_nid_cache(sbi);
   3308	if (err)
   3309		return err;
   3310
   3311	/* load free nid status from nat_bits table */
   3312	load_free_nid_bitmap(sbi);
   3313
   3314	return f2fs_build_free_nids(sbi, true, true);
   3315}
   3316
   3317void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
   3318{
   3319	struct f2fs_nm_info *nm_i = NM_I(sbi);
   3320	struct free_nid *i, *next_i;
   3321	struct nat_entry *natvec[NATVEC_SIZE];
   3322	struct nat_entry_set *setvec[SETVEC_SIZE];
   3323	nid_t nid = 0;
   3324	unsigned int found;
   3325
   3326	if (!nm_i)
   3327		return;
   3328
   3329	/* destroy free nid list */
   3330	spin_lock(&nm_i->nid_list_lock);
   3331	list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
   3332		__remove_free_nid(sbi, i, FREE_NID);
   3333		spin_unlock(&nm_i->nid_list_lock);
   3334		kmem_cache_free(free_nid_slab, i);
   3335		spin_lock(&nm_i->nid_list_lock);
   3336	}
   3337	f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID]);
   3338	f2fs_bug_on(sbi, nm_i->nid_cnt[PREALLOC_NID]);
   3339	f2fs_bug_on(sbi, !list_empty(&nm_i->free_nid_list));
   3340	spin_unlock(&nm_i->nid_list_lock);
   3341
   3342	/* destroy nat cache */
   3343	f2fs_down_write(&nm_i->nat_tree_lock);
   3344	while ((found = __gang_lookup_nat_cache(nm_i,
   3345					nid, NATVEC_SIZE, natvec))) {
   3346		unsigned idx;
   3347
   3348		nid = nat_get_nid(natvec[found - 1]) + 1;
   3349		for (idx = 0; idx < found; idx++) {
   3350			spin_lock(&nm_i->nat_list_lock);
   3351			list_del(&natvec[idx]->list);
   3352			spin_unlock(&nm_i->nat_list_lock);
   3353
   3354			__del_from_nat_cache(nm_i, natvec[idx]);
   3355		}
   3356	}
   3357	f2fs_bug_on(sbi, nm_i->nat_cnt[TOTAL_NAT]);
   3358
   3359	/* destroy nat set cache */
   3360	nid = 0;
   3361	while ((found = __gang_lookup_nat_set(nm_i,
   3362					nid, SETVEC_SIZE, setvec))) {
   3363		unsigned idx;
   3364
   3365		nid = setvec[found - 1]->set + 1;
   3366		for (idx = 0; idx < found; idx++) {
   3367			/* entry_cnt is not zero, when cp_error was occurred */
   3368			f2fs_bug_on(sbi, !list_empty(&setvec[idx]->entry_list));
   3369			radix_tree_delete(&nm_i->nat_set_root, setvec[idx]->set);
   3370			kmem_cache_free(nat_entry_set_slab, setvec[idx]);
   3371		}
   3372	}
   3373	f2fs_up_write(&nm_i->nat_tree_lock);
   3374
   3375	kvfree(nm_i->nat_block_bitmap);
   3376	if (nm_i->free_nid_bitmap) {
   3377		int i;
   3378
   3379		for (i = 0; i < nm_i->nat_blocks; i++)
   3380			kvfree(nm_i->free_nid_bitmap[i]);
   3381		kvfree(nm_i->free_nid_bitmap);
   3382	}
   3383	kvfree(nm_i->free_nid_count);
   3384
   3385	kvfree(nm_i->nat_bitmap);
   3386	kvfree(nm_i->nat_bits);
   3387#ifdef CONFIG_F2FS_CHECK_FS
   3388	kvfree(nm_i->nat_bitmap_mir);
   3389#endif
   3390	sbi->nm_info = NULL;
   3391	kfree(nm_i);
   3392}
   3393
   3394int __init f2fs_create_node_manager_caches(void)
   3395{
   3396	nat_entry_slab = f2fs_kmem_cache_create("f2fs_nat_entry",
   3397			sizeof(struct nat_entry));
   3398	if (!nat_entry_slab)
   3399		goto fail;
   3400
   3401	free_nid_slab = f2fs_kmem_cache_create("f2fs_free_nid",
   3402			sizeof(struct free_nid));
   3403	if (!free_nid_slab)
   3404		goto destroy_nat_entry;
   3405
   3406	nat_entry_set_slab = f2fs_kmem_cache_create("f2fs_nat_entry_set",
   3407			sizeof(struct nat_entry_set));
   3408	if (!nat_entry_set_slab)
   3409		goto destroy_free_nid;
   3410
   3411	fsync_node_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_node_entry",
   3412			sizeof(struct fsync_node_entry));
   3413	if (!fsync_node_entry_slab)
   3414		goto destroy_nat_entry_set;
   3415	return 0;
   3416
   3417destroy_nat_entry_set:
   3418	kmem_cache_destroy(nat_entry_set_slab);
   3419destroy_free_nid:
   3420	kmem_cache_destroy(free_nid_slab);
   3421destroy_nat_entry:
   3422	kmem_cache_destroy(nat_entry_slab);
   3423fail:
   3424	return -ENOMEM;
   3425}
   3426
   3427void f2fs_destroy_node_manager_caches(void)
   3428{
   3429	kmem_cache_destroy(fsync_node_entry_slab);
   3430	kmem_cache_destroy(nat_entry_set_slab);
   3431	kmem_cache_destroy(free_nid_slab);
   3432	kmem_cache_destroy(nat_entry_slab);
   3433}