From 16c4042f08919f447d6b2a55679546c9b97c7264 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 11 Aug 2010 14:17:39 -0700 Subject: writeback: avoid unnecessary calculation of bdi dirty thresholds Split get_dirty_limits() into global_dirty_limits()+bdi_dirty_limit(), so that the latter can be avoided when under global dirty background threshold (which is the normal state for most systems). Signed-off-by: Wu Fengguang Cc: Peter Zijlstra Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/writeback.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index c24eca71e80c..72a5d647a5f2 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -124,8 +124,9 @@ struct ctl_table; int dirty_writeback_centisecs_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -void get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, - unsigned long *pbdi_dirty, struct backing_dev_info *bdi); +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); +unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, + unsigned long dirty); void page_writeback_init(void); void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, -- cgit v1.2.3-71-gd317 From 9e38d86ff2d8a8db99570e982230861046df32b5 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 23 Oct 2010 06:55:17 -0400 Subject: fs: Implement lazy LRU updates for inodes Convert the inode LRU to use lazy updates to reduce lock and cacheline traffic. We avoid moving inodes around in the LRU list during iget/iput operations so these frequent operations don't need to access the LRUs. Instead, we defer the refcount checks to reclaim-time and use a per-inode state flag, I_REFERENCED, to tell reclaim that iget has touched the inode in the past. This means that only reclaim should be touching the LRU with any frequency, hence significantly reducing lock acquisitions and the amount contention on LRU updates. This also removes the inode_in_use list, which means we now only have one list for tracking the inode LRU status. This makes it much simpler to split out the LRU list operations under it's own lock. Signed-off-by: Nick Piggin Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/fs-writeback.c | 11 +++--- fs/inode.c | 86 +++++++++++++++++++++++++++++++++-------------- include/linux/fs.h | 13 +++---- include/linux/writeback.h | 2 -- 4 files changed, 71 insertions(+), 41 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f04d04af84f2..e8f65290e836 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -408,16 +408,13 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * completion. */ redirty_tail(inode); - } else if (atomic_read(&inode->i_count)) { - /* - * The inode is clean, inuse - */ - list_move(&inode->i_list, &inode_in_use); } else { /* - * The inode is clean, unused + * The inode is clean. At this point we either have + * a reference to the inode or it's on it's way out. + * No need to add it back to the LRU. */ - list_move(&inode->i_list, &inode_unused); + list_del_init(&inode->i_list); } } inode_sync_complete(inode); diff --git a/fs/inode.c b/fs/inode.c index 0d5aeccbdd90..3bdc76f1653a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -72,8 +72,7 @@ static unsigned int i_hash_shift __read_mostly; * allowing for low-overhead inode sync() operations. */ -LIST_HEAD(inode_in_use); -LIST_HEAD(inode_unused); +static LIST_HEAD(inode_unused); static struct hlist_head *inode_hashtable __read_mostly; /* @@ -291,6 +290,7 @@ void inode_init_once(struct inode *inode) INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_devices); + INIT_LIST_HEAD(&inode->i_list); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); spin_lock_init(&inode->i_data.tree_lock); spin_lock_init(&inode->i_data.i_mmap_lock); @@ -317,12 +317,23 @@ static void init_once(void *foo) */ void __iget(struct inode *inode) { - if (atomic_inc_return(&inode->i_count) != 1) - return; + atomic_inc(&inode->i_count); +} - if (!(inode->i_state & (I_DIRTY|I_SYNC))) - list_move(&inode->i_list, &inode_in_use); - percpu_counter_dec(&nr_inodes_unused); +static void inode_lru_list_add(struct inode *inode) +{ + if (list_empty(&inode->i_list)) { + list_add(&inode->i_list, &inode_unused); + percpu_counter_inc(&nr_inodes_unused); + } +} + +static void inode_lru_list_del(struct inode *inode) +{ + if (!list_empty(&inode->i_list)) { + list_del_init(&inode->i_list); + percpu_counter_dec(&nr_inodes_unused); + } } void end_writeback(struct inode *inode) @@ -367,7 +378,7 @@ static void dispose_list(struct list_head *head) struct inode *inode; inode = list_first_entry(head, struct inode, i_list); - list_del(&inode->i_list); + list_del_init(&inode->i_list); evict(inode); @@ -413,7 +424,8 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose) list_move(&inode->i_list, dispose); WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; - percpu_counter_dec(&nr_inodes_unused); + if (!(inode->i_state & (I_DIRTY | I_SYNC))) + percpu_counter_dec(&nr_inodes_unused); continue; } busy = 1; @@ -448,7 +460,7 @@ int invalidate_inodes(struct super_block *sb) static int can_unuse(struct inode *inode) { - if (inode->i_state) + if (inode->i_state & ~I_REFERENCED) return 0; if (inode_has_buffers(inode)) return 0; @@ -460,17 +472,20 @@ static int can_unuse(struct inode *inode) } /* - * Scan `goal' inodes on the unused list for freeable ones. They are moved to - * a temporary list and then are freed outside inode_lock by dispose_list(). + * Scan `goal' inodes on the unused list for freeable ones. They are moved to a + * temporary list and then are freed outside inode_lock by dispose_list(). * * Any inodes which are pinned purely because of attached pagecache have their - * pagecache removed. We expect the final iput() on that inode to add it to - * the front of the inode_unused list. So look for it there and if the - * inode is still freeable, proceed. The right inode is found 99.9% of the - * time in testing on a 4-way. + * pagecache removed. If the inode has metadata buffers attached to + * mapping->private_list then try to remove them. * - * If the inode has metadata buffers attached to mapping->private_list then - * try to remove them. + * If the inode has the I_REFERENCED flag set, then it means that it has been + * used recently - the flag is set in iput_final(). When we encounter such an + * inode, clear the flag and move it to the back of the LRU so it gets another + * pass through the LRU before it gets reclaimed. This is necessary because of + * the fact we are doing lazy LRU updates to minimise lock contention so the + * LRU does not have strict ordering. Hence we don't want to reclaim inodes + * with this flag set because they are the inodes that are out of order. */ static void prune_icache(int nr_to_scan) { @@ -488,8 +503,21 @@ static void prune_icache(int nr_to_scan) inode = list_entry(inode_unused.prev, struct inode, i_list); - if (inode->i_state || atomic_read(&inode->i_count)) { + /* + * Referenced or dirty inodes are still in use. Give them + * another pass through the LRU as we canot reclaim them now. + */ + if (atomic_read(&inode->i_count) || + (inode->i_state & ~I_REFERENCED)) { + list_del_init(&inode->i_list); + percpu_counter_dec(&nr_inodes_unused); + continue; + } + + /* recently referenced inodes get one more pass */ + if (inode->i_state & I_REFERENCED) { list_move(&inode->i_list, &inode_unused); + inode->i_state &= ~I_REFERENCED; continue; } if (inode_has_buffers(inode) || inode->i_data.nrpages) { @@ -620,7 +648,6 @@ static inline void __inode_add_to_lists(struct super_block *sb, struct hlist_head *head, struct inode *inode) { - list_add(&inode->i_list, &inode_in_use); list_add(&inode->i_sb_list, &sb->s_inodes); if (head) hlist_add_head(&inode->i_hash, head); @@ -1237,10 +1264,11 @@ static void iput_final(struct inode *inode) drop = generic_drop_inode(inode); if (!drop) { - if (!(inode->i_state & (I_DIRTY|I_SYNC))) - list_move(&inode->i_list, &inode_unused); - percpu_counter_inc(&nr_inodes_unused); if (sb->s_flags & MS_ACTIVE) { + inode->i_state |= I_REFERENCED; + if (!(inode->i_state & (I_DIRTY|I_SYNC))) { + inode_lru_list_add(inode); + } spin_unlock(&inode_lock); return; } @@ -1251,13 +1279,19 @@ static void iput_final(struct inode *inode) spin_lock(&inode_lock); WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; - percpu_counter_dec(&nr_inodes_unused); hlist_del_init(&inode->i_hash); } - list_del_init(&inode->i_list); - list_del_init(&inode->i_sb_list); WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; + + /* + * After we delete the inode from the LRU here, we avoid moving dirty + * inodes back onto the LRU now because I_FREEING is set and hence + * writeback_single_inode() won't move the inode around. + */ + inode_lru_list_del(inode); + + list_del_init(&inode->i_sb_list); spin_unlock(&inode_lock); evict(inode); spin_lock(&inode_lock); diff --git a/include/linux/fs.h b/include/linux/fs.h index a3937a8ee95e..876275fc0638 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1641,16 +1641,17 @@ struct super_operations { * * Q: What is the difference between I_WILL_FREE and I_FREEING? */ -#define I_DIRTY_SYNC 1 -#define I_DIRTY_DATASYNC 2 -#define I_DIRTY_PAGES 4 +#define I_DIRTY_SYNC (1 << 0) +#define I_DIRTY_DATASYNC (1 << 1) +#define I_DIRTY_PAGES (1 << 2) #define __I_NEW 3 #define I_NEW (1 << __I_NEW) -#define I_WILL_FREE 16 -#define I_FREEING 32 -#define I_CLEAR 64 +#define I_WILL_FREE (1 << 4) +#define I_FREEING (1 << 5) +#define I_CLEAR (1 << 6) #define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) +#define I_REFERENCED (1 << 8) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 72a5d647a5f2..242b6f812ba6 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -10,8 +10,6 @@ struct backing_dev_info; extern spinlock_t inode_lock; -extern struct list_head inode_in_use; -extern struct list_head inode_unused; /* * fs/fs-writeback.c -- cgit v1.2.3-71-gd317 From 92c09c041f15fc88b35f8628e07639f52e1fbb38 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 26 Oct 2010 14:22:03 -0700 Subject: mm: declare some external symbols Declare 'bdi_pending_list' and 'tag_pages_for_writeback()' to remove following sparse warnings: mm/backing-dev.c:46:1: warning: symbol 'bdi_pending_list' was not declared. Should it be static? mm/page-writeback.c:825:6: warning: symbol 'tag_pages_for_writeback' was not declared. Should it be static? Signed-off-by: Namhyung Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 1 + include/linux/writeback.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/linux/writeback.h') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index f1b402a50679..4ce34fa937d4 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -111,6 +111,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); extern spinlock_t bdi_lock; extern struct list_head bdi_list; +extern struct list_head bdi_pending_list; static inline int wb_has_dirty_io(struct bdi_writeback *wb) { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 72a5d647a5f2..c7299d2ace6b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -149,6 +149,8 @@ int write_cache_pages(struct address_space *mapping, int do_writepages(struct address_space *mapping, struct writeback_control *wbc); void set_page_dirty_balance(struct page *page, int page_mkwrite); void writeback_set_ratelimit(void); +void tag_pages_for_writeback(struct address_space *mapping, + pgoff_t start, pgoff_t end); /* pdflush.c */ extern int nr_pdflush_threads; /* Global so it can be exported to sysctl -- cgit v1.2.3-71-gd317 From 5b41d92437f1ae19b3f3ffa3b16589fd5df50ac0 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 27 Oct 2010 21:30:13 -0400 Subject: ext4: implement writeback livelock avoidance using page tagging This is analogous to Jan Kara's commit, f446daaea9d4a420d16c606f755f3689dcb2d0ce mm: implement writeback livelock avoidance using page tagging but since we forked write_cache_pages, we need to reimplement it there (and in ext4_da_writepages, since range_cyclic handling was moved to there) If you start a large buffered IO to a file, and then set fsync after it, you'll find that fsync does not complete until the other IO stops. If you continue re-dirtying the file (say, putting dd with conv=notrunc in a loop), when fsync finally completes (after all IO is done), it reports via tracing that it has written many more pages than the file contains; in other words it has synced and re-synced pages in the file multiple times. This then leads to problems with our writeback_index update, since it advances it by pages written, and essentially sets writeback_index off the end of the file... With the following patch, we only sync as much as was dirty at the time of the sync. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 18 +++++++++++++++--- include/linux/writeback.h | 2 ++ 2 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6671fcbb5293..c9ea95ba5fde 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2809,16 +2809,21 @@ static int write_cache_pages_da(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ long nr_to_write = wbc->nr_to_write; + int tag; pagevec_init(&pvec, 0); index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; + while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; @@ -2923,6 +2928,7 @@ static int ext4_da_writepages(struct address_space *mapping, long desired_nr_to_write, nr_to_writebump = 0; loff_t range_start = wbc->range_start; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + pgoff_t end; trace_ext4_da_writepages(inode, wbc); @@ -2958,8 +2964,11 @@ static int ext4_da_writepages(struct address_space *mapping, wbc->range_start = index << PAGE_CACHE_SHIFT; wbc->range_end = LLONG_MAX; wbc->range_cyclic = 0; - } else + end = -1; + } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + } /* * This works around two forms of stupidity. The first is in @@ -3000,6 +3009,9 @@ static int ext4_da_writepages(struct address_space *mapping, pages_skipped = wbc->pages_skipped; retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); + while (!ret && wbc->nr_to_write > 0) { /* diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 72a5d647a5f2..3d132bfb4f3d 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -143,6 +143,8 @@ typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, int generic_writepages(struct address_space *mapping, struct writeback_control *wbc); +void tag_pages_for_writeback(struct address_space *mapping, + pgoff_t start, pgoff_t end); int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data); -- cgit v1.2.3-71-gd317 From 3259f8bed2f0f57c2fdcdac1b510c3fa319ef97e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 29 Oct 2010 11:16:17 -0400 Subject: Add new functions for triggering inode writeback When btrfs is running low on metadata space, it needs to force delayed allocation pages to disk. It currently does this with a suboptimal walk of a private list of inodes with delayed allocation, and it would be much better if we used the generic flusher threads. writeback_inodes_sb_if_idle would be ideal, but it waits for the flusher thread to start IO on all the dirty pages in the FS before it returns. This adds variants of writeback_inodes_sb* that allow the caller to control how many pages get sent down. Signed-off-by: Chris Mason --- fs/fs-writeback.c | 52 ++++++++++++++++++++++++++++++++++++++--------- include/linux/writeback.h | 2 ++ 2 files changed, 44 insertions(+), 10 deletions(-) (limited to 'include/linux/writeback.h') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ab38fef1c9a1..1e23c33ea5cf 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1069,33 +1069,44 @@ static void wait_sb_inodes(struct super_block *sb) } /** - * writeback_inodes_sb - writeback dirty inodes from given super_block + * writeback_inodes_sb_nr - writeback dirty inodes from given super_block * @sb: the superblock + * @nr: the number of pages to write * * Start writeback on some inodes on this super_block. No guarantees are made * on how many (if any) will be written, and this function does not wait - * for IO completion of submitted IO. The number of pages submitted is - * returned. + * for IO completion of submitted IO. */ -void writeback_inodes_sb(struct super_block *sb) +void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) { - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); DECLARE_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_NONE, .done = &done, + .nr_pages = nr, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); - - work.nr_pages = nr_dirty + nr_unstable + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); - bdi_queue_work(sb->s_bdi, &work); wait_for_completion(&done); } +EXPORT_SYMBOL(writeback_inodes_sb_nr); + +/** + * writeback_inodes_sb - writeback dirty inodes from given super_block + * @sb: the superblock + * + * Start writeback on some inodes on this super_block. No guarantees are made + * on how many (if any) will be written, and this function does not wait + * for IO completion of submitted IO. + */ +void writeback_inodes_sb(struct super_block *sb) +{ + return writeback_inodes_sb_nr(sb, global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + + (inodes_stat.nr_inodes - inodes_stat.nr_unused)); +} EXPORT_SYMBOL(writeback_inodes_sb); /** @@ -1117,6 +1128,27 @@ int writeback_inodes_sb_if_idle(struct super_block *sb) } EXPORT_SYMBOL(writeback_inodes_sb_if_idle); +/** + * writeback_inodes_sb_if_idle - start writeback if none underway + * @sb: the superblock + * @nr: the number of pages to write + * + * Invoke writeback_inodes_sb if no writeback is currently underway. + * Returns 1 if writeback was started, 0 if not. + */ +int writeback_inodes_sb_nr_if_idle(struct super_block *sb, + unsigned long nr) +{ + if (!writeback_in_progress(sb->s_bdi)) { + down_read(&sb->s_umount); + writeback_inodes_sb_nr(sb, nr); + up_read(&sb->s_umount); + return 1; + } else + return 0; +} +EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle); + /** * sync_inodes_sb - sync sb inode pages * @sb: the superblock diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 72a5d647a5f2..a4cf84511e79 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -60,7 +60,9 @@ struct writeback_control { struct bdi_writeback; int inode_wait(void *); void writeback_inodes_sb(struct super_block *); +void writeback_inodes_sb_nr(struct super_block *, unsigned long nr); int writeback_inodes_sb_if_idle(struct super_block *); +int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr); void sync_inodes_sb(struct super_block *); void writeback_inodes_wb(struct bdi_writeback *wb, struct writeback_control *wbc); -- cgit v1.2.3-71-gd317