cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

buffer.c (90332B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 *  linux/fs/buffer.c
      4 *
      5 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
      6 */
      7
      8/*
      9 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
     10 *
     11 * Removed a lot of unnecessary code and simplified things now that
     12 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
     13 *
     14 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
     15 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
     16 *
     17 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
     18 *
     19 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
     20 */
     21
     22#include <linux/kernel.h>
     23#include <linux/sched/signal.h>
     24#include <linux/syscalls.h>
     25#include <linux/fs.h>
     26#include <linux/iomap.h>
     27#include <linux/mm.h>
     28#include <linux/percpu.h>
     29#include <linux/slab.h>
     30#include <linux/capability.h>
     31#include <linux/blkdev.h>
     32#include <linux/file.h>
     33#include <linux/quotaops.h>
     34#include <linux/highmem.h>
     35#include <linux/export.h>
     36#include <linux/backing-dev.h>
     37#include <linux/writeback.h>
     38#include <linux/hash.h>
     39#include <linux/suspend.h>
     40#include <linux/buffer_head.h>
     41#include <linux/task_io_accounting_ops.h>
     42#include <linux/bio.h>
     43#include <linux/cpu.h>
     44#include <linux/bitops.h>
     45#include <linux/mpage.h>
     46#include <linux/bit_spinlock.h>
     47#include <linux/pagevec.h>
     48#include <linux/sched/mm.h>
     49#include <trace/events/block.h>
     50#include <linux/fscrypt.h>
     51
     52#include "internal.h"
     53
     54static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
     55static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
     56			 struct writeback_control *wbc);
     57
     58#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
     59
     60inline void touch_buffer(struct buffer_head *bh)
     61{
     62	trace_block_touch_buffer(bh);
     63	mark_page_accessed(bh->b_page);
     64}
     65EXPORT_SYMBOL(touch_buffer);
     66
     67void __lock_buffer(struct buffer_head *bh)
     68{
     69	wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
     70}
     71EXPORT_SYMBOL(__lock_buffer);
     72
     73void unlock_buffer(struct buffer_head *bh)
     74{
     75	clear_bit_unlock(BH_Lock, &bh->b_state);
     76	smp_mb__after_atomic();
     77	wake_up_bit(&bh->b_state, BH_Lock);
     78}
     79EXPORT_SYMBOL(unlock_buffer);
     80
     81/*
     82 * Returns if the folio has dirty or writeback buffers. If all the buffers
     83 * are unlocked and clean then the folio_test_dirty information is stale. If
     84 * any of the buffers are locked, it is assumed they are locked for IO.
     85 */
     86void buffer_check_dirty_writeback(struct folio *folio,
     87				     bool *dirty, bool *writeback)
     88{
     89	struct buffer_head *head, *bh;
     90	*dirty = false;
     91	*writeback = false;
     92
     93	BUG_ON(!folio_test_locked(folio));
     94
     95	head = folio_buffers(folio);
     96	if (!head)
     97		return;
     98
     99	if (folio_test_writeback(folio))
    100		*writeback = true;
    101
    102	bh = head;
    103	do {
    104		if (buffer_locked(bh))
    105			*writeback = true;
    106
    107		if (buffer_dirty(bh))
    108			*dirty = true;
    109
    110		bh = bh->b_this_page;
    111	} while (bh != head);
    112}
    113EXPORT_SYMBOL(buffer_check_dirty_writeback);
    114
    115/*
    116 * Block until a buffer comes unlocked.  This doesn't stop it
    117 * from becoming locked again - you have to lock it yourself
    118 * if you want to preserve its state.
    119 */
    120void __wait_on_buffer(struct buffer_head * bh)
    121{
    122	wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
    123}
    124EXPORT_SYMBOL(__wait_on_buffer);
    125
    126static void buffer_io_error(struct buffer_head *bh, char *msg)
    127{
    128	if (!test_bit(BH_Quiet, &bh->b_state))
    129		printk_ratelimited(KERN_ERR
    130			"Buffer I/O error on dev %pg, logical block %llu%s\n",
    131			bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
    132}
    133
    134/*
    135 * End-of-IO handler helper function which does not touch the bh after
    136 * unlocking it.
    137 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
    138 * a race there is benign: unlock_buffer() only use the bh's address for
    139 * hashing after unlocking the buffer, so it doesn't actually touch the bh
    140 * itself.
    141 */
    142static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
    143{
    144	if (uptodate) {
    145		set_buffer_uptodate(bh);
    146	} else {
    147		/* This happens, due to failed read-ahead attempts. */
    148		clear_buffer_uptodate(bh);
    149	}
    150	unlock_buffer(bh);
    151}
    152
    153/*
    154 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
    155 * unlock the buffer. This is what ll_rw_block uses too.
    156 */
    157void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
    158{
    159	__end_buffer_read_notouch(bh, uptodate);
    160	put_bh(bh);
    161}
    162EXPORT_SYMBOL(end_buffer_read_sync);
    163
    164void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
    165{
    166	if (uptodate) {
    167		set_buffer_uptodate(bh);
    168	} else {
    169		buffer_io_error(bh, ", lost sync page write");
    170		mark_buffer_write_io_error(bh);
    171		clear_buffer_uptodate(bh);
    172	}
    173	unlock_buffer(bh);
    174	put_bh(bh);
    175}
    176EXPORT_SYMBOL(end_buffer_write_sync);
    177
    178/*
    179 * Various filesystems appear to want __find_get_block to be non-blocking.
    180 * But it's the page lock which protects the buffers.  To get around this,
    181 * we get exclusion from try_to_free_buffers with the blockdev mapping's
    182 * private_lock.
    183 *
    184 * Hack idea: for the blockdev mapping, private_lock contention
    185 * may be quite high.  This code could TryLock the page, and if that
    186 * succeeds, there is no need to take private_lock.
    187 */
    188static struct buffer_head *
    189__find_get_block_slow(struct block_device *bdev, sector_t block)
    190{
    191	struct inode *bd_inode = bdev->bd_inode;
    192	struct address_space *bd_mapping = bd_inode->i_mapping;
    193	struct buffer_head *ret = NULL;
    194	pgoff_t index;
    195	struct buffer_head *bh;
    196	struct buffer_head *head;
    197	struct page *page;
    198	int all_mapped = 1;
    199	static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
    200
    201	index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
    202	page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
    203	if (!page)
    204		goto out;
    205
    206	spin_lock(&bd_mapping->private_lock);
    207	if (!page_has_buffers(page))
    208		goto out_unlock;
    209	head = page_buffers(page);
    210	bh = head;
    211	do {
    212		if (!buffer_mapped(bh))
    213			all_mapped = 0;
    214		else if (bh->b_blocknr == block) {
    215			ret = bh;
    216			get_bh(bh);
    217			goto out_unlock;
    218		}
    219		bh = bh->b_this_page;
    220	} while (bh != head);
    221
    222	/* we might be here because some of the buffers on this page are
    223	 * not mapped.  This is due to various races between
    224	 * file io on the block device and getblk.  It gets dealt with
    225	 * elsewhere, don't buffer_error if we had some unmapped buffers
    226	 */
    227	ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
    228	if (all_mapped && __ratelimit(&last_warned)) {
    229		printk("__find_get_block_slow() failed. block=%llu, "
    230		       "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
    231		       "device %pg blocksize: %d\n",
    232		       (unsigned long long)block,
    233		       (unsigned long long)bh->b_blocknr,
    234		       bh->b_state, bh->b_size, bdev,
    235		       1 << bd_inode->i_blkbits);
    236	}
    237out_unlock:
    238	spin_unlock(&bd_mapping->private_lock);
    239	put_page(page);
    240out:
    241	return ret;
    242}
    243
    244static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
    245{
    246	unsigned long flags;
    247	struct buffer_head *first;
    248	struct buffer_head *tmp;
    249	struct page *page;
    250	int page_uptodate = 1;
    251
    252	BUG_ON(!buffer_async_read(bh));
    253
    254	page = bh->b_page;
    255	if (uptodate) {
    256		set_buffer_uptodate(bh);
    257	} else {
    258		clear_buffer_uptodate(bh);
    259		buffer_io_error(bh, ", async page read");
    260		SetPageError(page);
    261	}
    262
    263	/*
    264	 * Be _very_ careful from here on. Bad things can happen if
    265	 * two buffer heads end IO at almost the same time and both
    266	 * decide that the page is now completely done.
    267	 */
    268	first = page_buffers(page);
    269	spin_lock_irqsave(&first->b_uptodate_lock, flags);
    270	clear_buffer_async_read(bh);
    271	unlock_buffer(bh);
    272	tmp = bh;
    273	do {
    274		if (!buffer_uptodate(tmp))
    275			page_uptodate = 0;
    276		if (buffer_async_read(tmp)) {
    277			BUG_ON(!buffer_locked(tmp));
    278			goto still_busy;
    279		}
    280		tmp = tmp->b_this_page;
    281	} while (tmp != bh);
    282	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
    283
    284	/*
    285	 * If none of the buffers had errors and they are all
    286	 * uptodate then we can set the page uptodate.
    287	 */
    288	if (page_uptodate && !PageError(page))
    289		SetPageUptodate(page);
    290	unlock_page(page);
    291	return;
    292
    293still_busy:
    294	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
    295	return;
    296}
    297
    298struct decrypt_bh_ctx {
    299	struct work_struct work;
    300	struct buffer_head *bh;
    301};
    302
    303static void decrypt_bh(struct work_struct *work)
    304{
    305	struct decrypt_bh_ctx *ctx =
    306		container_of(work, struct decrypt_bh_ctx, work);
    307	struct buffer_head *bh = ctx->bh;
    308	int err;
    309
    310	err = fscrypt_decrypt_pagecache_blocks(bh->b_page, bh->b_size,
    311					       bh_offset(bh));
    312	end_buffer_async_read(bh, err == 0);
    313	kfree(ctx);
    314}
    315
    316/*
    317 * I/O completion handler for block_read_full_folio() - pages
    318 * which come unlocked at the end of I/O.
    319 */
    320static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
    321{
    322	/* Decrypt if needed */
    323	if (uptodate &&
    324	    fscrypt_inode_uses_fs_layer_crypto(bh->b_page->mapping->host)) {
    325		struct decrypt_bh_ctx *ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC);
    326
    327		if (ctx) {
    328			INIT_WORK(&ctx->work, decrypt_bh);
    329			ctx->bh = bh;
    330			fscrypt_enqueue_decrypt_work(&ctx->work);
    331			return;
    332		}
    333		uptodate = 0;
    334	}
    335	end_buffer_async_read(bh, uptodate);
    336}
    337
    338/*
    339 * Completion handler for block_write_full_page() - pages which are unlocked
    340 * during I/O, and which have PageWriteback cleared upon I/O completion.
    341 */
    342void end_buffer_async_write(struct buffer_head *bh, int uptodate)
    343{
    344	unsigned long flags;
    345	struct buffer_head *first;
    346	struct buffer_head *tmp;
    347	struct page *page;
    348
    349	BUG_ON(!buffer_async_write(bh));
    350
    351	page = bh->b_page;
    352	if (uptodate) {
    353		set_buffer_uptodate(bh);
    354	} else {
    355		buffer_io_error(bh, ", lost async page write");
    356		mark_buffer_write_io_error(bh);
    357		clear_buffer_uptodate(bh);
    358		SetPageError(page);
    359	}
    360
    361	first = page_buffers(page);
    362	spin_lock_irqsave(&first->b_uptodate_lock, flags);
    363
    364	clear_buffer_async_write(bh);
    365	unlock_buffer(bh);
    366	tmp = bh->b_this_page;
    367	while (tmp != bh) {
    368		if (buffer_async_write(tmp)) {
    369			BUG_ON(!buffer_locked(tmp));
    370			goto still_busy;
    371		}
    372		tmp = tmp->b_this_page;
    373	}
    374	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
    375	end_page_writeback(page);
    376	return;
    377
    378still_busy:
    379	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
    380	return;
    381}
    382EXPORT_SYMBOL(end_buffer_async_write);
    383
    384/*
    385 * If a page's buffers are under async readin (end_buffer_async_read
    386 * completion) then there is a possibility that another thread of
    387 * control could lock one of the buffers after it has completed
    388 * but while some of the other buffers have not completed.  This
    389 * locked buffer would confuse end_buffer_async_read() into not unlocking
    390 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
    391 * that this buffer is not under async I/O.
    392 *
    393 * The page comes unlocked when it has no locked buffer_async buffers
    394 * left.
    395 *
    396 * PageLocked prevents anyone starting new async I/O reads any of
    397 * the buffers.
    398 *
    399 * PageWriteback is used to prevent simultaneous writeout of the same
    400 * page.
    401 *
    402 * PageLocked prevents anyone from starting writeback of a page which is
    403 * under read I/O (PageWriteback is only ever set against a locked page).
    404 */
    405static void mark_buffer_async_read(struct buffer_head *bh)
    406{
    407	bh->b_end_io = end_buffer_async_read_io;
    408	set_buffer_async_read(bh);
    409}
    410
    411static void mark_buffer_async_write_endio(struct buffer_head *bh,
    412					  bh_end_io_t *handler)
    413{
    414	bh->b_end_io = handler;
    415	set_buffer_async_write(bh);
    416}
    417
    418void mark_buffer_async_write(struct buffer_head *bh)
    419{
    420	mark_buffer_async_write_endio(bh, end_buffer_async_write);
    421}
    422EXPORT_SYMBOL(mark_buffer_async_write);
    423
    424
    425/*
    426 * fs/buffer.c contains helper functions for buffer-backed address space's
    427 * fsync functions.  A common requirement for buffer-based filesystems is
    428 * that certain data from the backing blockdev needs to be written out for
    429 * a successful fsync().  For example, ext2 indirect blocks need to be
    430 * written back and waited upon before fsync() returns.
    431 *
    432 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
    433 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
    434 * management of a list of dependent buffers at ->i_mapping->private_list.
    435 *
    436 * Locking is a little subtle: try_to_free_buffers() will remove buffers
    437 * from their controlling inode's queue when they are being freed.  But
    438 * try_to_free_buffers() will be operating against the *blockdev* mapping
    439 * at the time, not against the S_ISREG file which depends on those buffers.
    440 * So the locking for private_list is via the private_lock in the address_space
    441 * which backs the buffers.  Which is different from the address_space 
    442 * against which the buffers are listed.  So for a particular address_space,
    443 * mapping->private_lock does *not* protect mapping->private_list!  In fact,
    444 * mapping->private_list will always be protected by the backing blockdev's
    445 * ->private_lock.
    446 *
    447 * Which introduces a requirement: all buffers on an address_space's
    448 * ->private_list must be from the same address_space: the blockdev's.
    449 *
    450 * address_spaces which do not place buffers at ->private_list via these
    451 * utility functions are free to use private_lock and private_list for
    452 * whatever they want.  The only requirement is that list_empty(private_list)
    453 * be true at clear_inode() time.
    454 *
    455 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
    456 * filesystems should do that.  invalidate_inode_buffers() should just go
    457 * BUG_ON(!list_empty).
    458 *
    459 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
    460 * take an address_space, not an inode.  And it should be called
    461 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
    462 * queued up.
    463 *
    464 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
    465 * list if it is already on a list.  Because if the buffer is on a list,
    466 * it *must* already be on the right one.  If not, the filesystem is being
    467 * silly.  This will save a ton of locking.  But first we have to ensure
    468 * that buffers are taken *off* the old inode's list when they are freed
    469 * (presumably in truncate).  That requires careful auditing of all
    470 * filesystems (do it inside bforget()).  It could also be done by bringing
    471 * b_inode back.
    472 */
    473
    474/*
    475 * The buffer's backing address_space's private_lock must be held
    476 */
    477static void __remove_assoc_queue(struct buffer_head *bh)
    478{
    479	list_del_init(&bh->b_assoc_buffers);
    480	WARN_ON(!bh->b_assoc_map);
    481	bh->b_assoc_map = NULL;
    482}
    483
    484int inode_has_buffers(struct inode *inode)
    485{
    486	return !list_empty(&inode->i_data.private_list);
    487}
    488
    489/*
    490 * osync is designed to support O_SYNC io.  It waits synchronously for
    491 * all already-submitted IO to complete, but does not queue any new
    492 * writes to the disk.
    493 *
    494 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
    495 * you dirty the buffers, and then use osync_inode_buffers to wait for
    496 * completion.  Any other dirty buffers which are not yet queued for
    497 * write will not be flushed to disk by the osync.
    498 */
    499static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
    500{
    501	struct buffer_head *bh;
    502	struct list_head *p;
    503	int err = 0;
    504
    505	spin_lock(lock);
    506repeat:
    507	list_for_each_prev(p, list) {
    508		bh = BH_ENTRY(p);
    509		if (buffer_locked(bh)) {
    510			get_bh(bh);
    511			spin_unlock(lock);
    512			wait_on_buffer(bh);
    513			if (!buffer_uptodate(bh))
    514				err = -EIO;
    515			brelse(bh);
    516			spin_lock(lock);
    517			goto repeat;
    518		}
    519	}
    520	spin_unlock(lock);
    521	return err;
    522}
    523
    524void emergency_thaw_bdev(struct super_block *sb)
    525{
    526	while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
    527		printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
    528}
    529
    530/**
    531 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
    532 * @mapping: the mapping which wants those buffers written
    533 *
    534 * Starts I/O against the buffers at mapping->private_list, and waits upon
    535 * that I/O.
    536 *
    537 * Basically, this is a convenience function for fsync().
    538 * @mapping is a file or directory which needs those buffers to be written for
    539 * a successful fsync().
    540 */
    541int sync_mapping_buffers(struct address_space *mapping)
    542{
    543	struct address_space *buffer_mapping = mapping->private_data;
    544
    545	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
    546		return 0;
    547
    548	return fsync_buffers_list(&buffer_mapping->private_lock,
    549					&mapping->private_list);
    550}
    551EXPORT_SYMBOL(sync_mapping_buffers);
    552
    553/*
    554 * Called when we've recently written block `bblock', and it is known that
    555 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
    556 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
    557 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
    558 */
    559void write_boundary_block(struct block_device *bdev,
    560			sector_t bblock, unsigned blocksize)
    561{
    562	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
    563	if (bh) {
    564		if (buffer_dirty(bh))
    565			ll_rw_block(REQ_OP_WRITE, 0, 1, &bh);
    566		put_bh(bh);
    567	}
    568}
    569
    570void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
    571{
    572	struct address_space *mapping = inode->i_mapping;
    573	struct address_space *buffer_mapping = bh->b_page->mapping;
    574
    575	mark_buffer_dirty(bh);
    576	if (!mapping->private_data) {
    577		mapping->private_data = buffer_mapping;
    578	} else {
    579		BUG_ON(mapping->private_data != buffer_mapping);
    580	}
    581	if (!bh->b_assoc_map) {
    582		spin_lock(&buffer_mapping->private_lock);
    583		list_move_tail(&bh->b_assoc_buffers,
    584				&mapping->private_list);
    585		bh->b_assoc_map = mapping;
    586		spin_unlock(&buffer_mapping->private_lock);
    587	}
    588}
    589EXPORT_SYMBOL(mark_buffer_dirty_inode);
    590
    591/*
    592 * Add a page to the dirty page list.
    593 *
    594 * It is a sad fact of life that this function is called from several places
    595 * deeply under spinlocking.  It may not sleep.
    596 *
    597 * If the page has buffers, the uptodate buffers are set dirty, to preserve
    598 * dirty-state coherency between the page and the buffers.  It the page does
    599 * not have buffers then when they are later attached they will all be set
    600 * dirty.
    601 *
    602 * The buffers are dirtied before the page is dirtied.  There's a small race
    603 * window in which a writepage caller may see the page cleanness but not the
    604 * buffer dirtiness.  That's fine.  If this code were to set the page dirty
    605 * before the buffers, a concurrent writepage caller could clear the page dirty
    606 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
    607 * page on the dirty page list.
    608 *
    609 * We use private_lock to lock against try_to_free_buffers while using the
    610 * page's buffer list.  Also use this to protect against clean buffers being
    611 * added to the page after it was set dirty.
    612 *
    613 * FIXME: may need to call ->reservepage here as well.  That's rather up to the
    614 * address_space though.
    615 */
    616bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
    617{
    618	struct buffer_head *head;
    619	bool newly_dirty;
    620
    621	spin_lock(&mapping->private_lock);
    622	head = folio_buffers(folio);
    623	if (head) {
    624		struct buffer_head *bh = head;
    625
    626		do {
    627			set_buffer_dirty(bh);
    628			bh = bh->b_this_page;
    629		} while (bh != head);
    630	}
    631	/*
    632	 * Lock out page's memcg migration to keep PageDirty
    633	 * synchronized with per-memcg dirty page counters.
    634	 */
    635	folio_memcg_lock(folio);
    636	newly_dirty = !folio_test_set_dirty(folio);
    637	spin_unlock(&mapping->private_lock);
    638
    639	if (newly_dirty)
    640		__folio_mark_dirty(folio, mapping, 1);
    641
    642	folio_memcg_unlock(folio);
    643
    644	if (newly_dirty)
    645		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
    646
    647	return newly_dirty;
    648}
    649EXPORT_SYMBOL(block_dirty_folio);
    650
    651/*
    652 * Write out and wait upon a list of buffers.
    653 *
    654 * We have conflicting pressures: we want to make sure that all
    655 * initially dirty buffers get waited on, but that any subsequently
    656 * dirtied buffers don't.  After all, we don't want fsync to last
    657 * forever if somebody is actively writing to the file.
    658 *
    659 * Do this in two main stages: first we copy dirty buffers to a
    660 * temporary inode list, queueing the writes as we go.  Then we clean
    661 * up, waiting for those writes to complete.
    662 * 
    663 * During this second stage, any subsequent updates to the file may end
    664 * up refiling the buffer on the original inode's dirty list again, so
    665 * there is a chance we will end up with a buffer queued for write but
    666 * not yet completed on that list.  So, as a final cleanup we go through
    667 * the osync code to catch these locked, dirty buffers without requeuing
    668 * any newly dirty buffers for write.
    669 */
    670static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
    671{
    672	struct buffer_head *bh;
    673	struct list_head tmp;
    674	struct address_space *mapping;
    675	int err = 0, err2;
    676	struct blk_plug plug;
    677
    678	INIT_LIST_HEAD(&tmp);
    679	blk_start_plug(&plug);
    680
    681	spin_lock(lock);
    682	while (!list_empty(list)) {
    683		bh = BH_ENTRY(list->next);
    684		mapping = bh->b_assoc_map;
    685		__remove_assoc_queue(bh);
    686		/* Avoid race with mark_buffer_dirty_inode() which does
    687		 * a lockless check and we rely on seeing the dirty bit */
    688		smp_mb();
    689		if (buffer_dirty(bh) || buffer_locked(bh)) {
    690			list_add(&bh->b_assoc_buffers, &tmp);
    691			bh->b_assoc_map = mapping;
    692			if (buffer_dirty(bh)) {
    693				get_bh(bh);
    694				spin_unlock(lock);
    695				/*
    696				 * Ensure any pending I/O completes so that
    697				 * write_dirty_buffer() actually writes the
    698				 * current contents - it is a noop if I/O is
    699				 * still in flight on potentially older
    700				 * contents.
    701				 */
    702				write_dirty_buffer(bh, REQ_SYNC);
    703
    704				/*
    705				 * Kick off IO for the previous mapping. Note
    706				 * that we will not run the very last mapping,
    707				 * wait_on_buffer() will do that for us
    708				 * through sync_buffer().
    709				 */
    710				brelse(bh);
    711				spin_lock(lock);
    712			}
    713		}
    714	}
    715
    716	spin_unlock(lock);
    717	blk_finish_plug(&plug);
    718	spin_lock(lock);
    719
    720	while (!list_empty(&tmp)) {
    721		bh = BH_ENTRY(tmp.prev);
    722		get_bh(bh);
    723		mapping = bh->b_assoc_map;
    724		__remove_assoc_queue(bh);
    725		/* Avoid race with mark_buffer_dirty_inode() which does
    726		 * a lockless check and we rely on seeing the dirty bit */
    727		smp_mb();
    728		if (buffer_dirty(bh)) {
    729			list_add(&bh->b_assoc_buffers,
    730				 &mapping->private_list);
    731			bh->b_assoc_map = mapping;
    732		}
    733		spin_unlock(lock);
    734		wait_on_buffer(bh);
    735		if (!buffer_uptodate(bh))
    736			err = -EIO;
    737		brelse(bh);
    738		spin_lock(lock);
    739	}
    740	
    741	spin_unlock(lock);
    742	err2 = osync_buffers_list(lock, list);
    743	if (err)
    744		return err;
    745	else
    746		return err2;
    747}
    748
    749/*
    750 * Invalidate any and all dirty buffers on a given inode.  We are
    751 * probably unmounting the fs, but that doesn't mean we have already
    752 * done a sync().  Just drop the buffers from the inode list.
    753 *
    754 * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
    755 * assumes that all the buffers are against the blockdev.  Not true
    756 * for reiserfs.
    757 */
    758void invalidate_inode_buffers(struct inode *inode)
    759{
    760	if (inode_has_buffers(inode)) {
    761		struct address_space *mapping = &inode->i_data;
    762		struct list_head *list = &mapping->private_list;
    763		struct address_space *buffer_mapping = mapping->private_data;
    764
    765		spin_lock(&buffer_mapping->private_lock);
    766		while (!list_empty(list))
    767			__remove_assoc_queue(BH_ENTRY(list->next));
    768		spin_unlock(&buffer_mapping->private_lock);
    769	}
    770}
    771EXPORT_SYMBOL(invalidate_inode_buffers);
    772
    773/*
    774 * Remove any clean buffers from the inode's buffer list.  This is called
    775 * when we're trying to free the inode itself.  Those buffers can pin it.
    776 *
    777 * Returns true if all buffers were removed.
    778 */
    779int remove_inode_buffers(struct inode *inode)
    780{
    781	int ret = 1;
    782
    783	if (inode_has_buffers(inode)) {
    784		struct address_space *mapping = &inode->i_data;
    785		struct list_head *list = &mapping->private_list;
    786		struct address_space *buffer_mapping = mapping->private_data;
    787
    788		spin_lock(&buffer_mapping->private_lock);
    789		while (!list_empty(list)) {
    790			struct buffer_head *bh = BH_ENTRY(list->next);
    791			if (buffer_dirty(bh)) {
    792				ret = 0;
    793				break;
    794			}
    795			__remove_assoc_queue(bh);
    796		}
    797		spin_unlock(&buffer_mapping->private_lock);
    798	}
    799	return ret;
    800}
    801
    802/*
    803 * Create the appropriate buffers when given a page for data area and
    804 * the size of each buffer.. Use the bh->b_this_page linked list to
    805 * follow the buffers created.  Return NULL if unable to create more
    806 * buffers.
    807 *
    808 * The retry flag is used to differentiate async IO (paging, swapping)
    809 * which may not fail from ordinary buffer allocations.
    810 */
    811struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
    812		bool retry)
    813{
    814	struct buffer_head *bh, *head;
    815	gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
    816	long offset;
    817	struct mem_cgroup *memcg, *old_memcg;
    818
    819	if (retry)
    820		gfp |= __GFP_NOFAIL;
    821
    822	/* The page lock pins the memcg */
    823	memcg = page_memcg(page);
    824	old_memcg = set_active_memcg(memcg);
    825
    826	head = NULL;
    827	offset = PAGE_SIZE;
    828	while ((offset -= size) >= 0) {
    829		bh = alloc_buffer_head(gfp);
    830		if (!bh)
    831			goto no_grow;
    832
    833		bh->b_this_page = head;
    834		bh->b_blocknr = -1;
    835		head = bh;
    836
    837		bh->b_size = size;
    838
    839		/* Link the buffer to its page */
    840		set_bh_page(bh, page, offset);
    841	}
    842out:
    843	set_active_memcg(old_memcg);
    844	return head;
    845/*
    846 * In case anything failed, we just free everything we got.
    847 */
    848no_grow:
    849	if (head) {
    850		do {
    851			bh = head;
    852			head = head->b_this_page;
    853			free_buffer_head(bh);
    854		} while (head);
    855	}
    856
    857	goto out;
    858}
    859EXPORT_SYMBOL_GPL(alloc_page_buffers);
    860
    861static inline void
    862link_dev_buffers(struct page *page, struct buffer_head *head)
    863{
    864	struct buffer_head *bh, *tail;
    865
    866	bh = head;
    867	do {
    868		tail = bh;
    869		bh = bh->b_this_page;
    870	} while (bh);
    871	tail->b_this_page = head;
    872	attach_page_private(page, head);
    873}
    874
    875static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
    876{
    877	sector_t retval = ~((sector_t)0);
    878	loff_t sz = bdev_nr_bytes(bdev);
    879
    880	if (sz) {
    881		unsigned int sizebits = blksize_bits(size);
    882		retval = (sz >> sizebits);
    883	}
    884	return retval;
    885}
    886
    887/*
    888 * Initialise the state of a blockdev page's buffers.
    889 */ 
    890static sector_t
    891init_page_buffers(struct page *page, struct block_device *bdev,
    892			sector_t block, int size)
    893{
    894	struct buffer_head *head = page_buffers(page);
    895	struct buffer_head *bh = head;
    896	int uptodate = PageUptodate(page);
    897	sector_t end_block = blkdev_max_block(bdev, size);
    898
    899	do {
    900		if (!buffer_mapped(bh)) {
    901			bh->b_end_io = NULL;
    902			bh->b_private = NULL;
    903			bh->b_bdev = bdev;
    904			bh->b_blocknr = block;
    905			if (uptodate)
    906				set_buffer_uptodate(bh);
    907			if (block < end_block)
    908				set_buffer_mapped(bh);
    909		}
    910		block++;
    911		bh = bh->b_this_page;
    912	} while (bh != head);
    913
    914	/*
    915	 * Caller needs to validate requested block against end of device.
    916	 */
    917	return end_block;
    918}
    919
    920/*
    921 * Create the page-cache page that contains the requested block.
    922 *
    923 * This is used purely for blockdev mappings.
    924 */
    925static int
    926grow_dev_page(struct block_device *bdev, sector_t block,
    927	      pgoff_t index, int size, int sizebits, gfp_t gfp)
    928{
    929	struct inode *inode = bdev->bd_inode;
    930	struct page *page;
    931	struct buffer_head *bh;
    932	sector_t end_block;
    933	int ret = 0;
    934	gfp_t gfp_mask;
    935
    936	gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
    937
    938	/*
    939	 * XXX: __getblk_slow() can not really deal with failure and
    940	 * will endlessly loop on improvised global reclaim.  Prefer
    941	 * looping in the allocator rather than here, at least that
    942	 * code knows what it's doing.
    943	 */
    944	gfp_mask |= __GFP_NOFAIL;
    945
    946	page = find_or_create_page(inode->i_mapping, index, gfp_mask);
    947
    948	BUG_ON(!PageLocked(page));
    949
    950	if (page_has_buffers(page)) {
    951		bh = page_buffers(page);
    952		if (bh->b_size == size) {
    953			end_block = init_page_buffers(page, bdev,
    954						(sector_t)index << sizebits,
    955						size);
    956			goto done;
    957		}
    958		if (!try_to_free_buffers(page_folio(page)))
    959			goto failed;
    960	}
    961
    962	/*
    963	 * Allocate some buffers for this page
    964	 */
    965	bh = alloc_page_buffers(page, size, true);
    966
    967	/*
    968	 * Link the page to the buffers and initialise them.  Take the
    969	 * lock to be atomic wrt __find_get_block(), which does not
    970	 * run under the page lock.
    971	 */
    972	spin_lock(&inode->i_mapping->private_lock);
    973	link_dev_buffers(page, bh);
    974	end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
    975			size);
    976	spin_unlock(&inode->i_mapping->private_lock);
    977done:
    978	ret = (block < end_block) ? 1 : -ENXIO;
    979failed:
    980	unlock_page(page);
    981	put_page(page);
    982	return ret;
    983}
    984
    985/*
    986 * Create buffers for the specified block device block's page.  If
    987 * that page was dirty, the buffers are set dirty also.
    988 */
    989static int
    990grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
    991{
    992	pgoff_t index;
    993	int sizebits;
    994
    995	sizebits = PAGE_SHIFT - __ffs(size);
    996	index = block >> sizebits;
    997
    998	/*
    999	 * Check for a block which wants to lie outside our maximum possible
   1000	 * pagecache index.  (this comparison is done using sector_t types).
   1001	 */
   1002	if (unlikely(index != block >> sizebits)) {
   1003		printk(KERN_ERR "%s: requested out-of-range block %llu for "
   1004			"device %pg\n",
   1005			__func__, (unsigned long long)block,
   1006			bdev);
   1007		return -EIO;
   1008	}
   1009
   1010	/* Create a page with the proper size buffers.. */
   1011	return grow_dev_page(bdev, block, index, size, sizebits, gfp);
   1012}
   1013
   1014static struct buffer_head *
   1015__getblk_slow(struct block_device *bdev, sector_t block,
   1016	     unsigned size, gfp_t gfp)
   1017{
   1018	/* Size must be multiple of hard sectorsize */
   1019	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
   1020			(size < 512 || size > PAGE_SIZE))) {
   1021		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
   1022					size);
   1023		printk(KERN_ERR "logical block size: %d\n",
   1024					bdev_logical_block_size(bdev));
   1025
   1026		dump_stack();
   1027		return NULL;
   1028	}
   1029
   1030	for (;;) {
   1031		struct buffer_head *bh;
   1032		int ret;
   1033
   1034		bh = __find_get_block(bdev, block, size);
   1035		if (bh)
   1036			return bh;
   1037
   1038		ret = grow_buffers(bdev, block, size, gfp);
   1039		if (ret < 0)
   1040			return NULL;
   1041	}
   1042}
   1043
   1044/*
   1045 * The relationship between dirty buffers and dirty pages:
   1046 *
   1047 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
   1048 * the page is tagged dirty in the page cache.
   1049 *
   1050 * At all times, the dirtiness of the buffers represents the dirtiness of
   1051 * subsections of the page.  If the page has buffers, the page dirty bit is
   1052 * merely a hint about the true dirty state.
   1053 *
   1054 * When a page is set dirty in its entirety, all its buffers are marked dirty
   1055 * (if the page has buffers).
   1056 *
   1057 * When a buffer is marked dirty, its page is dirtied, but the page's other
   1058 * buffers are not.
   1059 *
   1060 * Also.  When blockdev buffers are explicitly read with bread(), they
   1061 * individually become uptodate.  But their backing page remains not
   1062 * uptodate - even if all of its buffers are uptodate.  A subsequent
   1063 * block_read_full_folio() against that folio will discover all the uptodate
   1064 * buffers, will set the folio uptodate and will perform no I/O.
   1065 */
   1066
   1067/**
   1068 * mark_buffer_dirty - mark a buffer_head as needing writeout
   1069 * @bh: the buffer_head to mark dirty
   1070 *
   1071 * mark_buffer_dirty() will set the dirty bit against the buffer, then set
   1072 * its backing page dirty, then tag the page as dirty in the page cache
   1073 * and then attach the address_space's inode to its superblock's dirty
   1074 * inode list.
   1075 *
   1076 * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
   1077 * i_pages lock and mapping->host->i_lock.
   1078 */
   1079void mark_buffer_dirty(struct buffer_head *bh)
   1080{
   1081	WARN_ON_ONCE(!buffer_uptodate(bh));
   1082
   1083	trace_block_dirty_buffer(bh);
   1084
   1085	/*
   1086	 * Very *carefully* optimize the it-is-already-dirty case.
   1087	 *
   1088	 * Don't let the final "is it dirty" escape to before we
   1089	 * perhaps modified the buffer.
   1090	 */
   1091	if (buffer_dirty(bh)) {
   1092		smp_mb();
   1093		if (buffer_dirty(bh))
   1094			return;
   1095	}
   1096
   1097	if (!test_set_buffer_dirty(bh)) {
   1098		struct page *page = bh->b_page;
   1099		struct address_space *mapping = NULL;
   1100
   1101		lock_page_memcg(page);
   1102		if (!TestSetPageDirty(page)) {
   1103			mapping = page_mapping(page);
   1104			if (mapping)
   1105				__set_page_dirty(page, mapping, 0);
   1106		}
   1107		unlock_page_memcg(page);
   1108		if (mapping)
   1109			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
   1110	}
   1111}
   1112EXPORT_SYMBOL(mark_buffer_dirty);
   1113
   1114void mark_buffer_write_io_error(struct buffer_head *bh)
   1115{
   1116	struct super_block *sb;
   1117
   1118	set_buffer_write_io_error(bh);
   1119	/* FIXME: do we need to set this in both places? */
   1120	if (bh->b_page && bh->b_page->mapping)
   1121		mapping_set_error(bh->b_page->mapping, -EIO);
   1122	if (bh->b_assoc_map)
   1123		mapping_set_error(bh->b_assoc_map, -EIO);
   1124	rcu_read_lock();
   1125	sb = READ_ONCE(bh->b_bdev->bd_super);
   1126	if (sb)
   1127		errseq_set(&sb->s_wb_err, -EIO);
   1128	rcu_read_unlock();
   1129}
   1130EXPORT_SYMBOL(mark_buffer_write_io_error);
   1131
   1132/*
   1133 * Decrement a buffer_head's reference count.  If all buffers against a page
   1134 * have zero reference count, are clean and unlocked, and if the page is clean
   1135 * and unlocked then try_to_free_buffers() may strip the buffers from the page
   1136 * in preparation for freeing it (sometimes, rarely, buffers are removed from
   1137 * a page but it ends up not being freed, and buffers may later be reattached).
   1138 */
   1139void __brelse(struct buffer_head * buf)
   1140{
   1141	if (atomic_read(&buf->b_count)) {
   1142		put_bh(buf);
   1143		return;
   1144	}
   1145	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
   1146}
   1147EXPORT_SYMBOL(__brelse);
   1148
   1149/*
   1150 * bforget() is like brelse(), except it discards any
   1151 * potentially dirty data.
   1152 */
   1153void __bforget(struct buffer_head *bh)
   1154{
   1155	clear_buffer_dirty(bh);
   1156	if (bh->b_assoc_map) {
   1157		struct address_space *buffer_mapping = bh->b_page->mapping;
   1158
   1159		spin_lock(&buffer_mapping->private_lock);
   1160		list_del_init(&bh->b_assoc_buffers);
   1161		bh->b_assoc_map = NULL;
   1162		spin_unlock(&buffer_mapping->private_lock);
   1163	}
   1164	__brelse(bh);
   1165}
   1166EXPORT_SYMBOL(__bforget);
   1167
   1168static struct buffer_head *__bread_slow(struct buffer_head *bh)
   1169{
   1170	lock_buffer(bh);
   1171	if (buffer_uptodate(bh)) {
   1172		unlock_buffer(bh);
   1173		return bh;
   1174	} else {
   1175		get_bh(bh);
   1176		bh->b_end_io = end_buffer_read_sync;
   1177		submit_bh(REQ_OP_READ, 0, bh);
   1178		wait_on_buffer(bh);
   1179		if (buffer_uptodate(bh))
   1180			return bh;
   1181	}
   1182	brelse(bh);
   1183	return NULL;
   1184}
   1185
   1186/*
   1187 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
   1188 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
   1189 * refcount elevated by one when they're in an LRU.  A buffer can only appear
   1190 * once in a particular CPU's LRU.  A single buffer can be present in multiple
   1191 * CPU's LRUs at the same time.
   1192 *
   1193 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
   1194 * sb_find_get_block().
   1195 *
   1196 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
   1197 * a local interrupt disable for that.
   1198 */
   1199
   1200#define BH_LRU_SIZE	16
   1201
   1202struct bh_lru {
   1203	struct buffer_head *bhs[BH_LRU_SIZE];
   1204};
   1205
   1206static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
   1207
   1208#ifdef CONFIG_SMP
   1209#define bh_lru_lock()	local_irq_disable()
   1210#define bh_lru_unlock()	local_irq_enable()
   1211#else
   1212#define bh_lru_lock()	preempt_disable()
   1213#define bh_lru_unlock()	preempt_enable()
   1214#endif
   1215
   1216static inline void check_irqs_on(void)
   1217{
   1218#ifdef irqs_disabled
   1219	BUG_ON(irqs_disabled());
   1220#endif
   1221}
   1222
   1223/*
   1224 * Install a buffer_head into this cpu's LRU.  If not already in the LRU, it is
   1225 * inserted at the front, and the buffer_head at the back if any is evicted.
   1226 * Or, if already in the LRU it is moved to the front.
   1227 */
   1228static void bh_lru_install(struct buffer_head *bh)
   1229{
   1230	struct buffer_head *evictee = bh;
   1231	struct bh_lru *b;
   1232	int i;
   1233
   1234	check_irqs_on();
   1235	bh_lru_lock();
   1236
   1237	/*
   1238	 * the refcount of buffer_head in bh_lru prevents dropping the
   1239	 * attached page(i.e., try_to_free_buffers) so it could cause
   1240	 * failing page migration.
   1241	 * Skip putting upcoming bh into bh_lru until migration is done.
   1242	 */
   1243	if (lru_cache_disabled()) {
   1244		bh_lru_unlock();
   1245		return;
   1246	}
   1247
   1248	b = this_cpu_ptr(&bh_lrus);
   1249	for (i = 0; i < BH_LRU_SIZE; i++) {
   1250		swap(evictee, b->bhs[i]);
   1251		if (evictee == bh) {
   1252			bh_lru_unlock();
   1253			return;
   1254		}
   1255	}
   1256
   1257	get_bh(bh);
   1258	bh_lru_unlock();
   1259	brelse(evictee);
   1260}
   1261
   1262/*
   1263 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
   1264 */
   1265static struct buffer_head *
   1266lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
   1267{
   1268	struct buffer_head *ret = NULL;
   1269	unsigned int i;
   1270
   1271	check_irqs_on();
   1272	bh_lru_lock();
   1273	for (i = 0; i < BH_LRU_SIZE; i++) {
   1274		struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
   1275
   1276		if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
   1277		    bh->b_size == size) {
   1278			if (i) {
   1279				while (i) {
   1280					__this_cpu_write(bh_lrus.bhs[i],
   1281						__this_cpu_read(bh_lrus.bhs[i - 1]));
   1282					i--;
   1283				}
   1284				__this_cpu_write(bh_lrus.bhs[0], bh);
   1285			}
   1286			get_bh(bh);
   1287			ret = bh;
   1288			break;
   1289		}
   1290	}
   1291	bh_lru_unlock();
   1292	return ret;
   1293}
   1294
   1295/*
   1296 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
   1297 * it in the LRU and mark it as accessed.  If it is not present then return
   1298 * NULL
   1299 */
   1300struct buffer_head *
   1301__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
   1302{
   1303	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
   1304
   1305	if (bh == NULL) {
   1306		/* __find_get_block_slow will mark the page accessed */
   1307		bh = __find_get_block_slow(bdev, block);
   1308		if (bh)
   1309			bh_lru_install(bh);
   1310	} else
   1311		touch_buffer(bh);
   1312
   1313	return bh;
   1314}
   1315EXPORT_SYMBOL(__find_get_block);
   1316
   1317/*
   1318 * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
   1319 * which corresponds to the passed block_device, block and size. The
   1320 * returned buffer has its reference count incremented.
   1321 *
   1322 * __getblk_gfp() will lock up the machine if grow_dev_page's
   1323 * try_to_free_buffers() attempt is failing.  FIXME, perhaps?
   1324 */
   1325struct buffer_head *
   1326__getblk_gfp(struct block_device *bdev, sector_t block,
   1327	     unsigned size, gfp_t gfp)
   1328{
   1329	struct buffer_head *bh = __find_get_block(bdev, block, size);
   1330
   1331	might_sleep();
   1332	if (bh == NULL)
   1333		bh = __getblk_slow(bdev, block, size, gfp);
   1334	return bh;
   1335}
   1336EXPORT_SYMBOL(__getblk_gfp);
   1337
   1338/*
   1339 * Do async read-ahead on a buffer..
   1340 */
   1341void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
   1342{
   1343	struct buffer_head *bh = __getblk(bdev, block, size);
   1344	if (likely(bh)) {
   1345		ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
   1346		brelse(bh);
   1347	}
   1348}
   1349EXPORT_SYMBOL(__breadahead);
   1350
   1351void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size,
   1352		      gfp_t gfp)
   1353{
   1354	struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
   1355	if (likely(bh)) {
   1356		ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
   1357		brelse(bh);
   1358	}
   1359}
   1360EXPORT_SYMBOL(__breadahead_gfp);
   1361
   1362/**
   1363 *  __bread_gfp() - reads a specified block and returns the bh
   1364 *  @bdev: the block_device to read from
   1365 *  @block: number of block
   1366 *  @size: size (in bytes) to read
   1367 *  @gfp: page allocation flag
   1368 *
   1369 *  Reads a specified block, and returns buffer head that contains it.
   1370 *  The page cache can be allocated from non-movable area
   1371 *  not to prevent page migration if you set gfp to zero.
   1372 *  It returns NULL if the block was unreadable.
   1373 */
   1374struct buffer_head *
   1375__bread_gfp(struct block_device *bdev, sector_t block,
   1376		   unsigned size, gfp_t gfp)
   1377{
   1378	struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
   1379
   1380	if (likely(bh) && !buffer_uptodate(bh))
   1381		bh = __bread_slow(bh);
   1382	return bh;
   1383}
   1384EXPORT_SYMBOL(__bread_gfp);
   1385
   1386static void __invalidate_bh_lrus(struct bh_lru *b)
   1387{
   1388	int i;
   1389
   1390	for (i = 0; i < BH_LRU_SIZE; i++) {
   1391		brelse(b->bhs[i]);
   1392		b->bhs[i] = NULL;
   1393	}
   1394}
   1395/*
   1396 * invalidate_bh_lrus() is called rarely - but not only at unmount.
   1397 * This doesn't race because it runs in each cpu either in irq
   1398 * or with preempt disabled.
   1399 */
   1400static void invalidate_bh_lru(void *arg)
   1401{
   1402	struct bh_lru *b = &get_cpu_var(bh_lrus);
   1403
   1404	__invalidate_bh_lrus(b);
   1405	put_cpu_var(bh_lrus);
   1406}
   1407
   1408bool has_bh_in_lru(int cpu, void *dummy)
   1409{
   1410	struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
   1411	int i;
   1412	
   1413	for (i = 0; i < BH_LRU_SIZE; i++) {
   1414		if (b->bhs[i])
   1415			return true;
   1416	}
   1417
   1418	return false;
   1419}
   1420
   1421void invalidate_bh_lrus(void)
   1422{
   1423	on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
   1424}
   1425EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
   1426
   1427/*
   1428 * It's called from workqueue context so we need a bh_lru_lock to close
   1429 * the race with preemption/irq.
   1430 */
   1431void invalidate_bh_lrus_cpu(void)
   1432{
   1433	struct bh_lru *b;
   1434
   1435	bh_lru_lock();
   1436	b = this_cpu_ptr(&bh_lrus);
   1437	__invalidate_bh_lrus(b);
   1438	bh_lru_unlock();
   1439}
   1440
   1441void set_bh_page(struct buffer_head *bh,
   1442		struct page *page, unsigned long offset)
   1443{
   1444	bh->b_page = page;
   1445	BUG_ON(offset >= PAGE_SIZE);
   1446	if (PageHighMem(page))
   1447		/*
   1448		 * This catches illegal uses and preserves the offset:
   1449		 */
   1450		bh->b_data = (char *)(0 + offset);
   1451	else
   1452		bh->b_data = page_address(page) + offset;
   1453}
   1454EXPORT_SYMBOL(set_bh_page);
   1455
   1456/*
   1457 * Called when truncating a buffer on a page completely.
   1458 */
   1459
   1460/* Bits that are cleared during an invalidate */
   1461#define BUFFER_FLAGS_DISCARD \
   1462	(1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
   1463	 1 << BH_Delay | 1 << BH_Unwritten)
   1464
   1465static void discard_buffer(struct buffer_head * bh)
   1466{
   1467	unsigned long b_state, b_state_old;
   1468
   1469	lock_buffer(bh);
   1470	clear_buffer_dirty(bh);
   1471	bh->b_bdev = NULL;
   1472	b_state = bh->b_state;
   1473	for (;;) {
   1474		b_state_old = cmpxchg(&bh->b_state, b_state,
   1475				      (b_state & ~BUFFER_FLAGS_DISCARD));
   1476		if (b_state_old == b_state)
   1477			break;
   1478		b_state = b_state_old;
   1479	}
   1480	unlock_buffer(bh);
   1481}
   1482
   1483/**
   1484 * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
   1485 * @folio: The folio which is affected.
   1486 * @offset: start of the range to invalidate
   1487 * @length: length of the range to invalidate
   1488 *
   1489 * block_invalidate_folio() is called when all or part of the folio has been
   1490 * invalidated by a truncate operation.
   1491 *
   1492 * block_invalidate_folio() does not have to release all buffers, but it must
   1493 * ensure that no dirty buffer is left outside @offset and that no I/O
   1494 * is underway against any of the blocks which are outside the truncation
   1495 * point.  Because the caller is about to free (and possibly reuse) those
   1496 * blocks on-disk.
   1497 */
   1498void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
   1499{
   1500	struct buffer_head *head, *bh, *next;
   1501	size_t curr_off = 0;
   1502	size_t stop = length + offset;
   1503
   1504	BUG_ON(!folio_test_locked(folio));
   1505
   1506	/*
   1507	 * Check for overflow
   1508	 */
   1509	BUG_ON(stop > folio_size(folio) || stop < length);
   1510
   1511	head = folio_buffers(folio);
   1512	if (!head)
   1513		return;
   1514
   1515	bh = head;
   1516	do {
   1517		size_t next_off = curr_off + bh->b_size;
   1518		next = bh->b_this_page;
   1519
   1520		/*
   1521		 * Are we still fully in range ?
   1522		 */
   1523		if (next_off > stop)
   1524			goto out;
   1525
   1526		/*
   1527		 * is this block fully invalidated?
   1528		 */
   1529		if (offset <= curr_off)
   1530			discard_buffer(bh);
   1531		curr_off = next_off;
   1532		bh = next;
   1533	} while (bh != head);
   1534
   1535	/*
   1536	 * We release buffers only if the entire folio is being invalidated.
   1537	 * The get_block cached value has been unconditionally invalidated,
   1538	 * so real IO is not possible anymore.
   1539	 */
   1540	if (length == folio_size(folio))
   1541		filemap_release_folio(folio, 0);
   1542out:
   1543	return;
   1544}
   1545EXPORT_SYMBOL(block_invalidate_folio);
   1546
   1547
   1548/*
   1549 * We attach and possibly dirty the buffers atomically wrt
   1550 * block_dirty_folio() via private_lock.  try_to_free_buffers
   1551 * is already excluded via the page lock.
   1552 */
   1553void create_empty_buffers(struct page *page,
   1554			unsigned long blocksize, unsigned long b_state)
   1555{
   1556	struct buffer_head *bh, *head, *tail;
   1557
   1558	head = alloc_page_buffers(page, blocksize, true);
   1559	bh = head;
   1560	do {
   1561		bh->b_state |= b_state;
   1562		tail = bh;
   1563		bh = bh->b_this_page;
   1564	} while (bh);
   1565	tail->b_this_page = head;
   1566
   1567	spin_lock(&page->mapping->private_lock);
   1568	if (PageUptodate(page) || PageDirty(page)) {
   1569		bh = head;
   1570		do {
   1571			if (PageDirty(page))
   1572				set_buffer_dirty(bh);
   1573			if (PageUptodate(page))
   1574				set_buffer_uptodate(bh);
   1575			bh = bh->b_this_page;
   1576		} while (bh != head);
   1577	}
   1578	attach_page_private(page, head);
   1579	spin_unlock(&page->mapping->private_lock);
   1580}
   1581EXPORT_SYMBOL(create_empty_buffers);
   1582
   1583/**
   1584 * clean_bdev_aliases: clean a range of buffers in block device
   1585 * @bdev: Block device to clean buffers in
   1586 * @block: Start of a range of blocks to clean
   1587 * @len: Number of blocks to clean
   1588 *
   1589 * We are taking a range of blocks for data and we don't want writeback of any
   1590 * buffer-cache aliases starting from return from this function and until the
   1591 * moment when something will explicitly mark the buffer dirty (hopefully that
   1592 * will not happen until we will free that block ;-) We don't even need to mark
   1593 * it not-uptodate - nobody can expect anything from a newly allocated buffer
   1594 * anyway. We used to use unmap_buffer() for such invalidation, but that was
   1595 * wrong. We definitely don't want to mark the alias unmapped, for example - it
   1596 * would confuse anyone who might pick it with bread() afterwards...
   1597 *
   1598 * Also..  Note that bforget() doesn't lock the buffer.  So there can be
   1599 * writeout I/O going on against recently-freed buffers.  We don't wait on that
   1600 * I/O in bforget() - it's more efficient to wait on the I/O only if we really
   1601 * need to.  That happens here.
   1602 */
   1603void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
   1604{
   1605	struct inode *bd_inode = bdev->bd_inode;
   1606	struct address_space *bd_mapping = bd_inode->i_mapping;
   1607	struct pagevec pvec;
   1608	pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
   1609	pgoff_t end;
   1610	int i, count;
   1611	struct buffer_head *bh;
   1612	struct buffer_head *head;
   1613
   1614	end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
   1615	pagevec_init(&pvec);
   1616	while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
   1617		count = pagevec_count(&pvec);
   1618		for (i = 0; i < count; i++) {
   1619			struct page *page = pvec.pages[i];
   1620
   1621			if (!page_has_buffers(page))
   1622				continue;
   1623			/*
   1624			 * We use page lock instead of bd_mapping->private_lock
   1625			 * to pin buffers here since we can afford to sleep and
   1626			 * it scales better than a global spinlock lock.
   1627			 */
   1628			lock_page(page);
   1629			/* Recheck when the page is locked which pins bhs */
   1630			if (!page_has_buffers(page))
   1631				goto unlock_page;
   1632			head = page_buffers(page);
   1633			bh = head;
   1634			do {
   1635				if (!buffer_mapped(bh) || (bh->b_blocknr < block))
   1636					goto next;
   1637				if (bh->b_blocknr >= block + len)
   1638					break;
   1639				clear_buffer_dirty(bh);
   1640				wait_on_buffer(bh);
   1641				clear_buffer_req(bh);
   1642next:
   1643				bh = bh->b_this_page;
   1644			} while (bh != head);
   1645unlock_page:
   1646			unlock_page(page);
   1647		}
   1648		pagevec_release(&pvec);
   1649		cond_resched();
   1650		/* End of range already reached? */
   1651		if (index > end || !index)
   1652			break;
   1653	}
   1654}
   1655EXPORT_SYMBOL(clean_bdev_aliases);
   1656
   1657/*
   1658 * Size is a power-of-two in the range 512..PAGE_SIZE,
   1659 * and the case we care about most is PAGE_SIZE.
   1660 *
   1661 * So this *could* possibly be written with those
   1662 * constraints in mind (relevant mostly if some
   1663 * architecture has a slow bit-scan instruction)
   1664 */
   1665static inline int block_size_bits(unsigned int blocksize)
   1666{
   1667	return ilog2(blocksize);
   1668}
   1669
   1670static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
   1671{
   1672	BUG_ON(!PageLocked(page));
   1673
   1674	if (!page_has_buffers(page))
   1675		create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits),
   1676				     b_state);
   1677	return page_buffers(page);
   1678}
   1679
   1680/*
   1681 * NOTE! All mapped/uptodate combinations are valid:
   1682 *
   1683 *	Mapped	Uptodate	Meaning
   1684 *
   1685 *	No	No		"unknown" - must do get_block()
   1686 *	No	Yes		"hole" - zero-filled
   1687 *	Yes	No		"allocated" - allocated on disk, not read in
   1688 *	Yes	Yes		"valid" - allocated and up-to-date in memory.
   1689 *
   1690 * "Dirty" is valid only with the last case (mapped+uptodate).
   1691 */
   1692
   1693/*
   1694 * While block_write_full_page is writing back the dirty buffers under
   1695 * the page lock, whoever dirtied the buffers may decide to clean them
   1696 * again at any time.  We handle that by only looking at the buffer
   1697 * state inside lock_buffer().
   1698 *
   1699 * If block_write_full_page() is called for regular writeback
   1700 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
   1701 * locked buffer.   This only can happen if someone has written the buffer
   1702 * directly, with submit_bh().  At the address_space level PageWriteback
   1703 * prevents this contention from occurring.
   1704 *
   1705 * If block_write_full_page() is called with wbc->sync_mode ==
   1706 * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
   1707 * causes the writes to be flagged as synchronous writes.
   1708 */
   1709int __block_write_full_page(struct inode *inode, struct page *page,
   1710			get_block_t *get_block, struct writeback_control *wbc,
   1711			bh_end_io_t *handler)
   1712{
   1713	int err;
   1714	sector_t block;
   1715	sector_t last_block;
   1716	struct buffer_head *bh, *head;
   1717	unsigned int blocksize, bbits;
   1718	int nr_underway = 0;
   1719	int write_flags = wbc_to_write_flags(wbc);
   1720
   1721	head = create_page_buffers(page, inode,
   1722					(1 << BH_Dirty)|(1 << BH_Uptodate));
   1723
   1724	/*
   1725	 * Be very careful.  We have no exclusion from block_dirty_folio
   1726	 * here, and the (potentially unmapped) buffers may become dirty at
   1727	 * any time.  If a buffer becomes dirty here after we've inspected it
   1728	 * then we just miss that fact, and the page stays dirty.
   1729	 *
   1730	 * Buffers outside i_size may be dirtied by block_dirty_folio;
   1731	 * handle that here by just cleaning them.
   1732	 */
   1733
   1734	bh = head;
   1735	blocksize = bh->b_size;
   1736	bbits = block_size_bits(blocksize);
   1737
   1738	block = (sector_t)page->index << (PAGE_SHIFT - bbits);
   1739	last_block = (i_size_read(inode) - 1) >> bbits;
   1740
   1741	/*
   1742	 * Get all the dirty buffers mapped to disk addresses and
   1743	 * handle any aliases from the underlying blockdev's mapping.
   1744	 */
   1745	do {
   1746		if (block > last_block) {
   1747			/*
   1748			 * mapped buffers outside i_size will occur, because
   1749			 * this page can be outside i_size when there is a
   1750			 * truncate in progress.
   1751			 */
   1752			/*
   1753			 * The buffer was zeroed by block_write_full_page()
   1754			 */
   1755			clear_buffer_dirty(bh);
   1756			set_buffer_uptodate(bh);
   1757		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
   1758			   buffer_dirty(bh)) {
   1759			WARN_ON(bh->b_size != blocksize);
   1760			err = get_block(inode, block, bh, 1);
   1761			if (err)
   1762				goto recover;
   1763			clear_buffer_delay(bh);
   1764			if (buffer_new(bh)) {
   1765				/* blockdev mappings never come here */
   1766				clear_buffer_new(bh);
   1767				clean_bdev_bh_alias(bh);
   1768			}
   1769		}
   1770		bh = bh->b_this_page;
   1771		block++;
   1772	} while (bh != head);
   1773
   1774	do {
   1775		if (!buffer_mapped(bh))
   1776			continue;
   1777		/*
   1778		 * If it's a fully non-blocking write attempt and we cannot
   1779		 * lock the buffer then redirty the page.  Note that this can
   1780		 * potentially cause a busy-wait loop from writeback threads
   1781		 * and kswapd activity, but those code paths have their own
   1782		 * higher-level throttling.
   1783		 */
   1784		if (wbc->sync_mode != WB_SYNC_NONE) {
   1785			lock_buffer(bh);
   1786		} else if (!trylock_buffer(bh)) {
   1787			redirty_page_for_writepage(wbc, page);
   1788			continue;
   1789		}
   1790		if (test_clear_buffer_dirty(bh)) {
   1791			mark_buffer_async_write_endio(bh, handler);
   1792		} else {
   1793			unlock_buffer(bh);
   1794		}
   1795	} while ((bh = bh->b_this_page) != head);
   1796
   1797	/*
   1798	 * The page and its buffers are protected by PageWriteback(), so we can
   1799	 * drop the bh refcounts early.
   1800	 */
   1801	BUG_ON(PageWriteback(page));
   1802	set_page_writeback(page);
   1803
   1804	do {
   1805		struct buffer_head *next = bh->b_this_page;
   1806		if (buffer_async_write(bh)) {
   1807			submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
   1808			nr_underway++;
   1809		}
   1810		bh = next;
   1811	} while (bh != head);
   1812	unlock_page(page);
   1813
   1814	err = 0;
   1815done:
   1816	if (nr_underway == 0) {
   1817		/*
   1818		 * The page was marked dirty, but the buffers were
   1819		 * clean.  Someone wrote them back by hand with
   1820		 * ll_rw_block/submit_bh.  A rare case.
   1821		 */
   1822		end_page_writeback(page);
   1823
   1824		/*
   1825		 * The page and buffer_heads can be released at any time from
   1826		 * here on.
   1827		 */
   1828	}
   1829	return err;
   1830
   1831recover:
   1832	/*
   1833	 * ENOSPC, or some other error.  We may already have added some
   1834	 * blocks to the file, so we need to write these out to avoid
   1835	 * exposing stale data.
   1836	 * The page is currently locked and not marked for writeback
   1837	 */
   1838	bh = head;
   1839	/* Recovery: lock and submit the mapped buffers */
   1840	do {
   1841		if (buffer_mapped(bh) && buffer_dirty(bh) &&
   1842		    !buffer_delay(bh)) {
   1843			lock_buffer(bh);
   1844			mark_buffer_async_write_endio(bh, handler);
   1845		} else {
   1846			/*
   1847			 * The buffer may have been set dirty during
   1848			 * attachment to a dirty page.
   1849			 */
   1850			clear_buffer_dirty(bh);
   1851		}
   1852	} while ((bh = bh->b_this_page) != head);
   1853	SetPageError(page);
   1854	BUG_ON(PageWriteback(page));
   1855	mapping_set_error(page->mapping, err);
   1856	set_page_writeback(page);
   1857	do {
   1858		struct buffer_head *next = bh->b_this_page;
   1859		if (buffer_async_write(bh)) {
   1860			clear_buffer_dirty(bh);
   1861			submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
   1862			nr_underway++;
   1863		}
   1864		bh = next;
   1865	} while (bh != head);
   1866	unlock_page(page);
   1867	goto done;
   1868}
   1869EXPORT_SYMBOL(__block_write_full_page);
   1870
   1871/*
   1872 * If a page has any new buffers, zero them out here, and mark them uptodate
   1873 * and dirty so they'll be written out (in order to prevent uninitialised
   1874 * block data from leaking). And clear the new bit.
   1875 */
   1876void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
   1877{
   1878	unsigned int block_start, block_end;
   1879	struct buffer_head *head, *bh;
   1880
   1881	BUG_ON(!PageLocked(page));
   1882	if (!page_has_buffers(page))
   1883		return;
   1884
   1885	bh = head = page_buffers(page);
   1886	block_start = 0;
   1887	do {
   1888		block_end = block_start + bh->b_size;
   1889
   1890		if (buffer_new(bh)) {
   1891			if (block_end > from && block_start < to) {
   1892				if (!PageUptodate(page)) {
   1893					unsigned start, size;
   1894
   1895					start = max(from, block_start);
   1896					size = min(to, block_end) - start;
   1897
   1898					zero_user(page, start, size);
   1899					set_buffer_uptodate(bh);
   1900				}
   1901
   1902				clear_buffer_new(bh);
   1903				mark_buffer_dirty(bh);
   1904			}
   1905		}
   1906
   1907		block_start = block_end;
   1908		bh = bh->b_this_page;
   1909	} while (bh != head);
   1910}
   1911EXPORT_SYMBOL(page_zero_new_buffers);
   1912
   1913static void
   1914iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
   1915		const struct iomap *iomap)
   1916{
   1917	loff_t offset = block << inode->i_blkbits;
   1918
   1919	bh->b_bdev = iomap->bdev;
   1920
   1921	/*
   1922	 * Block points to offset in file we need to map, iomap contains
   1923	 * the offset at which the map starts. If the map ends before the
   1924	 * current block, then do not map the buffer and let the caller
   1925	 * handle it.
   1926	 */
   1927	BUG_ON(offset >= iomap->offset + iomap->length);
   1928
   1929	switch (iomap->type) {
   1930	case IOMAP_HOLE:
   1931		/*
   1932		 * If the buffer is not up to date or beyond the current EOF,
   1933		 * we need to mark it as new to ensure sub-block zeroing is
   1934		 * executed if necessary.
   1935		 */
   1936		if (!buffer_uptodate(bh) ||
   1937		    (offset >= i_size_read(inode)))
   1938			set_buffer_new(bh);
   1939		break;
   1940	case IOMAP_DELALLOC:
   1941		if (!buffer_uptodate(bh) ||
   1942		    (offset >= i_size_read(inode)))
   1943			set_buffer_new(bh);
   1944		set_buffer_uptodate(bh);
   1945		set_buffer_mapped(bh);
   1946		set_buffer_delay(bh);
   1947		break;
   1948	case IOMAP_UNWRITTEN:
   1949		/*
   1950		 * For unwritten regions, we always need to ensure that regions
   1951		 * in the block we are not writing to are zeroed. Mark the
   1952		 * buffer as new to ensure this.
   1953		 */
   1954		set_buffer_new(bh);
   1955		set_buffer_unwritten(bh);
   1956		fallthrough;
   1957	case IOMAP_MAPPED:
   1958		if ((iomap->flags & IOMAP_F_NEW) ||
   1959		    offset >= i_size_read(inode))
   1960			set_buffer_new(bh);
   1961		bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
   1962				inode->i_blkbits;
   1963		set_buffer_mapped(bh);
   1964		break;
   1965	}
   1966}
   1967
   1968int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
   1969		get_block_t *get_block, const struct iomap *iomap)
   1970{
   1971	unsigned from = pos & (PAGE_SIZE - 1);
   1972	unsigned to = from + len;
   1973	struct inode *inode = folio->mapping->host;
   1974	unsigned block_start, block_end;
   1975	sector_t block;
   1976	int err = 0;
   1977	unsigned blocksize, bbits;
   1978	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
   1979
   1980	BUG_ON(!folio_test_locked(folio));
   1981	BUG_ON(from > PAGE_SIZE);
   1982	BUG_ON(to > PAGE_SIZE);
   1983	BUG_ON(from > to);
   1984
   1985	head = create_page_buffers(&folio->page, inode, 0);
   1986	blocksize = head->b_size;
   1987	bbits = block_size_bits(blocksize);
   1988
   1989	block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
   1990
   1991	for(bh = head, block_start = 0; bh != head || !block_start;
   1992	    block++, block_start=block_end, bh = bh->b_this_page) {
   1993		block_end = block_start + blocksize;
   1994		if (block_end <= from || block_start >= to) {
   1995			if (folio_test_uptodate(folio)) {
   1996				if (!buffer_uptodate(bh))
   1997					set_buffer_uptodate(bh);
   1998			}
   1999			continue;
   2000		}
   2001		if (buffer_new(bh))
   2002			clear_buffer_new(bh);
   2003		if (!buffer_mapped(bh)) {
   2004			WARN_ON(bh->b_size != blocksize);
   2005			if (get_block) {
   2006				err = get_block(inode, block, bh, 1);
   2007				if (err)
   2008					break;
   2009			} else {
   2010				iomap_to_bh(inode, block, bh, iomap);
   2011			}
   2012
   2013			if (buffer_new(bh)) {
   2014				clean_bdev_bh_alias(bh);
   2015				if (folio_test_uptodate(folio)) {
   2016					clear_buffer_new(bh);
   2017					set_buffer_uptodate(bh);
   2018					mark_buffer_dirty(bh);
   2019					continue;
   2020				}
   2021				if (block_end > to || block_start < from)
   2022					folio_zero_segments(folio,
   2023						to, block_end,
   2024						block_start, from);
   2025				continue;
   2026			}
   2027		}
   2028		if (folio_test_uptodate(folio)) {
   2029			if (!buffer_uptodate(bh))
   2030				set_buffer_uptodate(bh);
   2031			continue; 
   2032		}
   2033		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
   2034		    !buffer_unwritten(bh) &&
   2035		     (block_start < from || block_end > to)) {
   2036			ll_rw_block(REQ_OP_READ, 0, 1, &bh);
   2037			*wait_bh++=bh;
   2038		}
   2039	}
   2040	/*
   2041	 * If we issued read requests - let them complete.
   2042	 */
   2043	while(wait_bh > wait) {
   2044		wait_on_buffer(*--wait_bh);
   2045		if (!buffer_uptodate(*wait_bh))
   2046			err = -EIO;
   2047	}
   2048	if (unlikely(err))
   2049		page_zero_new_buffers(&folio->page, from, to);
   2050	return err;
   2051}
   2052
   2053int __block_write_begin(struct page *page, loff_t pos, unsigned len,
   2054		get_block_t *get_block)
   2055{
   2056	return __block_write_begin_int(page_folio(page), pos, len, get_block,
   2057				       NULL);
   2058}
   2059EXPORT_SYMBOL(__block_write_begin);
   2060
   2061static int __block_commit_write(struct inode *inode, struct page *page,
   2062		unsigned from, unsigned to)
   2063{
   2064	unsigned block_start, block_end;
   2065	int partial = 0;
   2066	unsigned blocksize;
   2067	struct buffer_head *bh, *head;
   2068
   2069	bh = head = page_buffers(page);
   2070	blocksize = bh->b_size;
   2071
   2072	block_start = 0;
   2073	do {
   2074		block_end = block_start + blocksize;
   2075		if (block_end <= from || block_start >= to) {
   2076			if (!buffer_uptodate(bh))
   2077				partial = 1;
   2078		} else {
   2079			set_buffer_uptodate(bh);
   2080			mark_buffer_dirty(bh);
   2081		}
   2082		if (buffer_new(bh))
   2083			clear_buffer_new(bh);
   2084
   2085		block_start = block_end;
   2086		bh = bh->b_this_page;
   2087	} while (bh != head);
   2088
   2089	/*
   2090	 * If this is a partial write which happened to make all buffers
   2091	 * uptodate then we can optimize away a bogus read_folio() for
   2092	 * the next read(). Here we 'discover' whether the page went
   2093	 * uptodate as a result of this (potentially partial) write.
   2094	 */
   2095	if (!partial)
   2096		SetPageUptodate(page);
   2097	return 0;
   2098}
   2099
   2100/*
   2101 * block_write_begin takes care of the basic task of block allocation and
   2102 * bringing partial write blocks uptodate first.
   2103 *
   2104 * The filesystem needs to handle block truncation upon failure.
   2105 */
   2106int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
   2107		struct page **pagep, get_block_t *get_block)
   2108{
   2109	pgoff_t index = pos >> PAGE_SHIFT;
   2110	struct page *page;
   2111	int status;
   2112
   2113	page = grab_cache_page_write_begin(mapping, index);
   2114	if (!page)
   2115		return -ENOMEM;
   2116
   2117	status = __block_write_begin(page, pos, len, get_block);
   2118	if (unlikely(status)) {
   2119		unlock_page(page);
   2120		put_page(page);
   2121		page = NULL;
   2122	}
   2123
   2124	*pagep = page;
   2125	return status;
   2126}
   2127EXPORT_SYMBOL(block_write_begin);
   2128
   2129int block_write_end(struct file *file, struct address_space *mapping,
   2130			loff_t pos, unsigned len, unsigned copied,
   2131			struct page *page, void *fsdata)
   2132{
   2133	struct inode *inode = mapping->host;
   2134	unsigned start;
   2135
   2136	start = pos & (PAGE_SIZE - 1);
   2137
   2138	if (unlikely(copied < len)) {
   2139		/*
   2140		 * The buffers that were written will now be uptodate, so
   2141		 * we don't have to worry about a read_folio reading them
   2142		 * and overwriting a partial write. However if we have
   2143		 * encountered a short write and only partially written
   2144		 * into a buffer, it will not be marked uptodate, so a
   2145		 * read_folio might come in and destroy our partial write.
   2146		 *
   2147		 * Do the simplest thing, and just treat any short write to a
   2148		 * non uptodate page as a zero-length write, and force the
   2149		 * caller to redo the whole thing.
   2150		 */
   2151		if (!PageUptodate(page))
   2152			copied = 0;
   2153
   2154		page_zero_new_buffers(page, start+copied, start+len);
   2155	}
   2156	flush_dcache_page(page);
   2157
   2158	/* This could be a short (even 0-length) commit */
   2159	__block_commit_write(inode, page, start, start+copied);
   2160
   2161	return copied;
   2162}
   2163EXPORT_SYMBOL(block_write_end);
   2164
   2165int generic_write_end(struct file *file, struct address_space *mapping,
   2166			loff_t pos, unsigned len, unsigned copied,
   2167			struct page *page, void *fsdata)
   2168{
   2169	struct inode *inode = mapping->host;
   2170	loff_t old_size = inode->i_size;
   2171	bool i_size_changed = false;
   2172
   2173	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
   2174
   2175	/*
   2176	 * No need to use i_size_read() here, the i_size cannot change under us
   2177	 * because we hold i_rwsem.
   2178	 *
   2179	 * But it's important to update i_size while still holding page lock:
   2180	 * page writeout could otherwise come in and zero beyond i_size.
   2181	 */
   2182	if (pos + copied > inode->i_size) {
   2183		i_size_write(inode, pos + copied);
   2184		i_size_changed = true;
   2185	}
   2186
   2187	unlock_page(page);
   2188	put_page(page);
   2189
   2190	if (old_size < pos)
   2191		pagecache_isize_extended(inode, old_size, pos);
   2192	/*
   2193	 * Don't mark the inode dirty under page lock. First, it unnecessarily
   2194	 * makes the holding time of page lock longer. Second, it forces lock
   2195	 * ordering of page lock and transaction start for journaling
   2196	 * filesystems.
   2197	 */
   2198	if (i_size_changed)
   2199		mark_inode_dirty(inode);
   2200	return copied;
   2201}
   2202EXPORT_SYMBOL(generic_write_end);
   2203
   2204/*
   2205 * block_is_partially_uptodate checks whether buffers within a folio are
   2206 * uptodate or not.
   2207 *
   2208 * Returns true if all buffers which correspond to the specified part
   2209 * of the folio are uptodate.
   2210 */
   2211bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
   2212{
   2213	unsigned block_start, block_end, blocksize;
   2214	unsigned to;
   2215	struct buffer_head *bh, *head;
   2216	bool ret = true;
   2217
   2218	head = folio_buffers(folio);
   2219	if (!head)
   2220		return false;
   2221	blocksize = head->b_size;
   2222	to = min_t(unsigned, folio_size(folio) - from, count);
   2223	to = from + to;
   2224	if (from < blocksize && to > folio_size(folio) - blocksize)
   2225		return false;
   2226
   2227	bh = head;
   2228	block_start = 0;
   2229	do {
   2230		block_end = block_start + blocksize;
   2231		if (block_end > from && block_start < to) {
   2232			if (!buffer_uptodate(bh)) {
   2233				ret = false;
   2234				break;
   2235			}
   2236			if (block_end >= to)
   2237				break;
   2238		}
   2239		block_start = block_end;
   2240		bh = bh->b_this_page;
   2241	} while (bh != head);
   2242
   2243	return ret;
   2244}
   2245EXPORT_SYMBOL(block_is_partially_uptodate);
   2246
   2247/*
   2248 * Generic "read_folio" function for block devices that have the normal
   2249 * get_block functionality. This is most of the block device filesystems.
   2250 * Reads the folio asynchronously --- the unlock_buffer() and
   2251 * set/clear_buffer_uptodate() functions propagate buffer state into the
   2252 * folio once IO has completed.
   2253 */
   2254int block_read_full_folio(struct folio *folio, get_block_t *get_block)
   2255{
   2256	struct inode *inode = folio->mapping->host;
   2257	sector_t iblock, lblock;
   2258	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
   2259	unsigned int blocksize, bbits;
   2260	int nr, i;
   2261	int fully_mapped = 1;
   2262
   2263	VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
   2264
   2265	head = create_page_buffers(&folio->page, inode, 0);
   2266	blocksize = head->b_size;
   2267	bbits = block_size_bits(blocksize);
   2268
   2269	iblock = (sector_t)folio->index << (PAGE_SHIFT - bbits);
   2270	lblock = (i_size_read(inode)+blocksize-1) >> bbits;
   2271	bh = head;
   2272	nr = 0;
   2273	i = 0;
   2274
   2275	do {
   2276		if (buffer_uptodate(bh))
   2277			continue;
   2278
   2279		if (!buffer_mapped(bh)) {
   2280			int err = 0;
   2281
   2282			fully_mapped = 0;
   2283			if (iblock < lblock) {
   2284				WARN_ON(bh->b_size != blocksize);
   2285				err = get_block(inode, iblock, bh, 0);
   2286				if (err)
   2287					folio_set_error(folio);
   2288			}
   2289			if (!buffer_mapped(bh)) {
   2290				folio_zero_range(folio, i * blocksize,
   2291						blocksize);
   2292				if (!err)
   2293					set_buffer_uptodate(bh);
   2294				continue;
   2295			}
   2296			/*
   2297			 * get_block() might have updated the buffer
   2298			 * synchronously
   2299			 */
   2300			if (buffer_uptodate(bh))
   2301				continue;
   2302		}
   2303		arr[nr++] = bh;
   2304	} while (i++, iblock++, (bh = bh->b_this_page) != head);
   2305
   2306	if (fully_mapped)
   2307		folio_set_mappedtodisk(folio);
   2308
   2309	if (!nr) {
   2310		/*
   2311		 * All buffers are uptodate - we can set the folio uptodate
   2312		 * as well. But not if get_block() returned an error.
   2313		 */
   2314		if (!folio_test_error(folio))
   2315			folio_mark_uptodate(folio);
   2316		folio_unlock(folio);
   2317		return 0;
   2318	}
   2319
   2320	/* Stage two: lock the buffers */
   2321	for (i = 0; i < nr; i++) {
   2322		bh = arr[i];
   2323		lock_buffer(bh);
   2324		mark_buffer_async_read(bh);
   2325	}
   2326
   2327	/*
   2328	 * Stage 3: start the IO.  Check for uptodateness
   2329	 * inside the buffer lock in case another process reading
   2330	 * the underlying blockdev brought it uptodate (the sct fix).
   2331	 */
   2332	for (i = 0; i < nr; i++) {
   2333		bh = arr[i];
   2334		if (buffer_uptodate(bh))
   2335			end_buffer_async_read(bh, 1);
   2336		else
   2337			submit_bh(REQ_OP_READ, 0, bh);
   2338	}
   2339	return 0;
   2340}
   2341EXPORT_SYMBOL(block_read_full_folio);
   2342
   2343/* utility function for filesystems that need to do work on expanding
   2344 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
   2345 * deal with the hole.  
   2346 */
   2347int generic_cont_expand_simple(struct inode *inode, loff_t size)
   2348{
   2349	struct address_space *mapping = inode->i_mapping;
   2350	const struct address_space_operations *aops = mapping->a_ops;
   2351	struct page *page;
   2352	void *fsdata;
   2353	int err;
   2354
   2355	err = inode_newsize_ok(inode, size);
   2356	if (err)
   2357		goto out;
   2358
   2359	err = aops->write_begin(NULL, mapping, size, 0, &page, &fsdata);
   2360	if (err)
   2361		goto out;
   2362
   2363	err = aops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
   2364	BUG_ON(err > 0);
   2365
   2366out:
   2367	return err;
   2368}
   2369EXPORT_SYMBOL(generic_cont_expand_simple);
   2370
   2371static int cont_expand_zero(struct file *file, struct address_space *mapping,
   2372			    loff_t pos, loff_t *bytes)
   2373{
   2374	struct inode *inode = mapping->host;
   2375	const struct address_space_operations *aops = mapping->a_ops;
   2376	unsigned int blocksize = i_blocksize(inode);
   2377	struct page *page;
   2378	void *fsdata;
   2379	pgoff_t index, curidx;
   2380	loff_t curpos;
   2381	unsigned zerofrom, offset, len;
   2382	int err = 0;
   2383
   2384	index = pos >> PAGE_SHIFT;
   2385	offset = pos & ~PAGE_MASK;
   2386
   2387	while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
   2388		zerofrom = curpos & ~PAGE_MASK;
   2389		if (zerofrom & (blocksize-1)) {
   2390			*bytes |= (blocksize-1);
   2391			(*bytes)++;
   2392		}
   2393		len = PAGE_SIZE - zerofrom;
   2394
   2395		err = aops->write_begin(file, mapping, curpos, len,
   2396					    &page, &fsdata);
   2397		if (err)
   2398			goto out;
   2399		zero_user(page, zerofrom, len);
   2400		err = aops->write_end(file, mapping, curpos, len, len,
   2401						page, fsdata);
   2402		if (err < 0)
   2403			goto out;
   2404		BUG_ON(err != len);
   2405		err = 0;
   2406
   2407		balance_dirty_pages_ratelimited(mapping);
   2408
   2409		if (fatal_signal_pending(current)) {
   2410			err = -EINTR;
   2411			goto out;
   2412		}
   2413	}
   2414
   2415	/* page covers the boundary, find the boundary offset */
   2416	if (index == curidx) {
   2417		zerofrom = curpos & ~PAGE_MASK;
   2418		/* if we will expand the thing last block will be filled */
   2419		if (offset <= zerofrom) {
   2420			goto out;
   2421		}
   2422		if (zerofrom & (blocksize-1)) {
   2423			*bytes |= (blocksize-1);
   2424			(*bytes)++;
   2425		}
   2426		len = offset - zerofrom;
   2427
   2428		err = aops->write_begin(file, mapping, curpos, len,
   2429					    &page, &fsdata);
   2430		if (err)
   2431			goto out;
   2432		zero_user(page, zerofrom, len);
   2433		err = aops->write_end(file, mapping, curpos, len, len,
   2434						page, fsdata);
   2435		if (err < 0)
   2436			goto out;
   2437		BUG_ON(err != len);
   2438		err = 0;
   2439	}
   2440out:
   2441	return err;
   2442}
   2443
   2444/*
   2445 * For moronic filesystems that do not allow holes in file.
   2446 * We may have to extend the file.
   2447 */
   2448int cont_write_begin(struct file *file, struct address_space *mapping,
   2449			loff_t pos, unsigned len,
   2450			struct page **pagep, void **fsdata,
   2451			get_block_t *get_block, loff_t *bytes)
   2452{
   2453	struct inode *inode = mapping->host;
   2454	unsigned int blocksize = i_blocksize(inode);
   2455	unsigned int zerofrom;
   2456	int err;
   2457
   2458	err = cont_expand_zero(file, mapping, pos, bytes);
   2459	if (err)
   2460		return err;
   2461
   2462	zerofrom = *bytes & ~PAGE_MASK;
   2463	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
   2464		*bytes |= (blocksize-1);
   2465		(*bytes)++;
   2466	}
   2467
   2468	return block_write_begin(mapping, pos, len, pagep, get_block);
   2469}
   2470EXPORT_SYMBOL(cont_write_begin);
   2471
   2472int block_commit_write(struct page *page, unsigned from, unsigned to)
   2473{
   2474	struct inode *inode = page->mapping->host;
   2475	__block_commit_write(inode,page,from,to);
   2476	return 0;
   2477}
   2478EXPORT_SYMBOL(block_commit_write);
   2479
   2480/*
   2481 * block_page_mkwrite() is not allowed to change the file size as it gets
   2482 * called from a page fault handler when a page is first dirtied. Hence we must
   2483 * be careful to check for EOF conditions here. We set the page up correctly
   2484 * for a written page which means we get ENOSPC checking when writing into
   2485 * holes and correct delalloc and unwritten extent mapping on filesystems that
   2486 * support these features.
   2487 *
   2488 * We are not allowed to take the i_mutex here so we have to play games to
   2489 * protect against truncate races as the page could now be beyond EOF.  Because
   2490 * truncate writes the inode size before removing pages, once we have the
   2491 * page lock we can determine safely if the page is beyond EOF. If it is not
   2492 * beyond EOF, then the page is guaranteed safe against truncation until we
   2493 * unlock the page.
   2494 *
   2495 * Direct callers of this function should protect against filesystem freezing
   2496 * using sb_start_pagefault() - sb_end_pagefault() functions.
   2497 */
   2498int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
   2499			 get_block_t get_block)
   2500{
   2501	struct page *page = vmf->page;
   2502	struct inode *inode = file_inode(vma->vm_file);
   2503	unsigned long end;
   2504	loff_t size;
   2505	int ret;
   2506
   2507	lock_page(page);
   2508	size = i_size_read(inode);
   2509	if ((page->mapping != inode->i_mapping) ||
   2510	    (page_offset(page) > size)) {
   2511		/* We overload EFAULT to mean page got truncated */
   2512		ret = -EFAULT;
   2513		goto out_unlock;
   2514	}
   2515
   2516	/* page is wholly or partially inside EOF */
   2517	if (((page->index + 1) << PAGE_SHIFT) > size)
   2518		end = size & ~PAGE_MASK;
   2519	else
   2520		end = PAGE_SIZE;
   2521
   2522	ret = __block_write_begin(page, 0, end, get_block);
   2523	if (!ret)
   2524		ret = block_commit_write(page, 0, end);
   2525
   2526	if (unlikely(ret < 0))
   2527		goto out_unlock;
   2528	set_page_dirty(page);
   2529	wait_for_stable_page(page);
   2530	return 0;
   2531out_unlock:
   2532	unlock_page(page);
   2533	return ret;
   2534}
   2535EXPORT_SYMBOL(block_page_mkwrite);
   2536
   2537/*
   2538 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
   2539 * immediately, while under the page lock.  So it needs a special end_io
   2540 * handler which does not touch the bh after unlocking it.
   2541 */
   2542static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
   2543{
   2544	__end_buffer_read_notouch(bh, uptodate);
   2545}
   2546
   2547/*
   2548 * Attach the singly-linked list of buffers created by nobh_write_begin, to
   2549 * the page (converting it to circular linked list and taking care of page
   2550 * dirty races).
   2551 */
   2552static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
   2553{
   2554	struct buffer_head *bh;
   2555
   2556	BUG_ON(!PageLocked(page));
   2557
   2558	spin_lock(&page->mapping->private_lock);
   2559	bh = head;
   2560	do {
   2561		if (PageDirty(page))
   2562			set_buffer_dirty(bh);
   2563		if (!bh->b_this_page)
   2564			bh->b_this_page = head;
   2565		bh = bh->b_this_page;
   2566	} while (bh != head);
   2567	attach_page_private(page, head);
   2568	spin_unlock(&page->mapping->private_lock);
   2569}
   2570
   2571/*
   2572 * On entry, the page is fully not uptodate.
   2573 * On exit the page is fully uptodate in the areas outside (from,to)
   2574 * The filesystem needs to handle block truncation upon failure.
   2575 */
   2576int nobh_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
   2577			struct page **pagep, void **fsdata,
   2578			get_block_t *get_block)
   2579{
   2580	struct inode *inode = mapping->host;
   2581	const unsigned blkbits = inode->i_blkbits;
   2582	const unsigned blocksize = 1 << blkbits;
   2583	struct buffer_head *head, *bh;
   2584	struct page *page;
   2585	pgoff_t index;
   2586	unsigned from, to;
   2587	unsigned block_in_page;
   2588	unsigned block_start, block_end;
   2589	sector_t block_in_file;
   2590	int nr_reads = 0;
   2591	int ret = 0;
   2592	int is_mapped_to_disk = 1;
   2593
   2594	index = pos >> PAGE_SHIFT;
   2595	from = pos & (PAGE_SIZE - 1);
   2596	to = from + len;
   2597
   2598	page = grab_cache_page_write_begin(mapping, index);
   2599	if (!page)
   2600		return -ENOMEM;
   2601	*pagep = page;
   2602	*fsdata = NULL;
   2603
   2604	if (page_has_buffers(page)) {
   2605		ret = __block_write_begin(page, pos, len, get_block);
   2606		if (unlikely(ret))
   2607			goto out_release;
   2608		return ret;
   2609	}
   2610
   2611	if (PageMappedToDisk(page))
   2612		return 0;
   2613
   2614	/*
   2615	 * Allocate buffers so that we can keep track of state, and potentially
   2616	 * attach them to the page if an error occurs. In the common case of
   2617	 * no error, they will just be freed again without ever being attached
   2618	 * to the page (which is all OK, because we're under the page lock).
   2619	 *
   2620	 * Be careful: the buffer linked list is a NULL terminated one, rather
   2621	 * than the circular one we're used to.
   2622	 */
   2623	head = alloc_page_buffers(page, blocksize, false);
   2624	if (!head) {
   2625		ret = -ENOMEM;
   2626		goto out_release;
   2627	}
   2628
   2629	block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
   2630
   2631	/*
   2632	 * We loop across all blocks in the page, whether or not they are
   2633	 * part of the affected region.  This is so we can discover if the
   2634	 * page is fully mapped-to-disk.
   2635	 */
   2636	for (block_start = 0, block_in_page = 0, bh = head;
   2637		  block_start < PAGE_SIZE;
   2638		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
   2639		int create;
   2640
   2641		block_end = block_start + blocksize;
   2642		bh->b_state = 0;
   2643		create = 1;
   2644		if (block_start >= to)
   2645			create = 0;
   2646		ret = get_block(inode, block_in_file + block_in_page,
   2647					bh, create);
   2648		if (ret)
   2649			goto failed;
   2650		if (!buffer_mapped(bh))
   2651			is_mapped_to_disk = 0;
   2652		if (buffer_new(bh))
   2653			clean_bdev_bh_alias(bh);
   2654		if (PageUptodate(page)) {
   2655			set_buffer_uptodate(bh);
   2656			continue;
   2657		}
   2658		if (buffer_new(bh) || !buffer_mapped(bh)) {
   2659			zero_user_segments(page, block_start, from,
   2660							to, block_end);
   2661			continue;
   2662		}
   2663		if (buffer_uptodate(bh))
   2664			continue;	/* reiserfs does this */
   2665		if (block_start < from || block_end > to) {
   2666			lock_buffer(bh);
   2667			bh->b_end_io = end_buffer_read_nobh;
   2668			submit_bh(REQ_OP_READ, 0, bh);
   2669			nr_reads++;
   2670		}
   2671	}
   2672
   2673	if (nr_reads) {
   2674		/*
   2675		 * The page is locked, so these buffers are protected from
   2676		 * any VM or truncate activity.  Hence we don't need to care
   2677		 * for the buffer_head refcounts.
   2678		 */
   2679		for (bh = head; bh; bh = bh->b_this_page) {
   2680			wait_on_buffer(bh);
   2681			if (!buffer_uptodate(bh))
   2682				ret = -EIO;
   2683		}
   2684		if (ret)
   2685			goto failed;
   2686	}
   2687
   2688	if (is_mapped_to_disk)
   2689		SetPageMappedToDisk(page);
   2690
   2691	*fsdata = head; /* to be released by nobh_write_end */
   2692
   2693	return 0;
   2694
   2695failed:
   2696	BUG_ON(!ret);
   2697	/*
   2698	 * Error recovery is a bit difficult. We need to zero out blocks that
   2699	 * were newly allocated, and dirty them to ensure they get written out.
   2700	 * Buffers need to be attached to the page at this point, otherwise
   2701	 * the handling of potential IO errors during writeout would be hard
   2702	 * (could try doing synchronous writeout, but what if that fails too?)
   2703	 */
   2704	attach_nobh_buffers(page, head);
   2705	page_zero_new_buffers(page, from, to);
   2706
   2707out_release:
   2708	unlock_page(page);
   2709	put_page(page);
   2710	*pagep = NULL;
   2711
   2712	return ret;
   2713}
   2714EXPORT_SYMBOL(nobh_write_begin);
   2715
   2716int nobh_write_end(struct file *file, struct address_space *mapping,
   2717			loff_t pos, unsigned len, unsigned copied,
   2718			struct page *page, void *fsdata)
   2719{
   2720	struct inode *inode = page->mapping->host;
   2721	struct buffer_head *head = fsdata;
   2722	struct buffer_head *bh;
   2723	BUG_ON(fsdata != NULL && page_has_buffers(page));
   2724
   2725	if (unlikely(copied < len) && head)
   2726		attach_nobh_buffers(page, head);
   2727	if (page_has_buffers(page))
   2728		return generic_write_end(file, mapping, pos, len,
   2729					copied, page, fsdata);
   2730
   2731	SetPageUptodate(page);
   2732	set_page_dirty(page);
   2733	if (pos+copied > inode->i_size) {
   2734		i_size_write(inode, pos+copied);
   2735		mark_inode_dirty(inode);
   2736	}
   2737
   2738	unlock_page(page);
   2739	put_page(page);
   2740
   2741	while (head) {
   2742		bh = head;
   2743		head = head->b_this_page;
   2744		free_buffer_head(bh);
   2745	}
   2746
   2747	return copied;
   2748}
   2749EXPORT_SYMBOL(nobh_write_end);
   2750
   2751/*
   2752 * nobh_writepage() - based on block_full_write_page() except
   2753 * that it tries to operate without attaching bufferheads to
   2754 * the page.
   2755 */
   2756int nobh_writepage(struct page *page, get_block_t *get_block,
   2757			struct writeback_control *wbc)
   2758{
   2759	struct inode * const inode = page->mapping->host;
   2760	loff_t i_size = i_size_read(inode);
   2761	const pgoff_t end_index = i_size >> PAGE_SHIFT;
   2762	unsigned offset;
   2763	int ret;
   2764
   2765	/* Is the page fully inside i_size? */
   2766	if (page->index < end_index)
   2767		goto out;
   2768
   2769	/* Is the page fully outside i_size? (truncate in progress) */
   2770	offset = i_size & (PAGE_SIZE-1);
   2771	if (page->index >= end_index+1 || !offset) {
   2772		unlock_page(page);
   2773		return 0; /* don't care */
   2774	}
   2775
   2776	/*
   2777	 * The page straddles i_size.  It must be zeroed out on each and every
   2778	 * writepage invocation because it may be mmapped.  "A file is mapped
   2779	 * in multiples of the page size.  For a file that is not a multiple of
   2780	 * the  page size, the remaining memory is zeroed when mapped, and
   2781	 * writes to that region are not written out to the file."
   2782	 */
   2783	zero_user_segment(page, offset, PAGE_SIZE);
   2784out:
   2785	ret = mpage_writepage(page, get_block, wbc);
   2786	if (ret == -EAGAIN)
   2787		ret = __block_write_full_page(inode, page, get_block, wbc,
   2788					      end_buffer_async_write);
   2789	return ret;
   2790}
   2791EXPORT_SYMBOL(nobh_writepage);
   2792
   2793int nobh_truncate_page(struct address_space *mapping,
   2794			loff_t from, get_block_t *get_block)
   2795{
   2796	pgoff_t index = from >> PAGE_SHIFT;
   2797	struct inode *inode = mapping->host;
   2798	unsigned blocksize = i_blocksize(inode);
   2799	struct folio *folio;
   2800	struct buffer_head map_bh;
   2801	size_t offset;
   2802	sector_t iblock;
   2803	int err;
   2804
   2805	/* Block boundary? Nothing to do */
   2806	if (!(from & (blocksize - 1)))
   2807		return 0;
   2808
   2809	folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_CREAT,
   2810			mapping_gfp_mask(mapping));
   2811	err = -ENOMEM;
   2812	if (!folio)
   2813		goto out;
   2814
   2815	if (folio_buffers(folio))
   2816		goto has_buffers;
   2817
   2818	iblock = from >> inode->i_blkbits;
   2819	map_bh.b_size = blocksize;
   2820	map_bh.b_state = 0;
   2821	err = get_block(inode, iblock, &map_bh, 0);
   2822	if (err)
   2823		goto unlock;
   2824	/* unmapped? It's a hole - nothing to do */
   2825	if (!buffer_mapped(&map_bh))
   2826		goto unlock;
   2827
   2828	/* Ok, it's mapped. Make sure it's up-to-date */
   2829	if (!folio_test_uptodate(folio)) {
   2830		err = mapping->a_ops->read_folio(NULL, folio);
   2831		if (err) {
   2832			folio_put(folio);
   2833			goto out;
   2834		}
   2835		folio_lock(folio);
   2836		if (!folio_test_uptodate(folio)) {
   2837			err = -EIO;
   2838			goto unlock;
   2839		}
   2840		if (folio_buffers(folio))
   2841			goto has_buffers;
   2842	}
   2843	offset = offset_in_folio(folio, from);
   2844	folio_zero_segment(folio, offset, round_up(offset, blocksize));
   2845	folio_mark_dirty(folio);
   2846	err = 0;
   2847
   2848unlock:
   2849	folio_unlock(folio);
   2850	folio_put(folio);
   2851out:
   2852	return err;
   2853
   2854has_buffers:
   2855	folio_unlock(folio);
   2856	folio_put(folio);
   2857	return block_truncate_page(mapping, from, get_block);
   2858}
   2859EXPORT_SYMBOL(nobh_truncate_page);
   2860
   2861int block_truncate_page(struct address_space *mapping,
   2862			loff_t from, get_block_t *get_block)
   2863{
   2864	pgoff_t index = from >> PAGE_SHIFT;
   2865	unsigned offset = from & (PAGE_SIZE-1);
   2866	unsigned blocksize;
   2867	sector_t iblock;
   2868	unsigned length, pos;
   2869	struct inode *inode = mapping->host;
   2870	struct page *page;
   2871	struct buffer_head *bh;
   2872	int err;
   2873
   2874	blocksize = i_blocksize(inode);
   2875	length = offset & (blocksize - 1);
   2876
   2877	/* Block boundary? Nothing to do */
   2878	if (!length)
   2879		return 0;
   2880
   2881	length = blocksize - length;
   2882	iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
   2883	
   2884	page = grab_cache_page(mapping, index);
   2885	err = -ENOMEM;
   2886	if (!page)
   2887		goto out;
   2888
   2889	if (!page_has_buffers(page))
   2890		create_empty_buffers(page, blocksize, 0);
   2891
   2892	/* Find the buffer that contains "offset" */
   2893	bh = page_buffers(page);
   2894	pos = blocksize;
   2895	while (offset >= pos) {
   2896		bh = bh->b_this_page;
   2897		iblock++;
   2898		pos += blocksize;
   2899	}
   2900
   2901	err = 0;
   2902	if (!buffer_mapped(bh)) {
   2903		WARN_ON(bh->b_size != blocksize);
   2904		err = get_block(inode, iblock, bh, 0);
   2905		if (err)
   2906			goto unlock;
   2907		/* unmapped? It's a hole - nothing to do */
   2908		if (!buffer_mapped(bh))
   2909			goto unlock;
   2910	}
   2911
   2912	/* Ok, it's mapped. Make sure it's up-to-date */
   2913	if (PageUptodate(page))
   2914		set_buffer_uptodate(bh);
   2915
   2916	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
   2917		err = -EIO;
   2918		ll_rw_block(REQ_OP_READ, 0, 1, &bh);
   2919		wait_on_buffer(bh);
   2920		/* Uhhuh. Read error. Complain and punt. */
   2921		if (!buffer_uptodate(bh))
   2922			goto unlock;
   2923	}
   2924
   2925	zero_user(page, offset, length);
   2926	mark_buffer_dirty(bh);
   2927	err = 0;
   2928
   2929unlock:
   2930	unlock_page(page);
   2931	put_page(page);
   2932out:
   2933	return err;
   2934}
   2935EXPORT_SYMBOL(block_truncate_page);
   2936
   2937/*
   2938 * The generic ->writepage function for buffer-backed address_spaces
   2939 */
   2940int block_write_full_page(struct page *page, get_block_t *get_block,
   2941			struct writeback_control *wbc)
   2942{
   2943	struct inode * const inode = page->mapping->host;
   2944	loff_t i_size = i_size_read(inode);
   2945	const pgoff_t end_index = i_size >> PAGE_SHIFT;
   2946	unsigned offset;
   2947
   2948	/* Is the page fully inside i_size? */
   2949	if (page->index < end_index)
   2950		return __block_write_full_page(inode, page, get_block, wbc,
   2951					       end_buffer_async_write);
   2952
   2953	/* Is the page fully outside i_size? (truncate in progress) */
   2954	offset = i_size & (PAGE_SIZE-1);
   2955	if (page->index >= end_index+1 || !offset) {
   2956		unlock_page(page);
   2957		return 0; /* don't care */
   2958	}
   2959
   2960	/*
   2961	 * The page straddles i_size.  It must be zeroed out on each and every
   2962	 * writepage invocation because it may be mmapped.  "A file is mapped
   2963	 * in multiples of the page size.  For a file that is not a multiple of
   2964	 * the  page size, the remaining memory is zeroed when mapped, and
   2965	 * writes to that region are not written out to the file."
   2966	 */
   2967	zero_user_segment(page, offset, PAGE_SIZE);
   2968	return __block_write_full_page(inode, page, get_block, wbc,
   2969							end_buffer_async_write);
   2970}
   2971EXPORT_SYMBOL(block_write_full_page);
   2972
   2973sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
   2974			    get_block_t *get_block)
   2975{
   2976	struct inode *inode = mapping->host;
   2977	struct buffer_head tmp = {
   2978		.b_size = i_blocksize(inode),
   2979	};
   2980
   2981	get_block(inode, block, &tmp, 0);
   2982	return tmp.b_blocknr;
   2983}
   2984EXPORT_SYMBOL(generic_block_bmap);
   2985
   2986static void end_bio_bh_io_sync(struct bio *bio)
   2987{
   2988	struct buffer_head *bh = bio->bi_private;
   2989
   2990	if (unlikely(bio_flagged(bio, BIO_QUIET)))
   2991		set_bit(BH_Quiet, &bh->b_state);
   2992
   2993	bh->b_end_io(bh, !bio->bi_status);
   2994	bio_put(bio);
   2995}
   2996
   2997static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
   2998			 struct writeback_control *wbc)
   2999{
   3000	struct bio *bio;
   3001
   3002	BUG_ON(!buffer_locked(bh));
   3003	BUG_ON(!buffer_mapped(bh));
   3004	BUG_ON(!bh->b_end_io);
   3005	BUG_ON(buffer_delay(bh));
   3006	BUG_ON(buffer_unwritten(bh));
   3007
   3008	/*
   3009	 * Only clear out a write error when rewriting
   3010	 */
   3011	if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
   3012		clear_buffer_write_io_error(bh);
   3013
   3014	if (buffer_meta(bh))
   3015		op_flags |= REQ_META;
   3016	if (buffer_prio(bh))
   3017		op_flags |= REQ_PRIO;
   3018
   3019	bio = bio_alloc(bh->b_bdev, 1, op | op_flags, GFP_NOIO);
   3020
   3021	fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
   3022
   3023	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
   3024
   3025	bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
   3026	BUG_ON(bio->bi_iter.bi_size != bh->b_size);
   3027
   3028	bio->bi_end_io = end_bio_bh_io_sync;
   3029	bio->bi_private = bh;
   3030
   3031	/* Take care of bh's that straddle the end of the device */
   3032	guard_bio_eod(bio);
   3033
   3034	if (wbc) {
   3035		wbc_init_bio(wbc, bio);
   3036		wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
   3037	}
   3038
   3039	submit_bio(bio);
   3040	return 0;
   3041}
   3042
   3043int submit_bh(int op, int op_flags, struct buffer_head *bh)
   3044{
   3045	return submit_bh_wbc(op, op_flags, bh, NULL);
   3046}
   3047EXPORT_SYMBOL(submit_bh);
   3048
   3049/**
   3050 * ll_rw_block: low-level access to block devices (DEPRECATED)
   3051 * @op: whether to %READ or %WRITE
   3052 * @op_flags: req_flag_bits
   3053 * @nr: number of &struct buffer_heads in the array
   3054 * @bhs: array of pointers to &struct buffer_head
   3055 *
   3056 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
   3057 * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE.
   3058 * @op_flags contains flags modifying the detailed I/O behavior, most notably
   3059 * %REQ_RAHEAD.
   3060 *
   3061 * This function drops any buffer that it cannot get a lock on (with the
   3062 * BH_Lock state bit), any buffer that appears to be clean when doing a write
   3063 * request, and any buffer that appears to be up-to-date when doing read
   3064 * request.  Further it marks as clean buffers that are processed for
   3065 * writing (the buffer cache won't assume that they are actually clean
   3066 * until the buffer gets unlocked).
   3067 *
   3068 * ll_rw_block sets b_end_io to simple completion handler that marks
   3069 * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
   3070 * any waiters. 
   3071 *
   3072 * All of the buffers must be for the same device, and must also be a
   3073 * multiple of the current approved size for the device.
   3074 */
   3075void ll_rw_block(int op, int op_flags,  int nr, struct buffer_head *bhs[])
   3076{
   3077	int i;
   3078
   3079	for (i = 0; i < nr; i++) {
   3080		struct buffer_head *bh = bhs[i];
   3081
   3082		if (!trylock_buffer(bh))
   3083			continue;
   3084		if (op == WRITE) {
   3085			if (test_clear_buffer_dirty(bh)) {
   3086				bh->b_end_io = end_buffer_write_sync;
   3087				get_bh(bh);
   3088				submit_bh(op, op_flags, bh);
   3089				continue;
   3090			}
   3091		} else {
   3092			if (!buffer_uptodate(bh)) {
   3093				bh->b_end_io = end_buffer_read_sync;
   3094				get_bh(bh);
   3095				submit_bh(op, op_flags, bh);
   3096				continue;
   3097			}
   3098		}
   3099		unlock_buffer(bh);
   3100	}
   3101}
   3102EXPORT_SYMBOL(ll_rw_block);
   3103
   3104void write_dirty_buffer(struct buffer_head *bh, int op_flags)
   3105{
   3106	lock_buffer(bh);
   3107	if (!test_clear_buffer_dirty(bh)) {
   3108		unlock_buffer(bh);
   3109		return;
   3110	}
   3111	bh->b_end_io = end_buffer_write_sync;
   3112	get_bh(bh);
   3113	submit_bh(REQ_OP_WRITE, op_flags, bh);
   3114}
   3115EXPORT_SYMBOL(write_dirty_buffer);
   3116
   3117/*
   3118 * For a data-integrity writeout, we need to wait upon any in-progress I/O
   3119 * and then start new I/O and then wait upon it.  The caller must have a ref on
   3120 * the buffer_head.
   3121 */
   3122int __sync_dirty_buffer(struct buffer_head *bh, int op_flags)
   3123{
   3124	int ret = 0;
   3125
   3126	WARN_ON(atomic_read(&bh->b_count) < 1);
   3127	lock_buffer(bh);
   3128	if (test_clear_buffer_dirty(bh)) {
   3129		/*
   3130		 * The bh should be mapped, but it might not be if the
   3131		 * device was hot-removed. Not much we can do but fail the I/O.
   3132		 */
   3133		if (!buffer_mapped(bh)) {
   3134			unlock_buffer(bh);
   3135			return -EIO;
   3136		}
   3137
   3138		get_bh(bh);
   3139		bh->b_end_io = end_buffer_write_sync;
   3140		ret = submit_bh(REQ_OP_WRITE, op_flags, bh);
   3141		wait_on_buffer(bh);
   3142		if (!ret && !buffer_uptodate(bh))
   3143			ret = -EIO;
   3144	} else {
   3145		unlock_buffer(bh);
   3146	}
   3147	return ret;
   3148}
   3149EXPORT_SYMBOL(__sync_dirty_buffer);
   3150
   3151int sync_dirty_buffer(struct buffer_head *bh)
   3152{
   3153	return __sync_dirty_buffer(bh, REQ_SYNC);
   3154}
   3155EXPORT_SYMBOL(sync_dirty_buffer);
   3156
   3157/*
   3158 * try_to_free_buffers() checks if all the buffers on this particular folio
   3159 * are unused, and releases them if so.
   3160 *
   3161 * Exclusion against try_to_free_buffers may be obtained by either
   3162 * locking the folio or by holding its mapping's private_lock.
   3163 *
   3164 * If the folio is dirty but all the buffers are clean then we need to
   3165 * be sure to mark the folio clean as well.  This is because the folio
   3166 * may be against a block device, and a later reattachment of buffers
   3167 * to a dirty folio will set *all* buffers dirty.  Which would corrupt
   3168 * filesystem data on the same device.
   3169 *
   3170 * The same applies to regular filesystem folios: if all the buffers are
   3171 * clean then we set the folio clean and proceed.  To do that, we require
   3172 * total exclusion from block_dirty_folio().  That is obtained with
   3173 * private_lock.
   3174 *
   3175 * try_to_free_buffers() is non-blocking.
   3176 */
   3177static inline int buffer_busy(struct buffer_head *bh)
   3178{
   3179	return atomic_read(&bh->b_count) |
   3180		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
   3181}
   3182
   3183static bool
   3184drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free)
   3185{
   3186	struct buffer_head *head = folio_buffers(folio);
   3187	struct buffer_head *bh;
   3188
   3189	bh = head;
   3190	do {
   3191		if (buffer_busy(bh))
   3192			goto failed;
   3193		bh = bh->b_this_page;
   3194	} while (bh != head);
   3195
   3196	do {
   3197		struct buffer_head *next = bh->b_this_page;
   3198
   3199		if (bh->b_assoc_map)
   3200			__remove_assoc_queue(bh);
   3201		bh = next;
   3202	} while (bh != head);
   3203	*buffers_to_free = head;
   3204	folio_detach_private(folio);
   3205	return true;
   3206failed:
   3207	return false;
   3208}
   3209
   3210bool try_to_free_buffers(struct folio *folio)
   3211{
   3212	struct address_space * const mapping = folio->mapping;
   3213	struct buffer_head *buffers_to_free = NULL;
   3214	bool ret = 0;
   3215
   3216	BUG_ON(!folio_test_locked(folio));
   3217	if (folio_test_writeback(folio))
   3218		return false;
   3219
   3220	if (mapping == NULL) {		/* can this still happen? */
   3221		ret = drop_buffers(folio, &buffers_to_free);
   3222		goto out;
   3223	}
   3224
   3225	spin_lock(&mapping->private_lock);
   3226	ret = drop_buffers(folio, &buffers_to_free);
   3227
   3228	/*
   3229	 * If the filesystem writes its buffers by hand (eg ext3)
   3230	 * then we can have clean buffers against a dirty folio.  We
   3231	 * clean the folio here; otherwise the VM will never notice
   3232	 * that the filesystem did any IO at all.
   3233	 *
   3234	 * Also, during truncate, discard_buffer will have marked all
   3235	 * the folio's buffers clean.  We discover that here and clean
   3236	 * the folio also.
   3237	 *
   3238	 * private_lock must be held over this entire operation in order
   3239	 * to synchronise against block_dirty_folio and prevent the
   3240	 * dirty bit from being lost.
   3241	 */
   3242	if (ret)
   3243		folio_cancel_dirty(folio);
   3244	spin_unlock(&mapping->private_lock);
   3245out:
   3246	if (buffers_to_free) {
   3247		struct buffer_head *bh = buffers_to_free;
   3248
   3249		do {
   3250			struct buffer_head *next = bh->b_this_page;
   3251			free_buffer_head(bh);
   3252			bh = next;
   3253		} while (bh != buffers_to_free);
   3254	}
   3255	return ret;
   3256}
   3257EXPORT_SYMBOL(try_to_free_buffers);
   3258
   3259/*
   3260 * Buffer-head allocation
   3261 */
   3262static struct kmem_cache *bh_cachep __read_mostly;
   3263
   3264/*
   3265 * Once the number of bh's in the machine exceeds this level, we start
   3266 * stripping them in writeback.
   3267 */
   3268static unsigned long max_buffer_heads;
   3269
   3270int buffer_heads_over_limit;
   3271
   3272struct bh_accounting {
   3273	int nr;			/* Number of live bh's */
   3274	int ratelimit;		/* Limit cacheline bouncing */
   3275};
   3276
   3277static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
   3278
   3279static void recalc_bh_state(void)
   3280{
   3281	int i;
   3282	int tot = 0;
   3283
   3284	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
   3285		return;
   3286	__this_cpu_write(bh_accounting.ratelimit, 0);
   3287	for_each_online_cpu(i)
   3288		tot += per_cpu(bh_accounting, i).nr;
   3289	buffer_heads_over_limit = (tot > max_buffer_heads);
   3290}
   3291
   3292struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
   3293{
   3294	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
   3295	if (ret) {
   3296		INIT_LIST_HEAD(&ret->b_assoc_buffers);
   3297		spin_lock_init(&ret->b_uptodate_lock);
   3298		preempt_disable();
   3299		__this_cpu_inc(bh_accounting.nr);
   3300		recalc_bh_state();
   3301		preempt_enable();
   3302	}
   3303	return ret;
   3304}
   3305EXPORT_SYMBOL(alloc_buffer_head);
   3306
   3307void free_buffer_head(struct buffer_head *bh)
   3308{
   3309	BUG_ON(!list_empty(&bh->b_assoc_buffers));
   3310	kmem_cache_free(bh_cachep, bh);
   3311	preempt_disable();
   3312	__this_cpu_dec(bh_accounting.nr);
   3313	recalc_bh_state();
   3314	preempt_enable();
   3315}
   3316EXPORT_SYMBOL(free_buffer_head);
   3317
   3318static int buffer_exit_cpu_dead(unsigned int cpu)
   3319{
   3320	int i;
   3321	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
   3322
   3323	for (i = 0; i < BH_LRU_SIZE; i++) {
   3324		brelse(b->bhs[i]);
   3325		b->bhs[i] = NULL;
   3326	}
   3327	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
   3328	per_cpu(bh_accounting, cpu).nr = 0;
   3329	return 0;
   3330}
   3331
   3332/**
   3333 * bh_uptodate_or_lock - Test whether the buffer is uptodate
   3334 * @bh: struct buffer_head
   3335 *
   3336 * Return true if the buffer is up-to-date and false,
   3337 * with the buffer locked, if not.
   3338 */
   3339int bh_uptodate_or_lock(struct buffer_head *bh)
   3340{
   3341	if (!buffer_uptodate(bh)) {
   3342		lock_buffer(bh);
   3343		if (!buffer_uptodate(bh))
   3344			return 0;
   3345		unlock_buffer(bh);
   3346	}
   3347	return 1;
   3348}
   3349EXPORT_SYMBOL(bh_uptodate_or_lock);
   3350
   3351/**
   3352 * bh_submit_read - Submit a locked buffer for reading
   3353 * @bh: struct buffer_head
   3354 *
   3355 * Returns zero on success and -EIO on error.
   3356 */
   3357int bh_submit_read(struct buffer_head *bh)
   3358{
   3359	BUG_ON(!buffer_locked(bh));
   3360
   3361	if (buffer_uptodate(bh)) {
   3362		unlock_buffer(bh);
   3363		return 0;
   3364	}
   3365
   3366	get_bh(bh);
   3367	bh->b_end_io = end_buffer_read_sync;
   3368	submit_bh(REQ_OP_READ, 0, bh);
   3369	wait_on_buffer(bh);
   3370	if (buffer_uptodate(bh))
   3371		return 0;
   3372	return -EIO;
   3373}
   3374EXPORT_SYMBOL(bh_submit_read);
   3375
   3376void __init buffer_init(void)
   3377{
   3378	unsigned long nrpages;
   3379	int ret;
   3380
   3381	bh_cachep = kmem_cache_create("buffer_head",
   3382			sizeof(struct buffer_head), 0,
   3383				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
   3384				SLAB_MEM_SPREAD),
   3385				NULL);
   3386
   3387	/*
   3388	 * Limit the bh occupancy to 10% of ZONE_NORMAL
   3389	 */
   3390	nrpages = (nr_free_buffer_pages() * 10) / 100;
   3391	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
   3392	ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
   3393					NULL, buffer_exit_cpu_dead);
   3394	WARN_ON(ret < 0);
   3395}