xfs_buf.c - cachepc-linux - Fork of AMDESE/linux with modifications for CachePC side-channel attack

	cachepc-linux Fork of AMDESE/linux with modifications for CachePC side-channel attack
	git clone https://git.sinitax.com/sinitax/cachepc-linux
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt
xfs_buf.c (58948B)
      1// SPDX-License-Identifier: GPL-2.0
      2/*
      3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
      4 * All Rights Reserved.
      5 */
      6#include "xfs.h"
      7#include <linux/backing-dev.h>
      8
      9#include "xfs_shared.h"
     10#include "xfs_format.h"
     11#include "xfs_log_format.h"
     12#include "xfs_trans_resv.h"
     13#include "xfs_mount.h"
     14#include "xfs_trace.h"
     15#include "xfs_log.h"
     16#include "xfs_log_recover.h"
     17#include "xfs_log_priv.h"
     18#include "xfs_trans.h"
     19#include "xfs_buf_item.h"
     20#include "xfs_errortag.h"
     21#include "xfs_error.h"
     22#include "xfs_ag.h"
     23
     24static struct kmem_cache *xfs_buf_cache;
     25
     26/*
     27 * Locking orders
     28 *
     29 * xfs_buf_ioacct_inc:
     30 * xfs_buf_ioacct_dec:
     31 *	b_sema (caller holds)
     32 *	  b_lock
     33 *
     34 * xfs_buf_stale:
     35 *	b_sema (caller holds)
     36 *	  b_lock
     37 *	    lru_lock
     38 *
     39 * xfs_buf_rele:
     40 *	b_lock
     41 *	  pag_buf_lock
     42 *	    lru_lock
     43 *
     44 * xfs_buftarg_drain_rele
     45 *	lru_lock
     46 *	  b_lock (trylock due to inversion)
     47 *
     48 * xfs_buftarg_isolate
     49 *	lru_lock
     50 *	  b_lock (trylock due to inversion)
     51 */
     52
     53static int __xfs_buf_submit(struct xfs_buf *bp, bool wait);
     54
     55static inline int
     56xfs_buf_submit(
     57	struct xfs_buf		*bp)
     58{
     59	return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
     60}
     61
     62static inline int
     63xfs_buf_is_vmapped(
     64	struct xfs_buf	*bp)
     65{
     66	/*
     67	 * Return true if the buffer is vmapped.
     68	 *
     69	 * b_addr is null if the buffer is not mapped, but the code is clever
     70	 * enough to know it doesn't have to map a single page, so the check has
     71	 * to be both for b_addr and bp->b_page_count > 1.
     72	 */
     73	return bp->b_addr && bp->b_page_count > 1;
     74}
     75
     76static inline int
     77xfs_buf_vmap_len(
     78	struct xfs_buf	*bp)
     79{
     80	return (bp->b_page_count * PAGE_SIZE);
     81}
     82
     83/*
     84 * Bump the I/O in flight count on the buftarg if we haven't yet done so for
     85 * this buffer. The count is incremented once per buffer (per hold cycle)
     86 * because the corresponding decrement is deferred to buffer release. Buffers
     87 * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
     88 * tracking adds unnecessary overhead. This is used for sychronization purposes
     89 * with unmount (see xfs_buftarg_drain()), so all we really need is a count of
     90 * in-flight buffers.
     91 *
     92 * Buffers that are never released (e.g., superblock, iclog buffers) must set
     93 * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
     94 * never reaches zero and unmount hangs indefinitely.
     95 */
     96static inline void
     97xfs_buf_ioacct_inc(
     98	struct xfs_buf	*bp)
     99{
    100	if (bp->b_flags & XBF_NO_IOACCT)
    101		return;
    102
    103	ASSERT(bp->b_flags & XBF_ASYNC);
    104	spin_lock(&bp->b_lock);
    105	if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
    106		bp->b_state |= XFS_BSTATE_IN_FLIGHT;
    107		percpu_counter_inc(&bp->b_target->bt_io_count);
    108	}
    109	spin_unlock(&bp->b_lock);
    110}
    111
    112/*
    113 * Clear the in-flight state on a buffer about to be released to the LRU or
    114 * freed and unaccount from the buftarg.
    115 */
    116static inline void
    117__xfs_buf_ioacct_dec(
    118	struct xfs_buf	*bp)
    119{
    120	lockdep_assert_held(&bp->b_lock);
    121
    122	if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
    123		bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
    124		percpu_counter_dec(&bp->b_target->bt_io_count);
    125	}
    126}
    127
    128static inline void
    129xfs_buf_ioacct_dec(
    130	struct xfs_buf	*bp)
    131{
    132	spin_lock(&bp->b_lock);
    133	__xfs_buf_ioacct_dec(bp);
    134	spin_unlock(&bp->b_lock);
    135}
    136
    137/*
    138 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
    139 * b_lru_ref count so that the buffer is freed immediately when the buffer
    140 * reference count falls to zero. If the buffer is already on the LRU, we need
    141 * to remove the reference that LRU holds on the buffer.
    142 *
    143 * This prevents build-up of stale buffers on the LRU.
    144 */
    145void
    146xfs_buf_stale(
    147	struct xfs_buf	*bp)
    148{
    149	ASSERT(xfs_buf_islocked(bp));
    150
    151	bp->b_flags |= XBF_STALE;
    152
    153	/*
    154	 * Clear the delwri status so that a delwri queue walker will not
    155	 * flush this buffer to disk now that it is stale. The delwri queue has
    156	 * a reference to the buffer, so this is safe to do.
    157	 */
    158	bp->b_flags &= ~_XBF_DELWRI_Q;
    159
    160	/*
    161	 * Once the buffer is marked stale and unlocked, a subsequent lookup
    162	 * could reset b_flags. There is no guarantee that the buffer is
    163	 * unaccounted (released to LRU) before that occurs. Drop in-flight
    164	 * status now to preserve accounting consistency.
    165	 */
    166	spin_lock(&bp->b_lock);
    167	__xfs_buf_ioacct_dec(bp);
    168
    169	atomic_set(&bp->b_lru_ref, 0);
    170	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
    171	    (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
    172		atomic_dec(&bp->b_hold);
    173
    174	ASSERT(atomic_read(&bp->b_hold) >= 1);
    175	spin_unlock(&bp->b_lock);
    176}
    177
    178static int
    179xfs_buf_get_maps(
    180	struct xfs_buf		*bp,
    181	int			map_count)
    182{
    183	ASSERT(bp->b_maps == NULL);
    184	bp->b_map_count = map_count;
    185
    186	if (map_count == 1) {
    187		bp->b_maps = &bp->__b_map;
    188		return 0;
    189	}
    190
    191	bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
    192				KM_NOFS);
    193	if (!bp->b_maps)
    194		return -ENOMEM;
    195	return 0;
    196}
    197
    198/*
    199 *	Frees b_pages if it was allocated.
    200 */
    201static void
    202xfs_buf_free_maps(
    203	struct xfs_buf	*bp)
    204{
    205	if (bp->b_maps != &bp->__b_map) {
    206		kmem_free(bp->b_maps);
    207		bp->b_maps = NULL;
    208	}
    209}
    210
    211static int
    212_xfs_buf_alloc(
    213	struct xfs_buftarg	*target,
    214	struct xfs_buf_map	*map,
    215	int			nmaps,
    216	xfs_buf_flags_t		flags,
    217	struct xfs_buf		**bpp)
    218{
    219	struct xfs_buf		*bp;
    220	int			error;
    221	int			i;
    222
    223	*bpp = NULL;
    224	bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL);
    225
    226	/*
    227	 * We don't want certain flags to appear in b_flags unless they are
    228	 * specifically set by later operations on the buffer.
    229	 */
    230	flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
    231
    232	atomic_set(&bp->b_hold, 1);
    233	atomic_set(&bp->b_lru_ref, 1);
    234	init_completion(&bp->b_iowait);
    235	INIT_LIST_HEAD(&bp->b_lru);
    236	INIT_LIST_HEAD(&bp->b_list);
    237	INIT_LIST_HEAD(&bp->b_li_list);
    238	sema_init(&bp->b_sema, 0); /* held, no waiters */
    239	spin_lock_init(&bp->b_lock);
    240	bp->b_target = target;
    241	bp->b_mount = target->bt_mount;
    242	bp->b_flags = flags;
    243
    244	/*
    245	 * Set length and io_length to the same value initially.
    246	 * I/O routines should use io_length, which will be the same in
    247	 * most cases but may be reset (e.g. XFS recovery).
    248	 */
    249	error = xfs_buf_get_maps(bp, nmaps);
    250	if (error)  {
    251		kmem_cache_free(xfs_buf_cache, bp);
    252		return error;
    253	}
    254
    255	bp->b_rhash_key = map[0].bm_bn;
    256	bp->b_length = 0;
    257	for (i = 0; i < nmaps; i++) {
    258		bp->b_maps[i].bm_bn = map[i].bm_bn;
    259		bp->b_maps[i].bm_len = map[i].bm_len;
    260		bp->b_length += map[i].bm_len;
    261	}
    262
    263	atomic_set(&bp->b_pin_count, 0);
    264	init_waitqueue_head(&bp->b_waiters);
    265
    266	XFS_STATS_INC(bp->b_mount, xb_create);
    267	trace_xfs_buf_init(bp, _RET_IP_);
    268
    269	*bpp = bp;
    270	return 0;
    271}
    272
    273static void
    274xfs_buf_free_pages(
    275	struct xfs_buf	*bp)
    276{
    277	uint		i;
    278
    279	ASSERT(bp->b_flags & _XBF_PAGES);
    280
    281	if (xfs_buf_is_vmapped(bp))
    282		vm_unmap_ram(bp->b_addr, bp->b_page_count);
    283
    284	for (i = 0; i < bp->b_page_count; i++) {
    285		if (bp->b_pages[i])
    286			__free_page(bp->b_pages[i]);
    287	}
    288	if (current->reclaim_state)
    289		current->reclaim_state->reclaimed_slab += bp->b_page_count;
    290
    291	if (bp->b_pages != bp->b_page_array)
    292		kmem_free(bp->b_pages);
    293	bp->b_pages = NULL;
    294	bp->b_flags &= ~_XBF_PAGES;
    295}
    296
    297static void
    298xfs_buf_free(
    299	struct xfs_buf		*bp)
    300{
    301	trace_xfs_buf_free(bp, _RET_IP_);
    302
    303	ASSERT(list_empty(&bp->b_lru));
    304
    305	if (bp->b_flags & _XBF_PAGES)
    306		xfs_buf_free_pages(bp);
    307	else if (bp->b_flags & _XBF_KMEM)
    308		kmem_free(bp->b_addr);
    309
    310	xfs_buf_free_maps(bp);
    311	kmem_cache_free(xfs_buf_cache, bp);
    312}
    313
    314static int
    315xfs_buf_alloc_kmem(
    316	struct xfs_buf	*bp,
    317	xfs_buf_flags_t	flags)
    318{
    319	xfs_km_flags_t	kmflag_mask = KM_NOFS;
    320	size_t		size = BBTOB(bp->b_length);
    321
    322	/* Assure zeroed buffer for non-read cases. */
    323	if (!(flags & XBF_READ))
    324		kmflag_mask |= KM_ZERO;
    325
    326	bp->b_addr = kmem_alloc(size, kmflag_mask);
    327	if (!bp->b_addr)
    328		return -ENOMEM;
    329
    330	if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
    331	    ((unsigned long)bp->b_addr & PAGE_MASK)) {
    332		/* b_addr spans two pages - use alloc_page instead */
    333		kmem_free(bp->b_addr);
    334		bp->b_addr = NULL;
    335		return -ENOMEM;
    336	}
    337	bp->b_offset = offset_in_page(bp->b_addr);
    338	bp->b_pages = bp->b_page_array;
    339	bp->b_pages[0] = kmem_to_page(bp->b_addr);
    340	bp->b_page_count = 1;
    341	bp->b_flags |= _XBF_KMEM;
    342	return 0;
    343}
    344
    345static int
    346xfs_buf_alloc_pages(
    347	struct xfs_buf	*bp,
    348	xfs_buf_flags_t	flags)
    349{
    350	gfp_t		gfp_mask = __GFP_NOWARN;
    351	long		filled = 0;
    352
    353	if (flags & XBF_READ_AHEAD)
    354		gfp_mask |= __GFP_NORETRY;
    355	else
    356		gfp_mask |= GFP_NOFS;
    357
    358	/* Make sure that we have a page list */
    359	bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
    360	if (bp->b_page_count <= XB_PAGES) {
    361		bp->b_pages = bp->b_page_array;
    362	} else {
    363		bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
    364					gfp_mask);
    365		if (!bp->b_pages)
    366			return -ENOMEM;
    367	}
    368	bp->b_flags |= _XBF_PAGES;
    369
    370	/* Assure zeroed buffer for non-read cases. */
    371	if (!(flags & XBF_READ))
    372		gfp_mask |= __GFP_ZERO;
    373
    374	/*
    375	 * Bulk filling of pages can take multiple calls. Not filling the entire
    376	 * array is not an allocation failure, so don't back off if we get at
    377	 * least one extra page.
    378	 */
    379	for (;;) {
    380		long	last = filled;
    381
    382		filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count,
    383						bp->b_pages);
    384		if (filled == bp->b_page_count) {
    385			XFS_STATS_INC(bp->b_mount, xb_page_found);
    386			break;
    387		}
    388
    389		if (filled != last)
    390			continue;
    391
    392		if (flags & XBF_READ_AHEAD) {
    393			xfs_buf_free_pages(bp);
    394			return -ENOMEM;
    395		}
    396
    397		XFS_STATS_INC(bp->b_mount, xb_page_retries);
    398		memalloc_retry_wait(gfp_mask);
    399	}
    400	return 0;
    401}
    402
    403/*
    404 *	Map buffer into kernel address-space if necessary.
    405 */
    406STATIC int
    407_xfs_buf_map_pages(
    408	struct xfs_buf		*bp,
    409	xfs_buf_flags_t		flags)
    410{
    411	ASSERT(bp->b_flags & _XBF_PAGES);
    412	if (bp->b_page_count == 1) {
    413		/* A single page buffer is always mappable */
    414		bp->b_addr = page_address(bp->b_pages[0]);
    415	} else if (flags & XBF_UNMAPPED) {
    416		bp->b_addr = NULL;
    417	} else {
    418		int retried = 0;
    419		unsigned nofs_flag;
    420
    421		/*
    422		 * vm_map_ram() will allocate auxiliary structures (e.g.
    423		 * pagetables) with GFP_KERNEL, yet we are likely to be under
    424		 * GFP_NOFS context here. Hence we need to tell memory reclaim
    425		 * that we are in such a context via PF_MEMALLOC_NOFS to prevent
    426		 * memory reclaim re-entering the filesystem here and
    427		 * potentially deadlocking.
    428		 */
    429		nofs_flag = memalloc_nofs_save();
    430		do {
    431			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
    432						-1);
    433			if (bp->b_addr)
    434				break;
    435			vm_unmap_aliases();
    436		} while (retried++ <= 1);
    437		memalloc_nofs_restore(nofs_flag);
    438
    439		if (!bp->b_addr)
    440			return -ENOMEM;
    441	}
    442
    443	return 0;
    444}
    445
    446/*
    447 *	Finding and Reading Buffers
    448 */
    449static int
    450_xfs_buf_obj_cmp(
    451	struct rhashtable_compare_arg	*arg,
    452	const void			*obj)
    453{
    454	const struct xfs_buf_map	*map = arg->key;
    455	const struct xfs_buf		*bp = obj;
    456
    457	/*
    458	 * The key hashing in the lookup path depends on the key being the
    459	 * first element of the compare_arg, make sure to assert this.
    460	 */
    461	BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
    462
    463	if (bp->b_rhash_key != map->bm_bn)
    464		return 1;
    465
    466	if (unlikely(bp->b_length != map->bm_len)) {
    467		/*
    468		 * found a block number match. If the range doesn't
    469		 * match, the only way this is allowed is if the buffer
    470		 * in the cache is stale and the transaction that made
    471		 * it stale has not yet committed. i.e. we are
    472		 * reallocating a busy extent. Skip this buffer and
    473		 * continue searching for an exact match.
    474		 */
    475		ASSERT(bp->b_flags & XBF_STALE);
    476		return 1;
    477	}
    478	return 0;
    479}
    480
    481static const struct rhashtable_params xfs_buf_hash_params = {
    482	.min_size		= 32,	/* empty AGs have minimal footprint */
    483	.nelem_hint		= 16,
    484	.key_len		= sizeof(xfs_daddr_t),
    485	.key_offset		= offsetof(struct xfs_buf, b_rhash_key),
    486	.head_offset		= offsetof(struct xfs_buf, b_rhash_head),
    487	.automatic_shrinking	= true,
    488	.obj_cmpfn		= _xfs_buf_obj_cmp,
    489};
    490
    491int
    492xfs_buf_hash_init(
    493	struct xfs_perag	*pag)
    494{
    495	spin_lock_init(&pag->pag_buf_lock);
    496	return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
    497}
    498
    499void
    500xfs_buf_hash_destroy(
    501	struct xfs_perag	*pag)
    502{
    503	rhashtable_destroy(&pag->pag_buf_hash);
    504}
    505
    506/*
    507 * Look up a buffer in the buffer cache and return it referenced and locked
    508 * in @found_bp.
    509 *
    510 * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the
    511 * cache.
    512 *
    513 * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return
    514 * -EAGAIN if we fail to lock it.
    515 *
    516 * Return values are:
    517 *	-EFSCORRUPTED if have been supplied with an invalid address
    518 *	-EAGAIN on trylock failure
    519 *	-ENOENT if we fail to find a match and @new_bp was NULL
    520 *	0, with @found_bp:
    521 *		- @new_bp if we inserted it into the cache
    522 *		- the buffer we found and locked.
    523 */
    524static int
    525xfs_buf_find(
    526	struct xfs_buftarg	*btp,
    527	struct xfs_buf_map	*map,
    528	int			nmaps,
    529	xfs_buf_flags_t		flags,
    530	struct xfs_buf		*new_bp,
    531	struct xfs_buf		**found_bp)
    532{
    533	struct xfs_perag	*pag;
    534	struct xfs_buf		*bp;
    535	struct xfs_buf_map	cmap = { .bm_bn = map[0].bm_bn };
    536	xfs_daddr_t		eofs;
    537	int			i;
    538
    539	*found_bp = NULL;
    540
    541	for (i = 0; i < nmaps; i++)
    542		cmap.bm_len += map[i].bm_len;
    543
    544	/* Check for IOs smaller than the sector size / not sector aligned */
    545	ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
    546	ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
    547
    548	/*
    549	 * Corrupted block numbers can get through to here, unfortunately, so we
    550	 * have to check that the buffer falls within the filesystem bounds.
    551	 */
    552	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
    553	if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
    554		xfs_alert(btp->bt_mount,
    555			  "%s: daddr 0x%llx out of range, EOFS 0x%llx",
    556			  __func__, cmap.bm_bn, eofs);
    557		WARN_ON(1);
    558		return -EFSCORRUPTED;
    559	}
    560
    561	pag = xfs_perag_get(btp->bt_mount,
    562			    xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
    563
    564	spin_lock(&pag->pag_buf_lock);
    565	bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
    566				    xfs_buf_hash_params);
    567	if (bp) {
    568		atomic_inc(&bp->b_hold);
    569		goto found;
    570	}
    571
    572	/* No match found */
    573	if (!new_bp) {
    574		XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
    575		spin_unlock(&pag->pag_buf_lock);
    576		xfs_perag_put(pag);
    577		return -ENOENT;
    578	}
    579
    580	/* the buffer keeps the perag reference until it is freed */
    581	new_bp->b_pag = pag;
    582	rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head,
    583			       xfs_buf_hash_params);
    584	spin_unlock(&pag->pag_buf_lock);
    585	*found_bp = new_bp;
    586	return 0;
    587
    588found:
    589	spin_unlock(&pag->pag_buf_lock);
    590	xfs_perag_put(pag);
    591
    592	if (!xfs_buf_trylock(bp)) {
    593		if (flags & XBF_TRYLOCK) {
    594			xfs_buf_rele(bp);
    595			XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
    596			return -EAGAIN;
    597		}
    598		xfs_buf_lock(bp);
    599		XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
    600	}
    601
    602	/*
    603	 * if the buffer is stale, clear all the external state associated with
    604	 * it. We need to keep flags such as how we allocated the buffer memory
    605	 * intact here.
    606	 */
    607	if (bp->b_flags & XBF_STALE) {
    608		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
    609		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
    610		bp->b_ops = NULL;
    611	}
    612
    613	trace_xfs_buf_find(bp, flags, _RET_IP_);
    614	XFS_STATS_INC(btp->bt_mount, xb_get_locked);
    615	*found_bp = bp;
    616	return 0;
    617}
    618
    619struct xfs_buf *
    620xfs_buf_incore(
    621	struct xfs_buftarg	*target,
    622	xfs_daddr_t		blkno,
    623	size_t			numblks,
    624	xfs_buf_flags_t		flags)
    625{
    626	struct xfs_buf		*bp;
    627	int			error;
    628	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
    629
    630	error = xfs_buf_find(target, &map, 1, flags, NULL, &bp);
    631	if (error)
    632		return NULL;
    633	return bp;
    634}
    635
    636/*
    637 * Assembles a buffer covering the specified range. The code is optimised for
    638 * cache hits, as metadata intensive workloads will see 3 orders of magnitude
    639 * more hits than misses.
    640 */
    641int
    642xfs_buf_get_map(
    643	struct xfs_buftarg	*target,
    644	struct xfs_buf_map	*map,
    645	int			nmaps,
    646	xfs_buf_flags_t		flags,
    647	struct xfs_buf		**bpp)
    648{
    649	struct xfs_buf		*bp;
    650	struct xfs_buf		*new_bp;
    651	int			error;
    652
    653	*bpp = NULL;
    654	error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
    655	if (!error)
    656		goto found;
    657	if (error != -ENOENT)
    658		return error;
    659
    660	error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp);
    661	if (error)
    662		return error;
    663
    664	/*
    665	 * For buffers that fit entirely within a single page, first attempt to
    666	 * allocate the memory from the heap to minimise memory usage. If we
    667	 * can't get heap memory for these small buffers, we fall back to using
    668	 * the page allocator.
    669	 */
    670	if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
    671	    xfs_buf_alloc_kmem(new_bp, flags) < 0) {
    672		error = xfs_buf_alloc_pages(new_bp, flags);
    673		if (error)
    674			goto out_free_buf;
    675	}
    676
    677	error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
    678	if (error)
    679		goto out_free_buf;
    680
    681	if (bp != new_bp)
    682		xfs_buf_free(new_bp);
    683
    684found:
    685	if (!bp->b_addr) {
    686		error = _xfs_buf_map_pages(bp, flags);
    687		if (unlikely(error)) {
    688			xfs_warn_ratelimited(target->bt_mount,
    689				"%s: failed to map %u pages", __func__,
    690				bp->b_page_count);
    691			xfs_buf_relse(bp);
    692			return error;
    693		}
    694	}
    695
    696	/*
    697	 * Clear b_error if this is a lookup from a caller that doesn't expect
    698	 * valid data to be found in the buffer.
    699	 */
    700	if (!(flags & XBF_READ))
    701		xfs_buf_ioerror(bp, 0);
    702
    703	XFS_STATS_INC(target->bt_mount, xb_get);
    704	trace_xfs_buf_get(bp, flags, _RET_IP_);
    705	*bpp = bp;
    706	return 0;
    707out_free_buf:
    708	xfs_buf_free(new_bp);
    709	return error;
    710}
    711
    712int
    713_xfs_buf_read(
    714	struct xfs_buf		*bp,
    715	xfs_buf_flags_t		flags)
    716{
    717	ASSERT(!(flags & XBF_WRITE));
    718	ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
    719
    720	bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
    721	bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
    722
    723	return xfs_buf_submit(bp);
    724}
    725
    726/*
    727 * Reverify a buffer found in cache without an attached ->b_ops.
    728 *
    729 * If the caller passed an ops structure and the buffer doesn't have ops
    730 * assigned, set the ops and use it to verify the contents. If verification
    731 * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
    732 * already in XBF_DONE state on entry.
    733 *
    734 * Under normal operations, every in-core buffer is verified on read I/O
    735 * completion. There are two scenarios that can lead to in-core buffers without
    736 * an assigned ->b_ops. The first is during log recovery of buffers on a V4
    737 * filesystem, though these buffers are purged at the end of recovery. The
    738 * other is online repair, which intentionally reads with a NULL buffer ops to
    739 * run several verifiers across an in-core buffer in order to establish buffer
    740 * type.  If repair can't establish that, the buffer will be left in memory
    741 * with NULL buffer ops.
    742 */
    743int
    744xfs_buf_reverify(
    745	struct xfs_buf		*bp,
    746	const struct xfs_buf_ops *ops)
    747{
    748	ASSERT(bp->b_flags & XBF_DONE);
    749	ASSERT(bp->b_error == 0);
    750
    751	if (!ops || bp->b_ops)
    752		return 0;
    753
    754	bp->b_ops = ops;
    755	bp->b_ops->verify_read(bp);
    756	if (bp->b_error)
    757		bp->b_flags &= ~XBF_DONE;
    758	return bp->b_error;
    759}
    760
    761int
    762xfs_buf_read_map(
    763	struct xfs_buftarg	*target,
    764	struct xfs_buf_map	*map,
    765	int			nmaps,
    766	xfs_buf_flags_t		flags,
    767	struct xfs_buf		**bpp,
    768	const struct xfs_buf_ops *ops,
    769	xfs_failaddr_t		fa)
    770{
    771	struct xfs_buf		*bp;
    772	int			error;
    773
    774	flags |= XBF_READ;
    775	*bpp = NULL;
    776
    777	error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
    778	if (error)
    779		return error;
    780
    781	trace_xfs_buf_read(bp, flags, _RET_IP_);
    782
    783	if (!(bp->b_flags & XBF_DONE)) {
    784		/* Initiate the buffer read and wait. */
    785		XFS_STATS_INC(target->bt_mount, xb_get_read);
    786		bp->b_ops = ops;
    787		error = _xfs_buf_read(bp, flags);
    788
    789		/* Readahead iodone already dropped the buffer, so exit. */
    790		if (flags & XBF_ASYNC)
    791			return 0;
    792	} else {
    793		/* Buffer already read; all we need to do is check it. */
    794		error = xfs_buf_reverify(bp, ops);
    795
    796		/* Readahead already finished; drop the buffer and exit. */
    797		if (flags & XBF_ASYNC) {
    798			xfs_buf_relse(bp);
    799			return 0;
    800		}
    801
    802		/* We do not want read in the flags */
    803		bp->b_flags &= ~XBF_READ;
    804		ASSERT(bp->b_ops != NULL || ops == NULL);
    805	}
    806
    807	/*
    808	 * If we've had a read error, then the contents of the buffer are
    809	 * invalid and should not be used. To ensure that a followup read tries
    810	 * to pull the buffer from disk again, we clear the XBF_DONE flag and
    811	 * mark the buffer stale. This ensures that anyone who has a current
    812	 * reference to the buffer will interpret it's contents correctly and
    813	 * future cache lookups will also treat it as an empty, uninitialised
    814	 * buffer.
    815	 */
    816	if (error) {
    817		/*
    818		 * Check against log shutdown for error reporting because
    819		 * metadata writeback may require a read first and we need to
    820		 * report errors in metadata writeback until the log is shut
    821		 * down. High level transaction read functions already check
    822		 * against mount shutdown, anyway, so we only need to be
    823		 * concerned about low level IO interactions here.
    824		 */
    825		if (!xlog_is_shutdown(target->bt_mount->m_log))
    826			xfs_buf_ioerror_alert(bp, fa);
    827
    828		bp->b_flags &= ~XBF_DONE;
    829		xfs_buf_stale(bp);
    830		xfs_buf_relse(bp);
    831
    832		/* bad CRC means corrupted metadata */
    833		if (error == -EFSBADCRC)
    834			error = -EFSCORRUPTED;
    835		return error;
    836	}
    837
    838	*bpp = bp;
    839	return 0;
    840}
    841
    842/*
    843 *	If we are not low on memory then do the readahead in a deadlock
    844 *	safe manner.
    845 */
    846void
    847xfs_buf_readahead_map(
    848	struct xfs_buftarg	*target,
    849	struct xfs_buf_map	*map,
    850	int			nmaps,
    851	const struct xfs_buf_ops *ops)
    852{
    853	struct xfs_buf		*bp;
    854
    855	xfs_buf_read_map(target, map, nmaps,
    856		     XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
    857		     __this_address);
    858}
    859
    860/*
    861 * Read an uncached buffer from disk. Allocates and returns a locked
    862 * buffer containing the disk contents or nothing. Uncached buffers always have
    863 * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer
    864 * is cached or uncached during fault diagnosis.
    865 */
    866int
    867xfs_buf_read_uncached(
    868	struct xfs_buftarg	*target,
    869	xfs_daddr_t		daddr,
    870	size_t			numblks,
    871	xfs_buf_flags_t		flags,
    872	struct xfs_buf		**bpp,
    873	const struct xfs_buf_ops *ops)
    874{
    875	struct xfs_buf		*bp;
    876	int			error;
    877
    878	*bpp = NULL;
    879
    880	error = xfs_buf_get_uncached(target, numblks, flags, &bp);
    881	if (error)
    882		return error;
    883
    884	/* set up the buffer for a read IO */
    885	ASSERT(bp->b_map_count == 1);
    886	bp->b_rhash_key = XFS_BUF_DADDR_NULL;
    887	bp->b_maps[0].bm_bn = daddr;
    888	bp->b_flags |= XBF_READ;
    889	bp->b_ops = ops;
    890
    891	xfs_buf_submit(bp);
    892	if (bp->b_error) {
    893		error = bp->b_error;
    894		xfs_buf_relse(bp);
    895		return error;
    896	}
    897
    898	*bpp = bp;
    899	return 0;
    900}
    901
    902int
    903xfs_buf_get_uncached(
    904	struct xfs_buftarg	*target,
    905	size_t			numblks,
    906	xfs_buf_flags_t		flags,
    907	struct xfs_buf		**bpp)
    908{
    909	int			error;
    910	struct xfs_buf		*bp;
    911	DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
    912
    913	*bpp = NULL;
    914
    915	/* flags might contain irrelevant bits, pass only what we care about */
    916	error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
    917	if (error)
    918		return error;
    919
    920	error = xfs_buf_alloc_pages(bp, flags);
    921	if (error)
    922		goto fail_free_buf;
    923
    924	error = _xfs_buf_map_pages(bp, 0);
    925	if (unlikely(error)) {
    926		xfs_warn(target->bt_mount,
    927			"%s: failed to map pages", __func__);
    928		goto fail_free_buf;
    929	}
    930
    931	trace_xfs_buf_get_uncached(bp, _RET_IP_);
    932	*bpp = bp;
    933	return 0;
    934
    935fail_free_buf:
    936	xfs_buf_free(bp);
    937	return error;
    938}
    939
    940/*
    941 *	Increment reference count on buffer, to hold the buffer concurrently
    942 *	with another thread which may release (free) the buffer asynchronously.
    943 *	Must hold the buffer already to call this function.
    944 */
    945void
    946xfs_buf_hold(
    947	struct xfs_buf		*bp)
    948{
    949	trace_xfs_buf_hold(bp, _RET_IP_);
    950	atomic_inc(&bp->b_hold);
    951}
    952
    953/*
    954 * Release a hold on the specified buffer. If the hold count is 1, the buffer is
    955 * placed on LRU or freed (depending on b_lru_ref).
    956 */
    957void
    958xfs_buf_rele(
    959	struct xfs_buf		*bp)
    960{
    961	struct xfs_perag	*pag = bp->b_pag;
    962	bool			release;
    963	bool			freebuf = false;
    964
    965	trace_xfs_buf_rele(bp, _RET_IP_);
    966
    967	if (!pag) {
    968		ASSERT(list_empty(&bp->b_lru));
    969		if (atomic_dec_and_test(&bp->b_hold)) {
    970			xfs_buf_ioacct_dec(bp);
    971			xfs_buf_free(bp);
    972		}
    973		return;
    974	}
    975
    976	ASSERT(atomic_read(&bp->b_hold) > 0);
    977
    978	/*
    979	 * We grab the b_lock here first to serialise racing xfs_buf_rele()
    980	 * calls. The pag_buf_lock being taken on the last reference only
    981	 * serialises against racing lookups in xfs_buf_find(). IOWs, the second
    982	 * to last reference we drop here is not serialised against the last
    983	 * reference until we take bp->b_lock. Hence if we don't grab b_lock
    984	 * first, the last "release" reference can win the race to the lock and
    985	 * free the buffer before the second-to-last reference is processed,
    986	 * leading to a use-after-free scenario.
    987	 */
    988	spin_lock(&bp->b_lock);
    989	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
    990	if (!release) {
    991		/*
    992		 * Drop the in-flight state if the buffer is already on the LRU
    993		 * and it holds the only reference. This is racy because we
    994		 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
    995		 * ensures the decrement occurs only once per-buf.
    996		 */
    997		if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
    998			__xfs_buf_ioacct_dec(bp);
    999		goto out_unlock;
   1000	}
   1001
   1002	/* the last reference has been dropped ... */
   1003	__xfs_buf_ioacct_dec(bp);
   1004	if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
   1005		/*
   1006		 * If the buffer is added to the LRU take a new reference to the
   1007		 * buffer for the LRU and clear the (now stale) dispose list
   1008		 * state flag
   1009		 */
   1010		if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
   1011			bp->b_state &= ~XFS_BSTATE_DISPOSE;
   1012			atomic_inc(&bp->b_hold);
   1013		}
   1014		spin_unlock(&pag->pag_buf_lock);
   1015	} else {
   1016		/*
   1017		 * most of the time buffers will already be removed from the
   1018		 * LRU, so optimise that case by checking for the
   1019		 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
   1020		 * was on was the disposal list
   1021		 */
   1022		if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
   1023			list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
   1024		} else {
   1025			ASSERT(list_empty(&bp->b_lru));
   1026		}
   1027
   1028		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
   1029		rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
   1030				       xfs_buf_hash_params);
   1031		spin_unlock(&pag->pag_buf_lock);
   1032		xfs_perag_put(pag);
   1033		freebuf = true;
   1034	}
   1035
   1036out_unlock:
   1037	spin_unlock(&bp->b_lock);
   1038
   1039	if (freebuf)
   1040		xfs_buf_free(bp);
   1041}
   1042
   1043
   1044/*
   1045 *	Lock a buffer object, if it is not already locked.
   1046 *
   1047 *	If we come across a stale, pinned, locked buffer, we know that we are
   1048 *	being asked to lock a buffer that has been reallocated. Because it is
   1049 *	pinned, we know that the log has not been pushed to disk and hence it
   1050 *	will still be locked.  Rather than continuing to have trylock attempts
   1051 *	fail until someone else pushes the log, push it ourselves before
   1052 *	returning.  This means that the xfsaild will not get stuck trying
   1053 *	to push on stale inode buffers.
   1054 */
   1055int
   1056xfs_buf_trylock(
   1057	struct xfs_buf		*bp)
   1058{
   1059	int			locked;
   1060
   1061	locked = down_trylock(&bp->b_sema) == 0;
   1062	if (locked)
   1063		trace_xfs_buf_trylock(bp, _RET_IP_);
   1064	else
   1065		trace_xfs_buf_trylock_fail(bp, _RET_IP_);
   1066	return locked;
   1067}
   1068
   1069/*
   1070 *	Lock a buffer object.
   1071 *
   1072 *	If we come across a stale, pinned, locked buffer, we know that we
   1073 *	are being asked to lock a buffer that has been reallocated. Because
   1074 *	it is pinned, we know that the log has not been pushed to disk and
   1075 *	hence it will still be locked. Rather than sleeping until someone
   1076 *	else pushes the log, push it ourselves before trying to get the lock.
   1077 */
   1078void
   1079xfs_buf_lock(
   1080	struct xfs_buf		*bp)
   1081{
   1082	trace_xfs_buf_lock(bp, _RET_IP_);
   1083
   1084	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
   1085		xfs_log_force(bp->b_mount, 0);
   1086	down(&bp->b_sema);
   1087
   1088	trace_xfs_buf_lock_done(bp, _RET_IP_);
   1089}
   1090
   1091void
   1092xfs_buf_unlock(
   1093	struct xfs_buf		*bp)
   1094{
   1095	ASSERT(xfs_buf_islocked(bp));
   1096
   1097	up(&bp->b_sema);
   1098	trace_xfs_buf_unlock(bp, _RET_IP_);
   1099}
   1100
   1101STATIC void
   1102xfs_buf_wait_unpin(
   1103	struct xfs_buf		*bp)
   1104{
   1105	DECLARE_WAITQUEUE	(wait, current);
   1106
   1107	if (atomic_read(&bp->b_pin_count) == 0)
   1108		return;
   1109
   1110	add_wait_queue(&bp->b_waiters, &wait);
   1111	for (;;) {
   1112		set_current_state(TASK_UNINTERRUPTIBLE);
   1113		if (atomic_read(&bp->b_pin_count) == 0)
   1114			break;
   1115		io_schedule();
   1116	}
   1117	remove_wait_queue(&bp->b_waiters, &wait);
   1118	set_current_state(TASK_RUNNING);
   1119}
   1120
   1121static void
   1122xfs_buf_ioerror_alert_ratelimited(
   1123	struct xfs_buf		*bp)
   1124{
   1125	static unsigned long	lasttime;
   1126	static struct xfs_buftarg *lasttarg;
   1127
   1128	if (bp->b_target != lasttarg ||
   1129	    time_after(jiffies, (lasttime + 5*HZ))) {
   1130		lasttime = jiffies;
   1131		xfs_buf_ioerror_alert(bp, __this_address);
   1132	}
   1133	lasttarg = bp->b_target;
   1134}
   1135
   1136/*
   1137 * Account for this latest trip around the retry handler, and decide if
   1138 * we've failed enough times to constitute a permanent failure.
   1139 */
   1140static bool
   1141xfs_buf_ioerror_permanent(
   1142	struct xfs_buf		*bp,
   1143	struct xfs_error_cfg	*cfg)
   1144{
   1145	struct xfs_mount	*mp = bp->b_mount;
   1146
   1147	if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
   1148	    ++bp->b_retries > cfg->max_retries)
   1149		return true;
   1150	if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
   1151	    time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
   1152		return true;
   1153
   1154	/* At unmount we may treat errors differently */
   1155	if (xfs_is_unmounting(mp) && mp->m_fail_unmount)
   1156		return true;
   1157
   1158	return false;
   1159}
   1160
   1161/*
   1162 * On a sync write or shutdown we just want to stale the buffer and let the
   1163 * caller handle the error in bp->b_error appropriately.
   1164 *
   1165 * If the write was asynchronous then no one will be looking for the error.  If
   1166 * this is the first failure of this type, clear the error state and write the
   1167 * buffer out again. This means we always retry an async write failure at least
   1168 * once, but we also need to set the buffer up to behave correctly now for
   1169 * repeated failures.
   1170 *
   1171 * If we get repeated async write failures, then we take action according to the
   1172 * error configuration we have been set up to use.
   1173 *
   1174 * Returns true if this function took care of error handling and the caller must
   1175 * not touch the buffer again.  Return false if the caller should proceed with
   1176 * normal I/O completion handling.
   1177 */
   1178static bool
   1179xfs_buf_ioend_handle_error(
   1180	struct xfs_buf		*bp)
   1181{
   1182	struct xfs_mount	*mp = bp->b_mount;
   1183	struct xfs_error_cfg	*cfg;
   1184
   1185	/*
   1186	 * If we've already shutdown the journal because of I/O errors, there's
   1187	 * no point in giving this a retry.
   1188	 */
   1189	if (xlog_is_shutdown(mp->m_log))
   1190		goto out_stale;
   1191
   1192	xfs_buf_ioerror_alert_ratelimited(bp);
   1193
   1194	/*
   1195	 * We're not going to bother about retrying this during recovery.
   1196	 * One strike!
   1197	 */
   1198	if (bp->b_flags & _XBF_LOGRECOVERY) {
   1199		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
   1200		return false;
   1201	}
   1202
   1203	/*
   1204	 * Synchronous writes will have callers process the error.
   1205	 */
   1206	if (!(bp->b_flags & XBF_ASYNC))
   1207		goto out_stale;
   1208
   1209	trace_xfs_buf_iodone_async(bp, _RET_IP_);
   1210
   1211	cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
   1212	if (bp->b_last_error != bp->b_error ||
   1213	    !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) {
   1214		bp->b_last_error = bp->b_error;
   1215		if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
   1216		    !bp->b_first_retry_time)
   1217			bp->b_first_retry_time = jiffies;
   1218		goto resubmit;
   1219	}
   1220
   1221	/*
   1222	 * Permanent error - we need to trigger a shutdown if we haven't already
   1223	 * to indicate that inconsistency will result from this action.
   1224	 */
   1225	if (xfs_buf_ioerror_permanent(bp, cfg)) {
   1226		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
   1227		goto out_stale;
   1228	}
   1229
   1230	/* Still considered a transient error. Caller will schedule retries. */
   1231	if (bp->b_flags & _XBF_INODES)
   1232		xfs_buf_inode_io_fail(bp);
   1233	else if (bp->b_flags & _XBF_DQUOTS)
   1234		xfs_buf_dquot_io_fail(bp);
   1235	else
   1236		ASSERT(list_empty(&bp->b_li_list));
   1237	xfs_buf_ioerror(bp, 0);
   1238	xfs_buf_relse(bp);
   1239	return true;
   1240
   1241resubmit:
   1242	xfs_buf_ioerror(bp, 0);
   1243	bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
   1244	xfs_buf_submit(bp);
   1245	return true;
   1246out_stale:
   1247	xfs_buf_stale(bp);
   1248	bp->b_flags |= XBF_DONE;
   1249	bp->b_flags &= ~XBF_WRITE;
   1250	trace_xfs_buf_error_relse(bp, _RET_IP_);
   1251	return false;
   1252}
   1253
   1254static void
   1255xfs_buf_ioend(
   1256	struct xfs_buf	*bp)
   1257{
   1258	trace_xfs_buf_iodone(bp, _RET_IP_);
   1259
   1260	/*
   1261	 * Pull in IO completion errors now. We are guaranteed to be running
   1262	 * single threaded, so we don't need the lock to read b_io_error.
   1263	 */
   1264	if (!bp->b_error && bp->b_io_error)
   1265		xfs_buf_ioerror(bp, bp->b_io_error);
   1266
   1267	if (bp->b_flags & XBF_READ) {
   1268		if (!bp->b_error && bp->b_ops)
   1269			bp->b_ops->verify_read(bp);
   1270		if (!bp->b_error)
   1271			bp->b_flags |= XBF_DONE;
   1272	} else {
   1273		if (!bp->b_error) {
   1274			bp->b_flags &= ~XBF_WRITE_FAIL;
   1275			bp->b_flags |= XBF_DONE;
   1276		}
   1277
   1278		if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
   1279			return;
   1280
   1281		/* clear the retry state */
   1282		bp->b_last_error = 0;
   1283		bp->b_retries = 0;
   1284		bp->b_first_retry_time = 0;
   1285
   1286		/*
   1287		 * Note that for things like remote attribute buffers, there may
   1288		 * not be a buffer log item here, so processing the buffer log
   1289		 * item must remain optional.
   1290		 */
   1291		if (bp->b_log_item)
   1292			xfs_buf_item_done(bp);
   1293
   1294		if (bp->b_flags & _XBF_INODES)
   1295			xfs_buf_inode_iodone(bp);
   1296		else if (bp->b_flags & _XBF_DQUOTS)
   1297			xfs_buf_dquot_iodone(bp);
   1298
   1299	}
   1300
   1301	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
   1302			 _XBF_LOGRECOVERY);
   1303
   1304	if (bp->b_flags & XBF_ASYNC)
   1305		xfs_buf_relse(bp);
   1306	else
   1307		complete(&bp->b_iowait);
   1308}
   1309
   1310static void
   1311xfs_buf_ioend_work(
   1312	struct work_struct	*work)
   1313{
   1314	struct xfs_buf		*bp =
   1315		container_of(work, struct xfs_buf, b_ioend_work);
   1316
   1317	xfs_buf_ioend(bp);
   1318}
   1319
   1320static void
   1321xfs_buf_ioend_async(
   1322	struct xfs_buf	*bp)
   1323{
   1324	INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
   1325	queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
   1326}
   1327
   1328void
   1329__xfs_buf_ioerror(
   1330	struct xfs_buf		*bp,
   1331	int			error,
   1332	xfs_failaddr_t		failaddr)
   1333{
   1334	ASSERT(error <= 0 && error >= -1000);
   1335	bp->b_error = error;
   1336	trace_xfs_buf_ioerror(bp, error, failaddr);
   1337}
   1338
   1339void
   1340xfs_buf_ioerror_alert(
   1341	struct xfs_buf		*bp,
   1342	xfs_failaddr_t		func)
   1343{
   1344	xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
   1345		"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
   1346				  func, (uint64_t)xfs_buf_daddr(bp),
   1347				  bp->b_length, -bp->b_error);
   1348}
   1349
   1350/*
   1351 * To simulate an I/O failure, the buffer must be locked and held with at least
   1352 * three references. The LRU reference is dropped by the stale call. The buf
   1353 * item reference is dropped via ioend processing. The third reference is owned
   1354 * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
   1355 */
   1356void
   1357xfs_buf_ioend_fail(
   1358	struct xfs_buf	*bp)
   1359{
   1360	bp->b_flags &= ~XBF_DONE;
   1361	xfs_buf_stale(bp);
   1362	xfs_buf_ioerror(bp, -EIO);
   1363	xfs_buf_ioend(bp);
   1364}
   1365
   1366int
   1367xfs_bwrite(
   1368	struct xfs_buf		*bp)
   1369{
   1370	int			error;
   1371
   1372	ASSERT(xfs_buf_islocked(bp));
   1373
   1374	bp->b_flags |= XBF_WRITE;
   1375	bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
   1376			 XBF_DONE);
   1377
   1378	error = xfs_buf_submit(bp);
   1379	if (error)
   1380		xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
   1381	return error;
   1382}
   1383
   1384static void
   1385xfs_buf_bio_end_io(
   1386	struct bio		*bio)
   1387{
   1388	struct xfs_buf		*bp = (struct xfs_buf *)bio->bi_private;
   1389
   1390	if (!bio->bi_status &&
   1391	    (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
   1392	    XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
   1393		bio->bi_status = BLK_STS_IOERR;
   1394
   1395	/*
   1396	 * don't overwrite existing errors - otherwise we can lose errors on
   1397	 * buffers that require multiple bios to complete.
   1398	 */
   1399	if (bio->bi_status) {
   1400		int error = blk_status_to_errno(bio->bi_status);
   1401
   1402		cmpxchg(&bp->b_io_error, 0, error);
   1403	}
   1404
   1405	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
   1406		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
   1407
   1408	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
   1409		xfs_buf_ioend_async(bp);
   1410	bio_put(bio);
   1411}
   1412
   1413static void
   1414xfs_buf_ioapply_map(
   1415	struct xfs_buf	*bp,
   1416	int		map,
   1417	int		*buf_offset,
   1418	int		*count,
   1419	int		op)
   1420{
   1421	int		page_index;
   1422	unsigned int	total_nr_pages = bp->b_page_count;
   1423	int		nr_pages;
   1424	struct bio	*bio;
   1425	sector_t	sector =  bp->b_maps[map].bm_bn;
   1426	int		size;
   1427	int		offset;
   1428
   1429	/* skip the pages in the buffer before the start offset */
   1430	page_index = 0;
   1431	offset = *buf_offset;
   1432	while (offset >= PAGE_SIZE) {
   1433		page_index++;
   1434		offset -= PAGE_SIZE;
   1435	}
   1436
   1437	/*
   1438	 * Limit the IO size to the length of the current vector, and update the
   1439	 * remaining IO count for the next time around.
   1440	 */
   1441	size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
   1442	*count -= size;
   1443	*buf_offset += size;
   1444
   1445next_chunk:
   1446	atomic_inc(&bp->b_io_remaining);
   1447	nr_pages = bio_max_segs(total_nr_pages);
   1448
   1449	bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
   1450	bio->bi_iter.bi_sector = sector;
   1451	bio->bi_end_io = xfs_buf_bio_end_io;
   1452	bio->bi_private = bp;
   1453
   1454	for (; size && nr_pages; nr_pages--, page_index++) {
   1455		int	rbytes, nbytes = PAGE_SIZE - offset;
   1456
   1457		if (nbytes > size)
   1458			nbytes = size;
   1459
   1460		rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
   1461				      offset);
   1462		if (rbytes < nbytes)
   1463			break;
   1464
   1465		offset = 0;
   1466		sector += BTOBB(nbytes);
   1467		size -= nbytes;
   1468		total_nr_pages--;
   1469	}
   1470
   1471	if (likely(bio->bi_iter.bi_size)) {
   1472		if (xfs_buf_is_vmapped(bp)) {
   1473			flush_kernel_vmap_range(bp->b_addr,
   1474						xfs_buf_vmap_len(bp));
   1475		}
   1476		submit_bio(bio);
   1477		if (size)
   1478			goto next_chunk;
   1479	} else {
   1480		/*
   1481		 * This is guaranteed not to be the last io reference count
   1482		 * because the caller (xfs_buf_submit) holds a count itself.
   1483		 */
   1484		atomic_dec(&bp->b_io_remaining);
   1485		xfs_buf_ioerror(bp, -EIO);
   1486		bio_put(bio);
   1487	}
   1488
   1489}
   1490
   1491STATIC void
   1492_xfs_buf_ioapply(
   1493	struct xfs_buf	*bp)
   1494{
   1495	struct blk_plug	plug;
   1496	int		op;
   1497	int		offset;
   1498	int		size;
   1499	int		i;
   1500
   1501	/*
   1502	 * Make sure we capture only current IO errors rather than stale errors
   1503	 * left over from previous use of the buffer (e.g. failed readahead).
   1504	 */
   1505	bp->b_error = 0;
   1506
   1507	if (bp->b_flags & XBF_WRITE) {
   1508		op = REQ_OP_WRITE;
   1509
   1510		/*
   1511		 * Run the write verifier callback function if it exists. If
   1512		 * this function fails it will mark the buffer with an error and
   1513		 * the IO should not be dispatched.
   1514		 */
   1515		if (bp->b_ops) {
   1516			bp->b_ops->verify_write(bp);
   1517			if (bp->b_error) {
   1518				xfs_force_shutdown(bp->b_mount,
   1519						   SHUTDOWN_CORRUPT_INCORE);
   1520				return;
   1521			}
   1522		} else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) {
   1523			struct xfs_mount *mp = bp->b_mount;
   1524
   1525			/*
   1526			 * non-crc filesystems don't attach verifiers during
   1527			 * log recovery, so don't warn for such filesystems.
   1528			 */
   1529			if (xfs_has_crc(mp)) {
   1530				xfs_warn(mp,
   1531					"%s: no buf ops on daddr 0x%llx len %d",
   1532					__func__, xfs_buf_daddr(bp),
   1533					bp->b_length);
   1534				xfs_hex_dump(bp->b_addr,
   1535						XFS_CORRUPTION_DUMP_LEN);
   1536				dump_stack();
   1537			}
   1538		}
   1539	} else {
   1540		op = REQ_OP_READ;
   1541		if (bp->b_flags & XBF_READ_AHEAD)
   1542			op |= REQ_RAHEAD;
   1543	}
   1544
   1545	/* we only use the buffer cache for meta-data */
   1546	op |= REQ_META;
   1547
   1548	/*
   1549	 * Walk all the vectors issuing IO on them. Set up the initial offset
   1550	 * into the buffer and the desired IO size before we start -
   1551	 * _xfs_buf_ioapply_vec() will modify them appropriately for each
   1552	 * subsequent call.
   1553	 */
   1554	offset = bp->b_offset;
   1555	size = BBTOB(bp->b_length);
   1556	blk_start_plug(&plug);
   1557	for (i = 0; i < bp->b_map_count; i++) {
   1558		xfs_buf_ioapply_map(bp, i, &offset, &size, op);
   1559		if (bp->b_error)
   1560			break;
   1561		if (size <= 0)
   1562			break;	/* all done */
   1563	}
   1564	blk_finish_plug(&plug);
   1565}
   1566
   1567/*
   1568 * Wait for I/O completion of a sync buffer and return the I/O error code.
   1569 */
   1570static int
   1571xfs_buf_iowait(
   1572	struct xfs_buf	*bp)
   1573{
   1574	ASSERT(!(bp->b_flags & XBF_ASYNC));
   1575
   1576	trace_xfs_buf_iowait(bp, _RET_IP_);
   1577	wait_for_completion(&bp->b_iowait);
   1578	trace_xfs_buf_iowait_done(bp, _RET_IP_);
   1579
   1580	return bp->b_error;
   1581}
   1582
   1583/*
   1584 * Buffer I/O submission path, read or write. Asynchronous submission transfers
   1585 * the buffer lock ownership and the current reference to the IO. It is not
   1586 * safe to reference the buffer after a call to this function unless the caller
   1587 * holds an additional reference itself.
   1588 */
   1589static int
   1590__xfs_buf_submit(
   1591	struct xfs_buf	*bp,
   1592	bool		wait)
   1593{
   1594	int		error = 0;
   1595
   1596	trace_xfs_buf_submit(bp, _RET_IP_);
   1597
   1598	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
   1599
   1600	/*
   1601	 * On log shutdown we stale and complete the buffer immediately. We can
   1602	 * be called to read the superblock before the log has been set up, so
   1603	 * be careful checking the log state.
   1604	 *
   1605	 * Checking the mount shutdown state here can result in the log tail
   1606	 * moving inappropriately on disk as the log may not yet be shut down.
   1607	 * i.e. failing this buffer on mount shutdown can remove it from the AIL
   1608	 * and move the tail of the log forwards without having written this
   1609	 * buffer to disk. This corrupts the log tail state in memory, and
   1610	 * because the log may not be shut down yet, it can then be propagated
   1611	 * to disk before the log is shutdown. Hence we check log shutdown
   1612	 * state here rather than mount state to avoid corrupting the log tail
   1613	 * on shutdown.
   1614	 */
   1615	if (bp->b_mount->m_log &&
   1616	    xlog_is_shutdown(bp->b_mount->m_log)) {
   1617		xfs_buf_ioend_fail(bp);
   1618		return -EIO;
   1619	}
   1620
   1621	/*
   1622	 * Grab a reference so the buffer does not go away underneath us. For
   1623	 * async buffers, I/O completion drops the callers reference, which
   1624	 * could occur before submission returns.
   1625	 */
   1626	xfs_buf_hold(bp);
   1627
   1628	if (bp->b_flags & XBF_WRITE)
   1629		xfs_buf_wait_unpin(bp);
   1630
   1631	/* clear the internal error state to avoid spurious errors */
   1632	bp->b_io_error = 0;
   1633
   1634	/*
   1635	 * Set the count to 1 initially, this will stop an I/O completion
   1636	 * callout which happens before we have started all the I/O from calling
   1637	 * xfs_buf_ioend too early.
   1638	 */
   1639	atomic_set(&bp->b_io_remaining, 1);
   1640	if (bp->b_flags & XBF_ASYNC)
   1641		xfs_buf_ioacct_inc(bp);
   1642	_xfs_buf_ioapply(bp);
   1643
   1644	/*
   1645	 * If _xfs_buf_ioapply failed, we can get back here with only the IO
   1646	 * reference we took above. If we drop it to zero, run completion so
   1647	 * that we don't return to the caller with completion still pending.
   1648	 */
   1649	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
   1650		if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
   1651			xfs_buf_ioend(bp);
   1652		else
   1653			xfs_buf_ioend_async(bp);
   1654	}
   1655
   1656	if (wait)
   1657		error = xfs_buf_iowait(bp);
   1658
   1659	/*
   1660	 * Release the hold that keeps the buffer referenced for the entire
   1661	 * I/O. Note that if the buffer is async, it is not safe to reference
   1662	 * after this release.
   1663	 */
   1664	xfs_buf_rele(bp);
   1665	return error;
   1666}
   1667
   1668void *
   1669xfs_buf_offset(
   1670	struct xfs_buf		*bp,
   1671	size_t			offset)
   1672{
   1673	struct page		*page;
   1674
   1675	if (bp->b_addr)
   1676		return bp->b_addr + offset;
   1677
   1678	page = bp->b_pages[offset >> PAGE_SHIFT];
   1679	return page_address(page) + (offset & (PAGE_SIZE-1));
   1680}
   1681
   1682void
   1683xfs_buf_zero(
   1684	struct xfs_buf		*bp,
   1685	size_t			boff,
   1686	size_t			bsize)
   1687{
   1688	size_t			bend;
   1689
   1690	bend = boff + bsize;
   1691	while (boff < bend) {
   1692		struct page	*page;
   1693		int		page_index, page_offset, csize;
   1694
   1695		page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
   1696		page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
   1697		page = bp->b_pages[page_index];
   1698		csize = min_t(size_t, PAGE_SIZE - page_offset,
   1699				      BBTOB(bp->b_length) - boff);
   1700
   1701		ASSERT((csize + page_offset) <= PAGE_SIZE);
   1702
   1703		memset(page_address(page) + page_offset, 0, csize);
   1704
   1705		boff += csize;
   1706	}
   1707}
   1708
   1709/*
   1710 * Log a message about and stale a buffer that a caller has decided is corrupt.
   1711 *
   1712 * This function should be called for the kinds of metadata corruption that
   1713 * cannot be detect from a verifier, such as incorrect inter-block relationship
   1714 * data.  Do /not/ call this function from a verifier function.
   1715 *
   1716 * The buffer must be XBF_DONE prior to the call.  Afterwards, the buffer will
   1717 * be marked stale, but b_error will not be set.  The caller is responsible for
   1718 * releasing the buffer or fixing it.
   1719 */
   1720void
   1721__xfs_buf_mark_corrupt(
   1722	struct xfs_buf		*bp,
   1723	xfs_failaddr_t		fa)
   1724{
   1725	ASSERT(bp->b_flags & XBF_DONE);
   1726
   1727	xfs_buf_corruption_error(bp, fa);
   1728	xfs_buf_stale(bp);
   1729}
   1730
   1731/*
   1732 *	Handling of buffer targets (buftargs).
   1733 */
   1734
   1735/*
   1736 * Wait for any bufs with callbacks that have been submitted but have not yet
   1737 * returned. These buffers will have an elevated hold count, so wait on those
   1738 * while freeing all the buffers only held by the LRU.
   1739 */
   1740static enum lru_status
   1741xfs_buftarg_drain_rele(
   1742	struct list_head	*item,
   1743	struct list_lru_one	*lru,
   1744	spinlock_t		*lru_lock,
   1745	void			*arg)
   1746
   1747{
   1748	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
   1749	struct list_head	*dispose = arg;
   1750
   1751	if (atomic_read(&bp->b_hold) > 1) {
   1752		/* need to wait, so skip it this pass */
   1753		trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
   1754		return LRU_SKIP;
   1755	}
   1756	if (!spin_trylock(&bp->b_lock))
   1757		return LRU_SKIP;
   1758
   1759	/*
   1760	 * clear the LRU reference count so the buffer doesn't get
   1761	 * ignored in xfs_buf_rele().
   1762	 */
   1763	atomic_set(&bp->b_lru_ref, 0);
   1764	bp->b_state |= XFS_BSTATE_DISPOSE;
   1765	list_lru_isolate_move(lru, item, dispose);
   1766	spin_unlock(&bp->b_lock);
   1767	return LRU_REMOVED;
   1768}
   1769
   1770/*
   1771 * Wait for outstanding I/O on the buftarg to complete.
   1772 */
   1773void
   1774xfs_buftarg_wait(
   1775	struct xfs_buftarg	*btp)
   1776{
   1777	/*
   1778	 * First wait on the buftarg I/O count for all in-flight buffers to be
   1779	 * released. This is critical as new buffers do not make the LRU until
   1780	 * they are released.
   1781	 *
   1782	 * Next, flush the buffer workqueue to ensure all completion processing
   1783	 * has finished. Just waiting on buffer locks is not sufficient for
   1784	 * async IO as the reference count held over IO is not released until
   1785	 * after the buffer lock is dropped. Hence we need to ensure here that
   1786	 * all reference counts have been dropped before we start walking the
   1787	 * LRU list.
   1788	 */
   1789	while (percpu_counter_sum(&btp->bt_io_count))
   1790		delay(100);
   1791	flush_workqueue(btp->bt_mount->m_buf_workqueue);
   1792}
   1793
   1794void
   1795xfs_buftarg_drain(
   1796	struct xfs_buftarg	*btp)
   1797{
   1798	LIST_HEAD(dispose);
   1799	int			loop = 0;
   1800	bool			write_fail = false;
   1801
   1802	xfs_buftarg_wait(btp);
   1803
   1804	/* loop until there is nothing left on the lru list. */
   1805	while (list_lru_count(&btp->bt_lru)) {
   1806		list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele,
   1807			      &dispose, LONG_MAX);
   1808
   1809		while (!list_empty(&dispose)) {
   1810			struct xfs_buf *bp;
   1811			bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
   1812			list_del_init(&bp->b_lru);
   1813			if (bp->b_flags & XBF_WRITE_FAIL) {
   1814				write_fail = true;
   1815				xfs_buf_alert_ratelimited(bp,
   1816					"XFS: Corruption Alert",
   1817"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
   1818					(long long)xfs_buf_daddr(bp));
   1819			}
   1820			xfs_buf_rele(bp);
   1821		}
   1822		if (loop++ != 0)
   1823			delay(100);
   1824	}
   1825
   1826	/*
   1827	 * If one or more failed buffers were freed, that means dirty metadata
   1828	 * was thrown away. This should only ever happen after I/O completion
   1829	 * handling has elevated I/O error(s) to permanent failures and shuts
   1830	 * down the journal.
   1831	 */
   1832	if (write_fail) {
   1833		ASSERT(xlog_is_shutdown(btp->bt_mount->m_log));
   1834		xfs_alert(btp->bt_mount,
   1835	      "Please run xfs_repair to determine the extent of the problem.");
   1836	}
   1837}
   1838
   1839static enum lru_status
   1840xfs_buftarg_isolate(
   1841	struct list_head	*item,
   1842	struct list_lru_one	*lru,
   1843	spinlock_t		*lru_lock,
   1844	void			*arg)
   1845{
   1846	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
   1847	struct list_head	*dispose = arg;
   1848
   1849	/*
   1850	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
   1851	 * If we fail to get the lock, just skip it.
   1852	 */
   1853	if (!spin_trylock(&bp->b_lock))
   1854		return LRU_SKIP;
   1855	/*
   1856	 * Decrement the b_lru_ref count unless the value is already
   1857	 * zero. If the value is already zero, we need to reclaim the
   1858	 * buffer, otherwise it gets another trip through the LRU.
   1859	 */
   1860	if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
   1861		spin_unlock(&bp->b_lock);
   1862		return LRU_ROTATE;
   1863	}
   1864
   1865	bp->b_state |= XFS_BSTATE_DISPOSE;
   1866	list_lru_isolate_move(lru, item, dispose);
   1867	spin_unlock(&bp->b_lock);
   1868	return LRU_REMOVED;
   1869}
   1870
   1871static unsigned long
   1872xfs_buftarg_shrink_scan(
   1873	struct shrinker		*shrink,
   1874	struct shrink_control	*sc)
   1875{
   1876	struct xfs_buftarg	*btp = container_of(shrink,
   1877					struct xfs_buftarg, bt_shrinker);
   1878	LIST_HEAD(dispose);
   1879	unsigned long		freed;
   1880
   1881	freed = list_lru_shrink_walk(&btp->bt_lru, sc,
   1882				     xfs_buftarg_isolate, &dispose);
   1883
   1884	while (!list_empty(&dispose)) {
   1885		struct xfs_buf *bp;
   1886		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
   1887		list_del_init(&bp->b_lru);
   1888		xfs_buf_rele(bp);
   1889	}
   1890
   1891	return freed;
   1892}
   1893
   1894static unsigned long
   1895xfs_buftarg_shrink_count(
   1896	struct shrinker		*shrink,
   1897	struct shrink_control	*sc)
   1898{
   1899	struct xfs_buftarg	*btp = container_of(shrink,
   1900					struct xfs_buftarg, bt_shrinker);
   1901	return list_lru_shrink_count(&btp->bt_lru, sc);
   1902}
   1903
   1904void
   1905xfs_free_buftarg(
   1906	struct xfs_buftarg	*btp)
   1907{
   1908	unregister_shrinker(&btp->bt_shrinker);
   1909	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
   1910	percpu_counter_destroy(&btp->bt_io_count);
   1911	list_lru_destroy(&btp->bt_lru);
   1912
   1913	blkdev_issue_flush(btp->bt_bdev);
   1914	fs_put_dax(btp->bt_daxdev);
   1915
   1916	kmem_free(btp);
   1917}
   1918
   1919int
   1920xfs_setsize_buftarg(
   1921	xfs_buftarg_t		*btp,
   1922	unsigned int		sectorsize)
   1923{
   1924	/* Set up metadata sector size info */
   1925	btp->bt_meta_sectorsize = sectorsize;
   1926	btp->bt_meta_sectormask = sectorsize - 1;
   1927
   1928	if (set_blocksize(btp->bt_bdev, sectorsize)) {
   1929		xfs_warn(btp->bt_mount,
   1930			"Cannot set_blocksize to %u on device %pg",
   1931			sectorsize, btp->bt_bdev);
   1932		return -EINVAL;
   1933	}
   1934
   1935	/* Set up device logical sector size mask */
   1936	btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
   1937	btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
   1938
   1939	return 0;
   1940}
   1941
   1942/*
   1943 * When allocating the initial buffer target we have not yet
   1944 * read in the superblock, so don't know what sized sectors
   1945 * are being used at this early stage.  Play safe.
   1946 */
   1947STATIC int
   1948xfs_setsize_buftarg_early(
   1949	xfs_buftarg_t		*btp,
   1950	struct block_device	*bdev)
   1951{
   1952	return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
   1953}
   1954
   1955struct xfs_buftarg *
   1956xfs_alloc_buftarg(
   1957	struct xfs_mount	*mp,
   1958	struct block_device	*bdev)
   1959{
   1960	xfs_buftarg_t		*btp;
   1961
   1962	btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
   1963
   1964	btp->bt_mount = mp;
   1965	btp->bt_dev =  bdev->bd_dev;
   1966	btp->bt_bdev = bdev;
   1967	btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off);
   1968
   1969	/*
   1970	 * Buffer IO error rate limiting. Limit it to no more than 10 messages
   1971	 * per 30 seconds so as to not spam logs too much on repeated errors.
   1972	 */
   1973	ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
   1974			     DEFAULT_RATELIMIT_BURST);
   1975
   1976	if (xfs_setsize_buftarg_early(btp, bdev))
   1977		goto error_free;
   1978
   1979	if (list_lru_init(&btp->bt_lru))
   1980		goto error_free;
   1981
   1982	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
   1983		goto error_lru;
   1984
   1985	btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
   1986	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
   1987	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
   1988	btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
   1989	if (register_shrinker(&btp->bt_shrinker))
   1990		goto error_pcpu;
   1991	return btp;
   1992
   1993error_pcpu:
   1994	percpu_counter_destroy(&btp->bt_io_count);
   1995error_lru:
   1996	list_lru_destroy(&btp->bt_lru);
   1997error_free:
   1998	kmem_free(btp);
   1999	return NULL;
   2000}
   2001
   2002/*
   2003 * Cancel a delayed write list.
   2004 *
   2005 * Remove each buffer from the list, clear the delwri queue flag and drop the
   2006 * associated buffer reference.
   2007 */
   2008void
   2009xfs_buf_delwri_cancel(
   2010	struct list_head	*list)
   2011{
   2012	struct xfs_buf		*bp;
   2013
   2014	while (!list_empty(list)) {
   2015		bp = list_first_entry(list, struct xfs_buf, b_list);
   2016
   2017		xfs_buf_lock(bp);
   2018		bp->b_flags &= ~_XBF_DELWRI_Q;
   2019		list_del_init(&bp->b_list);
   2020		xfs_buf_relse(bp);
   2021	}
   2022}
   2023
   2024/*
   2025 * Add a buffer to the delayed write list.
   2026 *
   2027 * This queues a buffer for writeout if it hasn't already been.  Note that
   2028 * neither this routine nor the buffer list submission functions perform
   2029 * any internal synchronization.  It is expected that the lists are thread-local
   2030 * to the callers.
   2031 *
   2032 * Returns true if we queued up the buffer, or false if it already had
   2033 * been on the buffer list.
   2034 */
   2035bool
   2036xfs_buf_delwri_queue(
   2037	struct xfs_buf		*bp,
   2038	struct list_head	*list)
   2039{
   2040	ASSERT(xfs_buf_islocked(bp));
   2041	ASSERT(!(bp->b_flags & XBF_READ));
   2042
   2043	/*
   2044	 * If the buffer is already marked delwri it already is queued up
   2045	 * by someone else for imediate writeout.  Just ignore it in that
   2046	 * case.
   2047	 */
   2048	if (bp->b_flags & _XBF_DELWRI_Q) {
   2049		trace_xfs_buf_delwri_queued(bp, _RET_IP_);
   2050		return false;
   2051	}
   2052
   2053	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
   2054
   2055	/*
   2056	 * If a buffer gets written out synchronously or marked stale while it
   2057	 * is on a delwri list we lazily remove it. To do this, the other party
   2058	 * clears the  _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
   2059	 * It remains referenced and on the list.  In a rare corner case it
   2060	 * might get readded to a delwri list after the synchronous writeout, in
   2061	 * which case we need just need to re-add the flag here.
   2062	 */
   2063	bp->b_flags |= _XBF_DELWRI_Q;
   2064	if (list_empty(&bp->b_list)) {
   2065		atomic_inc(&bp->b_hold);
   2066		list_add_tail(&bp->b_list, list);
   2067	}
   2068
   2069	return true;
   2070}
   2071
   2072/*
   2073 * Compare function is more complex than it needs to be because
   2074 * the return value is only 32 bits and we are doing comparisons
   2075 * on 64 bit values
   2076 */
   2077static int
   2078xfs_buf_cmp(
   2079	void			*priv,
   2080	const struct list_head	*a,
   2081	const struct list_head	*b)
   2082{
   2083	struct xfs_buf	*ap = container_of(a, struct xfs_buf, b_list);
   2084	struct xfs_buf	*bp = container_of(b, struct xfs_buf, b_list);
   2085	xfs_daddr_t		diff;
   2086
   2087	diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
   2088	if (diff < 0)
   2089		return -1;
   2090	if (diff > 0)
   2091		return 1;
   2092	return 0;
   2093}
   2094
   2095/*
   2096 * Submit buffers for write. If wait_list is specified, the buffers are
   2097 * submitted using sync I/O and placed on the wait list such that the caller can
   2098 * iowait each buffer. Otherwise async I/O is used and the buffers are released
   2099 * at I/O completion time. In either case, buffers remain locked until I/O
   2100 * completes and the buffer is released from the queue.
   2101 */
   2102static int
   2103xfs_buf_delwri_submit_buffers(
   2104	struct list_head	*buffer_list,
   2105	struct list_head	*wait_list)
   2106{
   2107	struct xfs_buf		*bp, *n;
   2108	int			pinned = 0;
   2109	struct blk_plug		plug;
   2110
   2111	list_sort(NULL, buffer_list, xfs_buf_cmp);
   2112
   2113	blk_start_plug(&plug);
   2114	list_for_each_entry_safe(bp, n, buffer_list, b_list) {
   2115		if (!wait_list) {
   2116			if (!xfs_buf_trylock(bp))
   2117				continue;
   2118			if (xfs_buf_ispinned(bp)) {
   2119				xfs_buf_unlock(bp);
   2120				pinned++;
   2121				continue;
   2122			}
   2123		} else {
   2124			xfs_buf_lock(bp);
   2125		}
   2126
   2127		/*
   2128		 * Someone else might have written the buffer synchronously or
   2129		 * marked it stale in the meantime.  In that case only the
   2130		 * _XBF_DELWRI_Q flag got cleared, and we have to drop the
   2131		 * reference and remove it from the list here.
   2132		 */
   2133		if (!(bp->b_flags & _XBF_DELWRI_Q)) {
   2134			list_del_init(&bp->b_list);
   2135			xfs_buf_relse(bp);
   2136			continue;
   2137		}
   2138
   2139		trace_xfs_buf_delwri_split(bp, _RET_IP_);
   2140
   2141		/*
   2142		 * If we have a wait list, each buffer (and associated delwri
   2143		 * queue reference) transfers to it and is submitted
   2144		 * synchronously. Otherwise, drop the buffer from the delwri
   2145		 * queue and submit async.
   2146		 */
   2147		bp->b_flags &= ~_XBF_DELWRI_Q;
   2148		bp->b_flags |= XBF_WRITE;
   2149		if (wait_list) {
   2150			bp->b_flags &= ~XBF_ASYNC;
   2151			list_move_tail(&bp->b_list, wait_list);
   2152		} else {
   2153			bp->b_flags |= XBF_ASYNC;
   2154			list_del_init(&bp->b_list);
   2155		}
   2156		__xfs_buf_submit(bp, false);
   2157	}
   2158	blk_finish_plug(&plug);
   2159
   2160	return pinned;
   2161}
   2162
   2163/*
   2164 * Write out a buffer list asynchronously.
   2165 *
   2166 * This will take the @buffer_list, write all non-locked and non-pinned buffers
   2167 * out and not wait for I/O completion on any of the buffers.  This interface
   2168 * is only safely useable for callers that can track I/O completion by higher
   2169 * level means, e.g. AIL pushing as the @buffer_list is consumed in this
   2170 * function.
   2171 *
   2172 * Note: this function will skip buffers it would block on, and in doing so
   2173 * leaves them on @buffer_list so they can be retried on a later pass. As such,
   2174 * it is up to the caller to ensure that the buffer list is fully submitted or
   2175 * cancelled appropriately when they are finished with the list. Failure to
   2176 * cancel or resubmit the list until it is empty will result in leaked buffers
   2177 * at unmount time.
   2178 */
   2179int
   2180xfs_buf_delwri_submit_nowait(
   2181	struct list_head	*buffer_list)
   2182{
   2183	return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
   2184}
   2185
   2186/*
   2187 * Write out a buffer list synchronously.
   2188 *
   2189 * This will take the @buffer_list, write all buffers out and wait for I/O
   2190 * completion on all of the buffers. @buffer_list is consumed by the function,
   2191 * so callers must have some other way of tracking buffers if they require such
   2192 * functionality.
   2193 */
   2194int
   2195xfs_buf_delwri_submit(
   2196	struct list_head	*buffer_list)
   2197{
   2198	LIST_HEAD		(wait_list);
   2199	int			error = 0, error2;
   2200	struct xfs_buf		*bp;
   2201
   2202	xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
   2203
   2204	/* Wait for IO to complete. */
   2205	while (!list_empty(&wait_list)) {
   2206		bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
   2207
   2208		list_del_init(&bp->b_list);
   2209
   2210		/*
   2211		 * Wait on the locked buffer, check for errors and unlock and
   2212		 * release the delwri queue reference.
   2213		 */
   2214		error2 = xfs_buf_iowait(bp);
   2215		xfs_buf_relse(bp);
   2216		if (!error)
   2217			error = error2;
   2218	}
   2219
   2220	return error;
   2221}
   2222
   2223/*
   2224 * Push a single buffer on a delwri queue.
   2225 *
   2226 * The purpose of this function is to submit a single buffer of a delwri queue
   2227 * and return with the buffer still on the original queue. The waiting delwri
   2228 * buffer submission infrastructure guarantees transfer of the delwri queue
   2229 * buffer reference to a temporary wait list. We reuse this infrastructure to
   2230 * transfer the buffer back to the original queue.
   2231 *
   2232 * Note the buffer transitions from the queued state, to the submitted and wait
   2233 * listed state and back to the queued state during this call. The buffer
   2234 * locking and queue management logic between _delwri_pushbuf() and
   2235 * _delwri_queue() guarantee that the buffer cannot be queued to another list
   2236 * before returning.
   2237 */
   2238int
   2239xfs_buf_delwri_pushbuf(
   2240	struct xfs_buf		*bp,
   2241	struct list_head	*buffer_list)
   2242{
   2243	LIST_HEAD		(submit_list);
   2244	int			error;
   2245
   2246	ASSERT(bp->b_flags & _XBF_DELWRI_Q);
   2247
   2248	trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
   2249
   2250	/*
   2251	 * Isolate the buffer to a new local list so we can submit it for I/O
   2252	 * independently from the rest of the original list.
   2253	 */
   2254	xfs_buf_lock(bp);
   2255	list_move(&bp->b_list, &submit_list);
   2256	xfs_buf_unlock(bp);
   2257
   2258	/*
   2259	 * Delwri submission clears the DELWRI_Q buffer flag and returns with
   2260	 * the buffer on the wait list with the original reference. Rather than
   2261	 * bounce the buffer from a local wait list back to the original list
   2262	 * after I/O completion, reuse the original list as the wait list.
   2263	 */
   2264	xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
   2265
   2266	/*
   2267	 * The buffer is now locked, under I/O and wait listed on the original
   2268	 * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and
   2269	 * return with the buffer unlocked and on the original queue.
   2270	 */
   2271	error = xfs_buf_iowait(bp);
   2272	bp->b_flags |= _XBF_DELWRI_Q;
   2273	xfs_buf_unlock(bp);
   2274
   2275	return error;
   2276}
   2277
   2278int __init
   2279xfs_buf_init(void)
   2280{
   2281	xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
   2282					 SLAB_HWCACHE_ALIGN |
   2283					 SLAB_RECLAIM_ACCOUNT |
   2284					 SLAB_MEM_SPREAD,
   2285					 NULL);
   2286	if (!xfs_buf_cache)
   2287		goto out;
   2288
   2289	return 0;
   2290
   2291 out:
   2292	return -ENOMEM;
   2293}
   2294
   2295void
   2296xfs_buf_terminate(void)
   2297{
   2298	kmem_cache_destroy(xfs_buf_cache);
   2299}
   2300
   2301void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
   2302{
   2303	/*
   2304	 * Set the lru reference count to 0 based on the error injection tag.
   2305	 * This allows userspace to disrupt buffer caching for debug/testing
   2306	 * purposes.
   2307	 */
   2308	if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
   2309		lru_ref = 0;
   2310
   2311	atomic_set(&bp->b_lru_ref, lru_ref);
   2312}
   2313
   2314/*
   2315 * Verify an on-disk magic value against the magic value specified in the
   2316 * verifier structure. The verifier magic is in disk byte order so the caller is
   2317 * expected to pass the value directly from disk.
   2318 */
   2319bool
   2320xfs_verify_magic(
   2321	struct xfs_buf		*bp,
   2322	__be32			dmagic)
   2323{
   2324	struct xfs_mount	*mp = bp->b_mount;
   2325	int			idx;
   2326
   2327	idx = xfs_has_crc(mp);
   2328	if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
   2329		return false;
   2330	return dmagic == bp->b_ops->magic[idx];
   2331}
   2332/*
   2333 * Verify an on-disk magic value against the magic value specified in the
   2334 * verifier structure. The verifier magic is in disk byte order so the caller is
   2335 * expected to pass the value directly from disk.
   2336 */
   2337bool
   2338xfs_verify_magic16(
   2339	struct xfs_buf		*bp,
   2340	__be16			dmagic)
   2341{
   2342	struct xfs_mount	*mp = bp->b_mount;
   2343	int			idx;
   2344
   2345	idx = xfs_has_crc(mp);
   2346	if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
   2347		return false;
   2348	return dmagic == bp->b_ops->magic16[idx];
   2349}