cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

pnfs.c (89960B)


      1/*
      2 *  pNFS functions to call and manage layout drivers.
      3 *
      4 *  Copyright (c) 2002 [year of first publication]
      5 *  The Regents of the University of Michigan
      6 *  All Rights Reserved
      7 *
      8 *  Dean Hildebrand <dhildebz@umich.edu>
      9 *
     10 *  Permission is granted to use, copy, create derivative works, and
     11 *  redistribute this software and such derivative works for any purpose,
     12 *  so long as the name of the University of Michigan is not used in
     13 *  any advertising or publicity pertaining to the use or distribution
     14 *  of this software without specific, written prior authorization. If
     15 *  the above copyright notice or any other identification of the
     16 *  University of Michigan is included in any copy of any portion of
     17 *  this software, then the disclaimer below must also be included.
     18 *
     19 *  This software is provided as is, without representation or warranty
     20 *  of any kind either express or implied, including without limitation
     21 *  the implied warranties of merchantability, fitness for a particular
     22 *  purpose, or noninfringement.  The Regents of the University of
     23 *  Michigan shall not be liable for any damages, including special,
     24 *  indirect, incidental, or consequential damages, with respect to any
     25 *  claim arising out of or in connection with the use of the software,
     26 *  even if it has been or is hereafter advised of the possibility of
     27 *  such damages.
     28 */
     29
     30#include <linux/nfs_fs.h>
     31#include <linux/nfs_page.h>
     32#include <linux/module.h>
     33#include <linux/sort.h>
     34#include "internal.h"
     35#include "pnfs.h"
     36#include "iostat.h"
     37#include "nfs4trace.h"
     38#include "delegation.h"
     39#include "nfs42.h"
     40#include "nfs4_fs.h"
     41
     42#define NFSDBG_FACILITY		NFSDBG_PNFS
     43#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
     44
     45/* Locking:
     46 *
     47 * pnfs_spinlock:
     48 *      protects pnfs_modules_tbl.
     49 */
     50static DEFINE_SPINLOCK(pnfs_spinlock);
     51
     52/*
     53 * pnfs_modules_tbl holds all pnfs modules
     54 */
     55static LIST_HEAD(pnfs_modules_tbl);
     56
     57static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
     58static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
     59		struct list_head *free_me,
     60		const struct pnfs_layout_range *range,
     61		u32 seq);
     62static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
     63		                struct list_head *tmp_list);
     64
     65/* Return the registered pnfs layout driver module matching given id */
     66static struct pnfs_layoutdriver_type *
     67find_pnfs_driver_locked(u32 id)
     68{
     69	struct pnfs_layoutdriver_type *local;
     70
     71	list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
     72		if (local->id == id)
     73			goto out;
     74	local = NULL;
     75out:
     76	dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
     77	return local;
     78}
     79
     80static struct pnfs_layoutdriver_type *
     81find_pnfs_driver(u32 id)
     82{
     83	struct pnfs_layoutdriver_type *local;
     84
     85	spin_lock(&pnfs_spinlock);
     86	local = find_pnfs_driver_locked(id);
     87	if (local != NULL && !try_module_get(local->owner)) {
     88		dprintk("%s: Could not grab reference on module\n", __func__);
     89		local = NULL;
     90	}
     91	spin_unlock(&pnfs_spinlock);
     92	return local;
     93}
     94
     95const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id)
     96{
     97	return find_pnfs_driver(id);
     98}
     99
    100void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld)
    101{
    102	if (ld)
    103		module_put(ld->owner);
    104}
    105
    106void
    107unset_pnfs_layoutdriver(struct nfs_server *nfss)
    108{
    109	if (nfss->pnfs_curr_ld) {
    110		if (nfss->pnfs_curr_ld->clear_layoutdriver)
    111			nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
    112		/* Decrement the MDS count. Purge the deviceid cache if zero */
    113		if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count))
    114			nfs4_deviceid_purge_client(nfss->nfs_client);
    115		module_put(nfss->pnfs_curr_ld->owner);
    116	}
    117	nfss->pnfs_curr_ld = NULL;
    118}
    119
    120/*
    121 * When the server sends a list of layout types, we choose one in the order
    122 * given in the list below.
    123 *
    124 * FIXME: should this list be configurable in some fashion? module param?
    125 * 	  mount option? something else?
    126 */
    127static const u32 ld_prefs[] = {
    128	LAYOUT_SCSI,
    129	LAYOUT_BLOCK_VOLUME,
    130	LAYOUT_OSD2_OBJECTS,
    131	LAYOUT_FLEX_FILES,
    132	LAYOUT_NFSV4_1_FILES,
    133	0
    134};
    135
    136static int
    137ld_cmp(const void *e1, const void *e2)
    138{
    139	u32 ld1 = *((u32 *)e1);
    140	u32 ld2 = *((u32 *)e2);
    141	int i;
    142
    143	for (i = 0; ld_prefs[i] != 0; i++) {
    144		if (ld1 == ld_prefs[i])
    145			return -1;
    146
    147		if (ld2 == ld_prefs[i])
    148			return 1;
    149	}
    150	return 0;
    151}
    152
    153/*
    154 * Try to set the server's pnfs module to the pnfs layout type specified by id.
    155 * Currently only one pNFS layout driver per filesystem is supported.
    156 *
    157 * @ids array of layout types supported by MDS.
    158 */
    159void
    160set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
    161		      struct nfs_fsinfo *fsinfo)
    162{
    163	struct pnfs_layoutdriver_type *ld_type = NULL;
    164	u32 id;
    165	int i;
    166
    167	if (fsinfo->nlayouttypes == 0)
    168		goto out_no_driver;
    169	if (!(server->nfs_client->cl_exchange_flags &
    170		 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
    171		printk(KERN_ERR "NFS: %s: cl_exchange_flags 0x%x\n",
    172			__func__, server->nfs_client->cl_exchange_flags);
    173		goto out_no_driver;
    174	}
    175
    176	sort(fsinfo->layouttype, fsinfo->nlayouttypes,
    177		sizeof(*fsinfo->layouttype), ld_cmp, NULL);
    178
    179	for (i = 0; i < fsinfo->nlayouttypes; i++) {
    180		id = fsinfo->layouttype[i];
    181		ld_type = find_pnfs_driver(id);
    182		if (!ld_type) {
    183			request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX,
    184					id);
    185			ld_type = find_pnfs_driver(id);
    186		}
    187		if (ld_type)
    188			break;
    189	}
    190
    191	if (!ld_type) {
    192		dprintk("%s: No pNFS module found!\n", __func__);
    193		goto out_no_driver;
    194	}
    195
    196	server->pnfs_curr_ld = ld_type;
    197	if (ld_type->set_layoutdriver
    198	    && ld_type->set_layoutdriver(server, mntfh)) {
    199		printk(KERN_ERR "NFS: %s: Error initializing pNFS layout "
    200			"driver %u.\n", __func__, id);
    201		module_put(ld_type->owner);
    202		goto out_no_driver;
    203	}
    204	/* Bump the MDS count */
    205	atomic_inc(&server->nfs_client->cl_mds_count);
    206
    207	dprintk("%s: pNFS module for %u set\n", __func__, id);
    208	return;
    209
    210out_no_driver:
    211	dprintk("%s: Using NFSv4 I/O\n", __func__);
    212	server->pnfs_curr_ld = NULL;
    213}
    214
    215int
    216pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
    217{
    218	int status = -EINVAL;
    219	struct pnfs_layoutdriver_type *tmp;
    220
    221	if (ld_type->id == 0) {
    222		printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);
    223		return status;
    224	}
    225	if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
    226		printk(KERN_ERR "NFS: %s Layout driver must provide "
    227		       "alloc_lseg and free_lseg.\n", __func__);
    228		return status;
    229	}
    230
    231	spin_lock(&pnfs_spinlock);
    232	tmp = find_pnfs_driver_locked(ld_type->id);
    233	if (!tmp) {
    234		list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
    235		status = 0;
    236		dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
    237			ld_type->name);
    238	} else {
    239		printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",
    240			__func__, ld_type->id);
    241	}
    242	spin_unlock(&pnfs_spinlock);
    243
    244	return status;
    245}
    246EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
    247
    248void
    249pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
    250{
    251	dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
    252	spin_lock(&pnfs_spinlock);
    253	list_del(&ld_type->pnfs_tblid);
    254	spin_unlock(&pnfs_spinlock);
    255}
    256EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
    257
    258/*
    259 * pNFS client layout cache
    260 */
    261
    262/* Need to hold i_lock if caller does not already hold reference */
    263void
    264pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
    265{
    266	refcount_inc(&lo->plh_refcount);
    267}
    268
    269static struct pnfs_layout_hdr *
    270pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
    271{
    272	struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
    273	return ld->alloc_layout_hdr(ino, gfp_flags);
    274}
    275
    276static void
    277pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
    278{
    279	struct nfs_server *server = NFS_SERVER(lo->plh_inode);
    280	struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
    281
    282	if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
    283		struct nfs_client *clp = server->nfs_client;
    284
    285		spin_lock(&clp->cl_lock);
    286		list_del_rcu(&lo->plh_layouts);
    287		spin_unlock(&clp->cl_lock);
    288	}
    289	put_cred(lo->plh_lc_cred);
    290	return ld->free_layout_hdr(lo);
    291}
    292
    293static void
    294pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
    295{
    296	struct nfs_inode *nfsi = NFS_I(lo->plh_inode);
    297	dprintk("%s: freeing layout cache %p\n", __func__, lo);
    298	nfsi->layout = NULL;
    299	/* Reset MDS Threshold I/O counters */
    300	nfsi->write_io = 0;
    301	nfsi->read_io = 0;
    302}
    303
    304void
    305pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
    306{
    307	struct inode *inode;
    308	unsigned long i_state;
    309
    310	if (!lo)
    311		return;
    312	inode = lo->plh_inode;
    313	pnfs_layoutreturn_before_put_layout_hdr(lo);
    314
    315	if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
    316		if (!list_empty(&lo->plh_segs))
    317			WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
    318		pnfs_detach_layout_hdr(lo);
    319		i_state = inode->i_state;
    320		spin_unlock(&inode->i_lock);
    321		pnfs_free_layout_hdr(lo);
    322		/* Notify pnfs_destroy_layout_final() that we're done */
    323		if (i_state & (I_FREEING | I_CLEAR))
    324			wake_up_var(lo);
    325	}
    326}
    327
    328static struct inode *
    329pnfs_grab_inode_layout_hdr(struct pnfs_layout_hdr *lo)
    330{
    331	struct inode *inode = igrab(lo->plh_inode);
    332	if (inode)
    333		return inode;
    334	set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
    335	return NULL;
    336}
    337
    338/*
    339 * Compare 2 layout stateid sequence ids, to see which is newer,
    340 * taking into account wraparound issues.
    341 */
    342static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
    343{
    344	return (s32)(s1 - s2) > 0;
    345}
    346
    347static void pnfs_barrier_update(struct pnfs_layout_hdr *lo, u32 newseq)
    348{
    349	if (pnfs_seqid_is_newer(newseq, lo->plh_barrier) || !lo->plh_barrier)
    350		lo->plh_barrier = newseq;
    351}
    352
    353static void
    354pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
    355			 u32 seq)
    356{
    357	if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
    358		iomode = IOMODE_ANY;
    359	lo->plh_return_iomode = iomode;
    360	set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
    361	/*
    362	 * We must set lo->plh_return_seq to avoid livelocks with
    363	 * pnfs_layout_need_return()
    364	 */
    365	if (seq == 0)
    366		seq = be32_to_cpu(lo->plh_stateid.seqid);
    367	if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq))
    368		lo->plh_return_seq = seq;
    369	pnfs_barrier_update(lo, seq);
    370}
    371
    372static void
    373pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
    374{
    375	struct pnfs_layout_segment *lseg;
    376	lo->plh_return_iomode = 0;
    377	lo->plh_return_seq = 0;
    378	clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
    379	list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
    380		if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
    381			continue;
    382		pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
    383	}
    384}
    385
    386static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
    387{
    388	clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
    389	clear_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags);
    390	smp_mb__after_atomic();
    391	wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
    392	rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
    393}
    394
    395static void
    396pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg,
    397		struct list_head *free_me)
    398{
    399	clear_bit(NFS_LSEG_ROC, &lseg->pls_flags);
    400	clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
    401	if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags))
    402		pnfs_lseg_dec_and_remove_zero(lseg, free_me);
    403	if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
    404		pnfs_lseg_dec_and_remove_zero(lseg, free_me);
    405}
    406
    407/*
    408 * Update the seqid of a layout stateid after receiving
    409 * NFS4ERR_OLD_STATEID
    410 */
    411bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
    412		struct pnfs_layout_range *dst_range,
    413		struct inode *inode)
    414{
    415	struct pnfs_layout_hdr *lo;
    416	struct pnfs_layout_range range = {
    417		.iomode = IOMODE_ANY,
    418		.offset = 0,
    419		.length = NFS4_MAX_UINT64,
    420	};
    421	bool ret = false;
    422	LIST_HEAD(head);
    423	int err;
    424
    425	spin_lock(&inode->i_lock);
    426	lo = NFS_I(inode)->layout;
    427	if (lo &&  pnfs_layout_is_valid(lo) &&
    428	    nfs4_stateid_match_other(dst, &lo->plh_stateid)) {
    429		/* Is our call using the most recent seqid? If so, bump it */
    430		if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) {
    431			nfs4_stateid_seqid_inc(dst);
    432			ret = true;
    433			goto out;
    434		}
    435		/* Try to update the seqid to the most recent */
    436		err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0);
    437		if (err != -EBUSY) {
    438			dst->seqid = lo->plh_stateid.seqid;
    439			*dst_range = range;
    440			ret = true;
    441		}
    442	}
    443out:
    444	spin_unlock(&inode->i_lock);
    445	pnfs_free_lseg_list(&head);
    446	return ret;
    447}
    448
    449/*
    450 * Mark a pnfs_layout_hdr and all associated layout segments as invalid
    451 *
    452 * In order to continue using the pnfs_layout_hdr, a full recovery
    453 * is required.
    454 * Note that caller must hold inode->i_lock.
    455 */
    456int
    457pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
    458		struct list_head *lseg_list)
    459{
    460	struct pnfs_layout_range range = {
    461		.iomode = IOMODE_ANY,
    462		.offset = 0,
    463		.length = NFS4_MAX_UINT64,
    464	};
    465	struct pnfs_layout_segment *lseg, *next;
    466
    467	set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
    468	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
    469		pnfs_clear_lseg_state(lseg, lseg_list);
    470	pnfs_clear_layoutreturn_info(lo);
    471	pnfs_free_returned_lsegs(lo, lseg_list, &range, 0);
    472	set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
    473	if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
    474	    !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
    475		pnfs_clear_layoutreturn_waitbit(lo);
    476	return !list_empty(&lo->plh_segs);
    477}
    478
    479static int
    480pnfs_iomode_to_fail_bit(u32 iomode)
    481{
    482	return iomode == IOMODE_RW ?
    483		NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
    484}
    485
    486static void
    487pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
    488{
    489	lo->plh_retry_timestamp = jiffies;
    490	if (!test_and_set_bit(fail_bit, &lo->plh_flags))
    491		refcount_inc(&lo->plh_refcount);
    492}
    493
    494static void
    495pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
    496{
    497	if (test_and_clear_bit(fail_bit, &lo->plh_flags))
    498		refcount_dec(&lo->plh_refcount);
    499}
    500
    501static void
    502pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
    503{
    504	struct inode *inode = lo->plh_inode;
    505	struct pnfs_layout_range range = {
    506		.iomode = iomode,
    507		.offset = 0,
    508		.length = NFS4_MAX_UINT64,
    509	};
    510	LIST_HEAD(head);
    511
    512	spin_lock(&inode->i_lock);
    513	pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
    514	pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0);
    515	spin_unlock(&inode->i_lock);
    516	pnfs_free_lseg_list(&head);
    517	dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
    518			iomode == IOMODE_RW ?  "RW" : "READ");
    519}
    520
    521static bool
    522pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
    523{
    524	unsigned long start, end;
    525	int fail_bit = pnfs_iomode_to_fail_bit(iomode);
    526
    527	if (test_bit(fail_bit, &lo->plh_flags) == 0)
    528		return false;
    529	end = jiffies;
    530	start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT;
    531	if (!time_in_range(lo->plh_retry_timestamp, start, end)) {
    532		/* It is time to retry the failed layoutgets */
    533		pnfs_layout_clear_fail_bit(lo, fail_bit);
    534		return false;
    535	}
    536	return true;
    537}
    538
    539static void
    540pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
    541		const struct pnfs_layout_range *range,
    542		const nfs4_stateid *stateid)
    543{
    544	INIT_LIST_HEAD(&lseg->pls_list);
    545	INIT_LIST_HEAD(&lseg->pls_lc_list);
    546	INIT_LIST_HEAD(&lseg->pls_commits);
    547	refcount_set(&lseg->pls_refcount, 1);
    548	set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
    549	lseg->pls_layout = lo;
    550	lseg->pls_range = *range;
    551	lseg->pls_seq = be32_to_cpu(stateid->seqid);
    552}
    553
    554static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
    555{
    556	if (lseg != NULL) {
    557		struct inode *inode = lseg->pls_layout->plh_inode;
    558		NFS_SERVER(inode)->pnfs_curr_ld->free_lseg(lseg);
    559	}
    560}
    561
    562static void
    563pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
    564		struct pnfs_layout_segment *lseg)
    565{
    566	WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
    567	list_del_init(&lseg->pls_list);
    568	/* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
    569	refcount_dec(&lo->plh_refcount);
    570	if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
    571		return;
    572	if (list_empty(&lo->plh_segs) &&
    573	    !test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
    574	    !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
    575		if (atomic_read(&lo->plh_outstanding) == 0)
    576			set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
    577		clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
    578	}
    579}
    580
    581static bool
    582pnfs_cache_lseg_for_layoutreturn(struct pnfs_layout_hdr *lo,
    583		struct pnfs_layout_segment *lseg)
    584{
    585	if (test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
    586	    pnfs_layout_is_valid(lo)) {
    587		pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
    588		list_move_tail(&lseg->pls_list, &lo->plh_return_segs);
    589		return true;
    590	}
    591	return false;
    592}
    593
    594void
    595pnfs_put_lseg(struct pnfs_layout_segment *lseg)
    596{
    597	struct pnfs_layout_hdr *lo;
    598	struct inode *inode;
    599
    600	if (!lseg)
    601		return;
    602
    603	dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
    604		refcount_read(&lseg->pls_refcount),
    605		test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
    606
    607	lo = lseg->pls_layout;
    608	inode = lo->plh_inode;
    609
    610	if (refcount_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
    611		pnfs_get_layout_hdr(lo);
    612		pnfs_layout_remove_lseg(lo, lseg);
    613		if (pnfs_cache_lseg_for_layoutreturn(lo, lseg))
    614			lseg = NULL;
    615		spin_unlock(&inode->i_lock);
    616		pnfs_free_lseg(lseg);
    617		pnfs_put_layout_hdr(lo);
    618	}
    619}
    620EXPORT_SYMBOL_GPL(pnfs_put_lseg);
    621
    622/*
    623 * is l2 fully contained in l1?
    624 *   start1                             end1
    625 *   [----------------------------------)
    626 *           start2           end2
    627 *           [----------------)
    628 */
    629static bool
    630pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
    631		 const struct pnfs_layout_range *l2)
    632{
    633	u64 start1 = l1->offset;
    634	u64 end1 = pnfs_end_offset(start1, l1->length);
    635	u64 start2 = l2->offset;
    636	u64 end2 = pnfs_end_offset(start2, l2->length);
    637
    638	return (start1 <= start2) && (end1 >= end2);
    639}
    640
    641static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
    642		struct list_head *tmp_list)
    643{
    644	if (!refcount_dec_and_test(&lseg->pls_refcount))
    645		return false;
    646	pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
    647	list_add(&lseg->pls_list, tmp_list);
    648	return true;
    649}
    650
    651/* Returns 1 if lseg is removed from list, 0 otherwise */
    652static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
    653			     struct list_head *tmp_list)
    654{
    655	int rv = 0;
    656
    657	if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
    658		/* Remove the reference keeping the lseg in the
    659		 * list.  It will now be removed when all
    660		 * outstanding io is finished.
    661		 */
    662		dprintk("%s: lseg %p ref %d\n", __func__, lseg,
    663			refcount_read(&lseg->pls_refcount));
    664		if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list))
    665			rv = 1;
    666	}
    667	return rv;
    668}
    669
    670static bool
    671pnfs_should_free_range(const struct pnfs_layout_range *lseg_range,
    672		 const struct pnfs_layout_range *recall_range)
    673{
    674	return (recall_range->iomode == IOMODE_ANY ||
    675		lseg_range->iomode == recall_range->iomode) &&
    676	       pnfs_lseg_range_intersecting(lseg_range, recall_range);
    677}
    678
    679static bool
    680pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg,
    681		const struct pnfs_layout_range *recall_range,
    682		u32 seq)
    683{
    684	if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq))
    685		return false;
    686	if (recall_range == NULL)
    687		return true;
    688	return pnfs_should_free_range(&lseg->pls_range, recall_range);
    689}
    690
    691/**
    692 * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
    693 * @lo: layout header containing the lsegs
    694 * @tmp_list: list head where doomed lsegs should go
    695 * @recall_range: optional recall range argument to match (may be NULL)
    696 * @seq: only invalidate lsegs obtained prior to this sequence (may be 0)
    697 *
    698 * Walk the list of lsegs in the layout header, and tear down any that should
    699 * be destroyed. If "recall_range" is specified then the segment must match
    700 * that range. If "seq" is non-zero, then only match segments that were handed
    701 * out at or before that sequence.
    702 *
    703 * Returns number of matching invalid lsegs remaining in list after scanning
    704 * it and purging them.
    705 */
    706int
    707pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
    708			    struct list_head *tmp_list,
    709			    const struct pnfs_layout_range *recall_range,
    710			    u32 seq)
    711{
    712	struct pnfs_layout_segment *lseg, *next;
    713	int remaining = 0;
    714
    715	dprintk("%s:Begin lo %p\n", __func__, lo);
    716
    717	if (list_empty(&lo->plh_segs))
    718		return 0;
    719	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
    720		if (pnfs_match_lseg_recall(lseg, recall_range, seq)) {
    721			dprintk("%s: freeing lseg %p iomode %d seq %u "
    722				"offset %llu length %llu\n", __func__,
    723				lseg, lseg->pls_range.iomode, lseg->pls_seq,
    724				lseg->pls_range.offset, lseg->pls_range.length);
    725			if (!mark_lseg_invalid(lseg, tmp_list))
    726				remaining++;
    727		}
    728	dprintk("%s:Return %i\n", __func__, remaining);
    729	return remaining;
    730}
    731
    732static void
    733pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
    734		struct list_head *free_me,
    735		const struct pnfs_layout_range *range,
    736		u32 seq)
    737{
    738	struct pnfs_layout_segment *lseg, *next;
    739
    740	list_for_each_entry_safe(lseg, next, &lo->plh_return_segs, pls_list) {
    741		if (pnfs_match_lseg_recall(lseg, range, seq))
    742			list_move_tail(&lseg->pls_list, free_me);
    743	}
    744}
    745
    746/* note free_me must contain lsegs from a single layout_hdr */
    747void
    748pnfs_free_lseg_list(struct list_head *free_me)
    749{
    750	struct pnfs_layout_segment *lseg, *tmp;
    751
    752	if (list_empty(free_me))
    753		return;
    754
    755	list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
    756		list_del(&lseg->pls_list);
    757		pnfs_free_lseg(lseg);
    758	}
    759}
    760
    761static struct pnfs_layout_hdr *__pnfs_destroy_layout(struct nfs_inode *nfsi)
    762{
    763	struct pnfs_layout_hdr *lo;
    764	LIST_HEAD(tmp_list);
    765
    766	spin_lock(&nfsi->vfs_inode.i_lock);
    767	lo = nfsi->layout;
    768	if (lo) {
    769		pnfs_get_layout_hdr(lo);
    770		pnfs_mark_layout_stateid_invalid(lo, &tmp_list);
    771		pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
    772		pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
    773		spin_unlock(&nfsi->vfs_inode.i_lock);
    774		pnfs_free_lseg_list(&tmp_list);
    775		nfs_commit_inode(&nfsi->vfs_inode, 0);
    776		pnfs_put_layout_hdr(lo);
    777	} else
    778		spin_unlock(&nfsi->vfs_inode.i_lock);
    779	return lo;
    780}
    781
    782void pnfs_destroy_layout(struct nfs_inode *nfsi)
    783{
    784	__pnfs_destroy_layout(nfsi);
    785}
    786EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
    787
    788static bool pnfs_layout_removed(struct nfs_inode *nfsi,
    789				struct pnfs_layout_hdr *lo)
    790{
    791	bool ret;
    792
    793	spin_lock(&nfsi->vfs_inode.i_lock);
    794	ret = nfsi->layout != lo;
    795	spin_unlock(&nfsi->vfs_inode.i_lock);
    796	return ret;
    797}
    798
    799void pnfs_destroy_layout_final(struct nfs_inode *nfsi)
    800{
    801	struct pnfs_layout_hdr *lo = __pnfs_destroy_layout(nfsi);
    802
    803	if (lo)
    804		wait_var_event(lo, pnfs_layout_removed(nfsi, lo));
    805}
    806
    807static bool
    808pnfs_layout_add_bulk_destroy_list(struct inode *inode,
    809		struct list_head *layout_list)
    810{
    811	struct pnfs_layout_hdr *lo;
    812	bool ret = false;
    813
    814	spin_lock(&inode->i_lock);
    815	lo = NFS_I(inode)->layout;
    816	if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) {
    817		pnfs_get_layout_hdr(lo);
    818		list_add(&lo->plh_bulk_destroy, layout_list);
    819		ret = true;
    820	}
    821	spin_unlock(&inode->i_lock);
    822	return ret;
    823}
    824
    825/* Caller must hold rcu_read_lock and clp->cl_lock */
    826static int
    827pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
    828		struct nfs_server *server,
    829		struct list_head *layout_list)
    830	__must_hold(&clp->cl_lock)
    831	__must_hold(RCU)
    832{
    833	struct pnfs_layout_hdr *lo, *next;
    834	struct inode *inode;
    835
    836	list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
    837		if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
    838		    test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) ||
    839		    !list_empty(&lo->plh_bulk_destroy))
    840			continue;
    841		/* If the sb is being destroyed, just bail */
    842		if (!nfs_sb_active(server->super))
    843			break;
    844		inode = pnfs_grab_inode_layout_hdr(lo);
    845		if (inode != NULL) {
    846			if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags))
    847				list_del_rcu(&lo->plh_layouts);
    848			if (pnfs_layout_add_bulk_destroy_list(inode,
    849						layout_list))
    850				continue;
    851			rcu_read_unlock();
    852			spin_unlock(&clp->cl_lock);
    853			iput(inode);
    854		} else {
    855			rcu_read_unlock();
    856			spin_unlock(&clp->cl_lock);
    857		}
    858		nfs_sb_deactive(server->super);
    859		spin_lock(&clp->cl_lock);
    860		rcu_read_lock();
    861		return -EAGAIN;
    862	}
    863	return 0;
    864}
    865
    866static int
    867pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
    868		bool is_bulk_recall)
    869{
    870	struct pnfs_layout_hdr *lo;
    871	struct inode *inode;
    872	LIST_HEAD(lseg_list);
    873	int ret = 0;
    874
    875	while (!list_empty(layout_list)) {
    876		lo = list_entry(layout_list->next, struct pnfs_layout_hdr,
    877				plh_bulk_destroy);
    878		dprintk("%s freeing layout for inode %lu\n", __func__,
    879			lo->plh_inode->i_ino);
    880		inode = lo->plh_inode;
    881
    882		pnfs_layoutcommit_inode(inode, false);
    883
    884		spin_lock(&inode->i_lock);
    885		list_del_init(&lo->plh_bulk_destroy);
    886		if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) {
    887			if (is_bulk_recall)
    888				set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
    889			ret = -EAGAIN;
    890		}
    891		spin_unlock(&inode->i_lock);
    892		pnfs_free_lseg_list(&lseg_list);
    893		/* Free all lsegs that are attached to commit buckets */
    894		nfs_commit_inode(inode, 0);
    895		pnfs_put_layout_hdr(lo);
    896		nfs_iput_and_deactive(inode);
    897	}
    898	return ret;
    899}
    900
    901int
    902pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
    903		struct nfs_fsid *fsid,
    904		bool is_recall)
    905{
    906	struct nfs_server *server;
    907	LIST_HEAD(layout_list);
    908
    909	spin_lock(&clp->cl_lock);
    910	rcu_read_lock();
    911restart:
    912	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
    913		if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0)
    914			continue;
    915		if (pnfs_layout_bulk_destroy_byserver_locked(clp,
    916				server,
    917				&layout_list) != 0)
    918			goto restart;
    919	}
    920	rcu_read_unlock();
    921	spin_unlock(&clp->cl_lock);
    922
    923	if (list_empty(&layout_list))
    924		return 0;
    925	return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
    926}
    927
    928int
    929pnfs_destroy_layouts_byclid(struct nfs_client *clp,
    930		bool is_recall)
    931{
    932	struct nfs_server *server;
    933	LIST_HEAD(layout_list);
    934
    935	spin_lock(&clp->cl_lock);
    936	rcu_read_lock();
    937restart:
    938	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
    939		if (pnfs_layout_bulk_destroy_byserver_locked(clp,
    940					server,
    941					&layout_list) != 0)
    942			goto restart;
    943	}
    944	rcu_read_unlock();
    945	spin_unlock(&clp->cl_lock);
    946
    947	if (list_empty(&layout_list))
    948		return 0;
    949	return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
    950}
    951
    952/*
    953 * Called by the state manager to remove all layouts established under an
    954 * expired lease.
    955 */
    956void
    957pnfs_destroy_all_layouts(struct nfs_client *clp)
    958{
    959	nfs4_deviceid_mark_client_invalid(clp);
    960	nfs4_deviceid_purge_client(clp);
    961
    962	pnfs_destroy_layouts_byclid(clp, false);
    963}
    964
    965static void
    966pnfs_set_layout_cred(struct pnfs_layout_hdr *lo, const struct cred *cred)
    967{
    968	const struct cred *old;
    969
    970	if (cred && cred_fscmp(lo->plh_lc_cred, cred) != 0) {
    971		old = xchg(&lo->plh_lc_cred, get_cred(cred));
    972		put_cred(old);
    973	}
    974}
    975
    976/* update lo->plh_stateid with new if is more recent */
    977void
    978pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
    979			const struct cred *cred, bool update_barrier)
    980{
    981	u32 oldseq = be32_to_cpu(lo->plh_stateid.seqid);
    982	u32 newseq = be32_to_cpu(new->seqid);
    983
    984	if (!pnfs_layout_is_valid(lo)) {
    985		pnfs_set_layout_cred(lo, cred);
    986		nfs4_stateid_copy(&lo->plh_stateid, new);
    987		lo->plh_barrier = newseq;
    988		pnfs_clear_layoutreturn_info(lo);
    989		clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
    990		return;
    991	}
    992
    993	if (pnfs_seqid_is_newer(newseq, oldseq))
    994		nfs4_stateid_copy(&lo->plh_stateid, new);
    995
    996	if (update_barrier) {
    997		pnfs_barrier_update(lo, newseq);
    998		return;
    999	}
   1000	/*
   1001	 * Because of wraparound, we want to keep the barrier
   1002	 * "close" to the current seqids. We really only want to
   1003	 * get here from a layoutget call.
   1004	 */
   1005	if (atomic_read(&lo->plh_outstanding) == 1)
   1006		 pnfs_barrier_update(lo, be32_to_cpu(lo->plh_stateid.seqid));
   1007}
   1008
   1009static bool
   1010pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
   1011		const nfs4_stateid *stateid)
   1012{
   1013	u32 seqid = be32_to_cpu(stateid->seqid);
   1014
   1015	return lo->plh_barrier && pnfs_seqid_is_newer(lo->plh_barrier, seqid);
   1016}
   1017
   1018/* lget is set to 1 if called from inside send_layoutget call chain */
   1019static bool
   1020pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
   1021{
   1022	return lo->plh_block_lgets ||
   1023		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
   1024}
   1025
   1026static struct nfs_server *
   1027pnfs_find_server(struct inode *inode, struct nfs_open_context *ctx)
   1028{
   1029	struct nfs_server *server;
   1030
   1031	if (inode) {
   1032		server = NFS_SERVER(inode);
   1033	} else {
   1034		struct dentry *parent_dir = dget_parent(ctx->dentry);
   1035		server = NFS_SERVER(parent_dir->d_inode);
   1036		dput(parent_dir);
   1037	}
   1038	return server;
   1039}
   1040
   1041static void nfs4_free_pages(struct page **pages, size_t size)
   1042{
   1043	int i;
   1044
   1045	if (!pages)
   1046		return;
   1047
   1048	for (i = 0; i < size; i++) {
   1049		if (!pages[i])
   1050			break;
   1051		__free_page(pages[i]);
   1052	}
   1053	kfree(pages);
   1054}
   1055
   1056static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags)
   1057{
   1058	struct page **pages;
   1059	int i;
   1060
   1061	pages = kmalloc_array(size, sizeof(struct page *), gfp_flags);
   1062	if (!pages) {
   1063		dprintk("%s: can't alloc array of %zu pages\n", __func__, size);
   1064		return NULL;
   1065	}
   1066
   1067	for (i = 0; i < size; i++) {
   1068		pages[i] = alloc_page(gfp_flags);
   1069		if (!pages[i]) {
   1070			dprintk("%s: failed to allocate page\n", __func__);
   1071			nfs4_free_pages(pages, i);
   1072			return NULL;
   1073		}
   1074	}
   1075
   1076	return pages;
   1077}
   1078
   1079static struct nfs4_layoutget *
   1080pnfs_alloc_init_layoutget_args(struct inode *ino,
   1081	   struct nfs_open_context *ctx,
   1082	   const nfs4_stateid *stateid,
   1083	   const struct pnfs_layout_range *range,
   1084	   gfp_t gfp_flags)
   1085{
   1086	struct nfs_server *server = pnfs_find_server(ino, ctx);
   1087	size_t max_reply_sz = server->pnfs_curr_ld->max_layoutget_response;
   1088	size_t max_pages = max_response_pages(server);
   1089	struct nfs4_layoutget *lgp;
   1090
   1091	dprintk("--> %s\n", __func__);
   1092
   1093	lgp = kzalloc(sizeof(*lgp), gfp_flags);
   1094	if (lgp == NULL)
   1095		return NULL;
   1096
   1097	if (max_reply_sz) {
   1098		size_t npages = (max_reply_sz + PAGE_SIZE - 1) >> PAGE_SHIFT;
   1099		if (npages < max_pages)
   1100			max_pages = npages;
   1101	}
   1102
   1103	lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags);
   1104	if (!lgp->args.layout.pages) {
   1105		kfree(lgp);
   1106		return NULL;
   1107	}
   1108	lgp->args.layout.pglen = max_pages * PAGE_SIZE;
   1109	lgp->res.layoutp = &lgp->args.layout;
   1110
   1111	/* Don't confuse uninitialised result and success */
   1112	lgp->res.status = -NFS4ERR_DELAY;
   1113
   1114	lgp->args.minlength = PAGE_SIZE;
   1115	if (lgp->args.minlength > range->length)
   1116		lgp->args.minlength = range->length;
   1117	if (ino) {
   1118		loff_t i_size = i_size_read(ino);
   1119
   1120		if (range->iomode == IOMODE_READ) {
   1121			if (range->offset >= i_size)
   1122				lgp->args.minlength = 0;
   1123			else if (i_size - range->offset < lgp->args.minlength)
   1124				lgp->args.minlength = i_size - range->offset;
   1125		}
   1126	}
   1127	lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
   1128	pnfs_copy_range(&lgp->args.range, range);
   1129	lgp->args.type = server->pnfs_curr_ld->id;
   1130	lgp->args.inode = ino;
   1131	lgp->args.ctx = get_nfs_open_context(ctx);
   1132	nfs4_stateid_copy(&lgp->args.stateid, stateid);
   1133	lgp->gfp_flags = gfp_flags;
   1134	lgp->cred = ctx->cred;
   1135	return lgp;
   1136}
   1137
   1138void pnfs_layoutget_free(struct nfs4_layoutget *lgp)
   1139{
   1140	size_t max_pages = lgp->args.layout.pglen / PAGE_SIZE;
   1141
   1142	nfs4_free_pages(lgp->args.layout.pages, max_pages);
   1143	pnfs_put_layout_hdr(lgp->lo);
   1144	put_nfs_open_context(lgp->args.ctx);
   1145	kfree(lgp);
   1146}
   1147
   1148static void pnfs_clear_layoutcommit(struct inode *inode,
   1149		struct list_head *head)
   1150{
   1151	struct nfs_inode *nfsi = NFS_I(inode);
   1152	struct pnfs_layout_segment *lseg, *tmp;
   1153
   1154	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
   1155		return;
   1156	list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) {
   1157		if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
   1158			continue;
   1159		pnfs_lseg_dec_and_remove_zero(lseg, head);
   1160	}
   1161}
   1162
   1163void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
   1164		const nfs4_stateid *arg_stateid,
   1165		const struct pnfs_layout_range *range,
   1166		const nfs4_stateid *stateid)
   1167{
   1168	struct inode *inode = lo->plh_inode;
   1169	LIST_HEAD(freeme);
   1170
   1171	spin_lock(&inode->i_lock);
   1172	if (!pnfs_layout_is_valid(lo) ||
   1173	    !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid))
   1174		goto out_unlock;
   1175	if (stateid) {
   1176		u32 seq = be32_to_cpu(arg_stateid->seqid);
   1177
   1178		pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
   1179		pnfs_free_returned_lsegs(lo, &freeme, range, seq);
   1180		pnfs_set_layout_stateid(lo, stateid, NULL, true);
   1181	} else
   1182		pnfs_mark_layout_stateid_invalid(lo, &freeme);
   1183out_unlock:
   1184	pnfs_clear_layoutreturn_waitbit(lo);
   1185	spin_unlock(&inode->i_lock);
   1186	pnfs_free_lseg_list(&freeme);
   1187
   1188}
   1189
   1190static bool
   1191pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
   1192		nfs4_stateid *stateid,
   1193		const struct cred **cred,
   1194		enum pnfs_iomode *iomode)
   1195{
   1196	/* Serialise LAYOUTGET/LAYOUTRETURN */
   1197	if (atomic_read(&lo->plh_outstanding) != 0)
   1198		return false;
   1199	if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
   1200		return false;
   1201	set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
   1202	pnfs_get_layout_hdr(lo);
   1203	nfs4_stateid_copy(stateid, &lo->plh_stateid);
   1204	*cred = get_cred(lo->plh_lc_cred);
   1205	if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
   1206		if (lo->plh_return_seq != 0)
   1207			stateid->seqid = cpu_to_be32(lo->plh_return_seq);
   1208		if (iomode != NULL)
   1209			*iomode = lo->plh_return_iomode;
   1210		pnfs_clear_layoutreturn_info(lo);
   1211	} else if (iomode != NULL)
   1212		*iomode = IOMODE_ANY;
   1213	pnfs_barrier_update(lo, be32_to_cpu(stateid->seqid));
   1214	return true;
   1215}
   1216
   1217static void
   1218pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args,
   1219		struct pnfs_layout_hdr *lo,
   1220		const nfs4_stateid *stateid,
   1221		enum pnfs_iomode iomode)
   1222{
   1223	struct inode *inode = lo->plh_inode;
   1224
   1225	args->layout_type = NFS_SERVER(inode)->pnfs_curr_ld->id;
   1226	args->inode = inode;
   1227	args->range.iomode = iomode;
   1228	args->range.offset = 0;
   1229	args->range.length = NFS4_MAX_UINT64;
   1230	args->layout = lo;
   1231	nfs4_stateid_copy(&args->stateid, stateid);
   1232}
   1233
   1234static int
   1235pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo,
   1236		       const nfs4_stateid *stateid,
   1237		       const struct cred **pcred,
   1238		       enum pnfs_iomode iomode,
   1239		       bool sync)
   1240{
   1241	struct inode *ino = lo->plh_inode;
   1242	struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
   1243	struct nfs4_layoutreturn *lrp;
   1244	const struct cred *cred = *pcred;
   1245	int status = 0;
   1246
   1247	*pcred = NULL;
   1248	lrp = kzalloc(sizeof(*lrp), nfs_io_gfp_mask());
   1249	if (unlikely(lrp == NULL)) {
   1250		status = -ENOMEM;
   1251		spin_lock(&ino->i_lock);
   1252		pnfs_clear_layoutreturn_waitbit(lo);
   1253		spin_unlock(&ino->i_lock);
   1254		put_cred(cred);
   1255		pnfs_put_layout_hdr(lo);
   1256		goto out;
   1257	}
   1258
   1259	pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode);
   1260	lrp->args.ld_private = &lrp->ld_private;
   1261	lrp->clp = NFS_SERVER(ino)->nfs_client;
   1262	lrp->cred = cred;
   1263	if (ld->prepare_layoutreturn)
   1264		ld->prepare_layoutreturn(&lrp->args);
   1265
   1266	status = nfs4_proc_layoutreturn(lrp, sync);
   1267out:
   1268	dprintk("<-- %s status: %d\n", __func__, status);
   1269	return status;
   1270}
   1271
   1272static bool
   1273pnfs_layout_segments_returnable(struct pnfs_layout_hdr *lo,
   1274				enum pnfs_iomode iomode,
   1275				u32 seq)
   1276{
   1277	struct pnfs_layout_range recall_range = {
   1278		.length = NFS4_MAX_UINT64,
   1279		.iomode = iomode,
   1280	};
   1281	return pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
   1282					       &recall_range, seq) != -EBUSY;
   1283}
   1284
   1285/* Return true if layoutreturn is needed */
   1286static bool
   1287pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
   1288{
   1289	if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
   1290		return false;
   1291	return pnfs_layout_segments_returnable(lo, lo->plh_return_iomode,
   1292					       lo->plh_return_seq);
   1293}
   1294
   1295static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
   1296{
   1297	struct inode *inode= lo->plh_inode;
   1298
   1299	if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
   1300		return;
   1301	spin_lock(&inode->i_lock);
   1302	if (pnfs_layout_need_return(lo)) {
   1303		const struct cred *cred;
   1304		nfs4_stateid stateid;
   1305		enum pnfs_iomode iomode;
   1306		bool send;
   1307
   1308		send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
   1309		spin_unlock(&inode->i_lock);
   1310		if (send) {
   1311			/* Send an async layoutreturn so we dont deadlock */
   1312			pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
   1313		}
   1314	} else
   1315		spin_unlock(&inode->i_lock);
   1316}
   1317
   1318/*
   1319 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
   1320 * when the layout segment list is empty.
   1321 *
   1322 * Note that a pnfs_layout_hdr can exist with an empty layout segment
   1323 * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the
   1324 * deviceid is marked invalid.
   1325 */
   1326int
   1327_pnfs_return_layout(struct inode *ino)
   1328{
   1329	struct pnfs_layout_hdr *lo = NULL;
   1330	struct nfs_inode *nfsi = NFS_I(ino);
   1331	struct pnfs_layout_range range = {
   1332		.iomode		= IOMODE_ANY,
   1333		.offset		= 0,
   1334		.length		= NFS4_MAX_UINT64,
   1335	};
   1336	LIST_HEAD(tmp_list);
   1337	const struct cred *cred;
   1338	nfs4_stateid stateid;
   1339	int status = 0;
   1340	bool send, valid_layout;
   1341
   1342	dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
   1343
   1344	spin_lock(&ino->i_lock);
   1345	lo = nfsi->layout;
   1346	if (!lo) {
   1347		spin_unlock(&ino->i_lock);
   1348		dprintk("NFS: %s no layout to return\n", __func__);
   1349		goto out;
   1350	}
   1351	/* Reference matched in nfs4_layoutreturn_release */
   1352	pnfs_get_layout_hdr(lo);
   1353	/* Is there an outstanding layoutreturn ? */
   1354	if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
   1355		spin_unlock(&ino->i_lock);
   1356		if (wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
   1357					TASK_UNINTERRUPTIBLE))
   1358			goto out_put_layout_hdr;
   1359		spin_lock(&ino->i_lock);
   1360	}
   1361	valid_layout = pnfs_layout_is_valid(lo);
   1362	pnfs_clear_layoutcommit(ino, &tmp_list);
   1363	pnfs_mark_matching_lsegs_return(lo, &tmp_list, &range, 0);
   1364
   1365	if (NFS_SERVER(ino)->pnfs_curr_ld->return_range)
   1366		NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
   1367
   1368	/* Don't send a LAYOUTRETURN if list was initially empty */
   1369	if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) ||
   1370			!valid_layout) {
   1371		spin_unlock(&ino->i_lock);
   1372		dprintk("NFS: %s no layout segments to return\n", __func__);
   1373		goto out_wait_layoutreturn;
   1374	}
   1375
   1376	send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL);
   1377	spin_unlock(&ino->i_lock);
   1378	if (send)
   1379		status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, true);
   1380out_wait_layoutreturn:
   1381	wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE);
   1382out_put_layout_hdr:
   1383	pnfs_free_lseg_list(&tmp_list);
   1384	pnfs_put_layout_hdr(lo);
   1385out:
   1386	dprintk("<-- %s status: %d\n", __func__, status);
   1387	return status;
   1388}
   1389
   1390int
   1391pnfs_commit_and_return_layout(struct inode *inode)
   1392{
   1393	struct pnfs_layout_hdr *lo;
   1394	int ret;
   1395
   1396	spin_lock(&inode->i_lock);
   1397	lo = NFS_I(inode)->layout;
   1398	if (lo == NULL) {
   1399		spin_unlock(&inode->i_lock);
   1400		return 0;
   1401	}
   1402	pnfs_get_layout_hdr(lo);
   1403	/* Block new layoutgets and read/write to ds */
   1404	lo->plh_block_lgets++;
   1405	spin_unlock(&inode->i_lock);
   1406	filemap_fdatawait(inode->i_mapping);
   1407	ret = pnfs_layoutcommit_inode(inode, true);
   1408	if (ret == 0)
   1409		ret = _pnfs_return_layout(inode);
   1410	spin_lock(&inode->i_lock);
   1411	lo->plh_block_lgets--;
   1412	spin_unlock(&inode->i_lock);
   1413	pnfs_put_layout_hdr(lo);
   1414	return ret;
   1415}
   1416
   1417bool pnfs_roc(struct inode *ino,
   1418		struct nfs4_layoutreturn_args *args,
   1419		struct nfs4_layoutreturn_res *res,
   1420		const struct cred *cred)
   1421{
   1422	struct nfs_inode *nfsi = NFS_I(ino);
   1423	struct nfs_open_context *ctx;
   1424	struct nfs4_state *state;
   1425	struct pnfs_layout_hdr *lo;
   1426	struct pnfs_layout_segment *lseg, *next;
   1427	const struct cred *lc_cred;
   1428	nfs4_stateid stateid;
   1429	enum pnfs_iomode iomode = 0;
   1430	bool layoutreturn = false, roc = false;
   1431	bool skip_read = false;
   1432
   1433	if (!nfs_have_layout(ino))
   1434		return false;
   1435retry:
   1436	rcu_read_lock();
   1437	spin_lock(&ino->i_lock);
   1438	lo = nfsi->layout;
   1439	if (!lo || !pnfs_layout_is_valid(lo) ||
   1440	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
   1441		lo = NULL;
   1442		goto out_noroc;
   1443	}
   1444	pnfs_get_layout_hdr(lo);
   1445	if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
   1446		spin_unlock(&ino->i_lock);
   1447		rcu_read_unlock();
   1448		wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
   1449				TASK_UNINTERRUPTIBLE);
   1450		pnfs_put_layout_hdr(lo);
   1451		goto retry;
   1452	}
   1453
   1454	/* no roc if we hold a delegation */
   1455	if (nfs4_check_delegation(ino, FMODE_READ)) {
   1456		if (nfs4_check_delegation(ino, FMODE_WRITE))
   1457			goto out_noroc;
   1458		skip_read = true;
   1459	}
   1460
   1461	list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
   1462		state = ctx->state;
   1463		if (state == NULL)
   1464			continue;
   1465		/* Don't return layout if there is open file state */
   1466		if (state->state & FMODE_WRITE)
   1467			goto out_noroc;
   1468		if (state->state & FMODE_READ)
   1469			skip_read = true;
   1470	}
   1471
   1472
   1473	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) {
   1474		if (skip_read && lseg->pls_range.iomode == IOMODE_READ)
   1475			continue;
   1476		/* If we are sending layoutreturn, invalidate all valid lsegs */
   1477		if (!test_and_clear_bit(NFS_LSEG_ROC, &lseg->pls_flags))
   1478			continue;
   1479		/*
   1480		 * Note: mark lseg for return so pnfs_layout_remove_lseg
   1481		 * doesn't invalidate the layout for us.
   1482		 */
   1483		set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
   1484		if (!mark_lseg_invalid(lseg, &lo->plh_return_segs))
   1485			continue;
   1486		pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
   1487	}
   1488
   1489	if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
   1490		goto out_noroc;
   1491
   1492	/* ROC in two conditions:
   1493	 * 1. there are ROC lsegs
   1494	 * 2. we don't send layoutreturn
   1495	 */
   1496	/* lo ref dropped in pnfs_roc_release() */
   1497	layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &lc_cred, &iomode);
   1498	/* If the creds don't match, we can't compound the layoutreturn */
   1499	if (!layoutreturn || cred_fscmp(cred, lc_cred) != 0)
   1500		goto out_noroc;
   1501
   1502	roc = layoutreturn;
   1503	pnfs_init_layoutreturn_args(args, lo, &stateid, iomode);
   1504	res->lrs_present = 0;
   1505	layoutreturn = false;
   1506	put_cred(lc_cred);
   1507
   1508out_noroc:
   1509	spin_unlock(&ino->i_lock);
   1510	rcu_read_unlock();
   1511	pnfs_layoutcommit_inode(ino, true);
   1512	if (roc) {
   1513		struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
   1514		if (ld->prepare_layoutreturn)
   1515			ld->prepare_layoutreturn(args);
   1516		pnfs_put_layout_hdr(lo);
   1517		return true;
   1518	}
   1519	if (layoutreturn)
   1520		pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, true);
   1521	pnfs_put_layout_hdr(lo);
   1522	return false;
   1523}
   1524
   1525int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp,
   1526		  struct nfs4_layoutreturn_res **respp, int *ret)
   1527{
   1528	struct nfs4_layoutreturn_args *arg = *argpp;
   1529	int retval = -EAGAIN;
   1530
   1531	if (!arg)
   1532		return 0;
   1533	/* Handle Layoutreturn errors */
   1534	switch (*ret) {
   1535	case 0:
   1536		retval = 0;
   1537		break;
   1538	case -NFS4ERR_NOMATCHING_LAYOUT:
   1539		/* Was there an RPC level error? If not, retry */
   1540		if (task->tk_rpc_status == 0)
   1541			break;
   1542		/* If the call was not sent, let caller handle it */
   1543		if (!RPC_WAS_SENT(task))
   1544			return 0;
   1545		/*
   1546		 * Otherwise, assume the call succeeded and
   1547		 * that we need to release the layout
   1548		 */
   1549		*ret = 0;
   1550		(*respp)->lrs_present = 0;
   1551		retval = 0;
   1552		break;
   1553	case -NFS4ERR_DELAY:
   1554		/* Let the caller handle the retry */
   1555		*ret = -NFS4ERR_NOMATCHING_LAYOUT;
   1556		return 0;
   1557	case -NFS4ERR_OLD_STATEID:
   1558		if (!nfs4_layout_refresh_old_stateid(&arg->stateid,
   1559						     &arg->range, arg->inode))
   1560			break;
   1561		*ret = -NFS4ERR_NOMATCHING_LAYOUT;
   1562		return -EAGAIN;
   1563	}
   1564	*argpp = NULL;
   1565	*respp = NULL;
   1566	return retval;
   1567}
   1568
   1569void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
   1570		struct nfs4_layoutreturn_res *res,
   1571		int ret)
   1572{
   1573	struct pnfs_layout_hdr *lo = args->layout;
   1574	struct inode *inode = args->inode;
   1575	const nfs4_stateid *res_stateid = NULL;
   1576	struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
   1577
   1578	switch (ret) {
   1579	case -NFS4ERR_NOMATCHING_LAYOUT:
   1580		spin_lock(&inode->i_lock);
   1581		if (pnfs_layout_is_valid(lo) &&
   1582		    nfs4_stateid_match_other(&args->stateid, &lo->plh_stateid))
   1583			pnfs_set_plh_return_info(lo, args->range.iomode, 0);
   1584		pnfs_clear_layoutreturn_waitbit(lo);
   1585		spin_unlock(&inode->i_lock);
   1586		break;
   1587	case 0:
   1588		if (res->lrs_present)
   1589			res_stateid = &res->stateid;
   1590		fallthrough;
   1591	default:
   1592		pnfs_layoutreturn_free_lsegs(lo, &args->stateid, &args->range,
   1593					     res_stateid);
   1594	}
   1595	trace_nfs4_layoutreturn_on_close(args->inode, &args->stateid, ret);
   1596	if (ld_private && ld_private->ops && ld_private->ops->free)
   1597		ld_private->ops->free(ld_private);
   1598	pnfs_put_layout_hdr(lo);
   1599}
   1600
   1601bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
   1602{
   1603	struct nfs_inode *nfsi = NFS_I(ino);
   1604        struct pnfs_layout_hdr *lo;
   1605        bool sleep = false;
   1606
   1607	/* we might not have grabbed lo reference. so need to check under
   1608	 * i_lock */
   1609        spin_lock(&ino->i_lock);
   1610        lo = nfsi->layout;
   1611        if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
   1612                rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
   1613                sleep = true;
   1614	}
   1615        spin_unlock(&ino->i_lock);
   1616        return sleep;
   1617}
   1618
   1619/*
   1620 * Compare two layout segments for sorting into layout cache.
   1621 * We want to preferentially return RW over RO layouts, so ensure those
   1622 * are seen first.
   1623 */
   1624static s64
   1625pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
   1626	   const struct pnfs_layout_range *l2)
   1627{
   1628	s64 d;
   1629
   1630	/* high offset > low offset */
   1631	d = l1->offset - l2->offset;
   1632	if (d)
   1633		return d;
   1634
   1635	/* short length > long length */
   1636	d = l2->length - l1->length;
   1637	if (d)
   1638		return d;
   1639
   1640	/* read > read/write */
   1641	return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
   1642}
   1643
   1644static bool
   1645pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1,
   1646		const struct pnfs_layout_range *l2)
   1647{
   1648	return pnfs_lseg_range_cmp(l1, l2) > 0;
   1649}
   1650
   1651static bool
   1652pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg,
   1653		struct pnfs_layout_segment *old)
   1654{
   1655	return false;
   1656}
   1657
   1658void
   1659pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
   1660		   struct pnfs_layout_segment *lseg,
   1661		   bool (*is_after)(const struct pnfs_layout_range *,
   1662			   const struct pnfs_layout_range *),
   1663		   bool (*do_merge)(struct pnfs_layout_segment *,
   1664			   struct pnfs_layout_segment *),
   1665		   struct list_head *free_me)
   1666{
   1667	struct pnfs_layout_segment *lp, *tmp;
   1668
   1669	dprintk("%s:Begin\n", __func__);
   1670
   1671	list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) {
   1672		if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0)
   1673			continue;
   1674		if (do_merge(lseg, lp)) {
   1675			mark_lseg_invalid(lp, free_me);
   1676			continue;
   1677		}
   1678		if (is_after(&lseg->pls_range, &lp->pls_range))
   1679			continue;
   1680		list_add_tail(&lseg->pls_list, &lp->pls_list);
   1681		dprintk("%s: inserted lseg %p "
   1682			"iomode %d offset %llu length %llu before "
   1683			"lp %p iomode %d offset %llu length %llu\n",
   1684			__func__, lseg, lseg->pls_range.iomode,
   1685			lseg->pls_range.offset, lseg->pls_range.length,
   1686			lp, lp->pls_range.iomode, lp->pls_range.offset,
   1687			lp->pls_range.length);
   1688		goto out;
   1689	}
   1690	list_add_tail(&lseg->pls_list, &lo->plh_segs);
   1691	dprintk("%s: inserted lseg %p "
   1692		"iomode %d offset %llu length %llu at tail\n",
   1693		__func__, lseg, lseg->pls_range.iomode,
   1694		lseg->pls_range.offset, lseg->pls_range.length);
   1695out:
   1696	pnfs_get_layout_hdr(lo);
   1697
   1698	dprintk("%s:Return\n", __func__);
   1699}
   1700EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg);
   1701
   1702static void
   1703pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
   1704		   struct pnfs_layout_segment *lseg,
   1705		   struct list_head *free_me)
   1706{
   1707	struct inode *inode = lo->plh_inode;
   1708	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
   1709
   1710	if (ld->add_lseg != NULL)
   1711		ld->add_lseg(lo, lseg, free_me);
   1712	else
   1713		pnfs_generic_layout_insert_lseg(lo, lseg,
   1714				pnfs_lseg_range_is_after,
   1715				pnfs_lseg_no_merge,
   1716				free_me);
   1717}
   1718
   1719static struct pnfs_layout_hdr *
   1720alloc_init_layout_hdr(struct inode *ino,
   1721		      struct nfs_open_context *ctx,
   1722		      gfp_t gfp_flags)
   1723{
   1724	struct pnfs_layout_hdr *lo;
   1725
   1726	lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
   1727	if (!lo)
   1728		return NULL;
   1729	refcount_set(&lo->plh_refcount, 1);
   1730	INIT_LIST_HEAD(&lo->plh_layouts);
   1731	INIT_LIST_HEAD(&lo->plh_segs);
   1732	INIT_LIST_HEAD(&lo->plh_return_segs);
   1733	INIT_LIST_HEAD(&lo->plh_bulk_destroy);
   1734	lo->plh_inode = ino;
   1735	lo->plh_lc_cred = get_cred(ctx->cred);
   1736	lo->plh_flags |= 1 << NFS_LAYOUT_INVALID_STID;
   1737	return lo;
   1738}
   1739
   1740static struct pnfs_layout_hdr *
   1741pnfs_find_alloc_layout(struct inode *ino,
   1742		       struct nfs_open_context *ctx,
   1743		       gfp_t gfp_flags)
   1744	__releases(&ino->i_lock)
   1745	__acquires(&ino->i_lock)
   1746{
   1747	struct nfs_inode *nfsi = NFS_I(ino);
   1748	struct pnfs_layout_hdr *new = NULL;
   1749
   1750	dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
   1751
   1752	if (nfsi->layout != NULL)
   1753		goto out_existing;
   1754	spin_unlock(&ino->i_lock);
   1755	new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
   1756	spin_lock(&ino->i_lock);
   1757
   1758	if (likely(nfsi->layout == NULL)) {	/* Won the race? */
   1759		nfsi->layout = new;
   1760		return new;
   1761	} else if (new != NULL)
   1762		pnfs_free_layout_hdr(new);
   1763out_existing:
   1764	pnfs_get_layout_hdr(nfsi->layout);
   1765	return nfsi->layout;
   1766}
   1767
   1768/*
   1769 * iomode matching rules:
   1770 * iomode	lseg	strict match
   1771 *                      iomode
   1772 * -----	-----	------ -----
   1773 * ANY		READ	N/A    true
   1774 * ANY		RW	N/A    true
   1775 * RW		READ	N/A    false
   1776 * RW		RW	N/A    true
   1777 * READ		READ	N/A    true
   1778 * READ		RW	true   false
   1779 * READ		RW	false  true
   1780 */
   1781static bool
   1782pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
   1783		 const struct pnfs_layout_range *range,
   1784		 bool strict_iomode)
   1785{
   1786	struct pnfs_layout_range range1;
   1787
   1788	if ((range->iomode == IOMODE_RW &&
   1789	     ls_range->iomode != IOMODE_RW) ||
   1790	    (range->iomode != ls_range->iomode &&
   1791	     strict_iomode) ||
   1792	    !pnfs_lseg_range_intersecting(ls_range, range))
   1793		return false;
   1794
   1795	/* range1 covers only the first byte in the range */
   1796	range1 = *range;
   1797	range1.length = 1;
   1798	return pnfs_lseg_range_contained(ls_range, &range1);
   1799}
   1800
   1801/*
   1802 * lookup range in layout
   1803 */
   1804static struct pnfs_layout_segment *
   1805pnfs_find_lseg(struct pnfs_layout_hdr *lo,
   1806		struct pnfs_layout_range *range,
   1807		bool strict_iomode)
   1808{
   1809	struct pnfs_layout_segment *lseg, *ret = NULL;
   1810
   1811	dprintk("%s:Begin\n", __func__);
   1812
   1813	list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
   1814		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
   1815		    pnfs_lseg_range_match(&lseg->pls_range, range,
   1816					  strict_iomode)) {
   1817			ret = pnfs_get_lseg(lseg);
   1818			break;
   1819		}
   1820	}
   1821
   1822	dprintk("%s:Return lseg %p ref %d\n",
   1823		__func__, ret, ret ? refcount_read(&ret->pls_refcount) : 0);
   1824	return ret;
   1825}
   1826
   1827/*
   1828 * Use mdsthreshold hints set at each OPEN to determine if I/O should go
   1829 * to the MDS or over pNFS
   1830 *
   1831 * The nfs_inode read_io and write_io fields are cumulative counters reset
   1832 * when there are no layout segments. Note that in pnfs_update_layout iomode
   1833 * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
   1834 * WRITE request.
   1835 *
   1836 * A return of true means use MDS I/O.
   1837 *
   1838 * From rfc 5661:
   1839 * If a file's size is smaller than the file size threshold, data accesses
   1840 * SHOULD be sent to the metadata server.  If an I/O request has a length that
   1841 * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
   1842 * server.  If both file size and I/O size are provided, the client SHOULD
   1843 * reach or exceed  both thresholds before sending its read or write
   1844 * requests to the data server.
   1845 */
   1846static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
   1847				     struct inode *ino, int iomode)
   1848{
   1849	struct nfs4_threshold *t = ctx->mdsthreshold;
   1850	struct nfs_inode *nfsi = NFS_I(ino);
   1851	loff_t fsize = i_size_read(ino);
   1852	bool size = false, size_set = false, io = false, io_set = false, ret = false;
   1853
   1854	if (t == NULL)
   1855		return ret;
   1856
   1857	dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
   1858		__func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
   1859
   1860	switch (iomode) {
   1861	case IOMODE_READ:
   1862		if (t->bm & THRESHOLD_RD) {
   1863			dprintk("%s fsize %llu\n", __func__, fsize);
   1864			size_set = true;
   1865			if (fsize < t->rd_sz)
   1866				size = true;
   1867		}
   1868		if (t->bm & THRESHOLD_RD_IO) {
   1869			dprintk("%s nfsi->read_io %llu\n", __func__,
   1870				nfsi->read_io);
   1871			io_set = true;
   1872			if (nfsi->read_io < t->rd_io_sz)
   1873				io = true;
   1874		}
   1875		break;
   1876	case IOMODE_RW:
   1877		if (t->bm & THRESHOLD_WR) {
   1878			dprintk("%s fsize %llu\n", __func__, fsize);
   1879			size_set = true;
   1880			if (fsize < t->wr_sz)
   1881				size = true;
   1882		}
   1883		if (t->bm & THRESHOLD_WR_IO) {
   1884			dprintk("%s nfsi->write_io %llu\n", __func__,
   1885				nfsi->write_io);
   1886			io_set = true;
   1887			if (nfsi->write_io < t->wr_io_sz)
   1888				io = true;
   1889		}
   1890		break;
   1891	}
   1892	if (size_set && io_set) {
   1893		if (size && io)
   1894			ret = true;
   1895	} else if (size || io)
   1896		ret = true;
   1897
   1898	dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
   1899	return ret;
   1900}
   1901
   1902static int pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
   1903{
   1904	/*
   1905	 * send layoutcommit as it can hold up layoutreturn due to lseg
   1906	 * reference
   1907	 */
   1908	pnfs_layoutcommit_inode(lo->plh_inode, false);
   1909	return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
   1910				   nfs_wait_bit_killable,
   1911				   TASK_KILLABLE);
   1912}
   1913
   1914static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)
   1915{
   1916	atomic_inc(&lo->plh_outstanding);
   1917}
   1918
   1919static void nfs_layoutget_end(struct pnfs_layout_hdr *lo)
   1920{
   1921	if (atomic_dec_and_test(&lo->plh_outstanding) &&
   1922	    test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags))
   1923		wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN);
   1924}
   1925
   1926static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo)
   1927{
   1928	return test_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags);
   1929}
   1930
   1931static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
   1932{
   1933	unsigned long *bitlock = &lo->plh_flags;
   1934
   1935	clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
   1936	smp_mb__after_atomic();
   1937	wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
   1938}
   1939
   1940static void _add_to_server_list(struct pnfs_layout_hdr *lo,
   1941				struct nfs_server *server)
   1942{
   1943	if (!test_and_set_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
   1944		struct nfs_client *clp = server->nfs_client;
   1945
   1946		/* The lo must be on the clp list if there is any
   1947		 * chance of a CB_LAYOUTRECALL(FILE) coming in.
   1948		 */
   1949		spin_lock(&clp->cl_lock);
   1950		list_add_tail_rcu(&lo->plh_layouts, &server->layouts);
   1951		spin_unlock(&clp->cl_lock);
   1952	}
   1953}
   1954
   1955/*
   1956 * Layout segment is retreived from the server if not cached.
   1957 * The appropriate layout segment is referenced and returned to the caller.
   1958 */
   1959struct pnfs_layout_segment *
   1960pnfs_update_layout(struct inode *ino,
   1961		   struct nfs_open_context *ctx,
   1962		   loff_t pos,
   1963		   u64 count,
   1964		   enum pnfs_iomode iomode,
   1965		   bool strict_iomode,
   1966		   gfp_t gfp_flags)
   1967{
   1968	struct pnfs_layout_range arg = {
   1969		.iomode = iomode,
   1970		.offset = pos,
   1971		.length = count,
   1972	};
   1973	unsigned pg_offset;
   1974	struct nfs_server *server = NFS_SERVER(ino);
   1975	struct nfs_client *clp = server->nfs_client;
   1976	struct pnfs_layout_hdr *lo = NULL;
   1977	struct pnfs_layout_segment *lseg = NULL;
   1978	struct nfs4_layoutget *lgp;
   1979	nfs4_stateid stateid;
   1980	long timeout = 0;
   1981	unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
   1982	bool first;
   1983
   1984	if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
   1985		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
   1986				 PNFS_UPDATE_LAYOUT_NO_PNFS);
   1987		goto out;
   1988	}
   1989
   1990	if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
   1991		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
   1992				 PNFS_UPDATE_LAYOUT_MDSTHRESH);
   1993		goto out;
   1994	}
   1995
   1996lookup_again:
   1997	lseg = ERR_PTR(nfs4_client_recover_expired_lease(clp));
   1998	if (IS_ERR(lseg))
   1999		goto out;
   2000	first = false;
   2001	spin_lock(&ino->i_lock);
   2002	lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
   2003	if (lo == NULL) {
   2004		spin_unlock(&ino->i_lock);
   2005		lseg = ERR_PTR(-ENOMEM);
   2006		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
   2007				 PNFS_UPDATE_LAYOUT_NOMEM);
   2008		goto out;
   2009	}
   2010
   2011	/* Do we even need to bother with this? */
   2012	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
   2013		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
   2014				 PNFS_UPDATE_LAYOUT_BULK_RECALL);
   2015		dprintk("%s matches recall, use MDS\n", __func__);
   2016		goto out_unlock;
   2017	}
   2018
   2019	/* if LAYOUTGET already failed once we don't try again */
   2020	if (pnfs_layout_io_test_failed(lo, iomode)) {
   2021		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
   2022				 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
   2023		goto out_unlock;
   2024	}
   2025
   2026	/*
   2027	 * If the layout segment list is empty, but there are outstanding
   2028	 * layoutget calls, then they might be subject to a layoutrecall.
   2029	 */
   2030	if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) &&
   2031	    atomic_read(&lo->plh_outstanding) != 0) {
   2032		spin_unlock(&ino->i_lock);
   2033		lseg = ERR_PTR(wait_on_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN,
   2034					   TASK_KILLABLE));
   2035		if (IS_ERR(lseg))
   2036			goto out_put_layout_hdr;
   2037		pnfs_put_layout_hdr(lo);
   2038		goto lookup_again;
   2039	}
   2040
   2041	/*
   2042	 * Because we free lsegs when sending LAYOUTRETURN, we need to wait
   2043	 * for LAYOUTRETURN.
   2044	 */
   2045	if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
   2046		spin_unlock(&ino->i_lock);
   2047		dprintk("%s wait for layoutreturn\n", __func__);
   2048		lseg = ERR_PTR(pnfs_prepare_to_retry_layoutget(lo));
   2049		if (!IS_ERR(lseg)) {
   2050			pnfs_put_layout_hdr(lo);
   2051			dprintk("%s retrying\n", __func__);
   2052			trace_pnfs_update_layout(ino, pos, count, iomode, lo,
   2053						 lseg,
   2054						 PNFS_UPDATE_LAYOUT_RETRY);
   2055			goto lookup_again;
   2056		}
   2057		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
   2058					 PNFS_UPDATE_LAYOUT_RETURN);
   2059		goto out_put_layout_hdr;
   2060	}
   2061
   2062	lseg = pnfs_find_lseg(lo, &arg, strict_iomode);
   2063	if (lseg) {
   2064		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
   2065				PNFS_UPDATE_LAYOUT_FOUND_CACHED);
   2066		goto out_unlock;
   2067	}
   2068
   2069	/*
   2070	 * Choose a stateid for the LAYOUTGET. If we don't have a layout
   2071	 * stateid, or it has been invalidated, then we must use the open
   2072	 * stateid.
   2073	 */
   2074	if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
   2075		int status;
   2076
   2077		/*
   2078		 * The first layoutget for the file. Need to serialize per
   2079		 * RFC 5661 Errata 3208.
   2080		 */
   2081		if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
   2082				     &lo->plh_flags)) {
   2083			spin_unlock(&ino->i_lock);
   2084			lseg = ERR_PTR(wait_on_bit(&lo->plh_flags,
   2085						NFS_LAYOUT_FIRST_LAYOUTGET,
   2086						TASK_KILLABLE));
   2087			if (IS_ERR(lseg))
   2088				goto out_put_layout_hdr;
   2089			pnfs_put_layout_hdr(lo);
   2090			dprintk("%s retrying\n", __func__);
   2091			goto lookup_again;
   2092		}
   2093
   2094		spin_unlock(&ino->i_lock);
   2095		first = true;
   2096		status = nfs4_select_rw_stateid(ctx->state,
   2097					iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ,
   2098					NULL, &stateid, NULL);
   2099		if (status != 0) {
   2100			lseg = ERR_PTR(status);
   2101			trace_pnfs_update_layout(ino, pos, count,
   2102					iomode, lo, lseg,
   2103					PNFS_UPDATE_LAYOUT_INVALID_OPEN);
   2104			nfs4_schedule_stateid_recovery(server, ctx->state);
   2105			pnfs_clear_first_layoutget(lo);
   2106			pnfs_put_layout_hdr(lo);
   2107			goto lookup_again;
   2108		}
   2109		spin_lock(&ino->i_lock);
   2110	} else {
   2111		nfs4_stateid_copy(&stateid, &lo->plh_stateid);
   2112	}
   2113
   2114	if (pnfs_layoutgets_blocked(lo)) {
   2115		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
   2116				PNFS_UPDATE_LAYOUT_BLOCKED);
   2117		goto out_unlock;
   2118	}
   2119	nfs_layoutget_begin(lo);
   2120	spin_unlock(&ino->i_lock);
   2121
   2122	_add_to_server_list(lo, server);
   2123
   2124	pg_offset = arg.offset & ~PAGE_MASK;
   2125	if (pg_offset) {
   2126		arg.offset -= pg_offset;
   2127		arg.length += pg_offset;
   2128	}
   2129	if (arg.length != NFS4_MAX_UINT64)
   2130		arg.length = PAGE_ALIGN(arg.length);
   2131
   2132	lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags);
   2133	if (!lgp) {
   2134		lseg = ERR_PTR(-ENOMEM);
   2135		trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL,
   2136					 PNFS_UPDATE_LAYOUT_NOMEM);
   2137		nfs_layoutget_end(lo);
   2138		goto out_put_layout_hdr;
   2139	}
   2140
   2141	lgp->lo = lo;
   2142	pnfs_get_layout_hdr(lo);
   2143
   2144	lseg = nfs4_proc_layoutget(lgp, &timeout);
   2145	trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
   2146				 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
   2147	nfs_layoutget_end(lo);
   2148	if (IS_ERR(lseg)) {
   2149		switch(PTR_ERR(lseg)) {
   2150		case -EBUSY:
   2151			if (time_after(jiffies, giveup))
   2152				lseg = NULL;
   2153			break;
   2154		case -ERECALLCONFLICT:
   2155		case -EAGAIN:
   2156			break;
   2157		case -ENODATA:
   2158			/* The server returned NFS4ERR_LAYOUTUNAVAILABLE */
   2159			pnfs_layout_set_fail_bit(
   2160				lo, pnfs_iomode_to_fail_bit(iomode));
   2161			lseg = NULL;
   2162			goto out_put_layout_hdr;
   2163		default:
   2164			if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
   2165				pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
   2166				lseg = NULL;
   2167			}
   2168			goto out_put_layout_hdr;
   2169		}
   2170		if (lseg) {
   2171			if (first)
   2172				pnfs_clear_first_layoutget(lo);
   2173			trace_pnfs_update_layout(ino, pos, count,
   2174				iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
   2175			pnfs_put_layout_hdr(lo);
   2176			goto lookup_again;
   2177		}
   2178	} else {
   2179		pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
   2180	}
   2181
   2182out_put_layout_hdr:
   2183	if (first)
   2184		pnfs_clear_first_layoutget(lo);
   2185	trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
   2186				 PNFS_UPDATE_LAYOUT_EXIT);
   2187	pnfs_put_layout_hdr(lo);
   2188out:
   2189	dprintk("%s: inode %s/%llu pNFS layout segment %s for "
   2190			"(%s, offset: %llu, length: %llu)\n",
   2191			__func__, ino->i_sb->s_id,
   2192			(unsigned long long)NFS_FILEID(ino),
   2193			IS_ERR_OR_NULL(lseg) ? "not found" : "found",
   2194			iomode==IOMODE_RW ?  "read/write" : "read-only",
   2195			(unsigned long long)pos,
   2196			(unsigned long long)count);
   2197	return lseg;
   2198out_unlock:
   2199	spin_unlock(&ino->i_lock);
   2200	goto out_put_layout_hdr;
   2201}
   2202EXPORT_SYMBOL_GPL(pnfs_update_layout);
   2203
   2204static bool
   2205pnfs_sanity_check_layout_range(struct pnfs_layout_range *range)
   2206{
   2207	switch (range->iomode) {
   2208	case IOMODE_READ:
   2209	case IOMODE_RW:
   2210		break;
   2211	default:
   2212		return false;
   2213	}
   2214	if (range->offset == NFS4_MAX_UINT64)
   2215		return false;
   2216	if (range->length == 0)
   2217		return false;
   2218	if (range->length != NFS4_MAX_UINT64 &&
   2219	    range->length > NFS4_MAX_UINT64 - range->offset)
   2220		return false;
   2221	return true;
   2222}
   2223
   2224static struct pnfs_layout_hdr *
   2225_pnfs_grab_empty_layout(struct inode *ino, struct nfs_open_context *ctx)
   2226{
   2227	struct pnfs_layout_hdr *lo;
   2228
   2229	spin_lock(&ino->i_lock);
   2230	lo = pnfs_find_alloc_layout(ino, ctx, nfs_io_gfp_mask());
   2231	if (!lo)
   2232		goto out_unlock;
   2233	if (!test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags))
   2234		goto out_unlock;
   2235	if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
   2236		goto out_unlock;
   2237	if (pnfs_layoutgets_blocked(lo))
   2238		goto out_unlock;
   2239	if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags))
   2240		goto out_unlock;
   2241	nfs_layoutget_begin(lo);
   2242	spin_unlock(&ino->i_lock);
   2243	_add_to_server_list(lo, NFS_SERVER(ino));
   2244	return lo;
   2245
   2246out_unlock:
   2247	spin_unlock(&ino->i_lock);
   2248	pnfs_put_layout_hdr(lo);
   2249	return NULL;
   2250}
   2251
   2252static void _lgopen_prepare_attached(struct nfs4_opendata *data,
   2253				     struct nfs_open_context *ctx)
   2254{
   2255	struct inode *ino = data->dentry->d_inode;
   2256	struct pnfs_layout_range rng = {
   2257		.iomode = (data->o_arg.fmode & FMODE_WRITE) ?
   2258			  IOMODE_RW: IOMODE_READ,
   2259		.offset = 0,
   2260		.length = NFS4_MAX_UINT64,
   2261	};
   2262	struct nfs4_layoutget *lgp;
   2263	struct pnfs_layout_hdr *lo;
   2264
   2265	/* Heuristic: don't send layoutget if we have cached data */
   2266	if (rng.iomode == IOMODE_READ &&
   2267	   (i_size_read(ino) == 0 || ino->i_mapping->nrpages != 0))
   2268		return;
   2269
   2270	lo = _pnfs_grab_empty_layout(ino, ctx);
   2271	if (!lo)
   2272		return;
   2273	lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid, &rng,
   2274					     nfs_io_gfp_mask());
   2275	if (!lgp) {
   2276		pnfs_clear_first_layoutget(lo);
   2277		nfs_layoutget_end(lo);
   2278		pnfs_put_layout_hdr(lo);
   2279		return;
   2280	}
   2281	lgp->lo = lo;
   2282	data->lgp = lgp;
   2283	data->o_arg.lg_args = &lgp->args;
   2284	data->o_res.lg_res = &lgp->res;
   2285}
   2286
   2287static void _lgopen_prepare_floating(struct nfs4_opendata *data,
   2288				     struct nfs_open_context *ctx)
   2289{
   2290	struct inode *ino = data->dentry->d_inode;
   2291	struct pnfs_layout_range rng = {
   2292		.iomode = (data->o_arg.fmode & FMODE_WRITE) ?
   2293			  IOMODE_RW: IOMODE_READ,
   2294		.offset = 0,
   2295		.length = NFS4_MAX_UINT64,
   2296	};
   2297	struct nfs4_layoutget *lgp;
   2298
   2299	lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid, &rng,
   2300					     nfs_io_gfp_mask());
   2301	if (!lgp)
   2302		return;
   2303	data->lgp = lgp;
   2304	data->o_arg.lg_args = &lgp->args;
   2305	data->o_res.lg_res = &lgp->res;
   2306}
   2307
   2308void pnfs_lgopen_prepare(struct nfs4_opendata *data,
   2309			 struct nfs_open_context *ctx)
   2310{
   2311	struct nfs_server *server = NFS_SERVER(data->dir->d_inode);
   2312
   2313	if (!(pnfs_enabled_sb(server) &&
   2314	      server->pnfs_curr_ld->flags & PNFS_LAYOUTGET_ON_OPEN))
   2315		return;
   2316	/* Could check on max_ops, but currently hardcoded high enough */
   2317	if (!nfs_server_capable(data->dir->d_inode, NFS_CAP_LGOPEN))
   2318		return;
   2319	if (data->lgp)
   2320		return;
   2321	if (data->state)
   2322		_lgopen_prepare_attached(data, ctx);
   2323	else
   2324		_lgopen_prepare_floating(data, ctx);
   2325}
   2326
   2327void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp,
   2328		       struct nfs_open_context *ctx)
   2329{
   2330	struct pnfs_layout_hdr *lo;
   2331	struct pnfs_layout_segment *lseg;
   2332	struct nfs_server *srv = NFS_SERVER(ino);
   2333	u32 iomode;
   2334
   2335	if (!lgp)
   2336		return;
   2337	dprintk("%s: entered with status %i\n", __func__, lgp->res.status);
   2338	if (lgp->res.status) {
   2339		switch (lgp->res.status) {
   2340		default:
   2341			break;
   2342		/*
   2343		 * Halt lgopen attempts if the server doesn't recognise
   2344		 * the "current stateid" value, the layout type, or the
   2345		 * layoutget operation as being valid.
   2346		 * Also if it complains about too many ops in the compound
   2347		 * or of the request/reply being too big.
   2348		 */
   2349		case -NFS4ERR_BAD_STATEID:
   2350		case -NFS4ERR_NOTSUPP:
   2351		case -NFS4ERR_REP_TOO_BIG:
   2352		case -NFS4ERR_REP_TOO_BIG_TO_CACHE:
   2353		case -NFS4ERR_REQ_TOO_BIG:
   2354		case -NFS4ERR_TOO_MANY_OPS:
   2355		case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
   2356			srv->caps &= ~NFS_CAP_LGOPEN;
   2357		}
   2358		return;
   2359	}
   2360	if (!lgp->lo) {
   2361		lo = _pnfs_grab_empty_layout(ino, ctx);
   2362		if (!lo)
   2363			return;
   2364		lgp->lo = lo;
   2365	} else
   2366		lo = lgp->lo;
   2367
   2368	lseg = pnfs_layout_process(lgp);
   2369	if (!IS_ERR(lseg)) {
   2370		iomode = lgp->args.range.iomode;
   2371		pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
   2372		pnfs_put_lseg(lseg);
   2373	}
   2374}
   2375
   2376void nfs4_lgopen_release(struct nfs4_layoutget *lgp)
   2377{
   2378	if (lgp != NULL) {
   2379		if (lgp->lo) {
   2380			pnfs_clear_first_layoutget(lgp->lo);
   2381			nfs_layoutget_end(lgp->lo);
   2382		}
   2383		pnfs_layoutget_free(lgp);
   2384	}
   2385}
   2386
   2387struct pnfs_layout_segment *
   2388pnfs_layout_process(struct nfs4_layoutget *lgp)
   2389{
   2390	struct pnfs_layout_hdr *lo = lgp->lo;
   2391	struct nfs4_layoutget_res *res = &lgp->res;
   2392	struct pnfs_layout_segment *lseg;
   2393	struct inode *ino = lo->plh_inode;
   2394	LIST_HEAD(free_me);
   2395
   2396	if (!pnfs_sanity_check_layout_range(&res->range))
   2397		return ERR_PTR(-EINVAL);
   2398
   2399	/* Inject layout blob into I/O device driver */
   2400	lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
   2401	if (IS_ERR_OR_NULL(lseg)) {
   2402		if (!lseg)
   2403			lseg = ERR_PTR(-ENOMEM);
   2404
   2405		dprintk("%s: Could not allocate layout: error %ld\n",
   2406		       __func__, PTR_ERR(lseg));
   2407		return lseg;
   2408	}
   2409
   2410	pnfs_init_lseg(lo, lseg, &res->range, &res->stateid);
   2411
   2412	spin_lock(&ino->i_lock);
   2413	if (pnfs_layoutgets_blocked(lo)) {
   2414		dprintk("%s forget reply due to state\n", __func__);
   2415		goto out_forget;
   2416	}
   2417
   2418	if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) &&
   2419	    !pnfs_is_first_layoutget(lo))
   2420		goto out_forget;
   2421
   2422	if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
   2423		/* existing state ID, make sure the sequence number matches. */
   2424		if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
   2425			if (!pnfs_layout_is_valid(lo))
   2426				lo->plh_barrier = 0;
   2427			dprintk("%s forget reply due to sequence\n", __func__);
   2428			goto out_forget;
   2429		}
   2430		pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, false);
   2431	} else if (pnfs_layout_is_valid(lo)) {
   2432		/*
   2433		 * We got an entirely new state ID.  Mark all segments for the
   2434		 * inode invalid, and retry the layoutget
   2435		 */
   2436		struct pnfs_layout_range range = {
   2437			.iomode = IOMODE_ANY,
   2438			.length = NFS4_MAX_UINT64,
   2439		};
   2440		pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0);
   2441		goto out_forget;
   2442	} else {
   2443		/* We have a completely new layout */
   2444		pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true);
   2445	}
   2446
   2447	pnfs_get_lseg(lseg);
   2448	pnfs_layout_insert_lseg(lo, lseg, &free_me);
   2449
   2450
   2451	if (res->return_on_close)
   2452		set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
   2453
   2454	spin_unlock(&ino->i_lock);
   2455	pnfs_free_lseg_list(&free_me);
   2456	return lseg;
   2457
   2458out_forget:
   2459	spin_unlock(&ino->i_lock);
   2460	lseg->pls_layout = lo;
   2461	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
   2462	return ERR_PTR(-EAGAIN);
   2463}
   2464
   2465/**
   2466 * pnfs_mark_matching_lsegs_return - Free or return matching layout segments
   2467 * @lo: pointer to layout header
   2468 * @tmp_list: list header to be used with pnfs_free_lseg_list()
   2469 * @return_range: describe layout segment ranges to be returned
   2470 * @seq: stateid seqid to match
   2471 *
   2472 * This function is mainly intended for use by layoutrecall. It attempts
   2473 * to free the layout segment immediately, or else to mark it for return
   2474 * as soon as its reference count drops to zero.
   2475 *
   2476 * Returns
   2477 * - 0: a layoutreturn needs to be scheduled.
   2478 * - EBUSY: there are layout segment that are still in use.
   2479 * - ENOENT: there are no layout segments that need to be returned.
   2480 */
   2481int
   2482pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
   2483				struct list_head *tmp_list,
   2484				const struct pnfs_layout_range *return_range,
   2485				u32 seq)
   2486{
   2487	struct pnfs_layout_segment *lseg, *next;
   2488	int remaining = 0;
   2489
   2490	dprintk("%s:Begin lo %p\n", __func__, lo);
   2491
   2492	assert_spin_locked(&lo->plh_inode->i_lock);
   2493
   2494	if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
   2495		tmp_list = &lo->plh_return_segs;
   2496
   2497	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
   2498		if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
   2499			dprintk("%s: marking lseg %p iomode %d "
   2500				"offset %llu length %llu\n", __func__,
   2501				lseg, lseg->pls_range.iomode,
   2502				lseg->pls_range.offset,
   2503				lseg->pls_range.length);
   2504			if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
   2505				tmp_list = &lo->plh_return_segs;
   2506			if (mark_lseg_invalid(lseg, tmp_list))
   2507				continue;
   2508			remaining++;
   2509			set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
   2510		}
   2511
   2512	if (remaining) {
   2513		pnfs_set_plh_return_info(lo, return_range->iomode, seq);
   2514		return -EBUSY;
   2515	}
   2516
   2517	if (!list_empty(&lo->plh_return_segs)) {
   2518		pnfs_set_plh_return_info(lo, return_range->iomode, seq);
   2519		return 0;
   2520	}
   2521
   2522	return -ENOENT;
   2523}
   2524
   2525static void
   2526pnfs_mark_layout_for_return(struct inode *inode,
   2527			    const struct pnfs_layout_range *range)
   2528{
   2529	struct pnfs_layout_hdr *lo;
   2530	bool return_now = false;
   2531
   2532	spin_lock(&inode->i_lock);
   2533	lo = NFS_I(inode)->layout;
   2534	if (!pnfs_layout_is_valid(lo)) {
   2535		spin_unlock(&inode->i_lock);
   2536		return;
   2537	}
   2538	pnfs_set_plh_return_info(lo, range->iomode, 0);
   2539	/*
   2540	 * mark all matching lsegs so that we are sure to have no live
   2541	 * segments at hand when sending layoutreturn. See pnfs_put_lseg()
   2542	 * for how it works.
   2543	 */
   2544	if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, range, 0) != -EBUSY) {
   2545		const struct cred *cred;
   2546		nfs4_stateid stateid;
   2547		enum pnfs_iomode iomode;
   2548
   2549		return_now = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
   2550		spin_unlock(&inode->i_lock);
   2551		if (return_now)
   2552			pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
   2553	} else {
   2554		spin_unlock(&inode->i_lock);
   2555		nfs_commit_inode(inode, 0);
   2556	}
   2557}
   2558
   2559void pnfs_error_mark_layout_for_return(struct inode *inode,
   2560				       struct pnfs_layout_segment *lseg)
   2561{
   2562	struct pnfs_layout_range range = {
   2563		.iomode = lseg->pls_range.iomode,
   2564		.offset = 0,
   2565		.length = NFS4_MAX_UINT64,
   2566	};
   2567
   2568	pnfs_mark_layout_for_return(inode, &range);
   2569}
   2570EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
   2571
   2572static bool
   2573pnfs_layout_can_be_returned(struct pnfs_layout_hdr *lo)
   2574{
   2575	return pnfs_layout_is_valid(lo) &&
   2576		!test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) &&
   2577		!test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
   2578}
   2579
   2580static struct pnfs_layout_segment *
   2581pnfs_find_first_lseg(struct pnfs_layout_hdr *lo,
   2582		     const struct pnfs_layout_range *range,
   2583		     enum pnfs_iomode iomode)
   2584{
   2585	struct pnfs_layout_segment *lseg;
   2586
   2587	list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
   2588		if (!test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
   2589			continue;
   2590		if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
   2591			continue;
   2592		if (lseg->pls_range.iomode != iomode && iomode != IOMODE_ANY)
   2593			continue;
   2594		if (pnfs_lseg_range_intersecting(&lseg->pls_range, range))
   2595			return lseg;
   2596	}
   2597	return NULL;
   2598}
   2599
   2600/* Find open file states whose mode matches that of the range */
   2601static bool
   2602pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo,
   2603				 const struct pnfs_layout_range *range)
   2604{
   2605	struct list_head *head;
   2606	struct nfs_open_context *ctx;
   2607	fmode_t mode = 0;
   2608
   2609	if (!pnfs_layout_can_be_returned(lo) ||
   2610	    !pnfs_find_first_lseg(lo, range, range->iomode))
   2611		return false;
   2612
   2613	head = &NFS_I(lo->plh_inode)->open_files;
   2614	list_for_each_entry_rcu(ctx, head, list) {
   2615		if (ctx->state)
   2616			mode |= ctx->state->state & (FMODE_READ|FMODE_WRITE);
   2617	}
   2618
   2619	switch (range->iomode) {
   2620	default:
   2621		break;
   2622	case IOMODE_READ:
   2623		mode &= ~FMODE_WRITE;
   2624		break;
   2625	case IOMODE_RW:
   2626		if (pnfs_find_first_lseg(lo, range, IOMODE_READ))
   2627			mode &= ~FMODE_READ;
   2628	}
   2629	return mode == 0;
   2630}
   2631
   2632static int
   2633pnfs_layout_return_unused_byserver(struct nfs_server *server, void *data)
   2634{
   2635	const struct pnfs_layout_range *range = data;
   2636	struct pnfs_layout_hdr *lo;
   2637	struct inode *inode;
   2638restart:
   2639	rcu_read_lock();
   2640	list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
   2641		if (!pnfs_layout_can_be_returned(lo) ||
   2642		    test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
   2643			continue;
   2644		inode = lo->plh_inode;
   2645		spin_lock(&inode->i_lock);
   2646		if (!pnfs_should_return_unused_layout(lo, range)) {
   2647			spin_unlock(&inode->i_lock);
   2648			continue;
   2649		}
   2650		spin_unlock(&inode->i_lock);
   2651		inode = pnfs_grab_inode_layout_hdr(lo);
   2652		if (!inode)
   2653			continue;
   2654		rcu_read_unlock();
   2655		pnfs_mark_layout_for_return(inode, range);
   2656		iput(inode);
   2657		cond_resched();
   2658		goto restart;
   2659	}
   2660	rcu_read_unlock();
   2661	return 0;
   2662}
   2663
   2664void
   2665pnfs_layout_return_unused_byclid(struct nfs_client *clp,
   2666				 enum pnfs_iomode iomode)
   2667{
   2668	struct pnfs_layout_range range = {
   2669		.iomode = iomode,
   2670		.offset = 0,
   2671		.length = NFS4_MAX_UINT64,
   2672	};
   2673
   2674	nfs_client_for_each_server(clp, pnfs_layout_return_unused_byserver,
   2675			&range);
   2676}
   2677
   2678void
   2679pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio)
   2680{
   2681	if (pgio->pg_lseg == NULL ||
   2682	    test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags))
   2683		return;
   2684	pnfs_put_lseg(pgio->pg_lseg);
   2685	pgio->pg_lseg = NULL;
   2686}
   2687EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout);
   2688
   2689/*
   2690 * Check for any intersection between the request and the pgio->pg_lseg,
   2691 * and if none, put this pgio->pg_lseg away.
   2692 */
   2693void
   2694pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
   2695{
   2696	if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) {
   2697		pnfs_put_lseg(pgio->pg_lseg);
   2698		pgio->pg_lseg = NULL;
   2699	}
   2700}
   2701EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range);
   2702
   2703void
   2704pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
   2705{
   2706	u64 rd_size;
   2707
   2708	pnfs_generic_pg_check_layout(pgio);
   2709	pnfs_generic_pg_check_range(pgio, req);
   2710	if (pgio->pg_lseg == NULL) {
   2711		if (pgio->pg_dreq == NULL)
   2712			rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
   2713		else
   2714			rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
   2715
   2716		pgio->pg_lseg =
   2717			pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
   2718					   req_offset(req), rd_size,
   2719					   IOMODE_READ, false,
   2720					   nfs_io_gfp_mask());
   2721		if (IS_ERR(pgio->pg_lseg)) {
   2722			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
   2723			pgio->pg_lseg = NULL;
   2724			return;
   2725		}
   2726	}
   2727	/* If no lseg, fall back to read through mds */
   2728	if (pgio->pg_lseg == NULL)
   2729		nfs_pageio_reset_read_mds(pgio);
   2730
   2731}
   2732EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
   2733
   2734void
   2735pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
   2736			   struct nfs_page *req, u64 wb_size)
   2737{
   2738	pnfs_generic_pg_check_layout(pgio);
   2739	pnfs_generic_pg_check_range(pgio, req);
   2740	if (pgio->pg_lseg == NULL) {
   2741		pgio->pg_lseg =
   2742			pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
   2743					   req_offset(req), wb_size, IOMODE_RW,
   2744					   false, nfs_io_gfp_mask());
   2745		if (IS_ERR(pgio->pg_lseg)) {
   2746			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
   2747			pgio->pg_lseg = NULL;
   2748			return;
   2749		}
   2750	}
   2751	/* If no lseg, fall back to write through mds */
   2752	if (pgio->pg_lseg == NULL)
   2753		nfs_pageio_reset_write_mds(pgio);
   2754}
   2755EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
   2756
   2757void
   2758pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
   2759{
   2760	if (desc->pg_lseg) {
   2761		pnfs_put_lseg(desc->pg_lseg);
   2762		desc->pg_lseg = NULL;
   2763	}
   2764}
   2765EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
   2766
   2767/*
   2768 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
   2769 * of bytes (maximum @req->wb_bytes) that can be coalesced.
   2770 */
   2771size_t
   2772pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
   2773		     struct nfs_page *prev, struct nfs_page *req)
   2774{
   2775	unsigned int size;
   2776	u64 seg_end, req_start, seg_left;
   2777
   2778	size = nfs_generic_pg_test(pgio, prev, req);
   2779	if (!size)
   2780		return 0;
   2781
   2782	/*
   2783	 * 'size' contains the number of bytes left in the current page (up
   2784	 * to the original size asked for in @req->wb_bytes).
   2785	 *
   2786	 * Calculate how many bytes are left in the layout segment
   2787	 * and if there are less bytes than 'size', return that instead.
   2788	 *
   2789	 * Please also note that 'end_offset' is actually the offset of the
   2790	 * first byte that lies outside the pnfs_layout_range. FIXME?
   2791	 *
   2792	 */
   2793	if (pgio->pg_lseg) {
   2794		seg_end = pnfs_end_offset(pgio->pg_lseg->pls_range.offset,
   2795				     pgio->pg_lseg->pls_range.length);
   2796		req_start = req_offset(req);
   2797
   2798		/* start of request is past the last byte of this segment */
   2799		if (req_start >= seg_end)
   2800			return 0;
   2801
   2802		/* adjust 'size' iff there are fewer bytes left in the
   2803		 * segment than what nfs_generic_pg_test returned */
   2804		seg_left = seg_end - req_start;
   2805		if (seg_left < size)
   2806			size = (unsigned int)seg_left;
   2807	}
   2808
   2809	return size;
   2810}
   2811EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
   2812
   2813int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
   2814{
   2815	struct nfs_pageio_descriptor pgio;
   2816
   2817	/* Resend all requests through the MDS */
   2818	nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
   2819			      hdr->completion_ops);
   2820	set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags);
   2821	return nfs_pageio_resend(&pgio, hdr);
   2822}
   2823EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
   2824
   2825static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
   2826{
   2827
   2828	dprintk("pnfs write error = %d\n", hdr->pnfs_error);
   2829	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
   2830	    PNFS_LAYOUTRET_ON_ERROR) {
   2831		pnfs_return_layout(hdr->inode);
   2832	}
   2833	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
   2834		hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
   2835}
   2836
   2837/*
   2838 * Called by non rpc-based layout drivers
   2839 */
   2840void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
   2841{
   2842	if (likely(!hdr->pnfs_error)) {
   2843		pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
   2844				hdr->mds_offset + hdr->res.count);
   2845		hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
   2846	}
   2847	trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
   2848	if (unlikely(hdr->pnfs_error))
   2849		pnfs_ld_handle_write_error(hdr);
   2850	hdr->mds_ops->rpc_release(hdr);
   2851}
   2852EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
   2853
   2854static void
   2855pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
   2856		struct nfs_pgio_header *hdr)
   2857{
   2858	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
   2859
   2860	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
   2861		list_splice_tail_init(&hdr->pages, &mirror->pg_list);
   2862		nfs_pageio_reset_write_mds(desc);
   2863		mirror->pg_recoalesce = 1;
   2864	}
   2865	hdr->completion_ops->completion(hdr);
   2866}
   2867
   2868static enum pnfs_try_status
   2869pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
   2870			const struct rpc_call_ops *call_ops,
   2871			struct pnfs_layout_segment *lseg,
   2872			int how)
   2873{
   2874	struct inode *inode = hdr->inode;
   2875	enum pnfs_try_status trypnfs;
   2876	struct nfs_server *nfss = NFS_SERVER(inode);
   2877
   2878	hdr->mds_ops = call_ops;
   2879
   2880	dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
   2881		inode->i_ino, hdr->args.count, hdr->args.offset, how);
   2882	trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
   2883	if (trypnfs != PNFS_NOT_ATTEMPTED)
   2884		nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
   2885	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
   2886	return trypnfs;
   2887}
   2888
   2889static void
   2890pnfs_do_write(struct nfs_pageio_descriptor *desc,
   2891	      struct nfs_pgio_header *hdr, int how)
   2892{
   2893	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
   2894	struct pnfs_layout_segment *lseg = desc->pg_lseg;
   2895	enum pnfs_try_status trypnfs;
   2896
   2897	trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
   2898	switch (trypnfs) {
   2899	case PNFS_NOT_ATTEMPTED:
   2900		pnfs_write_through_mds(desc, hdr);
   2901		break;
   2902	case PNFS_ATTEMPTED:
   2903		break;
   2904	case PNFS_TRY_AGAIN:
   2905		/* cleanup hdr and prepare to redo pnfs */
   2906		if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
   2907			struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
   2908			list_splice_init(&hdr->pages, &mirror->pg_list);
   2909			mirror->pg_recoalesce = 1;
   2910		}
   2911		hdr->mds_ops->rpc_release(hdr);
   2912	}
   2913}
   2914
   2915static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
   2916{
   2917	pnfs_put_lseg(hdr->lseg);
   2918	nfs_pgio_header_free(hdr);
   2919}
   2920
   2921int
   2922pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
   2923{
   2924	struct nfs_pgio_header *hdr;
   2925	int ret;
   2926
   2927	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
   2928	if (!hdr) {
   2929		desc->pg_error = -ENOMEM;
   2930		return desc->pg_error;
   2931	}
   2932	nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
   2933
   2934	hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
   2935	ret = nfs_generic_pgio(desc, hdr);
   2936	if (!ret)
   2937		pnfs_do_write(desc, hdr, desc->pg_ioflags);
   2938
   2939	return ret;
   2940}
   2941EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
   2942
   2943int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
   2944{
   2945	struct nfs_pageio_descriptor pgio;
   2946
   2947	/* Resend all requests through the MDS */
   2948	nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
   2949	return nfs_pageio_resend(&pgio, hdr);
   2950}
   2951EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
   2952
   2953static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
   2954{
   2955	dprintk("pnfs read error = %d\n", hdr->pnfs_error);
   2956	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
   2957	    PNFS_LAYOUTRET_ON_ERROR) {
   2958		pnfs_return_layout(hdr->inode);
   2959	}
   2960	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
   2961		hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
   2962}
   2963
   2964/*
   2965 * Called by non rpc-based layout drivers
   2966 */
   2967void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
   2968{
   2969	if (likely(!hdr->pnfs_error))
   2970		hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
   2971	trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
   2972	if (unlikely(hdr->pnfs_error))
   2973		pnfs_ld_handle_read_error(hdr);
   2974	hdr->mds_ops->rpc_release(hdr);
   2975}
   2976EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
   2977
   2978static void
   2979pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
   2980		struct nfs_pgio_header *hdr)
   2981{
   2982	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
   2983
   2984	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
   2985		list_splice_tail_init(&hdr->pages, &mirror->pg_list);
   2986		nfs_pageio_reset_read_mds(desc);
   2987		mirror->pg_recoalesce = 1;
   2988	}
   2989	hdr->completion_ops->completion(hdr);
   2990}
   2991
   2992/*
   2993 * Call the appropriate parallel I/O subsystem read function.
   2994 */
   2995static enum pnfs_try_status
   2996pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
   2997		       const struct rpc_call_ops *call_ops,
   2998		       struct pnfs_layout_segment *lseg)
   2999{
   3000	struct inode *inode = hdr->inode;
   3001	struct nfs_server *nfss = NFS_SERVER(inode);
   3002	enum pnfs_try_status trypnfs;
   3003
   3004	hdr->mds_ops = call_ops;
   3005
   3006	dprintk("%s: Reading ino:%lu %u@%llu\n",
   3007		__func__, inode->i_ino, hdr->args.count, hdr->args.offset);
   3008
   3009	trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
   3010	if (trypnfs != PNFS_NOT_ATTEMPTED)
   3011		nfs_inc_stats(inode, NFSIOS_PNFS_READ);
   3012	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
   3013	return trypnfs;
   3014}
   3015
   3016/* Resend all requests through pnfs. */
   3017void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr,
   3018			   unsigned int mirror_idx)
   3019{
   3020	struct nfs_pageio_descriptor pgio;
   3021
   3022	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
   3023		/* Prevent deadlocks with layoutreturn! */
   3024		pnfs_put_lseg(hdr->lseg);
   3025		hdr->lseg = NULL;
   3026
   3027		nfs_pageio_init_read(&pgio, hdr->inode, false,
   3028					hdr->completion_ops);
   3029		pgio.pg_mirror_idx = mirror_idx;
   3030		hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
   3031	}
   3032}
   3033EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
   3034
   3035static void
   3036pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
   3037{
   3038	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
   3039	struct pnfs_layout_segment *lseg = desc->pg_lseg;
   3040	enum pnfs_try_status trypnfs;
   3041
   3042	trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
   3043	switch (trypnfs) {
   3044	case PNFS_NOT_ATTEMPTED:
   3045		pnfs_read_through_mds(desc, hdr);
   3046		break;
   3047	case PNFS_ATTEMPTED:
   3048		break;
   3049	case PNFS_TRY_AGAIN:
   3050		/* cleanup hdr and prepare to redo pnfs */
   3051		if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
   3052			struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
   3053			list_splice_init(&hdr->pages, &mirror->pg_list);
   3054			mirror->pg_recoalesce = 1;
   3055		}
   3056		hdr->mds_ops->rpc_release(hdr);
   3057	}
   3058}
   3059
   3060static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
   3061{
   3062	pnfs_put_lseg(hdr->lseg);
   3063	nfs_pgio_header_free(hdr);
   3064}
   3065
   3066int
   3067pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
   3068{
   3069	struct nfs_pgio_header *hdr;
   3070	int ret;
   3071
   3072	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
   3073	if (!hdr) {
   3074		desc->pg_error = -ENOMEM;
   3075		return desc->pg_error;
   3076	}
   3077	nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
   3078	hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
   3079	ret = nfs_generic_pgio(desc, hdr);
   3080	if (!ret)
   3081		pnfs_do_read(desc, hdr);
   3082	return ret;
   3083}
   3084EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
   3085
   3086static void pnfs_clear_layoutcommitting(struct inode *inode)
   3087{
   3088	unsigned long *bitlock = &NFS_I(inode)->flags;
   3089
   3090	clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
   3091	smp_mb__after_atomic();
   3092	wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
   3093}
   3094
   3095/*
   3096 * There can be multiple RW segments.
   3097 */
   3098static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
   3099{
   3100	struct pnfs_layout_segment *lseg;
   3101
   3102	list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
   3103		if (lseg->pls_range.iomode == IOMODE_RW &&
   3104		    test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
   3105			list_add(&lseg->pls_lc_list, listp);
   3106	}
   3107}
   3108
   3109static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
   3110{
   3111	struct pnfs_layout_segment *lseg, *tmp;
   3112
   3113	/* Matched by references in pnfs_set_layoutcommit */
   3114	list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
   3115		list_del_init(&lseg->pls_lc_list);
   3116		pnfs_put_lseg(lseg);
   3117	}
   3118
   3119	pnfs_clear_layoutcommitting(inode);
   3120}
   3121
   3122void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
   3123{
   3124	pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
   3125}
   3126EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
   3127
   3128void
   3129pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg,
   3130		loff_t end_pos)
   3131{
   3132	struct nfs_inode *nfsi = NFS_I(inode);
   3133	bool mark_as_dirty = false;
   3134
   3135	spin_lock(&inode->i_lock);
   3136	if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
   3137		nfsi->layout->plh_lwb = end_pos;
   3138		mark_as_dirty = true;
   3139		dprintk("%s: Set layoutcommit for inode %lu ",
   3140			__func__, inode->i_ino);
   3141	} else if (end_pos > nfsi->layout->plh_lwb)
   3142		nfsi->layout->plh_lwb = end_pos;
   3143	if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) {
   3144		/* references matched in nfs4_layoutcommit_release */
   3145		pnfs_get_lseg(lseg);
   3146	}
   3147	spin_unlock(&inode->i_lock);
   3148	dprintk("%s: lseg %p end_pos %llu\n",
   3149		__func__, lseg, nfsi->layout->plh_lwb);
   3150
   3151	/* if pnfs_layoutcommit_inode() runs between inode locks, the next one
   3152	 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
   3153	if (mark_as_dirty)
   3154		mark_inode_dirty_sync(inode);
   3155}
   3156EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
   3157
   3158void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
   3159{
   3160	struct nfs_server *nfss = NFS_SERVER(data->args.inode);
   3161
   3162	if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
   3163		nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
   3164	pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list);
   3165}
   3166
   3167/*
   3168 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
   3169 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
   3170 * data to disk to allow the server to recover the data if it crashes.
   3171 * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
   3172 * is off, and a COMMIT is sent to a data server, or
   3173 * if WRITEs to a data server return NFS_DATA_SYNC.
   3174 */
   3175int
   3176pnfs_layoutcommit_inode(struct inode *inode, bool sync)
   3177{
   3178	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
   3179	struct nfs4_layoutcommit_data *data;
   3180	struct nfs_inode *nfsi = NFS_I(inode);
   3181	loff_t end_pos;
   3182	int status;
   3183
   3184	if (!pnfs_layoutcommit_outstanding(inode))
   3185		return 0;
   3186
   3187	dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
   3188
   3189	status = -EAGAIN;
   3190	if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
   3191		if (!sync)
   3192			goto out;
   3193		status = wait_on_bit_lock_action(&nfsi->flags,
   3194				NFS_INO_LAYOUTCOMMITTING,
   3195				nfs_wait_bit_killable,
   3196				TASK_KILLABLE);
   3197		if (status)
   3198			goto out;
   3199	}
   3200
   3201	status = -ENOMEM;
   3202	/* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
   3203	data = kzalloc(sizeof(*data), nfs_io_gfp_mask());
   3204	if (!data)
   3205		goto clear_layoutcommitting;
   3206
   3207	status = 0;
   3208	spin_lock(&inode->i_lock);
   3209	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
   3210		goto out_unlock;
   3211
   3212	INIT_LIST_HEAD(&data->lseg_list);
   3213	pnfs_list_write_lseg(inode, &data->lseg_list);
   3214
   3215	end_pos = nfsi->layout->plh_lwb;
   3216
   3217	nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
   3218	data->cred = get_cred(nfsi->layout->plh_lc_cred);
   3219	spin_unlock(&inode->i_lock);
   3220
   3221	data->args.inode = inode;
   3222	nfs_fattr_init(&data->fattr);
   3223	data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
   3224	data->res.fattr = &data->fattr;
   3225	if (end_pos != 0)
   3226		data->args.lastbytewritten = end_pos - 1;
   3227	else
   3228		data->args.lastbytewritten = U64_MAX;
   3229	data->res.server = NFS_SERVER(inode);
   3230
   3231	if (ld->prepare_layoutcommit) {
   3232		status = ld->prepare_layoutcommit(&data->args);
   3233		if (status) {
   3234			put_cred(data->cred);
   3235			spin_lock(&inode->i_lock);
   3236			set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
   3237			if (end_pos > nfsi->layout->plh_lwb)
   3238				nfsi->layout->plh_lwb = end_pos;
   3239			goto out_unlock;
   3240		}
   3241	}
   3242
   3243
   3244	status = nfs4_proc_layoutcommit(data, sync);
   3245out:
   3246	if (status)
   3247		mark_inode_dirty_sync(inode);
   3248	dprintk("<-- %s status %d\n", __func__, status);
   3249	return status;
   3250out_unlock:
   3251	spin_unlock(&inode->i_lock);
   3252	kfree(data);
   3253clear_layoutcommitting:
   3254	pnfs_clear_layoutcommitting(inode);
   3255	goto out;
   3256}
   3257EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
   3258
   3259int
   3260pnfs_generic_sync(struct inode *inode, bool datasync)
   3261{
   3262	return pnfs_layoutcommit_inode(inode, true);
   3263}
   3264EXPORT_SYMBOL_GPL(pnfs_generic_sync);
   3265
   3266struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
   3267{
   3268	struct nfs4_threshold *thp;
   3269
   3270	thp = kzalloc(sizeof(*thp), nfs_io_gfp_mask());
   3271	if (!thp) {
   3272		dprintk("%s mdsthreshold allocation failed\n", __func__);
   3273		return NULL;
   3274	}
   3275	return thp;
   3276}
   3277
   3278#if IS_ENABLED(CONFIG_NFS_V4_2)
   3279int
   3280pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
   3281{
   3282	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
   3283	struct nfs_server *server = NFS_SERVER(inode);
   3284	struct nfs_inode *nfsi = NFS_I(inode);
   3285	struct nfs42_layoutstat_data *data;
   3286	struct pnfs_layout_hdr *hdr;
   3287	int status = 0;
   3288
   3289	if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
   3290		goto out;
   3291
   3292	if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
   3293		goto out;
   3294
   3295	if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
   3296		goto out;
   3297
   3298	spin_lock(&inode->i_lock);
   3299	if (!NFS_I(inode)->layout) {
   3300		spin_unlock(&inode->i_lock);
   3301		goto out_clear_layoutstats;
   3302	}
   3303	hdr = NFS_I(inode)->layout;
   3304	pnfs_get_layout_hdr(hdr);
   3305	spin_unlock(&inode->i_lock);
   3306
   3307	data = kzalloc(sizeof(*data), gfp_flags);
   3308	if (!data) {
   3309		status = -ENOMEM;
   3310		goto out_put;
   3311	}
   3312
   3313	data->args.fh = NFS_FH(inode);
   3314	data->args.inode = inode;
   3315	status = ld->prepare_layoutstats(&data->args);
   3316	if (status)
   3317		goto out_free;
   3318
   3319	status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data);
   3320
   3321out:
   3322	dprintk("%s returns %d\n", __func__, status);
   3323	return status;
   3324
   3325out_free:
   3326	kfree(data);
   3327out_put:
   3328	pnfs_put_layout_hdr(hdr);
   3329out_clear_layoutstats:
   3330	smp_mb__before_atomic();
   3331	clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
   3332	smp_mb__after_atomic();
   3333	goto out;
   3334}
   3335EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
   3336#endif
   3337
   3338unsigned int layoutstats_timer;
   3339module_param(layoutstats_timer, uint, 0644);
   3340EXPORT_SYMBOL_GPL(layoutstats_timer);