cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

flexfilelayout.c (67742B)


      1// SPDX-License-Identifier: GPL-2.0-only
      2/*
      3 * Module for pnfs flexfile layout driver.
      4 *
      5 * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
      6 *
      7 * Tao Peng <bergwolf@primarydata.com>
      8 */
      9
     10#include <linux/nfs_fs.h>
     11#include <linux/nfs_mount.h>
     12#include <linux/nfs_page.h>
     13#include <linux/module.h>
     14#include <linux/sched/mm.h>
     15
     16#include <linux/sunrpc/metrics.h>
     17
     18#include "flexfilelayout.h"
     19#include "../nfs4session.h"
     20#include "../nfs4idmap.h"
     21#include "../internal.h"
     22#include "../delegation.h"
     23#include "../nfs4trace.h"
     24#include "../iostat.h"
     25#include "../nfs.h"
     26#include "../nfs42.h"
     27
     28#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
     29
     30#define FF_LAYOUT_POLL_RETRY_MAX     (15*HZ)
     31#define FF_LAYOUTRETURN_MAXERR 20
     32
     33static unsigned short io_maxretrans;
     34
     35static const struct pnfs_commit_ops ff_layout_commit_ops;
     36static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
     37		struct nfs_pgio_header *hdr);
     38static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
     39			       struct nfs42_layoutstat_devinfo *devinfo,
     40			       int dev_limit);
     41static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
     42			      const struct nfs42_layoutstat_devinfo *devinfo,
     43			      struct nfs4_ff_layout_mirror *mirror);
     44
     45static struct pnfs_layout_hdr *
     46ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
     47{
     48	struct nfs4_flexfile_layout *ffl;
     49
     50	ffl = kzalloc(sizeof(*ffl), gfp_flags);
     51	if (ffl) {
     52		pnfs_init_ds_commit_info(&ffl->commit_info);
     53		INIT_LIST_HEAD(&ffl->error_list);
     54		INIT_LIST_HEAD(&ffl->mirrors);
     55		ffl->last_report_time = ktime_get();
     56		ffl->commit_info.ops = &ff_layout_commit_ops;
     57		return &ffl->generic_hdr;
     58	} else
     59		return NULL;
     60}
     61
     62static void
     63ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
     64{
     65	struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(lo);
     66	struct nfs4_ff_layout_ds_err *err, *n;
     67
     68	list_for_each_entry_safe(err, n, &ffl->error_list, list) {
     69		list_del(&err->list);
     70		kfree(err);
     71	}
     72	kfree_rcu(ffl, generic_hdr.plh_rcu);
     73}
     74
     75static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
     76{
     77	__be32 *p;
     78
     79	p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
     80	if (unlikely(p == NULL))
     81		return -ENOBUFS;
     82	stateid->type = NFS4_PNFS_DS_STATEID_TYPE;
     83	memcpy(stateid->data, p, NFS4_STATEID_SIZE);
     84	dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
     85		p[0], p[1], p[2], p[3]);
     86	return 0;
     87}
     88
     89static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid)
     90{
     91	__be32 *p;
     92
     93	p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE);
     94	if (unlikely(!p))
     95		return -ENOBUFS;
     96	memcpy(devid, p, NFS4_DEVICEID4_SIZE);
     97	nfs4_print_deviceid(devid);
     98	return 0;
     99}
    100
    101static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
    102{
    103	__be32 *p;
    104
    105	p = xdr_inline_decode(xdr, 4);
    106	if (unlikely(!p))
    107		return -ENOBUFS;
    108	fh->size = be32_to_cpup(p++);
    109	if (fh->size > NFS_MAXFHSIZE) {
    110		printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
    111		       fh->size);
    112		return -EOVERFLOW;
    113	}
    114	/* fh.data */
    115	p = xdr_inline_decode(xdr, fh->size);
    116	if (unlikely(!p))
    117		return -ENOBUFS;
    118	memcpy(&fh->data, p, fh->size);
    119	dprintk("%s: fh len %d\n", __func__, fh->size);
    120
    121	return 0;
    122}
    123
    124/*
    125 * Currently only stringified uids and gids are accepted.
    126 * I.e., kerberos is not supported to the DSes, so no pricipals.
    127 *
    128 * That means that one common function will suffice, but when
    129 * principals are added, this should be split to accomodate
    130 * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid().
    131 */
    132static int
    133decode_name(struct xdr_stream *xdr, u32 *id)
    134{
    135	__be32 *p;
    136	int len;
    137
    138	/* opaque_length(4)*/
    139	p = xdr_inline_decode(xdr, 4);
    140	if (unlikely(!p))
    141		return -ENOBUFS;
    142	len = be32_to_cpup(p++);
    143	if (len < 0)
    144		return -EINVAL;
    145
    146	dprintk("%s: len %u\n", __func__, len);
    147
    148	/* opaque body */
    149	p = xdr_inline_decode(xdr, len);
    150	if (unlikely(!p))
    151		return -ENOBUFS;
    152
    153	if (!nfs_map_string_to_numeric((char *)p, len, id))
    154		return -EINVAL;
    155
    156	return 0;
    157}
    158
    159static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
    160		const struct nfs4_ff_layout_mirror *m2)
    161{
    162	int i, j;
    163
    164	if (m1->fh_versions_cnt != m2->fh_versions_cnt)
    165		return false;
    166	for (i = 0; i < m1->fh_versions_cnt; i++) {
    167		bool found_fh = false;
    168		for (j = 0; j < m2->fh_versions_cnt; j++) {
    169			if (nfs_compare_fh(&m1->fh_versions[i],
    170					&m2->fh_versions[j]) == 0) {
    171				found_fh = true;
    172				break;
    173			}
    174		}
    175		if (!found_fh)
    176			return false;
    177	}
    178	return true;
    179}
    180
    181static struct nfs4_ff_layout_mirror *
    182ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
    183		struct nfs4_ff_layout_mirror *mirror)
    184{
    185	struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
    186	struct nfs4_ff_layout_mirror *pos;
    187	struct inode *inode = lo->plh_inode;
    188
    189	spin_lock(&inode->i_lock);
    190	list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
    191		if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0)
    192			continue;
    193		if (!ff_mirror_match_fh(mirror, pos))
    194			continue;
    195		if (refcount_inc_not_zero(&pos->ref)) {
    196			spin_unlock(&inode->i_lock);
    197			return pos;
    198		}
    199	}
    200	list_add(&mirror->mirrors, &ff_layout->mirrors);
    201	mirror->layout = lo;
    202	spin_unlock(&inode->i_lock);
    203	return mirror;
    204}
    205
    206static void
    207ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
    208{
    209	struct inode *inode;
    210	if (mirror->layout == NULL)
    211		return;
    212	inode = mirror->layout->plh_inode;
    213	spin_lock(&inode->i_lock);
    214	list_del(&mirror->mirrors);
    215	spin_unlock(&inode->i_lock);
    216	mirror->layout = NULL;
    217}
    218
    219static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
    220{
    221	struct nfs4_ff_layout_mirror *mirror;
    222
    223	mirror = kzalloc(sizeof(*mirror), gfp_flags);
    224	if (mirror != NULL) {
    225		spin_lock_init(&mirror->lock);
    226		refcount_set(&mirror->ref, 1);
    227		INIT_LIST_HEAD(&mirror->mirrors);
    228	}
    229	return mirror;
    230}
    231
    232static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
    233{
    234	const struct cred	*cred;
    235
    236	ff_layout_remove_mirror(mirror);
    237	kfree(mirror->fh_versions);
    238	cred = rcu_access_pointer(mirror->ro_cred);
    239	put_cred(cred);
    240	cred = rcu_access_pointer(mirror->rw_cred);
    241	put_cred(cred);
    242	nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
    243	kfree(mirror);
    244}
    245
    246static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
    247{
    248	if (mirror != NULL && refcount_dec_and_test(&mirror->ref))
    249		ff_layout_free_mirror(mirror);
    250}
    251
    252static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
    253{
    254	u32 i;
    255
    256	for (i = 0; i < fls->mirror_array_cnt; i++)
    257		ff_layout_put_mirror(fls->mirror_array[i]);
    258}
    259
    260static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
    261{
    262	if (fls) {
    263		ff_layout_free_mirror_array(fls);
    264		kfree(fls);
    265	}
    266}
    267
    268static bool
    269ff_lseg_match_mirrors(struct pnfs_layout_segment *l1,
    270		struct pnfs_layout_segment *l2)
    271{
    272	const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1);
    273	const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1);
    274	u32 i;
    275
    276	if (fl1->mirror_array_cnt != fl2->mirror_array_cnt)
    277		return false;
    278	for (i = 0; i < fl1->mirror_array_cnt; i++) {
    279		if (fl1->mirror_array[i] != fl2->mirror_array[i])
    280			return false;
    281	}
    282	return true;
    283}
    284
    285static bool
    286ff_lseg_range_is_after(const struct pnfs_layout_range *l1,
    287		const struct pnfs_layout_range *l2)
    288{
    289	u64 end1, end2;
    290
    291	if (l1->iomode != l2->iomode)
    292		return l1->iomode != IOMODE_READ;
    293	end1 = pnfs_calc_offset_end(l1->offset, l1->length);
    294	end2 = pnfs_calc_offset_end(l2->offset, l2->length);
    295	if (end1 < l2->offset)
    296		return false;
    297	if (end2 < l1->offset)
    298		return true;
    299	return l2->offset <= l1->offset;
    300}
    301
    302static bool
    303ff_lseg_merge(struct pnfs_layout_segment *new,
    304		struct pnfs_layout_segment *old)
    305{
    306	u64 new_end, old_end;
    307
    308	if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
    309		return false;
    310	if (new->pls_range.iomode != old->pls_range.iomode)
    311		return false;
    312	old_end = pnfs_calc_offset_end(old->pls_range.offset,
    313			old->pls_range.length);
    314	if (old_end < new->pls_range.offset)
    315		return false;
    316	new_end = pnfs_calc_offset_end(new->pls_range.offset,
    317			new->pls_range.length);
    318	if (new_end < old->pls_range.offset)
    319		return false;
    320	if (!ff_lseg_match_mirrors(new, old))
    321		return false;
    322
    323	/* Mergeable: copy info from 'old' to 'new' */
    324	if (new_end < old_end)
    325		new_end = old_end;
    326	if (new->pls_range.offset < old->pls_range.offset)
    327		new->pls_range.offset = old->pls_range.offset;
    328	new->pls_range.length = pnfs_calc_offset_length(new->pls_range.offset,
    329			new_end);
    330	if (test_bit(NFS_LSEG_ROC, &old->pls_flags))
    331		set_bit(NFS_LSEG_ROC, &new->pls_flags);
    332	return true;
    333}
    334
    335static void
    336ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
    337		struct pnfs_layout_segment *lseg,
    338		struct list_head *free_me)
    339{
    340	pnfs_generic_layout_insert_lseg(lo, lseg,
    341			ff_lseg_range_is_after,
    342			ff_lseg_merge,
    343			free_me);
    344}
    345
    346static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
    347{
    348	int i, j;
    349
    350	for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
    351		for (j = i + 1; j < fls->mirror_array_cnt; j++)
    352			if (fls->mirror_array[i]->efficiency <
    353			    fls->mirror_array[j]->efficiency)
    354				swap(fls->mirror_array[i],
    355				     fls->mirror_array[j]);
    356	}
    357}
    358
    359static struct pnfs_layout_segment *
    360ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
    361		     struct nfs4_layoutget_res *lgr,
    362		     gfp_t gfp_flags)
    363{
    364	struct pnfs_layout_segment *ret;
    365	struct nfs4_ff_layout_segment *fls = NULL;
    366	struct xdr_stream stream;
    367	struct xdr_buf buf;
    368	struct page *scratch;
    369	u64 stripe_unit;
    370	u32 mirror_array_cnt;
    371	__be32 *p;
    372	int i, rc;
    373
    374	dprintk("--> %s\n", __func__);
    375	scratch = alloc_page(gfp_flags);
    376	if (!scratch)
    377		return ERR_PTR(-ENOMEM);
    378
    379	xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
    380			      lgr->layoutp->len);
    381	xdr_set_scratch_page(&stream, scratch);
    382
    383	/* stripe unit and mirror_array_cnt */
    384	rc = -EIO;
    385	p = xdr_inline_decode(&stream, 8 + 4);
    386	if (!p)
    387		goto out_err_free;
    388
    389	p = xdr_decode_hyper(p, &stripe_unit);
    390	mirror_array_cnt = be32_to_cpup(p++);
    391	dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__,
    392		stripe_unit, mirror_array_cnt);
    393
    394	if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT ||
    395	    mirror_array_cnt == 0)
    396		goto out_err_free;
    397
    398	rc = -ENOMEM;
    399	fls = kzalloc(struct_size(fls, mirror_array, mirror_array_cnt),
    400			gfp_flags);
    401	if (!fls)
    402		goto out_err_free;
    403
    404	fls->mirror_array_cnt = mirror_array_cnt;
    405	fls->stripe_unit = stripe_unit;
    406
    407	for (i = 0; i < fls->mirror_array_cnt; i++) {
    408		struct nfs4_ff_layout_mirror *mirror;
    409		struct cred *kcred;
    410		const struct cred __rcu *cred;
    411		kuid_t uid;
    412		kgid_t gid;
    413		u32 ds_count, fh_count, id;
    414		int j;
    415
    416		rc = -EIO;
    417		p = xdr_inline_decode(&stream, 4);
    418		if (!p)
    419			goto out_err_free;
    420		ds_count = be32_to_cpup(p);
    421
    422		/* FIXME: allow for striping? */
    423		if (ds_count != 1)
    424			goto out_err_free;
    425
    426		fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
    427		if (fls->mirror_array[i] == NULL) {
    428			rc = -ENOMEM;
    429			goto out_err_free;
    430		}
    431
    432		fls->mirror_array[i]->ds_count = ds_count;
    433
    434		/* deviceid */
    435		rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid);
    436		if (rc)
    437			goto out_err_free;
    438
    439		/* efficiency */
    440		rc = -EIO;
    441		p = xdr_inline_decode(&stream, 4);
    442		if (!p)
    443			goto out_err_free;
    444		fls->mirror_array[i]->efficiency = be32_to_cpup(p);
    445
    446		/* stateid */
    447		rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid);
    448		if (rc)
    449			goto out_err_free;
    450
    451		/* fh */
    452		rc = -EIO;
    453		p = xdr_inline_decode(&stream, 4);
    454		if (!p)
    455			goto out_err_free;
    456		fh_count = be32_to_cpup(p);
    457
    458		fls->mirror_array[i]->fh_versions =
    459			kcalloc(fh_count, sizeof(struct nfs_fh),
    460				gfp_flags);
    461		if (fls->mirror_array[i]->fh_versions == NULL) {
    462			rc = -ENOMEM;
    463			goto out_err_free;
    464		}
    465
    466		for (j = 0; j < fh_count; j++) {
    467			rc = decode_nfs_fh(&stream,
    468					   &fls->mirror_array[i]->fh_versions[j]);
    469			if (rc)
    470				goto out_err_free;
    471		}
    472
    473		fls->mirror_array[i]->fh_versions_cnt = fh_count;
    474
    475		/* user */
    476		rc = decode_name(&stream, &id);
    477		if (rc)
    478			goto out_err_free;
    479
    480		uid = make_kuid(&init_user_ns, id);
    481
    482		/* group */
    483		rc = decode_name(&stream, &id);
    484		if (rc)
    485			goto out_err_free;
    486
    487		gid = make_kgid(&init_user_ns, id);
    488
    489		if (gfp_flags & __GFP_FS)
    490			kcred = prepare_kernel_cred(NULL);
    491		else {
    492			unsigned int nofs_flags = memalloc_nofs_save();
    493			kcred = prepare_kernel_cred(NULL);
    494			memalloc_nofs_restore(nofs_flags);
    495		}
    496		rc = -ENOMEM;
    497		if (!kcred)
    498			goto out_err_free;
    499		kcred->fsuid = uid;
    500		kcred->fsgid = gid;
    501		cred = RCU_INITIALIZER(kcred);
    502
    503		if (lgr->range.iomode == IOMODE_READ)
    504			rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
    505		else
    506			rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
    507
    508		mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
    509		if (mirror != fls->mirror_array[i]) {
    510			/* swap cred ptrs so free_mirror will clean up old */
    511			if (lgr->range.iomode == IOMODE_READ) {
    512				cred = xchg(&mirror->ro_cred, cred);
    513				rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred);
    514			} else {
    515				cred = xchg(&mirror->rw_cred, cred);
    516				rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred);
    517			}
    518			ff_layout_free_mirror(fls->mirror_array[i]);
    519			fls->mirror_array[i] = mirror;
    520		}
    521
    522		dprintk("%s: iomode %s uid %u gid %u\n", __func__,
    523			lgr->range.iomode == IOMODE_READ ? "READ" : "RW",
    524			from_kuid(&init_user_ns, uid),
    525			from_kgid(&init_user_ns, gid));
    526	}
    527
    528	p = xdr_inline_decode(&stream, 4);
    529	if (!p)
    530		goto out_sort_mirrors;
    531	fls->flags = be32_to_cpup(p);
    532
    533	p = xdr_inline_decode(&stream, 4);
    534	if (!p)
    535		goto out_sort_mirrors;
    536	for (i=0; i < fls->mirror_array_cnt; i++)
    537		fls->mirror_array[i]->report_interval = be32_to_cpup(p);
    538
    539out_sort_mirrors:
    540	ff_layout_sort_mirrors(fls);
    541	ret = &fls->generic_hdr;
    542	dprintk("<-- %s (success)\n", __func__);
    543out_free_page:
    544	__free_page(scratch);
    545	return ret;
    546out_err_free:
    547	_ff_layout_free_lseg(fls);
    548	ret = ERR_PTR(rc);
    549	dprintk("<-- %s (%d)\n", __func__, rc);
    550	goto out_free_page;
    551}
    552
    553static void
    554ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
    555{
    556	struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
    557
    558	dprintk("--> %s\n", __func__);
    559
    560	if (lseg->pls_range.iomode == IOMODE_RW) {
    561		struct nfs4_flexfile_layout *ffl;
    562		struct inode *inode;
    563
    564		ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
    565		inode = ffl->generic_hdr.plh_inode;
    566		spin_lock(&inode->i_lock);
    567		pnfs_generic_ds_cinfo_release_lseg(&ffl->commit_info, lseg);
    568		spin_unlock(&inode->i_lock);
    569	}
    570	_ff_layout_free_lseg(fls);
    571}
    572
    573static void
    574nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
    575{
    576	/* first IO request? */
    577	if (atomic_inc_return(&timer->n_ops) == 1) {
    578		timer->start_time = now;
    579	}
    580}
    581
    582static ktime_t
    583nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
    584{
    585	ktime_t start;
    586
    587	if (atomic_dec_return(&timer->n_ops) < 0)
    588		WARN_ON_ONCE(1);
    589
    590	start = timer->start_time;
    591	timer->start_time = now;
    592	return ktime_sub(now, start);
    593}
    594
    595static bool
    596nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
    597			    struct nfs4_ff_layoutstat *layoutstat,
    598			    ktime_t now)
    599{
    600	s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
    601	struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout);
    602
    603	nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
    604	if (!mirror->start_time)
    605		mirror->start_time = now;
    606	if (mirror->report_interval != 0)
    607		report_interval = (s64)mirror->report_interval * 1000LL;
    608	else if (layoutstats_timer != 0)
    609		report_interval = (s64)layoutstats_timer * 1000LL;
    610	if (ktime_to_ms(ktime_sub(now, ffl->last_report_time)) >=
    611			report_interval) {
    612		ffl->last_report_time = now;
    613		return true;
    614	}
    615
    616	return false;
    617}
    618
    619static void
    620nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat *layoutstat,
    621		__u64 requested)
    622{
    623	struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
    624
    625	iostat->ops_requested++;
    626	iostat->bytes_requested += requested;
    627}
    628
    629static void
    630nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
    631		__u64 requested,
    632		__u64 completed,
    633		ktime_t time_completed,
    634		ktime_t time_started)
    635{
    636	struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
    637	ktime_t completion_time = ktime_sub(time_completed, time_started);
    638	ktime_t timer;
    639
    640	iostat->ops_completed++;
    641	iostat->bytes_completed += completed;
    642	iostat->bytes_not_delivered += requested - completed;
    643
    644	timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer, time_completed);
    645	iostat->total_busy_time =
    646			ktime_add(iostat->total_busy_time, timer);
    647	iostat->aggregate_completion_time =
    648			ktime_add(iostat->aggregate_completion_time,
    649					completion_time);
    650}
    651
    652static void
    653nfs4_ff_layout_stat_io_start_read(struct inode *inode,
    654		struct nfs4_ff_layout_mirror *mirror,
    655		__u64 requested, ktime_t now)
    656{
    657	bool report;
    658
    659	spin_lock(&mirror->lock);
    660	report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
    661	nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
    662	set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
    663	spin_unlock(&mirror->lock);
    664
    665	if (report)
    666		pnfs_report_layoutstat(inode, nfs_io_gfp_mask());
    667}
    668
    669static void
    670nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
    671		struct nfs4_ff_layout_mirror *mirror,
    672		__u64 requested,
    673		__u64 completed)
    674{
    675	spin_lock(&mirror->lock);
    676	nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
    677			requested, completed,
    678			ktime_get(), task->tk_start);
    679	set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
    680	spin_unlock(&mirror->lock);
    681}
    682
    683static void
    684nfs4_ff_layout_stat_io_start_write(struct inode *inode,
    685		struct nfs4_ff_layout_mirror *mirror,
    686		__u64 requested, ktime_t now)
    687{
    688	bool report;
    689
    690	spin_lock(&mirror->lock);
    691	report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
    692	nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
    693	set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
    694	spin_unlock(&mirror->lock);
    695
    696	if (report)
    697		pnfs_report_layoutstat(inode, nfs_io_gfp_mask());
    698}
    699
    700static void
    701nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
    702		struct nfs4_ff_layout_mirror *mirror,
    703		__u64 requested,
    704		__u64 completed,
    705		enum nfs3_stable_how committed)
    706{
    707	if (committed == NFS_UNSTABLE)
    708		requested = completed = 0;
    709
    710	spin_lock(&mirror->lock);
    711	nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
    712			requested, completed, ktime_get(), task->tk_start);
    713	set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
    714	spin_unlock(&mirror->lock);
    715}
    716
    717static void
    718ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx)
    719{
    720	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
    721
    722	if (devid)
    723		nfs4_mark_deviceid_unavailable(devid);
    724}
    725
    726static void
    727ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx)
    728{
    729	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
    730
    731	if (devid)
    732		nfs4_mark_deviceid_available(devid);
    733}
    734
    735static struct nfs4_pnfs_ds *
    736ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
    737			     u32 start_idx, u32 *best_idx,
    738			     bool check_device)
    739{
    740	struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
    741	struct nfs4_ff_layout_mirror *mirror;
    742	struct nfs4_pnfs_ds *ds;
    743	u32 idx;
    744
    745	/* mirrors are initially sorted by efficiency */
    746	for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
    747		mirror = FF_LAYOUT_COMP(lseg, idx);
    748		ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
    749		if (!ds)
    750			continue;
    751
    752		if (check_device &&
    753		    nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node))
    754			continue;
    755
    756		*best_idx = idx;
    757		return ds;
    758	}
    759
    760	return NULL;
    761}
    762
    763static struct nfs4_pnfs_ds *
    764ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg,
    765				 u32 start_idx, u32 *best_idx)
    766{
    767	return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false);
    768}
    769
    770static struct nfs4_pnfs_ds *
    771ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg,
    772				   u32 start_idx, u32 *best_idx)
    773{
    774	return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true);
    775}
    776
    777static struct nfs4_pnfs_ds *
    778ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
    779				  u32 start_idx, u32 *best_idx)
    780{
    781	struct nfs4_pnfs_ds *ds;
    782
    783	ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx);
    784	if (ds)
    785		return ds;
    786	return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx);
    787}
    788
    789static struct nfs4_pnfs_ds *
    790ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio,
    791			  u32 *best_idx)
    792{
    793	struct pnfs_layout_segment *lseg = pgio->pg_lseg;
    794	struct nfs4_pnfs_ds *ds;
    795
    796	ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx,
    797					       best_idx);
    798	if (ds || !pgio->pg_mirror_idx)
    799		return ds;
    800	return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx);
    801}
    802
    803static void
    804ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
    805		      struct nfs_page *req,
    806		      bool strict_iomode)
    807{
    808	pnfs_put_lseg(pgio->pg_lseg);
    809	pgio->pg_lseg =
    810		pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
    811				   req_offset(req), req->wb_bytes, IOMODE_READ,
    812				   strict_iomode, nfs_io_gfp_mask());
    813	if (IS_ERR(pgio->pg_lseg)) {
    814		pgio->pg_error = PTR_ERR(pgio->pg_lseg);
    815		pgio->pg_lseg = NULL;
    816	}
    817}
    818
    819static void
    820ff_layout_pg_check_layout(struct nfs_pageio_descriptor *pgio,
    821			  struct nfs_page *req)
    822{
    823	pnfs_generic_pg_check_layout(pgio);
    824	pnfs_generic_pg_check_range(pgio, req);
    825}
    826
    827static void
    828ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
    829			struct nfs_page *req)
    830{
    831	struct nfs_pgio_mirror *pgm;
    832	struct nfs4_ff_layout_mirror *mirror;
    833	struct nfs4_pnfs_ds *ds;
    834	u32 ds_idx;
    835
    836retry:
    837	ff_layout_pg_check_layout(pgio, req);
    838	/* Use full layout for now */
    839	if (!pgio->pg_lseg) {
    840		ff_layout_pg_get_read(pgio, req, false);
    841		if (!pgio->pg_lseg)
    842			goto out_nolseg;
    843	}
    844	if (ff_layout_avoid_read_on_rw(pgio->pg_lseg)) {
    845		ff_layout_pg_get_read(pgio, req, true);
    846		if (!pgio->pg_lseg)
    847			goto out_nolseg;
    848	}
    849
    850	ds = ff_layout_get_ds_for_read(pgio, &ds_idx);
    851	if (!ds) {
    852		if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
    853			goto out_mds;
    854		pnfs_generic_pg_cleanup(pgio);
    855		/* Sleep for 1 second before retrying */
    856		ssleep(1);
    857		goto retry;
    858	}
    859
    860	mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
    861	pgm = &pgio->pg_mirrors[0];
    862	pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
    863
    864	pgio->pg_mirror_idx = ds_idx;
    865
    866	if (NFS_SERVER(pgio->pg_inode)->flags &
    867			(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
    868		pgio->pg_maxretrans = io_maxretrans;
    869	return;
    870out_nolseg:
    871	if (pgio->pg_error < 0)
    872		return;
    873out_mds:
    874	trace_pnfs_mds_fallback_pg_init_read(pgio->pg_inode,
    875			0, NFS4_MAX_UINT64, IOMODE_READ,
    876			NFS_I(pgio->pg_inode)->layout,
    877			pgio->pg_lseg);
    878	pgio->pg_maxretrans = 0;
    879	nfs_pageio_reset_read_mds(pgio);
    880}
    881
    882static void
    883ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
    884			struct nfs_page *req)
    885{
    886	struct nfs4_ff_layout_mirror *mirror;
    887	struct nfs_pgio_mirror *pgm;
    888	struct nfs4_pnfs_ds *ds;
    889	u32 i;
    890
    891retry:
    892	ff_layout_pg_check_layout(pgio, req);
    893	if (!pgio->pg_lseg) {
    894		pgio->pg_lseg =
    895			pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
    896					   req_offset(req), req->wb_bytes,
    897					   IOMODE_RW, false, nfs_io_gfp_mask());
    898		if (IS_ERR(pgio->pg_lseg)) {
    899			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
    900			pgio->pg_lseg = NULL;
    901			return;
    902		}
    903	}
    904	/* If no lseg, fall back to write through mds */
    905	if (pgio->pg_lseg == NULL)
    906		goto out_mds;
    907
    908	/* Use a direct mapping of ds_idx to pgio mirror_idx */
    909	if (pgio->pg_mirror_count != FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg))
    910		goto out_eagain;
    911
    912	for (i = 0; i < pgio->pg_mirror_count; i++) {
    913		mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
    914		ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true);
    915		if (!ds) {
    916			if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
    917				goto out_mds;
    918			pnfs_generic_pg_cleanup(pgio);
    919			/* Sleep for 1 second before retrying */
    920			ssleep(1);
    921			goto retry;
    922		}
    923		pgm = &pgio->pg_mirrors[i];
    924		pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
    925	}
    926
    927	if (NFS_SERVER(pgio->pg_inode)->flags &
    928			(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
    929		pgio->pg_maxretrans = io_maxretrans;
    930	return;
    931out_eagain:
    932	pnfs_generic_pg_cleanup(pgio);
    933	pgio->pg_error = -EAGAIN;
    934	return;
    935out_mds:
    936	trace_pnfs_mds_fallback_pg_init_write(pgio->pg_inode,
    937			0, NFS4_MAX_UINT64, IOMODE_RW,
    938			NFS_I(pgio->pg_inode)->layout,
    939			pgio->pg_lseg);
    940	pgio->pg_maxretrans = 0;
    941	nfs_pageio_reset_write_mds(pgio);
    942	pgio->pg_error = -EAGAIN;
    943}
    944
    945static unsigned int
    946ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
    947				    struct nfs_page *req)
    948{
    949	if (!pgio->pg_lseg) {
    950		pgio->pg_lseg =
    951			pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
    952					   req_offset(req), req->wb_bytes,
    953					   IOMODE_RW, false, nfs_io_gfp_mask());
    954		if (IS_ERR(pgio->pg_lseg)) {
    955			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
    956			pgio->pg_lseg = NULL;
    957			goto out;
    958		}
    959	}
    960	if (pgio->pg_lseg)
    961		return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
    962
    963	trace_pnfs_mds_fallback_pg_get_mirror_count(pgio->pg_inode,
    964			0, NFS4_MAX_UINT64, IOMODE_RW,
    965			NFS_I(pgio->pg_inode)->layout,
    966			pgio->pg_lseg);
    967	/* no lseg means that pnfs is not in use, so no mirroring here */
    968	nfs_pageio_reset_write_mds(pgio);
    969out:
    970	return 1;
    971}
    972
    973static u32
    974ff_layout_pg_set_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx)
    975{
    976	u32 old = desc->pg_mirror_idx;
    977
    978	desc->pg_mirror_idx = idx;
    979	return old;
    980}
    981
    982static struct nfs_pgio_mirror *
    983ff_layout_pg_get_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx)
    984{
    985	return &desc->pg_mirrors[idx];
    986}
    987
    988static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
    989	.pg_init = ff_layout_pg_init_read,
    990	.pg_test = pnfs_generic_pg_test,
    991	.pg_doio = pnfs_generic_pg_readpages,
    992	.pg_cleanup = pnfs_generic_pg_cleanup,
    993};
    994
    995static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
    996	.pg_init = ff_layout_pg_init_write,
    997	.pg_test = pnfs_generic_pg_test,
    998	.pg_doio = pnfs_generic_pg_writepages,
    999	.pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
   1000	.pg_cleanup = pnfs_generic_pg_cleanup,
   1001	.pg_get_mirror = ff_layout_pg_get_mirror_write,
   1002	.pg_set_mirror = ff_layout_pg_set_mirror_write,
   1003};
   1004
   1005static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
   1006{
   1007	struct rpc_task *task = &hdr->task;
   1008
   1009	pnfs_layoutcommit_inode(hdr->inode, false);
   1010
   1011	if (retry_pnfs) {
   1012		dprintk("%s Reset task %5u for i/o through pNFS "
   1013			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
   1014			hdr->task.tk_pid,
   1015			hdr->inode->i_sb->s_id,
   1016			(unsigned long long)NFS_FILEID(hdr->inode),
   1017			hdr->args.count,
   1018			(unsigned long long)hdr->args.offset);
   1019
   1020		hdr->completion_ops->reschedule_io(hdr);
   1021		return;
   1022	}
   1023
   1024	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
   1025		dprintk("%s Reset task %5u for i/o through MDS "
   1026			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
   1027			hdr->task.tk_pid,
   1028			hdr->inode->i_sb->s_id,
   1029			(unsigned long long)NFS_FILEID(hdr->inode),
   1030			hdr->args.count,
   1031			(unsigned long long)hdr->args.offset);
   1032
   1033		trace_pnfs_mds_fallback_write_done(hdr->inode,
   1034				hdr->args.offset, hdr->args.count,
   1035				IOMODE_RW, NFS_I(hdr->inode)->layout,
   1036				hdr->lseg);
   1037		task->tk_status = pnfs_write_done_resend_to_mds(hdr);
   1038	}
   1039}
   1040
   1041static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
   1042{
   1043	u32 idx = hdr->pgio_mirror_idx + 1;
   1044	u32 new_idx = 0;
   1045
   1046	if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx))
   1047		ff_layout_send_layouterror(hdr->lseg);
   1048	else
   1049		pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
   1050	pnfs_read_resend_pnfs(hdr, new_idx);
   1051}
   1052
   1053static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
   1054{
   1055	struct rpc_task *task = &hdr->task;
   1056
   1057	pnfs_layoutcommit_inode(hdr->inode, false);
   1058	pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg);
   1059
   1060	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
   1061		dprintk("%s Reset task %5u for i/o through MDS "
   1062			"(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
   1063			hdr->task.tk_pid,
   1064			hdr->inode->i_sb->s_id,
   1065			(unsigned long long)NFS_FILEID(hdr->inode),
   1066			hdr->args.count,
   1067			(unsigned long long)hdr->args.offset);
   1068
   1069		trace_pnfs_mds_fallback_read_done(hdr->inode,
   1070				hdr->args.offset, hdr->args.count,
   1071				IOMODE_READ, NFS_I(hdr->inode)->layout,
   1072				hdr->lseg);
   1073		task->tk_status = pnfs_read_done_resend_to_mds(hdr);
   1074	}
   1075}
   1076
   1077static int ff_layout_async_handle_error_v4(struct rpc_task *task,
   1078					   struct nfs4_state *state,
   1079					   struct nfs_client *clp,
   1080					   struct pnfs_layout_segment *lseg,
   1081					   u32 idx)
   1082{
   1083	struct pnfs_layout_hdr *lo = lseg->pls_layout;
   1084	struct inode *inode = lo->plh_inode;
   1085	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
   1086	struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
   1087
   1088	switch (task->tk_status) {
   1089	case -NFS4ERR_BADSESSION:
   1090	case -NFS4ERR_BADSLOT:
   1091	case -NFS4ERR_BAD_HIGH_SLOT:
   1092	case -NFS4ERR_DEADSESSION:
   1093	case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
   1094	case -NFS4ERR_SEQ_FALSE_RETRY:
   1095	case -NFS4ERR_SEQ_MISORDERED:
   1096		dprintk("%s ERROR %d, Reset session. Exchangeid "
   1097			"flags 0x%x\n", __func__, task->tk_status,
   1098			clp->cl_exchange_flags);
   1099		nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
   1100		break;
   1101	case -NFS4ERR_DELAY:
   1102	case -NFS4ERR_GRACE:
   1103		rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
   1104		break;
   1105	case -NFS4ERR_RETRY_UNCACHED_REP:
   1106		break;
   1107	/* Invalidate Layout errors */
   1108	case -NFS4ERR_PNFS_NO_LAYOUT:
   1109	case -ESTALE:           /* mapped NFS4ERR_STALE */
   1110	case -EBADHANDLE:       /* mapped NFS4ERR_BADHANDLE */
   1111	case -EISDIR:           /* mapped NFS4ERR_ISDIR */
   1112	case -NFS4ERR_FHEXPIRED:
   1113	case -NFS4ERR_WRONG_TYPE:
   1114		dprintk("%s Invalid layout error %d\n", __func__,
   1115			task->tk_status);
   1116		/*
   1117		 * Destroy layout so new i/o will get a new layout.
   1118		 * Layout will not be destroyed until all current lseg
   1119		 * references are put. Mark layout as invalid to resend failed
   1120		 * i/o and all i/o waiting on the slot table to the MDS until
   1121		 * layout is destroyed and a new valid layout is obtained.
   1122		 */
   1123		pnfs_destroy_layout(NFS_I(inode));
   1124		rpc_wake_up(&tbl->slot_tbl_waitq);
   1125		goto reset;
   1126	/* RPC connection errors */
   1127	case -ECONNREFUSED:
   1128	case -EHOSTDOWN:
   1129	case -EHOSTUNREACH:
   1130	case -ENETUNREACH:
   1131	case -EIO:
   1132	case -ETIMEDOUT:
   1133	case -EPIPE:
   1134		dprintk("%s DS connection error %d\n", __func__,
   1135			task->tk_status);
   1136		nfs4_delete_deviceid(devid->ld, devid->nfs_client,
   1137				&devid->deviceid);
   1138		rpc_wake_up(&tbl->slot_tbl_waitq);
   1139		fallthrough;
   1140	default:
   1141		if (ff_layout_avoid_mds_available_ds(lseg))
   1142			return -NFS4ERR_RESET_TO_PNFS;
   1143reset:
   1144		dprintk("%s Retry through MDS. Error %d\n", __func__,
   1145			task->tk_status);
   1146		return -NFS4ERR_RESET_TO_MDS;
   1147	}
   1148	task->tk_status = 0;
   1149	return -EAGAIN;
   1150}
   1151
   1152/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
   1153static int ff_layout_async_handle_error_v3(struct rpc_task *task,
   1154					   struct pnfs_layout_segment *lseg,
   1155					   u32 idx)
   1156{
   1157	struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
   1158
   1159	switch (task->tk_status) {
   1160	/* File access problems. Don't mark the device as unavailable */
   1161	case -EACCES:
   1162	case -ESTALE:
   1163	case -EISDIR:
   1164	case -EBADHANDLE:
   1165	case -ELOOP:
   1166	case -ENOSPC:
   1167		break;
   1168	case -EJUKEBOX:
   1169		nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
   1170		goto out_retry;
   1171	default:
   1172		dprintk("%s DS connection error %d\n", __func__,
   1173			task->tk_status);
   1174		nfs4_delete_deviceid(devid->ld, devid->nfs_client,
   1175				&devid->deviceid);
   1176	}
   1177	/* FIXME: Need to prevent infinite looping here. */
   1178	return -NFS4ERR_RESET_TO_PNFS;
   1179out_retry:
   1180	task->tk_status = 0;
   1181	rpc_restart_call_prepare(task);
   1182	rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
   1183	return -EAGAIN;
   1184}
   1185
   1186static int ff_layout_async_handle_error(struct rpc_task *task,
   1187					struct nfs4_state *state,
   1188					struct nfs_client *clp,
   1189					struct pnfs_layout_segment *lseg,
   1190					u32 idx)
   1191{
   1192	int vers = clp->cl_nfs_mod->rpc_vers->number;
   1193
   1194	if (task->tk_status >= 0) {
   1195		ff_layout_mark_ds_reachable(lseg, idx);
   1196		return 0;
   1197	}
   1198
   1199	/* Handle the case of an invalid layout segment */
   1200	if (!pnfs_is_valid_lseg(lseg))
   1201		return -NFS4ERR_RESET_TO_PNFS;
   1202
   1203	switch (vers) {
   1204	case 3:
   1205		return ff_layout_async_handle_error_v3(task, lseg, idx);
   1206	case 4:
   1207		return ff_layout_async_handle_error_v4(task, state, clp,
   1208						       lseg, idx);
   1209	default:
   1210		/* should never happen */
   1211		WARN_ON_ONCE(1);
   1212		return 0;
   1213	}
   1214}
   1215
   1216static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
   1217					u32 idx, u64 offset, u64 length,
   1218					u32 *op_status, int opnum, int error)
   1219{
   1220	struct nfs4_ff_layout_mirror *mirror;
   1221	u32 status = *op_status;
   1222	int err;
   1223
   1224	if (status == 0) {
   1225		switch (error) {
   1226		case -ETIMEDOUT:
   1227		case -EPFNOSUPPORT:
   1228		case -EPROTONOSUPPORT:
   1229		case -EOPNOTSUPP:
   1230		case -ECONNREFUSED:
   1231		case -ECONNRESET:
   1232		case -EHOSTDOWN:
   1233		case -EHOSTUNREACH:
   1234		case -ENETUNREACH:
   1235		case -EADDRINUSE:
   1236		case -ENOBUFS:
   1237		case -EPIPE:
   1238		case -EPERM:
   1239			*op_status = status = NFS4ERR_NXIO;
   1240			break;
   1241		case -EACCES:
   1242			*op_status = status = NFS4ERR_ACCESS;
   1243			break;
   1244		default:
   1245			return;
   1246		}
   1247	}
   1248
   1249	mirror = FF_LAYOUT_COMP(lseg, idx);
   1250	err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
   1251				       mirror, offset, length, status, opnum,
   1252				       nfs_io_gfp_mask());
   1253
   1254	switch (status) {
   1255	case NFS4ERR_DELAY:
   1256	case NFS4ERR_GRACE:
   1257		break;
   1258	case NFS4ERR_NXIO:
   1259		ff_layout_mark_ds_unreachable(lseg, idx);
   1260		/*
   1261		 * Don't return the layout if this is a read and we still
   1262		 * have layouts to try
   1263		 */
   1264		if (opnum == OP_READ)
   1265			break;
   1266		fallthrough;
   1267	default:
   1268		pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
   1269						  lseg);
   1270	}
   1271
   1272	dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
   1273}
   1274
   1275/* NFS_PROTO call done callback routines */
   1276static int ff_layout_read_done_cb(struct rpc_task *task,
   1277				struct nfs_pgio_header *hdr)
   1278{
   1279	int err;
   1280
   1281	if (task->tk_status < 0) {
   1282		ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
   1283					    hdr->args.offset, hdr->args.count,
   1284					    &hdr->res.op_status, OP_READ,
   1285					    task->tk_status);
   1286		trace_ff_layout_read_error(hdr);
   1287	}
   1288
   1289	err = ff_layout_async_handle_error(task, hdr->args.context->state,
   1290					   hdr->ds_clp, hdr->lseg,
   1291					   hdr->pgio_mirror_idx);
   1292
   1293	trace_nfs4_pnfs_read(hdr, err);
   1294	clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
   1295	clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags);
   1296	switch (err) {
   1297	case -NFS4ERR_RESET_TO_PNFS:
   1298		set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
   1299		return task->tk_status;
   1300	case -NFS4ERR_RESET_TO_MDS:
   1301		set_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags);
   1302		return task->tk_status;
   1303	case -EAGAIN:
   1304		goto out_eagain;
   1305	}
   1306
   1307	return 0;
   1308out_eagain:
   1309	rpc_restart_call_prepare(task);
   1310	return -EAGAIN;
   1311}
   1312
   1313static bool
   1314ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
   1315{
   1316	return !(FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_LAYOUTCOMMIT);
   1317}
   1318
   1319/*
   1320 * We reference the rpc_cred of the first WRITE that triggers the need for
   1321 * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
   1322 * rfc5661 is not clear about which credential should be used.
   1323 *
   1324 * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so
   1325 * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751
   1326 * we always send layoutcommit after DS writes.
   1327 */
   1328static void
   1329ff_layout_set_layoutcommit(struct inode *inode,
   1330		struct pnfs_layout_segment *lseg,
   1331		loff_t end_offset)
   1332{
   1333	if (!ff_layout_need_layoutcommit(lseg))
   1334		return;
   1335
   1336	pnfs_set_layoutcommit(inode, lseg, end_offset);
   1337	dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino,
   1338		(unsigned long long) NFS_I(inode)->layout->plh_lwb);
   1339}
   1340
   1341static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
   1342		struct nfs_pgio_header *hdr)
   1343{
   1344	if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
   1345		return;
   1346	nfs4_ff_layout_stat_io_start_read(hdr->inode,
   1347			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
   1348			hdr->args.count,
   1349			task->tk_start);
   1350}
   1351
   1352static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
   1353		struct nfs_pgio_header *hdr)
   1354{
   1355	if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
   1356		return;
   1357	nfs4_ff_layout_stat_io_end_read(task,
   1358			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
   1359			hdr->args.count,
   1360			hdr->res.count);
   1361	set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags);
   1362}
   1363
   1364static int ff_layout_read_prepare_common(struct rpc_task *task,
   1365					 struct nfs_pgio_header *hdr)
   1366{
   1367	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
   1368		rpc_exit(task, -EIO);
   1369		return -EIO;
   1370	}
   1371
   1372	ff_layout_read_record_layoutstats_start(task, hdr);
   1373	return 0;
   1374}
   1375
   1376/*
   1377 * Call ops for the async read/write cases
   1378 * In the case of dense layouts, the offset needs to be reset to its
   1379 * original value.
   1380 */
   1381static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data)
   1382{
   1383	struct nfs_pgio_header *hdr = data;
   1384
   1385	if (ff_layout_read_prepare_common(task, hdr))
   1386		return;
   1387
   1388	rpc_call_start(task);
   1389}
   1390
   1391static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
   1392{
   1393	struct nfs_pgio_header *hdr = data;
   1394
   1395	if (nfs4_setup_sequence(hdr->ds_clp,
   1396				&hdr->args.seq_args,
   1397				&hdr->res.seq_res,
   1398				task))
   1399		return;
   1400
   1401	ff_layout_read_prepare_common(task, hdr);
   1402}
   1403
   1404static void ff_layout_read_call_done(struct rpc_task *task, void *data)
   1405{
   1406	struct nfs_pgio_header *hdr = data;
   1407
   1408	if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
   1409	    task->tk_status == 0) {
   1410		nfs4_sequence_done(task, &hdr->res.seq_res);
   1411		return;
   1412	}
   1413
   1414	/* Note this may cause RPC to be resent */
   1415	hdr->mds_ops->rpc_call_done(task, hdr);
   1416}
   1417
   1418static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
   1419{
   1420	struct nfs_pgio_header *hdr = data;
   1421
   1422	ff_layout_read_record_layoutstats_done(task, hdr);
   1423	rpc_count_iostats_metrics(task,
   1424	    &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
   1425}
   1426
   1427static void ff_layout_read_release(void *data)
   1428{
   1429	struct nfs_pgio_header *hdr = data;
   1430
   1431	ff_layout_read_record_layoutstats_done(&hdr->task, hdr);
   1432	if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags))
   1433		ff_layout_resend_pnfs_read(hdr);
   1434	else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
   1435		ff_layout_reset_read(hdr);
   1436	pnfs_generic_rw_release(data);
   1437}
   1438
   1439
   1440static int ff_layout_write_done_cb(struct rpc_task *task,
   1441				struct nfs_pgio_header *hdr)
   1442{
   1443	loff_t end_offs = 0;
   1444	int err;
   1445
   1446	if (task->tk_status < 0) {
   1447		ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
   1448					    hdr->args.offset, hdr->args.count,
   1449					    &hdr->res.op_status, OP_WRITE,
   1450					    task->tk_status);
   1451		trace_ff_layout_write_error(hdr);
   1452	}
   1453
   1454	err = ff_layout_async_handle_error(task, hdr->args.context->state,
   1455					   hdr->ds_clp, hdr->lseg,
   1456					   hdr->pgio_mirror_idx);
   1457
   1458	trace_nfs4_pnfs_write(hdr, err);
   1459	clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
   1460	clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags);
   1461	switch (err) {
   1462	case -NFS4ERR_RESET_TO_PNFS:
   1463		set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags);
   1464		return task->tk_status;
   1465	case -NFS4ERR_RESET_TO_MDS:
   1466		set_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags);
   1467		return task->tk_status;
   1468	case -EAGAIN:
   1469		return -EAGAIN;
   1470	}
   1471
   1472	if (hdr->res.verf->committed == NFS_FILE_SYNC ||
   1473	    hdr->res.verf->committed == NFS_DATA_SYNC)
   1474		end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
   1475
   1476	/* Note: if the write is unstable, don't set end_offs until commit */
   1477	ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
   1478
   1479	/* zero out fattr since we don't care DS attr at all */
   1480	hdr->fattr.valid = 0;
   1481	if (task->tk_status >= 0)
   1482		nfs_writeback_update_inode(hdr);
   1483
   1484	return 0;
   1485}
   1486
   1487static int ff_layout_commit_done_cb(struct rpc_task *task,
   1488				     struct nfs_commit_data *data)
   1489{
   1490	int err;
   1491
   1492	if (task->tk_status < 0) {
   1493		ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
   1494					    data->args.offset, data->args.count,
   1495					    &data->res.op_status, OP_COMMIT,
   1496					    task->tk_status);
   1497		trace_ff_layout_commit_error(data);
   1498	}
   1499
   1500	err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
   1501					   data->lseg, data->ds_commit_index);
   1502
   1503	trace_nfs4_pnfs_commit_ds(data, err);
   1504	switch (err) {
   1505	case -NFS4ERR_RESET_TO_PNFS:
   1506		pnfs_generic_prepare_to_resend_writes(data);
   1507		return -EAGAIN;
   1508	case -NFS4ERR_RESET_TO_MDS:
   1509		pnfs_generic_prepare_to_resend_writes(data);
   1510		return -EAGAIN;
   1511	case -EAGAIN:
   1512		rpc_restart_call_prepare(task);
   1513		return -EAGAIN;
   1514	}
   1515
   1516	ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb);
   1517
   1518	return 0;
   1519}
   1520
   1521static void ff_layout_write_record_layoutstats_start(struct rpc_task *task,
   1522		struct nfs_pgio_header *hdr)
   1523{
   1524	if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
   1525		return;
   1526	nfs4_ff_layout_stat_io_start_write(hdr->inode,
   1527			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
   1528			hdr->args.count,
   1529			task->tk_start);
   1530}
   1531
   1532static void ff_layout_write_record_layoutstats_done(struct rpc_task *task,
   1533		struct nfs_pgio_header *hdr)
   1534{
   1535	if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
   1536		return;
   1537	nfs4_ff_layout_stat_io_end_write(task,
   1538			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
   1539			hdr->args.count, hdr->res.count,
   1540			hdr->res.verf->committed);
   1541	set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags);
   1542}
   1543
   1544static int ff_layout_write_prepare_common(struct rpc_task *task,
   1545					  struct nfs_pgio_header *hdr)
   1546{
   1547	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
   1548		rpc_exit(task, -EIO);
   1549		return -EIO;
   1550	}
   1551
   1552	ff_layout_write_record_layoutstats_start(task, hdr);
   1553	return 0;
   1554}
   1555
   1556static void ff_layout_write_prepare_v3(struct rpc_task *task, void *data)
   1557{
   1558	struct nfs_pgio_header *hdr = data;
   1559
   1560	if (ff_layout_write_prepare_common(task, hdr))
   1561		return;
   1562
   1563	rpc_call_start(task);
   1564}
   1565
   1566static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
   1567{
   1568	struct nfs_pgio_header *hdr = data;
   1569
   1570	if (nfs4_setup_sequence(hdr->ds_clp,
   1571				&hdr->args.seq_args,
   1572				&hdr->res.seq_res,
   1573				task))
   1574		return;
   1575
   1576	ff_layout_write_prepare_common(task, hdr);
   1577}
   1578
   1579static void ff_layout_write_call_done(struct rpc_task *task, void *data)
   1580{
   1581	struct nfs_pgio_header *hdr = data;
   1582
   1583	if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
   1584	    task->tk_status == 0) {
   1585		nfs4_sequence_done(task, &hdr->res.seq_res);
   1586		return;
   1587	}
   1588
   1589	/* Note this may cause RPC to be resent */
   1590	hdr->mds_ops->rpc_call_done(task, hdr);
   1591}
   1592
   1593static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
   1594{
   1595	struct nfs_pgio_header *hdr = data;
   1596
   1597	ff_layout_write_record_layoutstats_done(task, hdr);
   1598	rpc_count_iostats_metrics(task,
   1599	    &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
   1600}
   1601
   1602static void ff_layout_write_release(void *data)
   1603{
   1604	struct nfs_pgio_header *hdr = data;
   1605
   1606	ff_layout_write_record_layoutstats_done(&hdr->task, hdr);
   1607	if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) {
   1608		ff_layout_send_layouterror(hdr->lseg);
   1609		ff_layout_reset_write(hdr, true);
   1610	} else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags))
   1611		ff_layout_reset_write(hdr, false);
   1612	pnfs_generic_rw_release(data);
   1613}
   1614
   1615static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task,
   1616		struct nfs_commit_data *cdata)
   1617{
   1618	if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags))
   1619		return;
   1620	nfs4_ff_layout_stat_io_start_write(cdata->inode,
   1621			FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
   1622			0, task->tk_start);
   1623}
   1624
   1625static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
   1626		struct nfs_commit_data *cdata)
   1627{
   1628	struct nfs_page *req;
   1629	__u64 count = 0;
   1630
   1631	if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags))
   1632		return;
   1633
   1634	if (task->tk_status == 0) {
   1635		list_for_each_entry(req, &cdata->pages, wb_list)
   1636			count += req->wb_bytes;
   1637	}
   1638	nfs4_ff_layout_stat_io_end_write(task,
   1639			FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
   1640			count, count, NFS_FILE_SYNC);
   1641	set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags);
   1642}
   1643
   1644static void ff_layout_commit_prepare_common(struct rpc_task *task,
   1645		struct nfs_commit_data *cdata)
   1646{
   1647	ff_layout_commit_record_layoutstats_start(task, cdata);
   1648}
   1649
   1650static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
   1651{
   1652	ff_layout_commit_prepare_common(task, data);
   1653	rpc_call_start(task);
   1654}
   1655
   1656static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
   1657{
   1658	struct nfs_commit_data *wdata = data;
   1659
   1660	if (nfs4_setup_sequence(wdata->ds_clp,
   1661				&wdata->args.seq_args,
   1662				&wdata->res.seq_res,
   1663				task))
   1664		return;
   1665	ff_layout_commit_prepare_common(task, data);
   1666}
   1667
   1668static void ff_layout_commit_done(struct rpc_task *task, void *data)
   1669{
   1670	pnfs_generic_write_commit_done(task, data);
   1671}
   1672
   1673static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
   1674{
   1675	struct nfs_commit_data *cdata = data;
   1676
   1677	ff_layout_commit_record_layoutstats_done(task, cdata);
   1678	rpc_count_iostats_metrics(task,
   1679	    &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
   1680}
   1681
   1682static void ff_layout_commit_release(void *data)
   1683{
   1684	struct nfs_commit_data *cdata = data;
   1685
   1686	ff_layout_commit_record_layoutstats_done(&cdata->task, cdata);
   1687	pnfs_generic_commit_release(data);
   1688}
   1689
   1690static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
   1691	.rpc_call_prepare = ff_layout_read_prepare_v3,
   1692	.rpc_call_done = ff_layout_read_call_done,
   1693	.rpc_count_stats = ff_layout_read_count_stats,
   1694	.rpc_release = ff_layout_read_release,
   1695};
   1696
   1697static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
   1698	.rpc_call_prepare = ff_layout_read_prepare_v4,
   1699	.rpc_call_done = ff_layout_read_call_done,
   1700	.rpc_count_stats = ff_layout_read_count_stats,
   1701	.rpc_release = ff_layout_read_release,
   1702};
   1703
   1704static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
   1705	.rpc_call_prepare = ff_layout_write_prepare_v3,
   1706	.rpc_call_done = ff_layout_write_call_done,
   1707	.rpc_count_stats = ff_layout_write_count_stats,
   1708	.rpc_release = ff_layout_write_release,
   1709};
   1710
   1711static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
   1712	.rpc_call_prepare = ff_layout_write_prepare_v4,
   1713	.rpc_call_done = ff_layout_write_call_done,
   1714	.rpc_count_stats = ff_layout_write_count_stats,
   1715	.rpc_release = ff_layout_write_release,
   1716};
   1717
   1718static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
   1719	.rpc_call_prepare = ff_layout_commit_prepare_v3,
   1720	.rpc_call_done = ff_layout_commit_done,
   1721	.rpc_count_stats = ff_layout_commit_count_stats,
   1722	.rpc_release = ff_layout_commit_release,
   1723};
   1724
   1725static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
   1726	.rpc_call_prepare = ff_layout_commit_prepare_v4,
   1727	.rpc_call_done = ff_layout_commit_done,
   1728	.rpc_count_stats = ff_layout_commit_count_stats,
   1729	.rpc_release = ff_layout_commit_release,
   1730};
   1731
   1732static enum pnfs_try_status
   1733ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
   1734{
   1735	struct pnfs_layout_segment *lseg = hdr->lseg;
   1736	struct nfs4_pnfs_ds *ds;
   1737	struct rpc_clnt *ds_clnt;
   1738	struct nfs4_ff_layout_mirror *mirror;
   1739	const struct cred *ds_cred;
   1740	loff_t offset = hdr->args.offset;
   1741	u32 idx = hdr->pgio_mirror_idx;
   1742	int vers;
   1743	struct nfs_fh *fh;
   1744
   1745	dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
   1746		__func__, hdr->inode->i_ino,
   1747		hdr->args.pgbase, (size_t)hdr->args.count, offset);
   1748
   1749	mirror = FF_LAYOUT_COMP(lseg, idx);
   1750	ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
   1751	if (!ds)
   1752		goto out_failed;
   1753
   1754	ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
   1755						   hdr->inode);
   1756	if (IS_ERR(ds_clnt))
   1757		goto out_failed;
   1758
   1759	ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
   1760	if (!ds_cred)
   1761		goto out_failed;
   1762
   1763	vers = nfs4_ff_layout_ds_version(mirror);
   1764
   1765	dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
   1766		ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers);
   1767
   1768	hdr->pgio_done_cb = ff_layout_read_done_cb;
   1769	refcount_inc(&ds->ds_clp->cl_count);
   1770	hdr->ds_clp = ds->ds_clp;
   1771	fh = nfs4_ff_layout_select_ds_fh(mirror);
   1772	if (fh)
   1773		hdr->args.fh = fh;
   1774
   1775	nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
   1776
   1777	/*
   1778	 * Note that if we ever decide to split across DSes,
   1779	 * then we may need to handle dense-like offsets.
   1780	 */
   1781	hdr->args.offset = offset;
   1782	hdr->mds_offset = offset;
   1783
   1784	/* Perform an asynchronous read to ds */
   1785	nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
   1786			  vers == 3 ? &ff_layout_read_call_ops_v3 :
   1787				      &ff_layout_read_call_ops_v4,
   1788			  0, RPC_TASK_SOFTCONN);
   1789	put_cred(ds_cred);
   1790	return PNFS_ATTEMPTED;
   1791
   1792out_failed:
   1793	if (ff_layout_avoid_mds_available_ds(lseg))
   1794		return PNFS_TRY_AGAIN;
   1795	trace_pnfs_mds_fallback_read_pagelist(hdr->inode,
   1796			hdr->args.offset, hdr->args.count,
   1797			IOMODE_READ, NFS_I(hdr->inode)->layout, lseg);
   1798	return PNFS_NOT_ATTEMPTED;
   1799}
   1800
   1801/* Perform async writes. */
   1802static enum pnfs_try_status
   1803ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
   1804{
   1805	struct pnfs_layout_segment *lseg = hdr->lseg;
   1806	struct nfs4_pnfs_ds *ds;
   1807	struct rpc_clnt *ds_clnt;
   1808	struct nfs4_ff_layout_mirror *mirror;
   1809	const struct cred *ds_cred;
   1810	loff_t offset = hdr->args.offset;
   1811	int vers;
   1812	struct nfs_fh *fh;
   1813	u32 idx = hdr->pgio_mirror_idx;
   1814
   1815	mirror = FF_LAYOUT_COMP(lseg, idx);
   1816	ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
   1817	if (!ds)
   1818		goto out_failed;
   1819
   1820	ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
   1821						   hdr->inode);
   1822	if (IS_ERR(ds_clnt))
   1823		goto out_failed;
   1824
   1825	ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred);
   1826	if (!ds_cred)
   1827		goto out_failed;
   1828
   1829	vers = nfs4_ff_layout_ds_version(mirror);
   1830
   1831	dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n",
   1832		__func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
   1833		offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count),
   1834		vers);
   1835
   1836	hdr->pgio_done_cb = ff_layout_write_done_cb;
   1837	refcount_inc(&ds->ds_clp->cl_count);
   1838	hdr->ds_clp = ds->ds_clp;
   1839	hdr->ds_commit_idx = idx;
   1840	fh = nfs4_ff_layout_select_ds_fh(mirror);
   1841	if (fh)
   1842		hdr->args.fh = fh;
   1843
   1844	nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid);
   1845
   1846	/*
   1847	 * Note that if we ever decide to split across DSes,
   1848	 * then we may need to handle dense-like offsets.
   1849	 */
   1850	hdr->args.offset = offset;
   1851
   1852	/* Perform an asynchronous write */
   1853	nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
   1854			  vers == 3 ? &ff_layout_write_call_ops_v3 :
   1855				      &ff_layout_write_call_ops_v4,
   1856			  sync, RPC_TASK_SOFTCONN);
   1857	put_cred(ds_cred);
   1858	return PNFS_ATTEMPTED;
   1859
   1860out_failed:
   1861	if (ff_layout_avoid_mds_available_ds(lseg))
   1862		return PNFS_TRY_AGAIN;
   1863	trace_pnfs_mds_fallback_write_pagelist(hdr->inode,
   1864			hdr->args.offset, hdr->args.count,
   1865			IOMODE_RW, NFS_I(hdr->inode)->layout, lseg);
   1866	return PNFS_NOT_ATTEMPTED;
   1867}
   1868
   1869static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
   1870{
   1871	return i;
   1872}
   1873
   1874static struct nfs_fh *
   1875select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
   1876{
   1877	struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
   1878
   1879	/* FIXME: Assume that there is only one NFS version available
   1880	 * for the DS.
   1881	 */
   1882	return &flseg->mirror_array[i]->fh_versions[0];
   1883}
   1884
   1885static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
   1886{
   1887	struct pnfs_layout_segment *lseg = data->lseg;
   1888	struct nfs4_pnfs_ds *ds;
   1889	struct rpc_clnt *ds_clnt;
   1890	struct nfs4_ff_layout_mirror *mirror;
   1891	const struct cred *ds_cred;
   1892	u32 idx;
   1893	int vers, ret;
   1894	struct nfs_fh *fh;
   1895
   1896	if (!lseg || !(pnfs_is_valid_lseg(lseg) ||
   1897	    test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)))
   1898		goto out_err;
   1899
   1900	idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
   1901	mirror = FF_LAYOUT_COMP(lseg, idx);
   1902	ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
   1903	if (!ds)
   1904		goto out_err;
   1905
   1906	ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
   1907						   data->inode);
   1908	if (IS_ERR(ds_clnt))
   1909		goto out_err;
   1910
   1911	ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred);
   1912	if (!ds_cred)
   1913		goto out_err;
   1914
   1915	vers = nfs4_ff_layout_ds_version(mirror);
   1916
   1917	dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
   1918		data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count),
   1919		vers);
   1920	data->commit_done_cb = ff_layout_commit_done_cb;
   1921	data->cred = ds_cred;
   1922	refcount_inc(&ds->ds_clp->cl_count);
   1923	data->ds_clp = ds->ds_clp;
   1924	fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
   1925	if (fh)
   1926		data->args.fh = fh;
   1927
   1928	ret = nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
   1929				   vers == 3 ? &ff_layout_commit_call_ops_v3 :
   1930					       &ff_layout_commit_call_ops_v4,
   1931				   how, RPC_TASK_SOFTCONN);
   1932	put_cred(ds_cred);
   1933	return ret;
   1934out_err:
   1935	pnfs_generic_prepare_to_resend_writes(data);
   1936	pnfs_generic_commit_release(data);
   1937	return -EAGAIN;
   1938}
   1939
   1940static int
   1941ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
   1942			   int how, struct nfs_commit_info *cinfo)
   1943{
   1944	return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
   1945					    ff_layout_initiate_commit);
   1946}
   1947
   1948static struct pnfs_ds_commit_info *
   1949ff_layout_get_ds_info(struct inode *inode)
   1950{
   1951	struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
   1952
   1953	if (layout == NULL)
   1954		return NULL;
   1955
   1956	return &FF_LAYOUT_FROM_HDR(layout)->commit_info;
   1957}
   1958
   1959static void
   1960ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
   1961		struct pnfs_layout_segment *lseg)
   1962{
   1963	struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
   1964	struct inode *inode = lseg->pls_layout->plh_inode;
   1965	struct pnfs_commit_array *array, *new;
   1966
   1967	new = pnfs_alloc_commit_array(flseg->mirror_array_cnt,
   1968				      nfs_io_gfp_mask());
   1969	if (new) {
   1970		spin_lock(&inode->i_lock);
   1971		array = pnfs_add_commit_array(fl_cinfo, new, lseg);
   1972		spin_unlock(&inode->i_lock);
   1973		if (array != new)
   1974			pnfs_free_commit_array(new);
   1975	}
   1976}
   1977
   1978static void
   1979ff_layout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
   1980		struct inode *inode)
   1981{
   1982	spin_lock(&inode->i_lock);
   1983	pnfs_generic_ds_cinfo_destroy(fl_cinfo);
   1984	spin_unlock(&inode->i_lock);
   1985}
   1986
   1987static void
   1988ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d)
   1989{
   1990	nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
   1991						  id_node));
   1992}
   1993
   1994static int ff_layout_encode_ioerr(struct xdr_stream *xdr,
   1995				  const struct nfs4_layoutreturn_args *args,
   1996				  const struct nfs4_flexfile_layoutreturn_args *ff_args)
   1997{
   1998	__be32 *start;
   1999
   2000	start = xdr_reserve_space(xdr, 4);
   2001	if (unlikely(!start))
   2002		return -E2BIG;
   2003
   2004	*start = cpu_to_be32(ff_args->num_errors);
   2005	/* This assume we always return _ALL_ layouts */
   2006	return ff_layout_encode_ds_ioerr(xdr, &ff_args->errors);
   2007}
   2008
   2009static void
   2010encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len)
   2011{
   2012	WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr, buf, len) < 0);
   2013}
   2014
   2015static void
   2016ff_layout_encode_ff_iostat_head(struct xdr_stream *xdr,
   2017			    const nfs4_stateid *stateid,
   2018			    const struct nfs42_layoutstat_devinfo *devinfo)
   2019{
   2020	__be32 *p;
   2021
   2022	p = xdr_reserve_space(xdr, 8 + 8);
   2023	p = xdr_encode_hyper(p, devinfo->offset);
   2024	p = xdr_encode_hyper(p, devinfo->length);
   2025	encode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
   2026	p = xdr_reserve_space(xdr, 4*8);
   2027	p = xdr_encode_hyper(p, devinfo->read_count);
   2028	p = xdr_encode_hyper(p, devinfo->read_bytes);
   2029	p = xdr_encode_hyper(p, devinfo->write_count);
   2030	p = xdr_encode_hyper(p, devinfo->write_bytes);
   2031	encode_opaque_fixed(xdr, devinfo->dev_id.data, NFS4_DEVICEID4_SIZE);
   2032}
   2033
   2034static void
   2035ff_layout_encode_ff_iostat(struct xdr_stream *xdr,
   2036			    const nfs4_stateid *stateid,
   2037			    const struct nfs42_layoutstat_devinfo *devinfo)
   2038{
   2039	ff_layout_encode_ff_iostat_head(xdr, stateid, devinfo);
   2040	ff_layout_encode_ff_layoutupdate(xdr, devinfo,
   2041			devinfo->ld_private.data);
   2042}
   2043
   2044/* report nothing for now */
   2045static void ff_layout_encode_iostats_array(struct xdr_stream *xdr,
   2046		const struct nfs4_layoutreturn_args *args,
   2047		struct nfs4_flexfile_layoutreturn_args *ff_args)
   2048{
   2049	__be32 *p;
   2050	int i;
   2051
   2052	p = xdr_reserve_space(xdr, 4);
   2053	*p = cpu_to_be32(ff_args->num_dev);
   2054	for (i = 0; i < ff_args->num_dev; i++)
   2055		ff_layout_encode_ff_iostat(xdr,
   2056				&args->layout->plh_stateid,
   2057				&ff_args->devinfo[i]);
   2058}
   2059
   2060static void
   2061ff_layout_free_iostats_array(struct nfs42_layoutstat_devinfo *devinfo,
   2062		unsigned int num_entries)
   2063{
   2064	unsigned int i;
   2065
   2066	for (i = 0; i < num_entries; i++) {
   2067		if (!devinfo[i].ld_private.ops)
   2068			continue;
   2069		if (!devinfo[i].ld_private.ops->free)
   2070			continue;
   2071		devinfo[i].ld_private.ops->free(&devinfo[i].ld_private);
   2072	}
   2073}
   2074
   2075static struct nfs4_deviceid_node *
   2076ff_layout_alloc_deviceid_node(struct nfs_server *server,
   2077			      struct pnfs_device *pdev, gfp_t gfp_flags)
   2078{
   2079	struct nfs4_ff_layout_ds *dsaddr;
   2080
   2081	dsaddr = nfs4_ff_alloc_deviceid_node(server, pdev, gfp_flags);
   2082	if (!dsaddr)
   2083		return NULL;
   2084	return &dsaddr->id_node;
   2085}
   2086
   2087static void
   2088ff_layout_encode_layoutreturn(struct xdr_stream *xdr,
   2089		const void *voidargs,
   2090		const struct nfs4_xdr_opaque_data *ff_opaque)
   2091{
   2092	const struct nfs4_layoutreturn_args *args = voidargs;
   2093	struct nfs4_flexfile_layoutreturn_args *ff_args = ff_opaque->data;
   2094	struct xdr_buf tmp_buf = {
   2095		.head = {
   2096			[0] = {
   2097				.iov_base = page_address(ff_args->pages[0]),
   2098			},
   2099		},
   2100		.buflen = PAGE_SIZE,
   2101	};
   2102	struct xdr_stream tmp_xdr;
   2103	__be32 *start;
   2104
   2105	dprintk("%s: Begin\n", __func__);
   2106
   2107	xdr_init_encode(&tmp_xdr, &tmp_buf, NULL, NULL);
   2108
   2109	ff_layout_encode_ioerr(&tmp_xdr, args, ff_args);
   2110	ff_layout_encode_iostats_array(&tmp_xdr, args, ff_args);
   2111
   2112	start = xdr_reserve_space(xdr, 4);
   2113	*start = cpu_to_be32(tmp_buf.len);
   2114	xdr_write_pages(xdr, ff_args->pages, 0, tmp_buf.len);
   2115
   2116	dprintk("%s: Return\n", __func__);
   2117}
   2118
   2119static void
   2120ff_layout_free_layoutreturn(struct nfs4_xdr_opaque_data *args)
   2121{
   2122	struct nfs4_flexfile_layoutreturn_args *ff_args;
   2123
   2124	if (!args->data)
   2125		return;
   2126	ff_args = args->data;
   2127	args->data = NULL;
   2128
   2129	ff_layout_free_ds_ioerr(&ff_args->errors);
   2130	ff_layout_free_iostats_array(ff_args->devinfo, ff_args->num_dev);
   2131
   2132	put_page(ff_args->pages[0]);
   2133	kfree(ff_args);
   2134}
   2135
   2136static const struct nfs4_xdr_opaque_ops layoutreturn_ops = {
   2137	.encode = ff_layout_encode_layoutreturn,
   2138	.free = ff_layout_free_layoutreturn,
   2139};
   2140
   2141static int
   2142ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args)
   2143{
   2144	struct nfs4_flexfile_layoutreturn_args *ff_args;
   2145	struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(args->layout);
   2146
   2147	ff_args = kmalloc(sizeof(*ff_args), nfs_io_gfp_mask());
   2148	if (!ff_args)
   2149		goto out_nomem;
   2150	ff_args->pages[0] = alloc_page(nfs_io_gfp_mask());
   2151	if (!ff_args->pages[0])
   2152		goto out_nomem_free;
   2153
   2154	INIT_LIST_HEAD(&ff_args->errors);
   2155	ff_args->num_errors = ff_layout_fetch_ds_ioerr(args->layout,
   2156			&args->range, &ff_args->errors,
   2157			FF_LAYOUTRETURN_MAXERR);
   2158
   2159	spin_lock(&args->inode->i_lock);
   2160	ff_args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
   2161			&ff_args->devinfo[0], ARRAY_SIZE(ff_args->devinfo));
   2162	spin_unlock(&args->inode->i_lock);
   2163
   2164	args->ld_private->ops = &layoutreturn_ops;
   2165	args->ld_private->data = ff_args;
   2166	return 0;
   2167out_nomem_free:
   2168	kfree(ff_args);
   2169out_nomem:
   2170	return -ENOMEM;
   2171}
   2172
   2173#ifdef CONFIG_NFS_V4_2
   2174void
   2175ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
   2176{
   2177	struct pnfs_layout_hdr *lo = lseg->pls_layout;
   2178	struct nfs42_layout_error *errors;
   2179	LIST_HEAD(head);
   2180
   2181	if (!nfs_server_capable(lo->plh_inode, NFS_CAP_LAYOUTERROR))
   2182		return;
   2183	ff_layout_fetch_ds_ioerr(lo, &lseg->pls_range, &head, -1);
   2184	if (list_empty(&head))
   2185		return;
   2186
   2187	errors = kmalloc_array(NFS42_LAYOUTERROR_MAX, sizeof(*errors),
   2188			       nfs_io_gfp_mask());
   2189	if (errors != NULL) {
   2190		const struct nfs4_ff_layout_ds_err *pos;
   2191		size_t n = 0;
   2192
   2193		list_for_each_entry(pos, &head, list) {
   2194			errors[n].offset = pos->offset;
   2195			errors[n].length = pos->length;
   2196			nfs4_stateid_copy(&errors[n].stateid, &pos->stateid);
   2197			errors[n].errors[0].dev_id = pos->deviceid;
   2198			errors[n].errors[0].status = pos->status;
   2199			errors[n].errors[0].opnum = pos->opnum;
   2200			n++;
   2201			if (!list_is_last(&pos->list, &head) &&
   2202			    n < NFS42_LAYOUTERROR_MAX)
   2203				continue;
   2204			if (nfs42_proc_layouterror(lseg, errors, n) < 0)
   2205				break;
   2206			n = 0;
   2207		}
   2208		kfree(errors);
   2209	}
   2210	ff_layout_free_ds_ioerr(&head);
   2211}
   2212#else
   2213void
   2214ff_layout_send_layouterror(struct pnfs_layout_segment *lseg)
   2215{
   2216}
   2217#endif
   2218
   2219static int
   2220ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
   2221{
   2222	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
   2223
   2224	return snprintf(buf, buflen, "%pI4", &sin->sin_addr);
   2225}
   2226
   2227static size_t
   2228ff_layout_ntop6_noscopeid(const struct sockaddr *sap, char *buf,
   2229			  const int buflen)
   2230{
   2231	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
   2232	const struct in6_addr *addr = &sin6->sin6_addr;
   2233
   2234	/*
   2235	 * RFC 4291, Section 2.2.2
   2236	 *
   2237	 * Shorthanded ANY address
   2238	 */
   2239	if (ipv6_addr_any(addr))
   2240		return snprintf(buf, buflen, "::");
   2241
   2242	/*
   2243	 * RFC 4291, Section 2.2.2
   2244	 *
   2245	 * Shorthanded loopback address
   2246	 */
   2247	if (ipv6_addr_loopback(addr))
   2248		return snprintf(buf, buflen, "::1");
   2249
   2250	/*
   2251	 * RFC 4291, Section 2.2.3
   2252	 *
   2253	 * Special presentation address format for mapped v4
   2254	 * addresses.
   2255	 */
   2256	if (ipv6_addr_v4mapped(addr))
   2257		return snprintf(buf, buflen, "::ffff:%pI4",
   2258					&addr->s6_addr32[3]);
   2259
   2260	/*
   2261	 * RFC 4291, Section 2.2.1
   2262	 */
   2263	return snprintf(buf, buflen, "%pI6c", addr);
   2264}
   2265
   2266/* Derived from rpc_sockaddr2uaddr */
   2267static void
   2268ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da)
   2269{
   2270	struct sockaddr *sap = (struct sockaddr *)&da->da_addr;
   2271	char portbuf[RPCBIND_MAXUADDRPLEN];
   2272	char addrbuf[RPCBIND_MAXUADDRLEN];
   2273	unsigned short port;
   2274	int len, netid_len;
   2275	__be32 *p;
   2276
   2277	switch (sap->sa_family) {
   2278	case AF_INET:
   2279		if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0)
   2280			return;
   2281		port = ntohs(((struct sockaddr_in *)sap)->sin_port);
   2282		break;
   2283	case AF_INET6:
   2284		if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0)
   2285			return;
   2286		port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
   2287		break;
   2288	default:
   2289		WARN_ON_ONCE(1);
   2290		return;
   2291	}
   2292
   2293	snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff);
   2294	len = strlcat(addrbuf, portbuf, sizeof(addrbuf));
   2295
   2296	netid_len = strlen(da->da_netid);
   2297	p = xdr_reserve_space(xdr, 4 + netid_len);
   2298	xdr_encode_opaque(p, da->da_netid, netid_len);
   2299
   2300	p = xdr_reserve_space(xdr, 4 + len);
   2301	xdr_encode_opaque(p, addrbuf, len);
   2302}
   2303
   2304static void
   2305ff_layout_encode_nfstime(struct xdr_stream *xdr,
   2306			 ktime_t t)
   2307{
   2308	struct timespec64 ts;
   2309	__be32 *p;
   2310
   2311	p = xdr_reserve_space(xdr, 12);
   2312	ts = ktime_to_timespec64(t);
   2313	p = xdr_encode_hyper(p, ts.tv_sec);
   2314	*p++ = cpu_to_be32(ts.tv_nsec);
   2315}
   2316
   2317static void
   2318ff_layout_encode_io_latency(struct xdr_stream *xdr,
   2319			    struct nfs4_ff_io_stat *stat)
   2320{
   2321	__be32 *p;
   2322
   2323	p = xdr_reserve_space(xdr, 5 * 8);
   2324	p = xdr_encode_hyper(p, stat->ops_requested);
   2325	p = xdr_encode_hyper(p, stat->bytes_requested);
   2326	p = xdr_encode_hyper(p, stat->ops_completed);
   2327	p = xdr_encode_hyper(p, stat->bytes_completed);
   2328	p = xdr_encode_hyper(p, stat->bytes_not_delivered);
   2329	ff_layout_encode_nfstime(xdr, stat->total_busy_time);
   2330	ff_layout_encode_nfstime(xdr, stat->aggregate_completion_time);
   2331}
   2332
   2333static void
   2334ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
   2335			      const struct nfs42_layoutstat_devinfo *devinfo,
   2336			      struct nfs4_ff_layout_mirror *mirror)
   2337{
   2338	struct nfs4_pnfs_ds_addr *da;
   2339	struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
   2340	struct nfs_fh *fh = &mirror->fh_versions[0];
   2341	__be32 *p;
   2342
   2343	da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
   2344	dprintk("%s: DS %s: encoding address %s\n",
   2345		__func__, ds->ds_remotestr, da->da_remotestr);
   2346	/* netaddr4 */
   2347	ff_layout_encode_netaddr(xdr, da);
   2348	/* nfs_fh4 */
   2349	p = xdr_reserve_space(xdr, 4 + fh->size);
   2350	xdr_encode_opaque(p, fh->data, fh->size);
   2351	/* ff_io_latency4 read */
   2352	spin_lock(&mirror->lock);
   2353	ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat);
   2354	/* ff_io_latency4 write */
   2355	ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat);
   2356	spin_unlock(&mirror->lock);
   2357	/* nfstime4 */
   2358	ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time));
   2359	/* bool */
   2360	p = xdr_reserve_space(xdr, 4);
   2361	*p = cpu_to_be32(false);
   2362}
   2363
   2364static void
   2365ff_layout_encode_layoutstats(struct xdr_stream *xdr, const void *args,
   2366			     const struct nfs4_xdr_opaque_data *opaque)
   2367{
   2368	struct nfs42_layoutstat_devinfo *devinfo = container_of(opaque,
   2369			struct nfs42_layoutstat_devinfo, ld_private);
   2370	__be32 *start;
   2371
   2372	/* layoutupdate length */
   2373	start = xdr_reserve_space(xdr, 4);
   2374	ff_layout_encode_ff_layoutupdate(xdr, devinfo, opaque->data);
   2375
   2376	*start = cpu_to_be32((xdr->p - start - 1) * 4);
   2377}
   2378
   2379static void
   2380ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data *opaque)
   2381{
   2382	struct nfs4_ff_layout_mirror *mirror = opaque->data;
   2383
   2384	ff_layout_put_mirror(mirror);
   2385}
   2386
   2387static const struct nfs4_xdr_opaque_ops layoutstat_ops = {
   2388	.encode = ff_layout_encode_layoutstats,
   2389	.free	= ff_layout_free_layoutstats,
   2390};
   2391
   2392static int
   2393ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
   2394			       struct nfs42_layoutstat_devinfo *devinfo,
   2395			       int dev_limit)
   2396{
   2397	struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
   2398	struct nfs4_ff_layout_mirror *mirror;
   2399	struct nfs4_deviceid_node *dev;
   2400	int i = 0;
   2401
   2402	list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
   2403		if (i >= dev_limit)
   2404			break;
   2405		if (IS_ERR_OR_NULL(mirror->mirror_ds))
   2406			continue;
   2407		if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags))
   2408			continue;
   2409		/* mirror refcount put in cleanup_layoutstats */
   2410		if (!refcount_inc_not_zero(&mirror->ref))
   2411			continue;
   2412		dev = &mirror->mirror_ds->id_node; 
   2413		memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
   2414		devinfo->offset = 0;
   2415		devinfo->length = NFS4_MAX_UINT64;
   2416		spin_lock(&mirror->lock);
   2417		devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
   2418		devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
   2419		devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
   2420		devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
   2421		spin_unlock(&mirror->lock);
   2422		devinfo->layout_type = LAYOUT_FLEX_FILES;
   2423		devinfo->ld_private.ops = &layoutstat_ops;
   2424		devinfo->ld_private.data = mirror;
   2425
   2426		devinfo++;
   2427		i++;
   2428	}
   2429	return i;
   2430}
   2431
   2432static int
   2433ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
   2434{
   2435	struct nfs4_flexfile_layout *ff_layout;
   2436	const int dev_count = PNFS_LAYOUTSTATS_MAXDEV;
   2437
   2438	/* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
   2439	args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo),
   2440				      nfs_io_gfp_mask());
   2441	if (!args->devinfo)
   2442		return -ENOMEM;
   2443
   2444	spin_lock(&args->inode->i_lock);
   2445	ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
   2446	args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
   2447			&args->devinfo[0], dev_count);
   2448	spin_unlock(&args->inode->i_lock);
   2449	if (!args->num_dev) {
   2450		kfree(args->devinfo);
   2451		args->devinfo = NULL;
   2452		return -ENOENT;
   2453	}
   2454
   2455	return 0;
   2456}
   2457
   2458static int
   2459ff_layout_set_layoutdriver(struct nfs_server *server,
   2460		const struct nfs_fh *dummy)
   2461{
   2462#if IS_ENABLED(CONFIG_NFS_V4_2)
   2463	server->caps |= NFS_CAP_LAYOUTSTATS;
   2464#endif
   2465	return 0;
   2466}
   2467
   2468static const struct pnfs_commit_ops ff_layout_commit_ops = {
   2469	.setup_ds_info		= ff_layout_setup_ds_info,
   2470	.release_ds_info	= ff_layout_release_ds_info,
   2471	.mark_request_commit	= pnfs_layout_mark_request_commit,
   2472	.clear_request_commit	= pnfs_generic_clear_request_commit,
   2473	.scan_commit_lists	= pnfs_generic_scan_commit_lists,
   2474	.recover_commit_reqs	= pnfs_generic_recover_commit_reqs,
   2475	.commit_pagelist	= ff_layout_commit_pagelist,
   2476};
   2477
   2478static struct pnfs_layoutdriver_type flexfilelayout_type = {
   2479	.id			= LAYOUT_FLEX_FILES,
   2480	.name			= "LAYOUT_FLEX_FILES",
   2481	.owner			= THIS_MODULE,
   2482	.flags			= PNFS_LAYOUTGET_ON_OPEN,
   2483	.max_layoutget_response	= 4096, /* 1 page or so... */
   2484	.set_layoutdriver	= ff_layout_set_layoutdriver,
   2485	.alloc_layout_hdr	= ff_layout_alloc_layout_hdr,
   2486	.free_layout_hdr	= ff_layout_free_layout_hdr,
   2487	.alloc_lseg		= ff_layout_alloc_lseg,
   2488	.free_lseg		= ff_layout_free_lseg,
   2489	.add_lseg		= ff_layout_add_lseg,
   2490	.pg_read_ops		= &ff_layout_pg_read_ops,
   2491	.pg_write_ops		= &ff_layout_pg_write_ops,
   2492	.get_ds_info		= ff_layout_get_ds_info,
   2493	.free_deviceid_node	= ff_layout_free_deviceid_node,
   2494	.read_pagelist		= ff_layout_read_pagelist,
   2495	.write_pagelist		= ff_layout_write_pagelist,
   2496	.alloc_deviceid_node    = ff_layout_alloc_deviceid_node,
   2497	.prepare_layoutreturn   = ff_layout_prepare_layoutreturn,
   2498	.sync			= pnfs_nfs_generic_sync,
   2499	.prepare_layoutstats	= ff_layout_prepare_layoutstats,
   2500};
   2501
   2502static int __init nfs4flexfilelayout_init(void)
   2503{
   2504	printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
   2505	       __func__);
   2506	return pnfs_register_layoutdriver(&flexfilelayout_type);
   2507}
   2508
   2509static void __exit nfs4flexfilelayout_exit(void)
   2510{
   2511	printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
   2512	       __func__);
   2513	pnfs_unregister_layoutdriver(&flexfilelayout_type);
   2514}
   2515
   2516MODULE_ALIAS("nfs-layouttype4-4");
   2517
   2518MODULE_LICENSE("GPL");
   2519MODULE_DESCRIPTION("The NFSv4 flexfile layout driver");
   2520
   2521module_init(nfs4flexfilelayout_init);
   2522module_exit(nfs4flexfilelayout_exit);
   2523
   2524module_param(io_maxretrans, ushort, 0644);
   2525MODULE_PARM_DESC(io_maxretrans, "The  number of times the NFSv4.1 client "
   2526			"retries an I/O request before returning an error. ");