cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

inode.c (70255B)


      1// SPDX-License-Identifier: GPL-2.0
      2#include <linux/ceph/ceph_debug.h>
      3
      4#include <linux/module.h>
      5#include <linux/fs.h>
      6#include <linux/slab.h>
      7#include <linux/string.h>
      8#include <linux/uaccess.h>
      9#include <linux/kernel.h>
     10#include <linux/writeback.h>
     11#include <linux/vmalloc.h>
     12#include <linux/xattr.h>
     13#include <linux/posix_acl.h>
     14#include <linux/random.h>
     15#include <linux/sort.h>
     16#include <linux/iversion.h>
     17
     18#include "super.h"
     19#include "mds_client.h"
     20#include "cache.h"
     21#include <linux/ceph/decode.h>
     22
     23/*
     24 * Ceph inode operations
     25 *
     26 * Implement basic inode helpers (get, alloc) and inode ops (getattr,
     27 * setattr, etc.), xattr helpers, and helpers for assimilating
     28 * metadata returned by the MDS into our cache.
     29 *
     30 * Also define helpers for doing asynchronous writeback, invalidation,
     31 * and truncation for the benefit of those who can't afford to block
     32 * (typically because they are in the message handler path).
     33 */
     34
     35static const struct inode_operations ceph_symlink_iops;
     36
     37static void ceph_inode_work(struct work_struct *work);
     38
     39/*
     40 * find or create an inode, given the ceph ino number
     41 */
     42static int ceph_set_ino_cb(struct inode *inode, void *data)
     43{
     44	struct ceph_inode_info *ci = ceph_inode(inode);
     45	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
     46
     47	ci->i_vino = *(struct ceph_vino *)data;
     48	inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
     49	inode_set_iversion_raw(inode, 0);
     50	percpu_counter_inc(&mdsc->metric.total_inodes);
     51
     52	return 0;
     53}
     54
     55struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
     56{
     57	struct inode *inode;
     58
     59	if (ceph_vino_is_reserved(vino))
     60		return ERR_PTR(-EREMOTEIO);
     61
     62	inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
     63			     ceph_set_ino_cb, &vino);
     64	if (!inode)
     65		return ERR_PTR(-ENOMEM);
     66
     67	dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
     68	     ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
     69	return inode;
     70}
     71
     72/*
     73 * get/constuct snapdir inode for a given directory
     74 */
     75struct inode *ceph_get_snapdir(struct inode *parent)
     76{
     77	struct ceph_vino vino = {
     78		.ino = ceph_ino(parent),
     79		.snap = CEPH_SNAPDIR,
     80	};
     81	struct inode *inode = ceph_get_inode(parent->i_sb, vino);
     82	struct ceph_inode_info *ci = ceph_inode(inode);
     83
     84	if (IS_ERR(inode))
     85		return inode;
     86
     87	if (!S_ISDIR(parent->i_mode)) {
     88		pr_warn_once("bad snapdir parent type (mode=0%o)\n",
     89			     parent->i_mode);
     90		goto err;
     91	}
     92
     93	if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
     94		pr_warn_once("bad snapdir inode type (mode=0%o)\n",
     95			     inode->i_mode);
     96		goto err;
     97	}
     98
     99	inode->i_mode = parent->i_mode;
    100	inode->i_uid = parent->i_uid;
    101	inode->i_gid = parent->i_gid;
    102	inode->i_mtime = parent->i_mtime;
    103	inode->i_ctime = parent->i_ctime;
    104	inode->i_atime = parent->i_atime;
    105	ci->i_rbytes = 0;
    106	ci->i_btime = ceph_inode(parent)->i_btime;
    107
    108	if (inode->i_state & I_NEW) {
    109		inode->i_op = &ceph_snapdir_iops;
    110		inode->i_fop = &ceph_snapdir_fops;
    111		ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
    112		unlock_new_inode(inode);
    113	}
    114
    115	return inode;
    116err:
    117	if ((inode->i_state & I_NEW))
    118		discard_new_inode(inode);
    119	else
    120		iput(inode);
    121	return ERR_PTR(-ENOTDIR);
    122}
    123
    124const struct inode_operations ceph_file_iops = {
    125	.permission = ceph_permission,
    126	.setattr = ceph_setattr,
    127	.getattr = ceph_getattr,
    128	.listxattr = ceph_listxattr,
    129	.get_acl = ceph_get_acl,
    130	.set_acl = ceph_set_acl,
    131};
    132
    133
    134/*
    135 * We use a 'frag tree' to keep track of the MDS's directory fragments
    136 * for a given inode (usually there is just a single fragment).  We
    137 * need to know when a child frag is delegated to a new MDS, or when
    138 * it is flagged as replicated, so we can direct our requests
    139 * accordingly.
    140 */
    141
    142/*
    143 * find/create a frag in the tree
    144 */
    145static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
    146						    u32 f)
    147{
    148	struct rb_node **p;
    149	struct rb_node *parent = NULL;
    150	struct ceph_inode_frag *frag;
    151	int c;
    152
    153	p = &ci->i_fragtree.rb_node;
    154	while (*p) {
    155		parent = *p;
    156		frag = rb_entry(parent, struct ceph_inode_frag, node);
    157		c = ceph_frag_compare(f, frag->frag);
    158		if (c < 0)
    159			p = &(*p)->rb_left;
    160		else if (c > 0)
    161			p = &(*p)->rb_right;
    162		else
    163			return frag;
    164	}
    165
    166	frag = kmalloc(sizeof(*frag), GFP_NOFS);
    167	if (!frag)
    168		return ERR_PTR(-ENOMEM);
    169
    170	frag->frag = f;
    171	frag->split_by = 0;
    172	frag->mds = -1;
    173	frag->ndist = 0;
    174
    175	rb_link_node(&frag->node, parent, p);
    176	rb_insert_color(&frag->node, &ci->i_fragtree);
    177
    178	dout("get_or_create_frag added %llx.%llx frag %x\n",
    179	     ceph_vinop(&ci->netfs.inode), f);
    180	return frag;
    181}
    182
    183/*
    184 * find a specific frag @f
    185 */
    186struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
    187{
    188	struct rb_node *n = ci->i_fragtree.rb_node;
    189
    190	while (n) {
    191		struct ceph_inode_frag *frag =
    192			rb_entry(n, struct ceph_inode_frag, node);
    193		int c = ceph_frag_compare(f, frag->frag);
    194		if (c < 0)
    195			n = n->rb_left;
    196		else if (c > 0)
    197			n = n->rb_right;
    198		else
    199			return frag;
    200	}
    201	return NULL;
    202}
    203
    204/*
    205 * Choose frag containing the given value @v.  If @pfrag is
    206 * specified, copy the frag delegation info to the caller if
    207 * it is present.
    208 */
    209static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
    210			      struct ceph_inode_frag *pfrag, int *found)
    211{
    212	u32 t = ceph_frag_make(0, 0);
    213	struct ceph_inode_frag *frag;
    214	unsigned nway, i;
    215	u32 n;
    216
    217	if (found)
    218		*found = 0;
    219
    220	while (1) {
    221		WARN_ON(!ceph_frag_contains_value(t, v));
    222		frag = __ceph_find_frag(ci, t);
    223		if (!frag)
    224			break; /* t is a leaf */
    225		if (frag->split_by == 0) {
    226			if (pfrag)
    227				memcpy(pfrag, frag, sizeof(*pfrag));
    228			if (found)
    229				*found = 1;
    230			break;
    231		}
    232
    233		/* choose child */
    234		nway = 1 << frag->split_by;
    235		dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
    236		     frag->split_by, nway);
    237		for (i = 0; i < nway; i++) {
    238			n = ceph_frag_make_child(t, frag->split_by, i);
    239			if (ceph_frag_contains_value(n, v)) {
    240				t = n;
    241				break;
    242			}
    243		}
    244		BUG_ON(i == nway);
    245	}
    246	dout("choose_frag(%x) = %x\n", v, t);
    247
    248	return t;
    249}
    250
    251u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
    252		     struct ceph_inode_frag *pfrag, int *found)
    253{
    254	u32 ret;
    255	mutex_lock(&ci->i_fragtree_mutex);
    256	ret = __ceph_choose_frag(ci, v, pfrag, found);
    257	mutex_unlock(&ci->i_fragtree_mutex);
    258	return ret;
    259}
    260
    261/*
    262 * Process dirfrag (delegation) info from the mds.  Include leaf
    263 * fragment in tree ONLY if ndist > 0.  Otherwise, only
    264 * branches/splits are included in i_fragtree)
    265 */
    266static int ceph_fill_dirfrag(struct inode *inode,
    267			     struct ceph_mds_reply_dirfrag *dirinfo)
    268{
    269	struct ceph_inode_info *ci = ceph_inode(inode);
    270	struct ceph_inode_frag *frag;
    271	u32 id = le32_to_cpu(dirinfo->frag);
    272	int mds = le32_to_cpu(dirinfo->auth);
    273	int ndist = le32_to_cpu(dirinfo->ndist);
    274	int diri_auth = -1;
    275	int i;
    276	int err = 0;
    277
    278	spin_lock(&ci->i_ceph_lock);
    279	if (ci->i_auth_cap)
    280		diri_auth = ci->i_auth_cap->mds;
    281	spin_unlock(&ci->i_ceph_lock);
    282
    283	if (mds == -1) /* CDIR_AUTH_PARENT */
    284		mds = diri_auth;
    285
    286	mutex_lock(&ci->i_fragtree_mutex);
    287	if (ndist == 0 && mds == diri_auth) {
    288		/* no delegation info needed. */
    289		frag = __ceph_find_frag(ci, id);
    290		if (!frag)
    291			goto out;
    292		if (frag->split_by == 0) {
    293			/* tree leaf, remove */
    294			dout("fill_dirfrag removed %llx.%llx frag %x"
    295			     " (no ref)\n", ceph_vinop(inode), id);
    296			rb_erase(&frag->node, &ci->i_fragtree);
    297			kfree(frag);
    298		} else {
    299			/* tree branch, keep and clear */
    300			dout("fill_dirfrag cleared %llx.%llx frag %x"
    301			     " referral\n", ceph_vinop(inode), id);
    302			frag->mds = -1;
    303			frag->ndist = 0;
    304		}
    305		goto out;
    306	}
    307
    308
    309	/* find/add this frag to store mds delegation info */
    310	frag = __get_or_create_frag(ci, id);
    311	if (IS_ERR(frag)) {
    312		/* this is not the end of the world; we can continue
    313		   with bad/inaccurate delegation info */
    314		pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
    315		       ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
    316		err = -ENOMEM;
    317		goto out;
    318	}
    319
    320	frag->mds = mds;
    321	frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
    322	for (i = 0; i < frag->ndist; i++)
    323		frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
    324	dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
    325	     ceph_vinop(inode), frag->frag, frag->ndist);
    326
    327out:
    328	mutex_unlock(&ci->i_fragtree_mutex);
    329	return err;
    330}
    331
    332static int frag_tree_split_cmp(const void *l, const void *r)
    333{
    334	struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
    335	struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
    336	return ceph_frag_compare(le32_to_cpu(ls->frag),
    337				 le32_to_cpu(rs->frag));
    338}
    339
    340static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
    341{
    342	if (!frag)
    343		return f == ceph_frag_make(0, 0);
    344	if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by)
    345		return false;
    346	return ceph_frag_contains_value(frag->frag, ceph_frag_value(f));
    347}
    348
    349static int ceph_fill_fragtree(struct inode *inode,
    350			      struct ceph_frag_tree_head *fragtree,
    351			      struct ceph_mds_reply_dirfrag *dirinfo)
    352{
    353	struct ceph_inode_info *ci = ceph_inode(inode);
    354	struct ceph_inode_frag *frag, *prev_frag = NULL;
    355	struct rb_node *rb_node;
    356	unsigned i, split_by, nsplits;
    357	u32 id;
    358	bool update = false;
    359
    360	mutex_lock(&ci->i_fragtree_mutex);
    361	nsplits = le32_to_cpu(fragtree->nsplits);
    362	if (nsplits != ci->i_fragtree_nsplits) {
    363		update = true;
    364	} else if (nsplits) {
    365		i = prandom_u32() % nsplits;
    366		id = le32_to_cpu(fragtree->splits[i].frag);
    367		if (!__ceph_find_frag(ci, id))
    368			update = true;
    369	} else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) {
    370		rb_node = rb_first(&ci->i_fragtree);
    371		frag = rb_entry(rb_node, struct ceph_inode_frag, node);
    372		if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node))
    373			update = true;
    374	}
    375	if (!update && dirinfo) {
    376		id = le32_to_cpu(dirinfo->frag);
    377		if (id != __ceph_choose_frag(ci, id, NULL, NULL))
    378			update = true;
    379	}
    380	if (!update)
    381		goto out_unlock;
    382
    383	if (nsplits > 1) {
    384		sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]),
    385		     frag_tree_split_cmp, NULL);
    386	}
    387
    388	dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
    389	rb_node = rb_first(&ci->i_fragtree);
    390	for (i = 0; i < nsplits; i++) {
    391		id = le32_to_cpu(fragtree->splits[i].frag);
    392		split_by = le32_to_cpu(fragtree->splits[i].by);
    393		if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) {
    394			pr_err("fill_fragtree %llx.%llx invalid split %d/%u, "
    395			       "frag %x split by %d\n", ceph_vinop(inode),
    396			       i, nsplits, id, split_by);
    397			continue;
    398		}
    399		frag = NULL;
    400		while (rb_node) {
    401			frag = rb_entry(rb_node, struct ceph_inode_frag, node);
    402			if (ceph_frag_compare(frag->frag, id) >= 0) {
    403				if (frag->frag != id)
    404					frag = NULL;
    405				else
    406					rb_node = rb_next(rb_node);
    407				break;
    408			}
    409			rb_node = rb_next(rb_node);
    410			/* delete stale split/leaf node */
    411			if (frag->split_by > 0 ||
    412			    !is_frag_child(frag->frag, prev_frag)) {
    413				rb_erase(&frag->node, &ci->i_fragtree);
    414				if (frag->split_by > 0)
    415					ci->i_fragtree_nsplits--;
    416				kfree(frag);
    417			}
    418			frag = NULL;
    419		}
    420		if (!frag) {
    421			frag = __get_or_create_frag(ci, id);
    422			if (IS_ERR(frag))
    423				continue;
    424		}
    425		if (frag->split_by == 0)
    426			ci->i_fragtree_nsplits++;
    427		frag->split_by = split_by;
    428		dout(" frag %x split by %d\n", frag->frag, frag->split_by);
    429		prev_frag = frag;
    430	}
    431	while (rb_node) {
    432		frag = rb_entry(rb_node, struct ceph_inode_frag, node);
    433		rb_node = rb_next(rb_node);
    434		/* delete stale split/leaf node */
    435		if (frag->split_by > 0 ||
    436		    !is_frag_child(frag->frag, prev_frag)) {
    437			rb_erase(&frag->node, &ci->i_fragtree);
    438			if (frag->split_by > 0)
    439				ci->i_fragtree_nsplits--;
    440			kfree(frag);
    441		}
    442	}
    443out_unlock:
    444	mutex_unlock(&ci->i_fragtree_mutex);
    445	return 0;
    446}
    447
    448/*
    449 * initialize a newly allocated inode.
    450 */
    451struct inode *ceph_alloc_inode(struct super_block *sb)
    452{
    453	struct ceph_inode_info *ci;
    454	int i;
    455
    456	ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS);
    457	if (!ci)
    458		return NULL;
    459
    460	dout("alloc_inode %p\n", &ci->netfs.inode);
    461
    462	/* Set parameters for the netfs library */
    463	netfs_inode_init(&ci->netfs, &ceph_netfs_ops);
    464
    465	spin_lock_init(&ci->i_ceph_lock);
    466
    467	ci->i_version = 0;
    468	ci->i_inline_version = 0;
    469	ci->i_time_warp_seq = 0;
    470	ci->i_ceph_flags = 0;
    471	atomic64_set(&ci->i_ordered_count, 1);
    472	atomic64_set(&ci->i_release_count, 1);
    473	atomic64_set(&ci->i_complete_seq[0], 0);
    474	atomic64_set(&ci->i_complete_seq[1], 0);
    475	ci->i_symlink = NULL;
    476
    477	ci->i_max_bytes = 0;
    478	ci->i_max_files = 0;
    479
    480	memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
    481	memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
    482	RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
    483
    484	ci->i_fragtree = RB_ROOT;
    485	mutex_init(&ci->i_fragtree_mutex);
    486
    487	ci->i_xattrs.blob = NULL;
    488	ci->i_xattrs.prealloc_blob = NULL;
    489	ci->i_xattrs.dirty = false;
    490	ci->i_xattrs.index = RB_ROOT;
    491	ci->i_xattrs.count = 0;
    492	ci->i_xattrs.names_size = 0;
    493	ci->i_xattrs.vals_size = 0;
    494	ci->i_xattrs.version = 0;
    495	ci->i_xattrs.index_version = 0;
    496
    497	ci->i_caps = RB_ROOT;
    498	ci->i_auth_cap = NULL;
    499	ci->i_dirty_caps = 0;
    500	ci->i_flushing_caps = 0;
    501	INIT_LIST_HEAD(&ci->i_dirty_item);
    502	INIT_LIST_HEAD(&ci->i_flushing_item);
    503	ci->i_prealloc_cap_flush = NULL;
    504	INIT_LIST_HEAD(&ci->i_cap_flush_list);
    505	init_waitqueue_head(&ci->i_cap_wq);
    506	ci->i_hold_caps_max = 0;
    507	INIT_LIST_HEAD(&ci->i_cap_delay_list);
    508	INIT_LIST_HEAD(&ci->i_cap_snaps);
    509	ci->i_head_snapc = NULL;
    510	ci->i_snap_caps = 0;
    511
    512	ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
    513	for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
    514		ci->i_nr_by_mode[i] = 0;
    515
    516	mutex_init(&ci->i_truncate_mutex);
    517	ci->i_truncate_seq = 0;
    518	ci->i_truncate_size = 0;
    519	ci->i_truncate_pending = 0;
    520
    521	ci->i_max_size = 0;
    522	ci->i_reported_size = 0;
    523	ci->i_wanted_max_size = 0;
    524	ci->i_requested_max_size = 0;
    525
    526	ci->i_pin_ref = 0;
    527	ci->i_rd_ref = 0;
    528	ci->i_rdcache_ref = 0;
    529	ci->i_wr_ref = 0;
    530	ci->i_wb_ref = 0;
    531	ci->i_fx_ref = 0;
    532	ci->i_wrbuffer_ref = 0;
    533	ci->i_wrbuffer_ref_head = 0;
    534	atomic_set(&ci->i_filelock_ref, 0);
    535	atomic_set(&ci->i_shared_gen, 1);
    536	ci->i_rdcache_gen = 0;
    537	ci->i_rdcache_revoking = 0;
    538
    539	INIT_LIST_HEAD(&ci->i_unsafe_dirops);
    540	INIT_LIST_HEAD(&ci->i_unsafe_iops);
    541	spin_lock_init(&ci->i_unsafe_lock);
    542
    543	ci->i_snap_realm = NULL;
    544	INIT_LIST_HEAD(&ci->i_snap_realm_item);
    545	INIT_LIST_HEAD(&ci->i_snap_flush_item);
    546
    547	INIT_WORK(&ci->i_work, ceph_inode_work);
    548	ci->i_work_mask = 0;
    549	memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
    550	return &ci->netfs.inode;
    551}
    552
    553void ceph_free_inode(struct inode *inode)
    554{
    555	struct ceph_inode_info *ci = ceph_inode(inode);
    556
    557	kfree(ci->i_symlink);
    558	kmem_cache_free(ceph_inode_cachep, ci);
    559}
    560
    561void ceph_evict_inode(struct inode *inode)
    562{
    563	struct ceph_inode_info *ci = ceph_inode(inode);
    564	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
    565	struct ceph_inode_frag *frag;
    566	struct rb_node *n;
    567
    568	dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
    569
    570	percpu_counter_dec(&mdsc->metric.total_inodes);
    571
    572	truncate_inode_pages_final(&inode->i_data);
    573	if (inode->i_state & I_PINNING_FSCACHE_WB)
    574		ceph_fscache_unuse_cookie(inode, true);
    575	clear_inode(inode);
    576
    577	ceph_fscache_unregister_inode_cookie(ci);
    578
    579	__ceph_remove_caps(ci);
    580
    581	if (__ceph_has_quota(ci, QUOTA_GET_ANY))
    582		ceph_adjust_quota_realms_count(inode, false);
    583
    584	/*
    585	 * we may still have a snap_realm reference if there are stray
    586	 * caps in i_snap_caps.
    587	 */
    588	if (ci->i_snap_realm) {
    589		if (ceph_snap(inode) == CEPH_NOSNAP) {
    590			dout(" dropping residual ref to snap realm %p\n",
    591			     ci->i_snap_realm);
    592			ceph_change_snap_realm(inode, NULL);
    593		} else {
    594			ceph_put_snapid_map(mdsc, ci->i_snapid_map);
    595			ci->i_snap_realm = NULL;
    596		}
    597	}
    598
    599	while ((n = rb_first(&ci->i_fragtree)) != NULL) {
    600		frag = rb_entry(n, struct ceph_inode_frag, node);
    601		rb_erase(n, &ci->i_fragtree);
    602		kfree(frag);
    603	}
    604	ci->i_fragtree_nsplits = 0;
    605
    606	__ceph_destroy_xattrs(ci);
    607	if (ci->i_xattrs.blob)
    608		ceph_buffer_put(ci->i_xattrs.blob);
    609	if (ci->i_xattrs.prealloc_blob)
    610		ceph_buffer_put(ci->i_xattrs.prealloc_blob);
    611
    612	ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
    613	ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
    614}
    615
    616static inline blkcnt_t calc_inode_blocks(u64 size)
    617{
    618	return (size + (1<<9) - 1) >> 9;
    619}
    620
    621/*
    622 * Helpers to fill in size, ctime, mtime, and atime.  We have to be
    623 * careful because either the client or MDS may have more up to date
    624 * info, depending on which capabilities are held, and whether
    625 * time_warp_seq or truncate_seq have increased.  (Ordinarily, mtime
    626 * and size are monotonically increasing, except when utimes() or
    627 * truncate() increments the corresponding _seq values.)
    628 */
    629int ceph_fill_file_size(struct inode *inode, int issued,
    630			u32 truncate_seq, u64 truncate_size, u64 size)
    631{
    632	struct ceph_inode_info *ci = ceph_inode(inode);
    633	int queue_trunc = 0;
    634	loff_t isize = i_size_read(inode);
    635
    636	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
    637	    (truncate_seq == ci->i_truncate_seq && size > isize)) {
    638		dout("size %lld -> %llu\n", isize, size);
    639		if (size > 0 && S_ISDIR(inode->i_mode)) {
    640			pr_err("fill_file_size non-zero size for directory\n");
    641			size = 0;
    642		}
    643		i_size_write(inode, size);
    644		inode->i_blocks = calc_inode_blocks(size);
    645		/*
    646		 * If we're expanding, then we should be able to just update
    647		 * the existing cookie.
    648		 */
    649		if (size > isize)
    650			ceph_fscache_update(inode);
    651		ci->i_reported_size = size;
    652		if (truncate_seq != ci->i_truncate_seq) {
    653			dout("truncate_seq %u -> %u\n",
    654			     ci->i_truncate_seq, truncate_seq);
    655			ci->i_truncate_seq = truncate_seq;
    656
    657			/* the MDS should have revoked these caps */
    658			WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL |
    659					       CEPH_CAP_FILE_RD |
    660					       CEPH_CAP_FILE_WR |
    661					       CEPH_CAP_FILE_LAZYIO));
    662			/*
    663			 * If we hold relevant caps, or in the case where we're
    664			 * not the only client referencing this file and we
    665			 * don't hold those caps, then we need to check whether
    666			 * the file is either opened or mmaped
    667			 */
    668			if ((issued & (CEPH_CAP_FILE_CACHE|
    669				       CEPH_CAP_FILE_BUFFER)) ||
    670			    mapping_mapped(inode->i_mapping) ||
    671			    __ceph_is_file_opened(ci)) {
    672				ci->i_truncate_pending++;
    673				queue_trunc = 1;
    674			}
    675		}
    676	}
    677	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
    678	    ci->i_truncate_size != truncate_size) {
    679		dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
    680		     truncate_size);
    681		ci->i_truncate_size = truncate_size;
    682	}
    683	return queue_trunc;
    684}
    685
    686void ceph_fill_file_time(struct inode *inode, int issued,
    687			 u64 time_warp_seq, struct timespec64 *ctime,
    688			 struct timespec64 *mtime, struct timespec64 *atime)
    689{
    690	struct ceph_inode_info *ci = ceph_inode(inode);
    691	int warn = 0;
    692
    693	if (issued & (CEPH_CAP_FILE_EXCL|
    694		      CEPH_CAP_FILE_WR|
    695		      CEPH_CAP_FILE_BUFFER|
    696		      CEPH_CAP_AUTH_EXCL|
    697		      CEPH_CAP_XATTR_EXCL)) {
    698		if (ci->i_version == 0 ||
    699		    timespec64_compare(ctime, &inode->i_ctime) > 0) {
    700			dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n",
    701			     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
    702			     ctime->tv_sec, ctime->tv_nsec);
    703			inode->i_ctime = *ctime;
    704		}
    705		if (ci->i_version == 0 ||
    706		    ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
    707			/* the MDS did a utimes() */
    708			dout("mtime %lld.%09ld -> %lld.%09ld "
    709			     "tw %d -> %d\n",
    710			     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
    711			     mtime->tv_sec, mtime->tv_nsec,
    712			     ci->i_time_warp_seq, (int)time_warp_seq);
    713
    714			inode->i_mtime = *mtime;
    715			inode->i_atime = *atime;
    716			ci->i_time_warp_seq = time_warp_seq;
    717		} else if (time_warp_seq == ci->i_time_warp_seq) {
    718			/* nobody did utimes(); take the max */
    719			if (timespec64_compare(mtime, &inode->i_mtime) > 0) {
    720				dout("mtime %lld.%09ld -> %lld.%09ld inc\n",
    721				     inode->i_mtime.tv_sec,
    722				     inode->i_mtime.tv_nsec,
    723				     mtime->tv_sec, mtime->tv_nsec);
    724				inode->i_mtime = *mtime;
    725			}
    726			if (timespec64_compare(atime, &inode->i_atime) > 0) {
    727				dout("atime %lld.%09ld -> %lld.%09ld inc\n",
    728				     inode->i_atime.tv_sec,
    729				     inode->i_atime.tv_nsec,
    730				     atime->tv_sec, atime->tv_nsec);
    731				inode->i_atime = *atime;
    732			}
    733		} else if (issued & CEPH_CAP_FILE_EXCL) {
    734			/* we did a utimes(); ignore mds values */
    735		} else {
    736			warn = 1;
    737		}
    738	} else {
    739		/* we have no write|excl caps; whatever the MDS says is true */
    740		if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
    741			inode->i_ctime = *ctime;
    742			inode->i_mtime = *mtime;
    743			inode->i_atime = *atime;
    744			ci->i_time_warp_seq = time_warp_seq;
    745		} else {
    746			warn = 1;
    747		}
    748	}
    749	if (warn) /* time_warp_seq shouldn't go backwards */
    750		dout("%p mds time_warp_seq %llu < %u\n",
    751		     inode, time_warp_seq, ci->i_time_warp_seq);
    752}
    753
    754/*
    755 * Populate an inode based on info from mds.  May be called on new or
    756 * existing inodes.
    757 */
    758int ceph_fill_inode(struct inode *inode, struct page *locked_page,
    759		    struct ceph_mds_reply_info_in *iinfo,
    760		    struct ceph_mds_reply_dirfrag *dirinfo,
    761		    struct ceph_mds_session *session, int cap_fmode,
    762		    struct ceph_cap_reservation *caps_reservation)
    763{
    764	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
    765	struct ceph_mds_reply_inode *info = iinfo->in;
    766	struct ceph_inode_info *ci = ceph_inode(inode);
    767	int issued, new_issued, info_caps;
    768	struct timespec64 mtime, atime, ctime;
    769	struct ceph_buffer *xattr_blob = NULL;
    770	struct ceph_buffer *old_blob = NULL;
    771	struct ceph_string *pool_ns = NULL;
    772	struct ceph_cap *new_cap = NULL;
    773	int err = 0;
    774	bool wake = false;
    775	bool queue_trunc = false;
    776	bool new_version = false;
    777	bool fill_inline = false;
    778	umode_t mode = le32_to_cpu(info->mode);
    779	dev_t rdev = le32_to_cpu(info->rdev);
    780
    781	lockdep_assert_held(&mdsc->snap_rwsem);
    782
    783	dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__,
    784	     inode, ceph_vinop(inode), le64_to_cpu(info->version),
    785	     ci->i_version);
    786
    787	/* Once I_NEW is cleared, we can't change type or dev numbers */
    788	if (inode->i_state & I_NEW) {
    789		inode->i_mode = mode;
    790	} else {
    791		if (inode_wrong_type(inode, mode)) {
    792			pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n",
    793				     ceph_vinop(inode), inode->i_mode, mode);
    794			return -ESTALE;
    795		}
    796
    797		if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) {
    798			pr_warn_once("dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n",
    799				     ceph_vinop(inode), MAJOR(inode->i_rdev),
    800				     MINOR(inode->i_rdev), MAJOR(rdev),
    801				     MINOR(rdev));
    802			return -ESTALE;
    803		}
    804	}
    805
    806	info_caps = le32_to_cpu(info->cap.caps);
    807
    808	/* prealloc new cap struct */
    809	if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) {
    810		new_cap = ceph_get_cap(mdsc, caps_reservation);
    811		if (!new_cap)
    812			return -ENOMEM;
    813	}
    814
    815	/*
    816	 * prealloc xattr data, if it looks like we'll need it.  only
    817	 * if len > 4 (meaning there are actually xattrs; the first 4
    818	 * bytes are the xattr count).
    819	 */
    820	if (iinfo->xattr_len > 4) {
    821		xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
    822		if (!xattr_blob)
    823			pr_err("%s ENOMEM xattr blob %d bytes\n", __func__,
    824			       iinfo->xattr_len);
    825	}
    826
    827	if (iinfo->pool_ns_len > 0)
    828		pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
    829						     iinfo->pool_ns_len);
    830
    831	if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
    832		ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
    833
    834	spin_lock(&ci->i_ceph_lock);
    835
    836	/*
    837	 * provided version will be odd if inode value is projected,
    838	 * even if stable.  skip the update if we have newer stable
    839	 * info (ours>=theirs, e.g. due to racing mds replies), unless
    840	 * we are getting projected (unstable) info (in which case the
    841	 * version is odd, and we want ours>theirs).
    842	 *   us   them
    843	 *   2    2     skip
    844	 *   3    2     skip
    845	 *   3    3     update
    846	 */
    847	if (ci->i_version == 0 ||
    848	    ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
    849	     le64_to_cpu(info->version) > (ci->i_version & ~1)))
    850		new_version = true;
    851
    852	/* Update change_attribute */
    853	inode_set_max_iversion_raw(inode, iinfo->change_attr);
    854
    855	__ceph_caps_issued(ci, &issued);
    856	issued |= __ceph_caps_dirty(ci);
    857	new_issued = ~issued & info_caps;
    858
    859	/* directories have fl_stripe_unit set to zero */
    860	if (le32_to_cpu(info->layout.fl_stripe_unit))
    861		inode->i_blkbits =
    862			fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
    863	else
    864		inode->i_blkbits = CEPH_BLOCK_SHIFT;
    865
    866	__ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
    867
    868	if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
    869	    (issued & CEPH_CAP_AUTH_EXCL) == 0) {
    870		inode->i_mode = mode;
    871		inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
    872		inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
    873		dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
    874		     from_kuid(&init_user_ns, inode->i_uid),
    875		     from_kgid(&init_user_ns, inode->i_gid));
    876		ceph_decode_timespec64(&ci->i_btime, &iinfo->btime);
    877		ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime);
    878	}
    879
    880	if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
    881	    (issued & CEPH_CAP_LINK_EXCL) == 0)
    882		set_nlink(inode, le32_to_cpu(info->nlink));
    883
    884	if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
    885		/* be careful with mtime, atime, size */
    886		ceph_decode_timespec64(&atime, &info->atime);
    887		ceph_decode_timespec64(&mtime, &info->mtime);
    888		ceph_decode_timespec64(&ctime, &info->ctime);
    889		ceph_fill_file_time(inode, issued,
    890				le32_to_cpu(info->time_warp_seq),
    891				&ctime, &mtime, &atime);
    892	}
    893
    894	if (new_version || (info_caps & CEPH_CAP_FILE_SHARED)) {
    895		ci->i_files = le64_to_cpu(info->files);
    896		ci->i_subdirs = le64_to_cpu(info->subdirs);
    897	}
    898
    899	if (new_version ||
    900	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
    901		s64 old_pool = ci->i_layout.pool_id;
    902		struct ceph_string *old_ns;
    903
    904		ceph_file_layout_from_legacy(&ci->i_layout, &info->layout);
    905		old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
    906					lockdep_is_held(&ci->i_ceph_lock));
    907		rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns);
    908
    909		if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns)
    910			ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
    911
    912		pool_ns = old_ns;
    913
    914		queue_trunc = ceph_fill_file_size(inode, issued,
    915					le32_to_cpu(info->truncate_seq),
    916					le64_to_cpu(info->truncate_size),
    917					le64_to_cpu(info->size));
    918		/* only update max_size on auth cap */
    919		if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
    920		    ci->i_max_size != le64_to_cpu(info->max_size)) {
    921			dout("max_size %lld -> %llu\n", ci->i_max_size,
    922					le64_to_cpu(info->max_size));
    923			ci->i_max_size = le64_to_cpu(info->max_size);
    924		}
    925	}
    926
    927	/* layout and rstat are not tracked by capability, update them if
    928	 * the inode info is from auth mds */
    929	if (new_version || (info->cap.flags & CEPH_CAP_FLAG_AUTH)) {
    930		if (S_ISDIR(inode->i_mode)) {
    931			ci->i_dir_layout = iinfo->dir_layout;
    932			ci->i_rbytes = le64_to_cpu(info->rbytes);
    933			ci->i_rfiles = le64_to_cpu(info->rfiles);
    934			ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
    935			ci->i_dir_pin = iinfo->dir_pin;
    936			ci->i_rsnaps = iinfo->rsnaps;
    937			ceph_decode_timespec64(&ci->i_rctime, &info->rctime);
    938		}
    939	}
    940
    941	/* xattrs */
    942	/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
    943	if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))  &&
    944	    le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
    945		if (ci->i_xattrs.blob)
    946			old_blob = ci->i_xattrs.blob;
    947		ci->i_xattrs.blob = xattr_blob;
    948		if (xattr_blob)
    949			memcpy(ci->i_xattrs.blob->vec.iov_base,
    950			       iinfo->xattr_data, iinfo->xattr_len);
    951		ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
    952		ceph_forget_all_cached_acls(inode);
    953		ceph_security_invalidate_secctx(inode);
    954		xattr_blob = NULL;
    955	}
    956
    957	/* finally update i_version */
    958	if (le64_to_cpu(info->version) > ci->i_version)
    959		ci->i_version = le64_to_cpu(info->version);
    960
    961	inode->i_mapping->a_ops = &ceph_aops;
    962
    963	switch (inode->i_mode & S_IFMT) {
    964	case S_IFIFO:
    965	case S_IFBLK:
    966	case S_IFCHR:
    967	case S_IFSOCK:
    968		inode->i_blkbits = PAGE_SHIFT;
    969		init_special_inode(inode, inode->i_mode, rdev);
    970		inode->i_op = &ceph_file_iops;
    971		break;
    972	case S_IFREG:
    973		inode->i_op = &ceph_file_iops;
    974		inode->i_fop = &ceph_file_fops;
    975		break;
    976	case S_IFLNK:
    977		inode->i_op = &ceph_symlink_iops;
    978		if (!ci->i_symlink) {
    979			u32 symlen = iinfo->symlink_len;
    980			char *sym;
    981
    982			spin_unlock(&ci->i_ceph_lock);
    983
    984			if (symlen != i_size_read(inode)) {
    985				pr_err("%s %llx.%llx BAD symlink "
    986					"size %lld\n", __func__,
    987					ceph_vinop(inode),
    988					i_size_read(inode));
    989				i_size_write(inode, symlen);
    990				inode->i_blocks = calc_inode_blocks(symlen);
    991			}
    992
    993			err = -ENOMEM;
    994			sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
    995			if (!sym)
    996				goto out;
    997
    998			spin_lock(&ci->i_ceph_lock);
    999			if (!ci->i_symlink)
   1000				ci->i_symlink = sym;
   1001			else
   1002				kfree(sym); /* lost a race */
   1003		}
   1004		inode->i_link = ci->i_symlink;
   1005		break;
   1006	case S_IFDIR:
   1007		inode->i_op = &ceph_dir_iops;
   1008		inode->i_fop = &ceph_dir_fops;
   1009		break;
   1010	default:
   1011		pr_err("%s %llx.%llx BAD mode 0%o\n", __func__,
   1012		       ceph_vinop(inode), inode->i_mode);
   1013	}
   1014
   1015	/* were we issued a capability? */
   1016	if (info_caps) {
   1017		if (ceph_snap(inode) == CEPH_NOSNAP) {
   1018			ceph_add_cap(inode, session,
   1019				     le64_to_cpu(info->cap.cap_id),
   1020				     info_caps,
   1021				     le32_to_cpu(info->cap.wanted),
   1022				     le32_to_cpu(info->cap.seq),
   1023				     le32_to_cpu(info->cap.mseq),
   1024				     le64_to_cpu(info->cap.realm),
   1025				     info->cap.flags, &new_cap);
   1026
   1027			/* set dir completion flag? */
   1028			if (S_ISDIR(inode->i_mode) &&
   1029			    ci->i_files == 0 && ci->i_subdirs == 0 &&
   1030			    (info_caps & CEPH_CAP_FILE_SHARED) &&
   1031			    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
   1032			    !__ceph_dir_is_complete(ci)) {
   1033				dout(" marking %p complete (empty)\n", inode);
   1034				i_size_write(inode, 0);
   1035				__ceph_dir_set_complete(ci,
   1036					atomic64_read(&ci->i_release_count),
   1037					atomic64_read(&ci->i_ordered_count));
   1038			}
   1039
   1040			wake = true;
   1041		} else {
   1042			dout(" %p got snap_caps %s\n", inode,
   1043			     ceph_cap_string(info_caps));
   1044			ci->i_snap_caps |= info_caps;
   1045		}
   1046	}
   1047
   1048	if (iinfo->inline_version > 0 &&
   1049	    iinfo->inline_version >= ci->i_inline_version) {
   1050		int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
   1051		ci->i_inline_version = iinfo->inline_version;
   1052		if (ci->i_inline_version != CEPH_INLINE_NONE &&
   1053		    (locked_page || (info_caps & cache_caps)))
   1054			fill_inline = true;
   1055	}
   1056
   1057	if (cap_fmode >= 0) {
   1058		if (!info_caps)
   1059			pr_warn("mds issued no caps on %llx.%llx\n",
   1060				ceph_vinop(inode));
   1061		__ceph_touch_fmode(ci, mdsc, cap_fmode);
   1062	}
   1063
   1064	spin_unlock(&ci->i_ceph_lock);
   1065
   1066	ceph_fscache_register_inode_cookie(inode);
   1067
   1068	if (fill_inline)
   1069		ceph_fill_inline_data(inode, locked_page,
   1070				      iinfo->inline_data, iinfo->inline_len);
   1071
   1072	if (wake)
   1073		wake_up_all(&ci->i_cap_wq);
   1074
   1075	/* queue truncate if we saw i_size decrease */
   1076	if (queue_trunc)
   1077		ceph_queue_vmtruncate(inode);
   1078
   1079	/* populate frag tree */
   1080	if (S_ISDIR(inode->i_mode))
   1081		ceph_fill_fragtree(inode, &info->fragtree, dirinfo);
   1082
   1083	/* update delegation info? */
   1084	if (dirinfo)
   1085		ceph_fill_dirfrag(inode, dirinfo);
   1086
   1087	err = 0;
   1088out:
   1089	if (new_cap)
   1090		ceph_put_cap(mdsc, new_cap);
   1091	ceph_buffer_put(old_blob);
   1092	ceph_buffer_put(xattr_blob);
   1093	ceph_put_string(pool_ns);
   1094	return err;
   1095}
   1096
   1097/*
   1098 * caller should hold session s_mutex and dentry->d_lock.
   1099 */
   1100static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
   1101				  struct ceph_mds_reply_lease *lease,
   1102				  struct ceph_mds_session *session,
   1103				  unsigned long from_time,
   1104				  struct ceph_mds_session **old_lease_session)
   1105{
   1106	struct ceph_dentry_info *di = ceph_dentry(dentry);
   1107	unsigned mask = le16_to_cpu(lease->mask);
   1108	long unsigned duration = le32_to_cpu(lease->duration_ms);
   1109	long unsigned ttl = from_time + (duration * HZ) / 1000;
   1110	long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
   1111
   1112	dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
   1113	     dentry, duration, ttl);
   1114
   1115	/* only track leases on regular dentries */
   1116	if (ceph_snap(dir) != CEPH_NOSNAP)
   1117		return;
   1118
   1119	if (mask & CEPH_LEASE_PRIMARY_LINK)
   1120		di->flags |= CEPH_DENTRY_PRIMARY_LINK;
   1121	else
   1122		di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
   1123
   1124	di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
   1125	if (!(mask & CEPH_LEASE_VALID)) {
   1126		__ceph_dentry_dir_lease_touch(di);
   1127		return;
   1128	}
   1129
   1130	if (di->lease_gen == atomic_read(&session->s_cap_gen) &&
   1131	    time_before(ttl, di->time))
   1132		return;  /* we already have a newer lease. */
   1133
   1134	if (di->lease_session && di->lease_session != session) {
   1135		*old_lease_session = di->lease_session;
   1136		di->lease_session = NULL;
   1137	}
   1138
   1139	if (!di->lease_session)
   1140		di->lease_session = ceph_get_mds_session(session);
   1141	di->lease_gen = atomic_read(&session->s_cap_gen);
   1142	di->lease_seq = le32_to_cpu(lease->seq);
   1143	di->lease_renew_after = half_ttl;
   1144	di->lease_renew_from = 0;
   1145	di->time = ttl;
   1146
   1147	__ceph_dentry_lease_touch(di);
   1148}
   1149
   1150static inline void update_dentry_lease(struct inode *dir, struct dentry *dentry,
   1151					struct ceph_mds_reply_lease *lease,
   1152					struct ceph_mds_session *session,
   1153					unsigned long from_time)
   1154{
   1155	struct ceph_mds_session *old_lease_session = NULL;
   1156	spin_lock(&dentry->d_lock);
   1157	__update_dentry_lease(dir, dentry, lease, session, from_time,
   1158			      &old_lease_session);
   1159	spin_unlock(&dentry->d_lock);
   1160	ceph_put_mds_session(old_lease_session);
   1161}
   1162
   1163/*
   1164 * update dentry lease without having parent inode locked
   1165 */
   1166static void update_dentry_lease_careful(struct dentry *dentry,
   1167					struct ceph_mds_reply_lease *lease,
   1168					struct ceph_mds_session *session,
   1169					unsigned long from_time,
   1170					char *dname, u32 dname_len,
   1171					struct ceph_vino *pdvino,
   1172					struct ceph_vino *ptvino)
   1173
   1174{
   1175	struct inode *dir;
   1176	struct ceph_mds_session *old_lease_session = NULL;
   1177
   1178	spin_lock(&dentry->d_lock);
   1179	/* make sure dentry's name matches target */
   1180	if (dentry->d_name.len != dname_len ||
   1181	    memcmp(dentry->d_name.name, dname, dname_len))
   1182		goto out_unlock;
   1183
   1184	dir = d_inode(dentry->d_parent);
   1185	/* make sure parent matches dvino */
   1186	if (!ceph_ino_compare(dir, pdvino))
   1187		goto out_unlock;
   1188
   1189	/* make sure dentry's inode matches target. NULL ptvino means that
   1190	 * we expect a negative dentry */
   1191	if (ptvino) {
   1192		if (d_really_is_negative(dentry))
   1193			goto out_unlock;
   1194		if (!ceph_ino_compare(d_inode(dentry), ptvino))
   1195			goto out_unlock;
   1196	} else {
   1197		if (d_really_is_positive(dentry))
   1198			goto out_unlock;
   1199	}
   1200
   1201	__update_dentry_lease(dir, dentry, lease, session,
   1202			      from_time, &old_lease_session);
   1203out_unlock:
   1204	spin_unlock(&dentry->d_lock);
   1205	ceph_put_mds_session(old_lease_session);
   1206}
   1207
   1208/*
   1209 * splice a dentry to an inode.
   1210 * caller must hold directory i_rwsem for this to be safe.
   1211 */
   1212static int splice_dentry(struct dentry **pdn, struct inode *in)
   1213{
   1214	struct dentry *dn = *pdn;
   1215	struct dentry *realdn;
   1216
   1217	BUG_ON(d_inode(dn));
   1218
   1219	if (S_ISDIR(in->i_mode)) {
   1220		/* If inode is directory, d_splice_alias() below will remove
   1221		 * 'realdn' from its origin parent. We need to ensure that
   1222		 * origin parent's readdir cache will not reference 'realdn'
   1223		 */
   1224		realdn = d_find_any_alias(in);
   1225		if (realdn) {
   1226			struct ceph_dentry_info *di = ceph_dentry(realdn);
   1227			spin_lock(&realdn->d_lock);
   1228
   1229			realdn->d_op->d_prune(realdn);
   1230
   1231			di->time = jiffies;
   1232			di->lease_shared_gen = 0;
   1233			di->offset = 0;
   1234
   1235			spin_unlock(&realdn->d_lock);
   1236			dput(realdn);
   1237		}
   1238	}
   1239
   1240	/* dn must be unhashed */
   1241	if (!d_unhashed(dn))
   1242		d_drop(dn);
   1243	realdn = d_splice_alias(in, dn);
   1244	if (IS_ERR(realdn)) {
   1245		pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
   1246		       PTR_ERR(realdn), dn, in, ceph_vinop(in));
   1247		return PTR_ERR(realdn);
   1248	}
   1249
   1250	if (realdn) {
   1251		dout("dn %p (%d) spliced with %p (%d) "
   1252		     "inode %p ino %llx.%llx\n",
   1253		     dn, d_count(dn),
   1254		     realdn, d_count(realdn),
   1255		     d_inode(realdn), ceph_vinop(d_inode(realdn)));
   1256		dput(dn);
   1257		*pdn = realdn;
   1258	} else {
   1259		BUG_ON(!ceph_dentry(dn));
   1260		dout("dn %p attached to %p ino %llx.%llx\n",
   1261		     dn, d_inode(dn), ceph_vinop(d_inode(dn)));
   1262	}
   1263	return 0;
   1264}
   1265
   1266/*
   1267 * Incorporate results into the local cache.  This is either just
   1268 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
   1269 * after a lookup).
   1270 *
   1271 * A reply may contain
   1272 *         a directory inode along with a dentry.
   1273 *  and/or a target inode
   1274 *
   1275 * Called with snap_rwsem (read).
   1276 */
   1277int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
   1278{
   1279	struct ceph_mds_session *session = req->r_session;
   1280	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
   1281	struct inode *in = NULL;
   1282	struct ceph_vino tvino, dvino;
   1283	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
   1284	int err = 0;
   1285
   1286	dout("fill_trace %p is_dentry %d is_target %d\n", req,
   1287	     rinfo->head->is_dentry, rinfo->head->is_target);
   1288
   1289	if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
   1290		dout("fill_trace reply is empty!\n");
   1291		if (rinfo->head->result == 0 && req->r_parent)
   1292			ceph_invalidate_dir_request(req);
   1293		return 0;
   1294	}
   1295
   1296	if (rinfo->head->is_dentry) {
   1297		struct inode *dir = req->r_parent;
   1298
   1299		if (dir) {
   1300			err = ceph_fill_inode(dir, NULL, &rinfo->diri,
   1301					      rinfo->dirfrag, session, -1,
   1302					      &req->r_caps_reservation);
   1303			if (err < 0)
   1304				goto done;
   1305		} else {
   1306			WARN_ON_ONCE(1);
   1307		}
   1308
   1309		if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
   1310		    test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
   1311		    !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
   1312			struct qstr dname;
   1313			struct dentry *dn, *parent;
   1314
   1315			BUG_ON(!rinfo->head->is_target);
   1316			BUG_ON(req->r_dentry);
   1317
   1318			parent = d_find_any_alias(dir);
   1319			BUG_ON(!parent);
   1320
   1321			dname.name = rinfo->dname;
   1322			dname.len = rinfo->dname_len;
   1323			dname.hash = full_name_hash(parent, dname.name, dname.len);
   1324			tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
   1325			tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
   1326retry_lookup:
   1327			dn = d_lookup(parent, &dname);
   1328			dout("d_lookup on parent=%p name=%.*s got %p\n",
   1329			     parent, dname.len, dname.name, dn);
   1330
   1331			if (!dn) {
   1332				dn = d_alloc(parent, &dname);
   1333				dout("d_alloc %p '%.*s' = %p\n", parent,
   1334				     dname.len, dname.name, dn);
   1335				if (!dn) {
   1336					dput(parent);
   1337					err = -ENOMEM;
   1338					goto done;
   1339				}
   1340				err = 0;
   1341			} else if (d_really_is_positive(dn) &&
   1342				   (ceph_ino(d_inode(dn)) != tvino.ino ||
   1343				    ceph_snap(d_inode(dn)) != tvino.snap)) {
   1344				dout(" dn %p points to wrong inode %p\n",
   1345				     dn, d_inode(dn));
   1346				ceph_dir_clear_ordered(dir);
   1347				d_delete(dn);
   1348				dput(dn);
   1349				goto retry_lookup;
   1350			}
   1351
   1352			req->r_dentry = dn;
   1353			dput(parent);
   1354		}
   1355	}
   1356
   1357	if (rinfo->head->is_target) {
   1358		/* Should be filled in by handle_reply */
   1359		BUG_ON(!req->r_target_inode);
   1360
   1361		in = req->r_target_inode;
   1362		err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti,
   1363				NULL, session,
   1364				(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
   1365				 !test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) &&
   1366				 rinfo->head->result == 0) ?  req->r_fmode : -1,
   1367				&req->r_caps_reservation);
   1368		if (err < 0) {
   1369			pr_err("ceph_fill_inode badness %p %llx.%llx\n",
   1370				in, ceph_vinop(in));
   1371			req->r_target_inode = NULL;
   1372			if (in->i_state & I_NEW)
   1373				discard_new_inode(in);
   1374			else
   1375				iput(in);
   1376			goto done;
   1377		}
   1378		if (in->i_state & I_NEW)
   1379			unlock_new_inode(in);
   1380	}
   1381
   1382	/*
   1383	 * ignore null lease/binding on snapdir ENOENT, or else we
   1384	 * will have trouble splicing in the virtual snapdir later
   1385	 */
   1386	if (rinfo->head->is_dentry &&
   1387            !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
   1388	    test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
   1389	    (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
   1390					       fsc->mount_options->snapdir_name,
   1391					       req->r_dentry->d_name.len))) {
   1392		/*
   1393		 * lookup link rename   : null -> possibly existing inode
   1394		 * mknod symlink mkdir  : null -> new inode
   1395		 * unlink               : linked -> null
   1396		 */
   1397		struct inode *dir = req->r_parent;
   1398		struct dentry *dn = req->r_dentry;
   1399		bool have_dir_cap, have_lease;
   1400
   1401		BUG_ON(!dn);
   1402		BUG_ON(!dir);
   1403		BUG_ON(d_inode(dn->d_parent) != dir);
   1404
   1405		dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
   1406		dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
   1407
   1408		BUG_ON(ceph_ino(dir) != dvino.ino);
   1409		BUG_ON(ceph_snap(dir) != dvino.snap);
   1410
   1411		/* do we have a lease on the whole dir? */
   1412		have_dir_cap =
   1413			(le32_to_cpu(rinfo->diri.in->cap.caps) &
   1414			 CEPH_CAP_FILE_SHARED);
   1415
   1416		/* do we have a dn lease? */
   1417		have_lease = have_dir_cap ||
   1418			le32_to_cpu(rinfo->dlease->duration_ms);
   1419		if (!have_lease)
   1420			dout("fill_trace  no dentry lease or dir cap\n");
   1421
   1422		/* rename? */
   1423		if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
   1424			struct inode *olddir = req->r_old_dentry_dir;
   1425			BUG_ON(!olddir);
   1426
   1427			dout(" src %p '%pd' dst %p '%pd'\n",
   1428			     req->r_old_dentry,
   1429			     req->r_old_dentry,
   1430			     dn, dn);
   1431			dout("fill_trace doing d_move %p -> %p\n",
   1432			     req->r_old_dentry, dn);
   1433
   1434			/* d_move screws up sibling dentries' offsets */
   1435			ceph_dir_clear_ordered(dir);
   1436			ceph_dir_clear_ordered(olddir);
   1437
   1438			d_move(req->r_old_dentry, dn);
   1439			dout(" src %p '%pd' dst %p '%pd'\n",
   1440			     req->r_old_dentry,
   1441			     req->r_old_dentry,
   1442			     dn, dn);
   1443
   1444			/* ensure target dentry is invalidated, despite
   1445			   rehashing bug in vfs_rename_dir */
   1446			ceph_invalidate_dentry_lease(dn);
   1447
   1448			dout("dn %p gets new offset %lld\n", req->r_old_dentry,
   1449			     ceph_dentry(req->r_old_dentry)->offset);
   1450
   1451			/* swap r_dentry and r_old_dentry in case that
   1452			 * splice_dentry() gets called later. This is safe
   1453			 * because no other place will use them */
   1454			req->r_dentry = req->r_old_dentry;
   1455			req->r_old_dentry = dn;
   1456			dn = req->r_dentry;
   1457		}
   1458
   1459		/* null dentry? */
   1460		if (!rinfo->head->is_target) {
   1461			dout("fill_trace null dentry\n");
   1462			if (d_really_is_positive(dn)) {
   1463				dout("d_delete %p\n", dn);
   1464				ceph_dir_clear_ordered(dir);
   1465				d_delete(dn);
   1466			} else if (have_lease) {
   1467				if (d_unhashed(dn))
   1468					d_add(dn, NULL);
   1469			}
   1470
   1471			if (!d_unhashed(dn) && have_lease)
   1472				update_dentry_lease(dir, dn,
   1473						    rinfo->dlease, session,
   1474						    req->r_request_started);
   1475			goto done;
   1476		}
   1477
   1478		/* attach proper inode */
   1479		if (d_really_is_negative(dn)) {
   1480			ceph_dir_clear_ordered(dir);
   1481			ihold(in);
   1482			err = splice_dentry(&req->r_dentry, in);
   1483			if (err < 0)
   1484				goto done;
   1485			dn = req->r_dentry;  /* may have spliced */
   1486		} else if (d_really_is_positive(dn) && d_inode(dn) != in) {
   1487			dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
   1488			     dn, d_inode(dn), ceph_vinop(d_inode(dn)),
   1489			     ceph_vinop(in));
   1490			d_invalidate(dn);
   1491			have_lease = false;
   1492		}
   1493
   1494		if (have_lease) {
   1495			update_dentry_lease(dir, dn,
   1496					    rinfo->dlease, session,
   1497					    req->r_request_started);
   1498		}
   1499		dout(" final dn %p\n", dn);
   1500	} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
   1501		    req->r_op == CEPH_MDS_OP_MKSNAP) &&
   1502	           test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
   1503		   !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
   1504		struct inode *dir = req->r_parent;
   1505
   1506		/* fill out a snapdir LOOKUPSNAP dentry */
   1507		BUG_ON(!dir);
   1508		BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
   1509		BUG_ON(!req->r_dentry);
   1510		dout(" linking snapped dir %p to dn %p\n", in, req->r_dentry);
   1511		ceph_dir_clear_ordered(dir);
   1512		ihold(in);
   1513		err = splice_dentry(&req->r_dentry, in);
   1514		if (err < 0)
   1515			goto done;
   1516	} else if (rinfo->head->is_dentry && req->r_dentry) {
   1517		/* parent inode is not locked, be carefull */
   1518		struct ceph_vino *ptvino = NULL;
   1519		dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
   1520		dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
   1521		if (rinfo->head->is_target) {
   1522			tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
   1523			tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
   1524			ptvino = &tvino;
   1525		}
   1526		update_dentry_lease_careful(req->r_dentry, rinfo->dlease,
   1527					    session, req->r_request_started,
   1528					    rinfo->dname, rinfo->dname_len,
   1529					    &dvino, ptvino);
   1530	}
   1531done:
   1532	dout("fill_trace done err=%d\n", err);
   1533	return err;
   1534}
   1535
   1536/*
   1537 * Prepopulate our cache with readdir results, leases, etc.
   1538 */
   1539static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
   1540					   struct ceph_mds_session *session)
   1541{
   1542	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
   1543	int i, err = 0;
   1544
   1545	for (i = 0; i < rinfo->dir_nr; i++) {
   1546		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
   1547		struct ceph_vino vino;
   1548		struct inode *in;
   1549		int rc;
   1550
   1551		vino.ino = le64_to_cpu(rde->inode.in->ino);
   1552		vino.snap = le64_to_cpu(rde->inode.in->snapid);
   1553
   1554		in = ceph_get_inode(req->r_dentry->d_sb, vino);
   1555		if (IS_ERR(in)) {
   1556			err = PTR_ERR(in);
   1557			dout("new_inode badness got %d\n", err);
   1558			continue;
   1559		}
   1560		rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
   1561				     -1, &req->r_caps_reservation);
   1562		if (rc < 0) {
   1563			pr_err("ceph_fill_inode badness on %p got %d\n",
   1564			       in, rc);
   1565			err = rc;
   1566			if (in->i_state & I_NEW) {
   1567				ihold(in);
   1568				discard_new_inode(in);
   1569			}
   1570		} else if (in->i_state & I_NEW) {
   1571			unlock_new_inode(in);
   1572		}
   1573
   1574		iput(in);
   1575	}
   1576
   1577	return err;
   1578}
   1579
   1580void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
   1581{
   1582	if (ctl->page) {
   1583		kunmap(ctl->page);
   1584		put_page(ctl->page);
   1585		ctl->page = NULL;
   1586	}
   1587}
   1588
   1589static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
   1590			      struct ceph_readdir_cache_control *ctl,
   1591			      struct ceph_mds_request *req)
   1592{
   1593	struct ceph_inode_info *ci = ceph_inode(dir);
   1594	unsigned nsize = PAGE_SIZE / sizeof(struct dentry*);
   1595	unsigned idx = ctl->index % nsize;
   1596	pgoff_t pgoff = ctl->index / nsize;
   1597
   1598	if (!ctl->page || pgoff != page_index(ctl->page)) {
   1599		ceph_readdir_cache_release(ctl);
   1600		if (idx == 0)
   1601			ctl->page = grab_cache_page(&dir->i_data, pgoff);
   1602		else
   1603			ctl->page = find_lock_page(&dir->i_data, pgoff);
   1604		if (!ctl->page) {
   1605			ctl->index = -1;
   1606			return idx == 0 ? -ENOMEM : 0;
   1607		}
   1608		/* reading/filling the cache are serialized by
   1609		 * i_rwsem, no need to use page lock */
   1610		unlock_page(ctl->page);
   1611		ctl->dentries = kmap(ctl->page);
   1612		if (idx == 0)
   1613			memset(ctl->dentries, 0, PAGE_SIZE);
   1614	}
   1615
   1616	if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
   1617	    req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
   1618		dout("readdir cache dn %p idx %d\n", dn, ctl->index);
   1619		ctl->dentries[idx] = dn;
   1620		ctl->index++;
   1621	} else {
   1622		dout("disable readdir cache\n");
   1623		ctl->index = -1;
   1624	}
   1625	return 0;
   1626}
   1627
   1628int ceph_readdir_prepopulate(struct ceph_mds_request *req,
   1629			     struct ceph_mds_session *session)
   1630{
   1631	struct dentry *parent = req->r_dentry;
   1632	struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
   1633	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
   1634	struct qstr dname;
   1635	struct dentry *dn;
   1636	struct inode *in;
   1637	int err = 0, skipped = 0, ret, i;
   1638	u32 frag = le32_to_cpu(req->r_args.readdir.frag);
   1639	u32 last_hash = 0;
   1640	u32 fpos_offset;
   1641	struct ceph_readdir_cache_control cache_ctl = {};
   1642
   1643	if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
   1644		return readdir_prepopulate_inodes_only(req, session);
   1645
   1646	if (rinfo->hash_order) {
   1647		if (req->r_path2) {
   1648			last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
   1649						  req->r_path2,
   1650						  strlen(req->r_path2));
   1651			last_hash = ceph_frag_value(last_hash);
   1652		} else if (rinfo->offset_hash) {
   1653			/* mds understands offset_hash */
   1654			WARN_ON_ONCE(req->r_readdir_offset != 2);
   1655			last_hash = le32_to_cpu(req->r_args.readdir.offset_hash);
   1656		}
   1657	}
   1658
   1659	if (rinfo->dir_dir &&
   1660	    le32_to_cpu(rinfo->dir_dir->frag) != frag) {
   1661		dout("readdir_prepopulate got new frag %x -> %x\n",
   1662		     frag, le32_to_cpu(rinfo->dir_dir->frag));
   1663		frag = le32_to_cpu(rinfo->dir_dir->frag);
   1664		if (!rinfo->hash_order)
   1665			req->r_readdir_offset = 2;
   1666	}
   1667
   1668	if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
   1669		dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
   1670		     rinfo->dir_nr, parent);
   1671	} else {
   1672		dout("readdir_prepopulate %d items under dn %p\n",
   1673		     rinfo->dir_nr, parent);
   1674		if (rinfo->dir_dir)
   1675			ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
   1676
   1677		if (ceph_frag_is_leftmost(frag) &&
   1678		    req->r_readdir_offset == 2 &&
   1679		    !(rinfo->hash_order && last_hash)) {
   1680			/* note dir version at start of readdir so we can
   1681			 * tell if any dentries get dropped */
   1682			req->r_dir_release_cnt =
   1683				atomic64_read(&ci->i_release_count);
   1684			req->r_dir_ordered_cnt =
   1685				atomic64_read(&ci->i_ordered_count);
   1686			req->r_readdir_cache_idx = 0;
   1687		}
   1688	}
   1689
   1690	cache_ctl.index = req->r_readdir_cache_idx;
   1691	fpos_offset = req->r_readdir_offset;
   1692
   1693	/* FIXME: release caps/leases if error occurs */
   1694	for (i = 0; i < rinfo->dir_nr; i++) {
   1695		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
   1696		struct ceph_vino tvino;
   1697
   1698		dname.name = rde->name;
   1699		dname.len = rde->name_len;
   1700		dname.hash = full_name_hash(parent, dname.name, dname.len);
   1701
   1702		tvino.ino = le64_to_cpu(rde->inode.in->ino);
   1703		tvino.snap = le64_to_cpu(rde->inode.in->snapid);
   1704
   1705		if (rinfo->hash_order) {
   1706			u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
   1707						 rde->name, rde->name_len);
   1708			hash = ceph_frag_value(hash);
   1709			if (hash != last_hash)
   1710				fpos_offset = 2;
   1711			last_hash = hash;
   1712			rde->offset = ceph_make_fpos(hash, fpos_offset++, true);
   1713		} else {
   1714			rde->offset = ceph_make_fpos(frag, fpos_offset++, false);
   1715		}
   1716
   1717retry_lookup:
   1718		dn = d_lookup(parent, &dname);
   1719		dout("d_lookup on parent=%p name=%.*s got %p\n",
   1720		     parent, dname.len, dname.name, dn);
   1721
   1722		if (!dn) {
   1723			dn = d_alloc(parent, &dname);
   1724			dout("d_alloc %p '%.*s' = %p\n", parent,
   1725			     dname.len, dname.name, dn);
   1726			if (!dn) {
   1727				dout("d_alloc badness\n");
   1728				err = -ENOMEM;
   1729				goto out;
   1730			}
   1731		} else if (d_really_is_positive(dn) &&
   1732			   (ceph_ino(d_inode(dn)) != tvino.ino ||
   1733			    ceph_snap(d_inode(dn)) != tvino.snap)) {
   1734			struct ceph_dentry_info *di = ceph_dentry(dn);
   1735			dout(" dn %p points to wrong inode %p\n",
   1736			     dn, d_inode(dn));
   1737
   1738			spin_lock(&dn->d_lock);
   1739			if (di->offset > 0 &&
   1740			    di->lease_shared_gen ==
   1741			    atomic_read(&ci->i_shared_gen)) {
   1742				__ceph_dir_clear_ordered(ci);
   1743				di->offset = 0;
   1744			}
   1745			spin_unlock(&dn->d_lock);
   1746
   1747			d_delete(dn);
   1748			dput(dn);
   1749			goto retry_lookup;
   1750		}
   1751
   1752		/* inode */
   1753		if (d_really_is_positive(dn)) {
   1754			in = d_inode(dn);
   1755		} else {
   1756			in = ceph_get_inode(parent->d_sb, tvino);
   1757			if (IS_ERR(in)) {
   1758				dout("new_inode badness\n");
   1759				d_drop(dn);
   1760				dput(dn);
   1761				err = PTR_ERR(in);
   1762				goto out;
   1763			}
   1764		}
   1765
   1766		ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session,
   1767				      -1, &req->r_caps_reservation);
   1768		if (ret < 0) {
   1769			pr_err("ceph_fill_inode badness on %p\n", in);
   1770			if (d_really_is_negative(dn)) {
   1771				if (in->i_state & I_NEW) {
   1772					ihold(in);
   1773					discard_new_inode(in);
   1774				}
   1775				iput(in);
   1776			}
   1777			d_drop(dn);
   1778			err = ret;
   1779			goto next_item;
   1780		}
   1781		if (in->i_state & I_NEW)
   1782			unlock_new_inode(in);
   1783
   1784		if (d_really_is_negative(dn)) {
   1785			if (ceph_security_xattr_deadlock(in)) {
   1786				dout(" skip splicing dn %p to inode %p"
   1787				     " (security xattr deadlock)\n", dn, in);
   1788				iput(in);
   1789				skipped++;
   1790				goto next_item;
   1791			}
   1792
   1793			err = splice_dentry(&dn, in);
   1794			if (err < 0)
   1795				goto next_item;
   1796		}
   1797
   1798		ceph_dentry(dn)->offset = rde->offset;
   1799
   1800		update_dentry_lease(d_inode(parent), dn,
   1801				    rde->lease, req->r_session,
   1802				    req->r_request_started);
   1803
   1804		if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
   1805			ret = fill_readdir_cache(d_inode(parent), dn,
   1806						 &cache_ctl, req);
   1807			if (ret < 0)
   1808				err = ret;
   1809		}
   1810next_item:
   1811		dput(dn);
   1812	}
   1813out:
   1814	if (err == 0 && skipped == 0) {
   1815		set_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags);
   1816		req->r_readdir_cache_idx = cache_ctl.index;
   1817	}
   1818	ceph_readdir_cache_release(&cache_ctl);
   1819	dout("readdir_prepopulate done\n");
   1820	return err;
   1821}
   1822
   1823bool ceph_inode_set_size(struct inode *inode, loff_t size)
   1824{
   1825	struct ceph_inode_info *ci = ceph_inode(inode);
   1826	bool ret;
   1827
   1828	spin_lock(&ci->i_ceph_lock);
   1829	dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size);
   1830	i_size_write(inode, size);
   1831	ceph_fscache_update(inode);
   1832	inode->i_blocks = calc_inode_blocks(size);
   1833
   1834	ret = __ceph_should_report_size(ci);
   1835
   1836	spin_unlock(&ci->i_ceph_lock);
   1837
   1838	return ret;
   1839}
   1840
   1841void ceph_queue_inode_work(struct inode *inode, int work_bit)
   1842{
   1843	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
   1844	struct ceph_inode_info *ci = ceph_inode(inode);
   1845	set_bit(work_bit, &ci->i_work_mask);
   1846
   1847	ihold(inode);
   1848	if (queue_work(fsc->inode_wq, &ci->i_work)) {
   1849		dout("queue_inode_work %p, mask=%lx\n", inode, ci->i_work_mask);
   1850	} else {
   1851		dout("queue_inode_work %p already queued, mask=%lx\n",
   1852		     inode, ci->i_work_mask);
   1853		iput(inode);
   1854	}
   1855}
   1856
   1857static void ceph_do_invalidate_pages(struct inode *inode)
   1858{
   1859	struct ceph_inode_info *ci = ceph_inode(inode);
   1860	u32 orig_gen;
   1861	int check = 0;
   1862
   1863	ceph_fscache_invalidate(inode, false);
   1864
   1865	mutex_lock(&ci->i_truncate_mutex);
   1866
   1867	if (ceph_inode_is_shutdown(inode)) {
   1868		pr_warn_ratelimited("%s: inode %llx.%llx is shut down\n",
   1869				    __func__, ceph_vinop(inode));
   1870		mapping_set_error(inode->i_mapping, -EIO);
   1871		truncate_pagecache(inode, 0);
   1872		mutex_unlock(&ci->i_truncate_mutex);
   1873		goto out;
   1874	}
   1875
   1876	spin_lock(&ci->i_ceph_lock);
   1877	dout("invalidate_pages %p gen %d revoking %d\n", inode,
   1878	     ci->i_rdcache_gen, ci->i_rdcache_revoking);
   1879	if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
   1880		if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
   1881			check = 1;
   1882		spin_unlock(&ci->i_ceph_lock);
   1883		mutex_unlock(&ci->i_truncate_mutex);
   1884		goto out;
   1885	}
   1886	orig_gen = ci->i_rdcache_gen;
   1887	spin_unlock(&ci->i_ceph_lock);
   1888
   1889	if (invalidate_inode_pages2(inode->i_mapping) < 0) {
   1890		pr_err("invalidate_inode_pages2 %llx.%llx failed\n",
   1891		       ceph_vinop(inode));
   1892	}
   1893
   1894	spin_lock(&ci->i_ceph_lock);
   1895	if (orig_gen == ci->i_rdcache_gen &&
   1896	    orig_gen == ci->i_rdcache_revoking) {
   1897		dout("invalidate_pages %p gen %d successful\n", inode,
   1898		     ci->i_rdcache_gen);
   1899		ci->i_rdcache_revoking--;
   1900		check = 1;
   1901	} else {
   1902		dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
   1903		     inode, orig_gen, ci->i_rdcache_gen,
   1904		     ci->i_rdcache_revoking);
   1905		if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
   1906			check = 1;
   1907	}
   1908	spin_unlock(&ci->i_ceph_lock);
   1909	mutex_unlock(&ci->i_truncate_mutex);
   1910out:
   1911	if (check)
   1912		ceph_check_caps(ci, 0, NULL);
   1913}
   1914
   1915/*
   1916 * Make sure any pending truncation is applied before doing anything
   1917 * that may depend on it.
   1918 */
   1919void __ceph_do_pending_vmtruncate(struct inode *inode)
   1920{
   1921	struct ceph_inode_info *ci = ceph_inode(inode);
   1922	u64 to;
   1923	int wrbuffer_refs, finish = 0;
   1924
   1925	mutex_lock(&ci->i_truncate_mutex);
   1926retry:
   1927	spin_lock(&ci->i_ceph_lock);
   1928	if (ci->i_truncate_pending == 0) {
   1929		dout("__do_pending_vmtruncate %p none pending\n", inode);
   1930		spin_unlock(&ci->i_ceph_lock);
   1931		mutex_unlock(&ci->i_truncate_mutex);
   1932		return;
   1933	}
   1934
   1935	/*
   1936	 * make sure any dirty snapped pages are flushed before we
   1937	 * possibly truncate them.. so write AND block!
   1938	 */
   1939	if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
   1940		spin_unlock(&ci->i_ceph_lock);
   1941		dout("__do_pending_vmtruncate %p flushing snaps first\n",
   1942		     inode);
   1943		filemap_write_and_wait_range(&inode->i_data, 0,
   1944					     inode->i_sb->s_maxbytes);
   1945		goto retry;
   1946	}
   1947
   1948	/* there should be no reader or writer */
   1949	WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref);
   1950
   1951	to = ci->i_truncate_size;
   1952	wrbuffer_refs = ci->i_wrbuffer_ref;
   1953	dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
   1954	     ci->i_truncate_pending, to);
   1955	spin_unlock(&ci->i_ceph_lock);
   1956
   1957	ceph_fscache_resize(inode, to);
   1958	truncate_pagecache(inode, to);
   1959
   1960	spin_lock(&ci->i_ceph_lock);
   1961	if (to == ci->i_truncate_size) {
   1962		ci->i_truncate_pending = 0;
   1963		finish = 1;
   1964	}
   1965	spin_unlock(&ci->i_ceph_lock);
   1966	if (!finish)
   1967		goto retry;
   1968
   1969	mutex_unlock(&ci->i_truncate_mutex);
   1970
   1971	if (wrbuffer_refs == 0)
   1972		ceph_check_caps(ci, 0, NULL);
   1973
   1974	wake_up_all(&ci->i_cap_wq);
   1975}
   1976
   1977static void ceph_inode_work(struct work_struct *work)
   1978{
   1979	struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
   1980						 i_work);
   1981	struct inode *inode = &ci->netfs.inode;
   1982
   1983	if (test_and_clear_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask)) {
   1984		dout("writeback %p\n", inode);
   1985		filemap_fdatawrite(&inode->i_data);
   1986	}
   1987	if (test_and_clear_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask))
   1988		ceph_do_invalidate_pages(inode);
   1989
   1990	if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask))
   1991		__ceph_do_pending_vmtruncate(inode);
   1992
   1993	if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask))
   1994		ceph_check_caps(ci, 0, NULL);
   1995
   1996	if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask))
   1997		ceph_flush_snaps(ci, NULL);
   1998
   1999	iput(inode);
   2000}
   2001
   2002/*
   2003 * symlinks
   2004 */
   2005static const struct inode_operations ceph_symlink_iops = {
   2006	.get_link = simple_get_link,
   2007	.setattr = ceph_setattr,
   2008	.getattr = ceph_getattr,
   2009	.listxattr = ceph_listxattr,
   2010};
   2011
   2012int __ceph_setattr(struct inode *inode, struct iattr *attr)
   2013{
   2014	struct ceph_inode_info *ci = ceph_inode(inode);
   2015	unsigned int ia_valid = attr->ia_valid;
   2016	struct ceph_mds_request *req;
   2017	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
   2018	struct ceph_cap_flush *prealloc_cf;
   2019	int issued;
   2020	int release = 0, dirtied = 0;
   2021	int mask = 0;
   2022	int err = 0;
   2023	int inode_dirty_flags = 0;
   2024	bool lock_snap_rwsem = false;
   2025
   2026	prealloc_cf = ceph_alloc_cap_flush();
   2027	if (!prealloc_cf)
   2028		return -ENOMEM;
   2029
   2030	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
   2031				       USE_AUTH_MDS);
   2032	if (IS_ERR(req)) {
   2033		ceph_free_cap_flush(prealloc_cf);
   2034		return PTR_ERR(req);
   2035	}
   2036
   2037	spin_lock(&ci->i_ceph_lock);
   2038	issued = __ceph_caps_issued(ci, NULL);
   2039
   2040	if (!ci->i_head_snapc &&
   2041	    (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) {
   2042		lock_snap_rwsem = true;
   2043		if (!down_read_trylock(&mdsc->snap_rwsem)) {
   2044			spin_unlock(&ci->i_ceph_lock);
   2045			down_read(&mdsc->snap_rwsem);
   2046			spin_lock(&ci->i_ceph_lock);
   2047			issued = __ceph_caps_issued(ci, NULL);
   2048		}
   2049	}
   2050
   2051	dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
   2052
   2053	if (ia_valid & ATTR_UID) {
   2054		dout("setattr %p uid %d -> %d\n", inode,
   2055		     from_kuid(&init_user_ns, inode->i_uid),
   2056		     from_kuid(&init_user_ns, attr->ia_uid));
   2057		if (issued & CEPH_CAP_AUTH_EXCL) {
   2058			inode->i_uid = attr->ia_uid;
   2059			dirtied |= CEPH_CAP_AUTH_EXCL;
   2060		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
   2061			   !uid_eq(attr->ia_uid, inode->i_uid)) {
   2062			req->r_args.setattr.uid = cpu_to_le32(
   2063				from_kuid(&init_user_ns, attr->ia_uid));
   2064			mask |= CEPH_SETATTR_UID;
   2065			release |= CEPH_CAP_AUTH_SHARED;
   2066		}
   2067	}
   2068	if (ia_valid & ATTR_GID) {
   2069		dout("setattr %p gid %d -> %d\n", inode,
   2070		     from_kgid(&init_user_ns, inode->i_gid),
   2071		     from_kgid(&init_user_ns, attr->ia_gid));
   2072		if (issued & CEPH_CAP_AUTH_EXCL) {
   2073			inode->i_gid = attr->ia_gid;
   2074			dirtied |= CEPH_CAP_AUTH_EXCL;
   2075		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
   2076			   !gid_eq(attr->ia_gid, inode->i_gid)) {
   2077			req->r_args.setattr.gid = cpu_to_le32(
   2078				from_kgid(&init_user_ns, attr->ia_gid));
   2079			mask |= CEPH_SETATTR_GID;
   2080			release |= CEPH_CAP_AUTH_SHARED;
   2081		}
   2082	}
   2083	if (ia_valid & ATTR_MODE) {
   2084		dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
   2085		     attr->ia_mode);
   2086		if (issued & CEPH_CAP_AUTH_EXCL) {
   2087			inode->i_mode = attr->ia_mode;
   2088			dirtied |= CEPH_CAP_AUTH_EXCL;
   2089		} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
   2090			   attr->ia_mode != inode->i_mode) {
   2091			inode->i_mode = attr->ia_mode;
   2092			req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
   2093			mask |= CEPH_SETATTR_MODE;
   2094			release |= CEPH_CAP_AUTH_SHARED;
   2095		}
   2096	}
   2097
   2098	if (ia_valid & ATTR_ATIME) {
   2099		dout("setattr %p atime %lld.%ld -> %lld.%ld\n", inode,
   2100		     inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
   2101		     attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
   2102		if (issued & CEPH_CAP_FILE_EXCL) {
   2103			ci->i_time_warp_seq++;
   2104			inode->i_atime = attr->ia_atime;
   2105			dirtied |= CEPH_CAP_FILE_EXCL;
   2106		} else if ((issued & CEPH_CAP_FILE_WR) &&
   2107			   timespec64_compare(&inode->i_atime,
   2108					    &attr->ia_atime) < 0) {
   2109			inode->i_atime = attr->ia_atime;
   2110			dirtied |= CEPH_CAP_FILE_WR;
   2111		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
   2112			   !timespec64_equal(&inode->i_atime, &attr->ia_atime)) {
   2113			ceph_encode_timespec64(&req->r_args.setattr.atime,
   2114					       &attr->ia_atime);
   2115			mask |= CEPH_SETATTR_ATIME;
   2116			release |= CEPH_CAP_FILE_SHARED |
   2117				   CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
   2118		}
   2119	}
   2120	if (ia_valid & ATTR_SIZE) {
   2121		loff_t isize = i_size_read(inode);
   2122
   2123		dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size);
   2124		if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
   2125			if (attr->ia_size > isize) {
   2126				i_size_write(inode, attr->ia_size);
   2127				inode->i_blocks = calc_inode_blocks(attr->ia_size);
   2128				ci->i_reported_size = attr->ia_size;
   2129				dirtied |= CEPH_CAP_FILE_EXCL;
   2130				ia_valid |= ATTR_MTIME;
   2131			}
   2132		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
   2133			   attr->ia_size != isize) {
   2134			req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
   2135			req->r_args.setattr.old_size = cpu_to_le64(isize);
   2136			mask |= CEPH_SETATTR_SIZE;
   2137			release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
   2138				   CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
   2139		}
   2140	}
   2141	if (ia_valid & ATTR_MTIME) {
   2142		dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode,
   2143		     inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
   2144		     attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
   2145		if (issued & CEPH_CAP_FILE_EXCL) {
   2146			ci->i_time_warp_seq++;
   2147			inode->i_mtime = attr->ia_mtime;
   2148			dirtied |= CEPH_CAP_FILE_EXCL;
   2149		} else if ((issued & CEPH_CAP_FILE_WR) &&
   2150			   timespec64_compare(&inode->i_mtime,
   2151					    &attr->ia_mtime) < 0) {
   2152			inode->i_mtime = attr->ia_mtime;
   2153			dirtied |= CEPH_CAP_FILE_WR;
   2154		} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
   2155			   !timespec64_equal(&inode->i_mtime, &attr->ia_mtime)) {
   2156			ceph_encode_timespec64(&req->r_args.setattr.mtime,
   2157					       &attr->ia_mtime);
   2158			mask |= CEPH_SETATTR_MTIME;
   2159			release |= CEPH_CAP_FILE_SHARED |
   2160				   CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
   2161		}
   2162	}
   2163
   2164	/* these do nothing */
   2165	if (ia_valid & ATTR_CTIME) {
   2166		bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
   2167					 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
   2168		dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode,
   2169		     inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
   2170		     attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
   2171		     only ? "ctime only" : "ignored");
   2172		if (only) {
   2173			/*
   2174			 * if kernel wants to dirty ctime but nothing else,
   2175			 * we need to choose a cap to dirty under, or do
   2176			 * a almost-no-op setattr
   2177			 */
   2178			if (issued & CEPH_CAP_AUTH_EXCL)
   2179				dirtied |= CEPH_CAP_AUTH_EXCL;
   2180			else if (issued & CEPH_CAP_FILE_EXCL)
   2181				dirtied |= CEPH_CAP_FILE_EXCL;
   2182			else if (issued & CEPH_CAP_XATTR_EXCL)
   2183				dirtied |= CEPH_CAP_XATTR_EXCL;
   2184			else
   2185				mask |= CEPH_SETATTR_CTIME;
   2186		}
   2187	}
   2188	if (ia_valid & ATTR_FILE)
   2189		dout("setattr %p ATTR_FILE ... hrm!\n", inode);
   2190
   2191	if (dirtied) {
   2192		inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
   2193							   &prealloc_cf);
   2194		inode->i_ctime = attr->ia_ctime;
   2195	}
   2196
   2197	release &= issued;
   2198	spin_unlock(&ci->i_ceph_lock);
   2199	if (lock_snap_rwsem)
   2200		up_read(&mdsc->snap_rwsem);
   2201
   2202	if (inode_dirty_flags)
   2203		__mark_inode_dirty(inode, inode_dirty_flags);
   2204
   2205	if (mask) {
   2206		req->r_inode = inode;
   2207		ihold(inode);
   2208		req->r_inode_drop = release;
   2209		req->r_args.setattr.mask = cpu_to_le32(mask);
   2210		req->r_num_caps = 1;
   2211		req->r_stamp = attr->ia_ctime;
   2212		err = ceph_mdsc_do_request(mdsc, NULL, req);
   2213	}
   2214	dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
   2215	     ceph_cap_string(dirtied), mask);
   2216
   2217	ceph_mdsc_put_request(req);
   2218	ceph_free_cap_flush(prealloc_cf);
   2219
   2220	if (err >= 0 && (mask & CEPH_SETATTR_SIZE))
   2221		__ceph_do_pending_vmtruncate(inode);
   2222
   2223	return err;
   2224}
   2225
   2226/*
   2227 * setattr
   2228 */
   2229int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
   2230		 struct iattr *attr)
   2231{
   2232	struct inode *inode = d_inode(dentry);
   2233	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
   2234	int err;
   2235
   2236	if (ceph_snap(inode) != CEPH_NOSNAP)
   2237		return -EROFS;
   2238
   2239	if (ceph_inode_is_shutdown(inode))
   2240		return -ESTALE;
   2241
   2242	err = setattr_prepare(&init_user_ns, dentry, attr);
   2243	if (err != 0)
   2244		return err;
   2245
   2246	if ((attr->ia_valid & ATTR_SIZE) &&
   2247	    attr->ia_size > max(i_size_read(inode), fsc->max_file_size))
   2248		return -EFBIG;
   2249
   2250	if ((attr->ia_valid & ATTR_SIZE) &&
   2251	    ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size))
   2252		return -EDQUOT;
   2253
   2254	err = __ceph_setattr(inode, attr);
   2255
   2256	if (err >= 0 && (attr->ia_valid & ATTR_MODE))
   2257		err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode);
   2258
   2259	return err;
   2260}
   2261
   2262int ceph_try_to_choose_auth_mds(struct inode *inode, int mask)
   2263{
   2264	int issued = ceph_caps_issued(ceph_inode(inode));
   2265
   2266	/*
   2267	 * If any 'x' caps is issued we can just choose the auth MDS
   2268	 * instead of the random replica MDSes. Because only when the
   2269	 * Locker is in LOCK_EXEC state will the loner client could
   2270	 * get the 'x' caps. And if we send the getattr requests to
   2271	 * any replica MDS it must auth pin and tries to rdlock from
   2272	 * the auth MDS, and then the auth MDS need to do the Locker
   2273	 * state transition to LOCK_SYNC. And after that the lock state
   2274	 * will change back.
   2275	 *
   2276	 * This cost much when doing the Locker state transition and
   2277	 * usually will need to revoke caps from clients.
   2278	 */
   2279	if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL))
   2280	    || (mask & CEPH_STAT_RSTAT))
   2281		return USE_AUTH_MDS;
   2282	else
   2283		return USE_ANY_MDS;
   2284}
   2285
   2286/*
   2287 * Verify that we have a lease on the given mask.  If not,
   2288 * do a getattr against an mds.
   2289 */
   2290int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
   2291		      int mask, bool force)
   2292{
   2293	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
   2294	struct ceph_mds_client *mdsc = fsc->mdsc;
   2295	struct ceph_mds_request *req;
   2296	int mode;
   2297	int err;
   2298
   2299	if (ceph_snap(inode) == CEPH_SNAPDIR) {
   2300		dout("do_getattr inode %p SNAPDIR\n", inode);
   2301		return 0;
   2302	}
   2303
   2304	dout("do_getattr inode %p mask %s mode 0%o\n",
   2305	     inode, ceph_cap_string(mask), inode->i_mode);
   2306	if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1))
   2307			return 0;
   2308
   2309	mode = ceph_try_to_choose_auth_mds(inode, mask);
   2310	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
   2311	if (IS_ERR(req))
   2312		return PTR_ERR(req);
   2313	req->r_inode = inode;
   2314	ihold(inode);
   2315	req->r_num_caps = 1;
   2316	req->r_args.getattr.mask = cpu_to_le32(mask);
   2317	req->r_locked_page = locked_page;
   2318	err = ceph_mdsc_do_request(mdsc, NULL, req);
   2319	if (locked_page && err == 0) {
   2320		u64 inline_version = req->r_reply_info.targeti.inline_version;
   2321		if (inline_version == 0) {
   2322			/* the reply is supposed to contain inline data */
   2323			err = -EINVAL;
   2324		} else if (inline_version == CEPH_INLINE_NONE) {
   2325			err = -ENODATA;
   2326		} else {
   2327			err = req->r_reply_info.targeti.inline_len;
   2328		}
   2329	}
   2330	ceph_mdsc_put_request(req);
   2331	dout("do_getattr result=%d\n", err);
   2332	return err;
   2333}
   2334
   2335int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
   2336		      size_t size)
   2337{
   2338	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
   2339	struct ceph_mds_client *mdsc = fsc->mdsc;
   2340	struct ceph_mds_request *req;
   2341	int mode = USE_AUTH_MDS;
   2342	int err;
   2343	char *xattr_value;
   2344	size_t xattr_value_len;
   2345
   2346	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
   2347	if (IS_ERR(req)) {
   2348		err = -ENOMEM;
   2349		goto out;
   2350	}
   2351
   2352	req->r_path2 = kstrdup(name, GFP_NOFS);
   2353	if (!req->r_path2) {
   2354		err = -ENOMEM;
   2355		goto put;
   2356	}
   2357
   2358	ihold(inode);
   2359	req->r_inode = inode;
   2360	err = ceph_mdsc_do_request(mdsc, NULL, req);
   2361	if (err < 0)
   2362		goto put;
   2363
   2364	xattr_value = req->r_reply_info.xattr_info.xattr_value;
   2365	xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
   2366
   2367	dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
   2368
   2369	err = (int)xattr_value_len;
   2370	if (size == 0)
   2371		goto put;
   2372
   2373	if (xattr_value_len > size) {
   2374		err = -ERANGE;
   2375		goto put;
   2376	}
   2377
   2378	memcpy(value, xattr_value, xattr_value_len);
   2379put:
   2380	ceph_mdsc_put_request(req);
   2381out:
   2382	dout("do_getvxattr result=%d\n", err);
   2383	return err;
   2384}
   2385
   2386
   2387/*
   2388 * Check inode permissions.  We verify we have a valid value for
   2389 * the AUTH cap, then call the generic handler.
   2390 */
   2391int ceph_permission(struct user_namespace *mnt_userns, struct inode *inode,
   2392		    int mask)
   2393{
   2394	int err;
   2395
   2396	if (mask & MAY_NOT_BLOCK)
   2397		return -ECHILD;
   2398
   2399	err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
   2400
   2401	if (!err)
   2402		err = generic_permission(&init_user_ns, inode, mask);
   2403	return err;
   2404}
   2405
   2406/* Craft a mask of needed caps given a set of requested statx attrs. */
   2407static int statx_to_caps(u32 want, umode_t mode)
   2408{
   2409	int mask = 0;
   2410
   2411	if (want & (STATX_MODE|STATX_UID|STATX_GID|STATX_CTIME|STATX_BTIME))
   2412		mask |= CEPH_CAP_AUTH_SHARED;
   2413
   2414	if (want & (STATX_NLINK|STATX_CTIME)) {
   2415		/*
   2416		 * The link count for directories depends on inode->i_subdirs,
   2417		 * and that is only updated when Fs caps are held.
   2418		 */
   2419		if (S_ISDIR(mode))
   2420			mask |= CEPH_CAP_FILE_SHARED;
   2421		else
   2422			mask |= CEPH_CAP_LINK_SHARED;
   2423	}
   2424
   2425	if (want & (STATX_ATIME|STATX_MTIME|STATX_CTIME|STATX_SIZE|
   2426		    STATX_BLOCKS))
   2427		mask |= CEPH_CAP_FILE_SHARED;
   2428
   2429	if (want & (STATX_CTIME))
   2430		mask |= CEPH_CAP_XATTR_SHARED;
   2431
   2432	return mask;
   2433}
   2434
   2435/*
   2436 * Get all the attributes. If we have sufficient caps for the requested attrs,
   2437 * then we can avoid talking to the MDS at all.
   2438 */
   2439int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
   2440		 struct kstat *stat, u32 request_mask, unsigned int flags)
   2441{
   2442	struct inode *inode = d_inode(path->dentry);
   2443	struct ceph_inode_info *ci = ceph_inode(inode);
   2444	u32 valid_mask = STATX_BASIC_STATS;
   2445	int err = 0;
   2446
   2447	if (ceph_inode_is_shutdown(inode))
   2448		return -ESTALE;
   2449
   2450	/* Skip the getattr altogether if we're asked not to sync */
   2451	if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) {
   2452		err = ceph_do_getattr(inode,
   2453				statx_to_caps(request_mask, inode->i_mode),
   2454				flags & AT_STATX_FORCE_SYNC);
   2455		if (err)
   2456			return err;
   2457	}
   2458
   2459	generic_fillattr(&init_user_ns, inode, stat);
   2460	stat->ino = ceph_present_inode(inode);
   2461
   2462	/*
   2463	 * btime on newly-allocated inodes is 0, so if this is still set to
   2464	 * that, then assume that it's not valid.
   2465	 */
   2466	if (ci->i_btime.tv_sec || ci->i_btime.tv_nsec) {
   2467		stat->btime = ci->i_btime;
   2468		valid_mask |= STATX_BTIME;
   2469	}
   2470
   2471	if (ceph_snap(inode) == CEPH_NOSNAP)
   2472		stat->dev = inode->i_sb->s_dev;
   2473	else
   2474		stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
   2475
   2476	if (S_ISDIR(inode->i_mode)) {
   2477		if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
   2478					RBYTES))
   2479			stat->size = ci->i_rbytes;
   2480		else
   2481			stat->size = ci->i_files + ci->i_subdirs;
   2482		stat->blocks = 0;
   2483		stat->blksize = 65536;
   2484		/*
   2485		 * Some applications rely on the number of st_nlink
   2486		 * value on directories to be either 0 (if unlinked)
   2487		 * or 2 + number of subdirectories.
   2488		 */
   2489		if (stat->nlink == 1)
   2490			/* '.' + '..' + subdirs */
   2491			stat->nlink = 1 + 1 + ci->i_subdirs;
   2492	}
   2493
   2494	stat->result_mask = request_mask & valid_mask;
   2495	return err;
   2496}
   2497
   2498void ceph_inode_shutdown(struct inode *inode)
   2499{
   2500	struct ceph_inode_info *ci = ceph_inode(inode);
   2501	struct rb_node *p;
   2502	int iputs = 0;
   2503	bool invalidate = false;
   2504
   2505	spin_lock(&ci->i_ceph_lock);
   2506	ci->i_ceph_flags |= CEPH_I_SHUTDOWN;
   2507	p = rb_first(&ci->i_caps);
   2508	while (p) {
   2509		struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
   2510
   2511		p = rb_next(p);
   2512		iputs += ceph_purge_inode_cap(inode, cap, &invalidate);
   2513	}
   2514	spin_unlock(&ci->i_ceph_lock);
   2515
   2516	if (invalidate)
   2517		ceph_queue_invalidate(inode);
   2518	while (iputs--)
   2519		iput(inode);
   2520}