cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

iversion.h (12670B)


      1/* SPDX-License-Identifier: GPL-2.0 */
      2#ifndef _LINUX_IVERSION_H
      3#define _LINUX_IVERSION_H
      4
      5#include <linux/fs.h>
      6
      7/*
      8 * The inode->i_version field:
      9 * ---------------------------
     10 * The change attribute (i_version) is mandated by NFSv4 and is mostly for
     11 * knfsd, but is also used for other purposes (e.g. IMA). The i_version must
     12 * appear different to observers if there was a change to the inode's data or
     13 * metadata since it was last queried.
     14 *
     15 * Observers see the i_version as a 64-bit number that never decreases. If it
     16 * remains the same since it was last checked, then nothing has changed in the
     17 * inode. If it's different then something has changed. Observers cannot infer
     18 * anything about the nature or magnitude of the changes from the value, only
     19 * that the inode has changed in some fashion.
     20 *
     21 * Not all filesystems properly implement the i_version counter. Subsystems that
     22 * want to use i_version field on an inode should first check whether the
     23 * filesystem sets the SB_I_VERSION flag (usually via the IS_I_VERSION macro).
     24 *
     25 * Those that set SB_I_VERSION will automatically have their i_version counter
     26 * incremented on writes to normal files. If the SB_I_VERSION is not set, then
     27 * the VFS will not touch it on writes, and the filesystem can use it how it
     28 * wishes. Note that the filesystem is always responsible for updating the
     29 * i_version on namespace changes in directories (mkdir, rmdir, unlink, etc.).
     30 * We consider these sorts of filesystems to have a kernel-managed i_version.
     31 *
     32 * It may be impractical for filesystems to keep i_version updates atomic with
     33 * respect to the changes that cause them.  They should, however, guarantee
     34 * that i_version updates are never visible before the changes that caused
     35 * them.  Also, i_version updates should never be delayed longer than it takes
     36 * the original change to reach disk.
     37 *
     38 * This implementation uses the low bit in the i_version field as a flag to
     39 * track when the value has been queried. If it has not been queried since it
     40 * was last incremented, we can skip the increment in most cases.
     41 *
     42 * In the event that we're updating the ctime, we will usually go ahead and
     43 * bump the i_version anyway. Since that has to go to stable storage in some
     44 * fashion, we might as well increment it as well.
     45 *
     46 * With this implementation, the value should always appear to observers to
     47 * increase over time if the file has changed. It's recommended to use
     48 * inode_eq_iversion() helper to compare values.
     49 *
     50 * Note that some filesystems (e.g. NFS and AFS) just use the field to store
     51 * a server-provided value (for the most part). For that reason, those
     52 * filesystems do not set SB_I_VERSION. These filesystems are considered to
     53 * have a self-managed i_version.
     54 *
     55 * Persistently storing the i_version
     56 * ----------------------------------
     57 * Queries of the i_version field are not gated on them hitting the backing
     58 * store. It's always possible that the host could crash after allowing
     59 * a query of the value but before it has made it to disk.
     60 *
     61 * To mitigate this problem, filesystems should always use
     62 * inode_set_iversion_queried when loading an existing inode from disk. This
     63 * ensures that the next attempted inode increment will result in the value
     64 * changing.
     65 *
     66 * Storing the value to disk therefore does not count as a query, so those
     67 * filesystems should use inode_peek_iversion to grab the value to be stored.
     68 * There is no need to flag the value as having been queried in that case.
     69 */
     70
     71/*
     72 * We borrow the lowest bit in the i_version to use as a flag to tell whether
     73 * it has been queried since we last incremented it. If it has, then we must
     74 * increment it on the next change. After that, we can clear the flag and
     75 * avoid incrementing it again until it has again been queried.
     76 */
     77#define I_VERSION_QUERIED_SHIFT	(1)
     78#define I_VERSION_QUERIED	(1ULL << (I_VERSION_QUERIED_SHIFT - 1))
     79#define I_VERSION_INCREMENT	(1ULL << I_VERSION_QUERIED_SHIFT)
     80
     81/**
     82 * inode_set_iversion_raw - set i_version to the specified raw value
     83 * @inode: inode to set
     84 * @val: new i_version value to set
     85 *
     86 * Set @inode's i_version field to @val. This function is for use by
     87 * filesystems that self-manage the i_version.
     88 *
     89 * For example, the NFS client stores its NFSv4 change attribute in this way,
     90 * and the AFS client stores the data_version from the server here.
     91 */
     92static inline void
     93inode_set_iversion_raw(struct inode *inode, u64 val)
     94{
     95	atomic64_set(&inode->i_version, val);
     96}
     97
     98/**
     99 * inode_peek_iversion_raw - grab a "raw" iversion value
    100 * @inode: inode from which i_version should be read
    101 *
    102 * Grab a "raw" inode->i_version value and return it. The i_version is not
    103 * flagged or converted in any way. This is mostly used to access a self-managed
    104 * i_version.
    105 *
    106 * With those filesystems, we want to treat the i_version as an entirely
    107 * opaque value.
    108 */
    109static inline u64
    110inode_peek_iversion_raw(const struct inode *inode)
    111{
    112	return atomic64_read(&inode->i_version);
    113}
    114
    115/**
    116 * inode_set_max_iversion_raw - update i_version new value is larger
    117 * @inode: inode to set
    118 * @val: new i_version to set
    119 *
    120 * Some self-managed filesystems (e.g Ceph) will only update the i_version
    121 * value if the new value is larger than the one we already have.
    122 */
    123static inline void
    124inode_set_max_iversion_raw(struct inode *inode, u64 val)
    125{
    126	u64 cur, old;
    127
    128	cur = inode_peek_iversion_raw(inode);
    129	for (;;) {
    130		if (cur > val)
    131			break;
    132		old = atomic64_cmpxchg(&inode->i_version, cur, val);
    133		if (likely(old == cur))
    134			break;
    135		cur = old;
    136	}
    137}
    138
    139/**
    140 * inode_set_iversion - set i_version to a particular value
    141 * @inode: inode to set
    142 * @val: new i_version value to set
    143 *
    144 * Set @inode's i_version field to @val. This function is for filesystems with
    145 * a kernel-managed i_version, for initializing a newly-created inode from
    146 * scratch.
    147 *
    148 * In this case, we do not set the QUERIED flag since we know that this value
    149 * has never been queried.
    150 */
    151static inline void
    152inode_set_iversion(struct inode *inode, u64 val)
    153{
    154	inode_set_iversion_raw(inode, val << I_VERSION_QUERIED_SHIFT);
    155}
    156
    157/**
    158 * inode_set_iversion_queried - set i_version to a particular value as quereied
    159 * @inode: inode to set
    160 * @val: new i_version value to set
    161 *
    162 * Set @inode's i_version field to @val, and flag it for increment on the next
    163 * change.
    164 *
    165 * Filesystems that persistently store the i_version on disk should use this
    166 * when loading an existing inode from disk.
    167 *
    168 * When loading in an i_version value from a backing store, we can't be certain
    169 * that it wasn't previously viewed before being stored. Thus, we must assume
    170 * that it was, to ensure that we don't end up handing out the same value for
    171 * different versions of the same inode.
    172 */
    173static inline void
    174inode_set_iversion_queried(struct inode *inode, u64 val)
    175{
    176	inode_set_iversion_raw(inode, (val << I_VERSION_QUERIED_SHIFT) |
    177				I_VERSION_QUERIED);
    178}
    179
    180/**
    181 * inode_maybe_inc_iversion - increments i_version
    182 * @inode: inode with the i_version that should be updated
    183 * @force: increment the counter even if it's not necessary?
    184 *
    185 * Every time the inode is modified, the i_version field must be seen to have
    186 * changed by any observer.
    187 *
    188 * If "force" is set or the QUERIED flag is set, then ensure that we increment
    189 * the value, and clear the queried flag.
    190 *
    191 * In the common case where neither is set, then we can return "false" without
    192 * updating i_version.
    193 *
    194 * If this function returns false, and no other metadata has changed, then we
    195 * can avoid logging the metadata.
    196 */
    197static inline bool
    198inode_maybe_inc_iversion(struct inode *inode, bool force)
    199{
    200	u64 cur, old, new;
    201
    202	/*
    203	 * The i_version field is not strictly ordered with any other inode
    204	 * information, but the legacy inode_inc_iversion code used a spinlock
    205	 * to serialize increments.
    206	 *
    207	 * Here, we add full memory barriers to ensure that any de-facto
    208	 * ordering with other info is preserved.
    209	 *
    210	 * This barrier pairs with the barrier in inode_query_iversion()
    211	 */
    212	smp_mb();
    213	cur = inode_peek_iversion_raw(inode);
    214	for (;;) {
    215		/* If flag is clear then we needn't do anything */
    216		if (!force && !(cur & I_VERSION_QUERIED))
    217			return false;
    218
    219		/* Since lowest bit is flag, add 2 to avoid it */
    220		new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
    221
    222		old = atomic64_cmpxchg(&inode->i_version, cur, new);
    223		if (likely(old == cur))
    224			break;
    225		cur = old;
    226	}
    227	return true;
    228}
    229
    230
    231/**
    232 * inode_inc_iversion - forcibly increment i_version
    233 * @inode: inode that needs to be updated
    234 *
    235 * Forcbily increment the i_version field. This always results in a change to
    236 * the observable value.
    237 */
    238static inline void
    239inode_inc_iversion(struct inode *inode)
    240{
    241	inode_maybe_inc_iversion(inode, true);
    242}
    243
    244/**
    245 * inode_iversion_need_inc - is the i_version in need of being incremented?
    246 * @inode: inode to check
    247 *
    248 * Returns whether the inode->i_version counter needs incrementing on the next
    249 * change. Just fetch the value and check the QUERIED flag.
    250 */
    251static inline bool
    252inode_iversion_need_inc(struct inode *inode)
    253{
    254	return inode_peek_iversion_raw(inode) & I_VERSION_QUERIED;
    255}
    256
    257/**
    258 * inode_inc_iversion_raw - forcibly increment raw i_version
    259 * @inode: inode that needs to be updated
    260 *
    261 * Forcbily increment the raw i_version field. This always results in a change
    262 * to the raw value.
    263 *
    264 * NFS will use the i_version field to store the value from the server. It
    265 * mostly treats it as opaque, but in the case where it holds a write
    266 * delegation, it must increment the value itself. This function does that.
    267 */
    268static inline void
    269inode_inc_iversion_raw(struct inode *inode)
    270{
    271	atomic64_inc(&inode->i_version);
    272}
    273
    274/**
    275 * inode_peek_iversion - read i_version without flagging it to be incremented
    276 * @inode: inode from which i_version should be read
    277 *
    278 * Read the inode i_version counter for an inode without registering it as a
    279 * query.
    280 *
    281 * This is typically used by local filesystems that need to store an i_version
    282 * on disk. In that situation, it's not necessary to flag it as having been
    283 * viewed, as the result won't be used to gauge changes from that point.
    284 */
    285static inline u64
    286inode_peek_iversion(const struct inode *inode)
    287{
    288	return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT;
    289}
    290
    291/**
    292 * inode_query_iversion - read i_version for later use
    293 * @inode: inode from which i_version should be read
    294 *
    295 * Read the inode i_version counter. This should be used by callers that wish
    296 * to store the returned i_version for later comparison. This will guarantee
    297 * that a later query of the i_version will result in a different value if
    298 * anything has changed.
    299 *
    300 * In this implementation, we fetch the current value, set the QUERIED flag and
    301 * then try to swap it into place with a cmpxchg, if it wasn't already set. If
    302 * that fails, we try again with the newly fetched value from the cmpxchg.
    303 */
    304static inline u64
    305inode_query_iversion(struct inode *inode)
    306{
    307	u64 cur, old, new;
    308
    309	cur = inode_peek_iversion_raw(inode);
    310	for (;;) {
    311		/* If flag is already set, then no need to swap */
    312		if (cur & I_VERSION_QUERIED) {
    313			/*
    314			 * This barrier (and the implicit barrier in the
    315			 * cmpxchg below) pairs with the barrier in
    316			 * inode_maybe_inc_iversion().
    317			 */
    318			smp_mb();
    319			break;
    320		}
    321
    322		new = cur | I_VERSION_QUERIED;
    323		old = atomic64_cmpxchg(&inode->i_version, cur, new);
    324		if (likely(old == cur))
    325			break;
    326		cur = old;
    327	}
    328	return cur >> I_VERSION_QUERIED_SHIFT;
    329}
    330
    331/*
    332 * For filesystems without any sort of change attribute, the best we can
    333 * do is fake one up from the ctime:
    334 */
    335static inline u64 time_to_chattr(struct timespec64 *t)
    336{
    337	u64 chattr = t->tv_sec;
    338
    339	chattr <<= 32;
    340	chattr += t->tv_nsec;
    341	return chattr;
    342}
    343
    344/**
    345 * inode_eq_iversion_raw - check whether the raw i_version counter has changed
    346 * @inode: inode to check
    347 * @old: old value to check against its i_version
    348 *
    349 * Compare the current raw i_version counter with a previous one. Returns true
    350 * if they are the same or false if they are different.
    351 */
    352static inline bool
    353inode_eq_iversion_raw(const struct inode *inode, u64 old)
    354{
    355	return inode_peek_iversion_raw(inode) == old;
    356}
    357
    358/**
    359 * inode_eq_iversion - check whether the i_version counter has changed
    360 * @inode: inode to check
    361 * @old: old value to check against its i_version
    362 *
    363 * Compare an i_version counter with a previous one. Returns true if they are
    364 * the same, and false if they are different.
    365 *
    366 * Note that we don't need to set the QUERIED flag in this case, as the value
    367 * in the inode is not being recorded for later use.
    368 */
    369static inline bool
    370inode_eq_iversion(const struct inode *inode, u64 old)
    371{
    372	return inode_peek_iversion(inode) == old;
    373}
    374#endif