cachepc-linux

Fork of AMDESE/linux with modifications for CachePC side-channel attack
git clone https://git.sinitax.com/sinitax/cachepc-linux
Log | Files | Refs | README | LICENSE | sfeed.txt

qspinlock.c (17100B)


      1// SPDX-License-Identifier: GPL-2.0-or-later
      2/*
      3 * Queued spinlock
      4 *
      5 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
      6 * (C) Copyright 2013-2014,2018 Red Hat, Inc.
      7 * (C) Copyright 2015 Intel Corp.
      8 * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
      9 *
     10 * Authors: Waiman Long <longman@redhat.com>
     11 *          Peter Zijlstra <peterz@infradead.org>
     12 */
     13
     14#ifndef _GEN_PV_LOCK_SLOWPATH
     15
     16#include <linux/smp.h>
     17#include <linux/bug.h>
     18#include <linux/cpumask.h>
     19#include <linux/percpu.h>
     20#include <linux/hardirq.h>
     21#include <linux/mutex.h>
     22#include <linux/prefetch.h>
     23#include <asm/byteorder.h>
     24#include <asm/qspinlock.h>
     25#include <trace/events/lock.h>
     26
     27/*
     28 * Include queued spinlock statistics code
     29 */
     30#include "qspinlock_stat.h"
     31
     32/*
     33 * The basic principle of a queue-based spinlock can best be understood
     34 * by studying a classic queue-based spinlock implementation called the
     35 * MCS lock. A copy of the original MCS lock paper ("Algorithms for Scalable
     36 * Synchronization on Shared-Memory Multiprocessors by Mellor-Crummey and
     37 * Scott") is available at
     38 *
     39 * https://bugzilla.kernel.org/show_bug.cgi?id=206115
     40 *
     41 * This queued spinlock implementation is based on the MCS lock, however to
     42 * make it fit the 4 bytes we assume spinlock_t to be, and preserve its
     43 * existing API, we must modify it somehow.
     44 *
     45 * In particular; where the traditional MCS lock consists of a tail pointer
     46 * (8 bytes) and needs the next pointer (another 8 bytes) of its own node to
     47 * unlock the next pending (next->locked), we compress both these: {tail,
     48 * next->locked} into a single u32 value.
     49 *
     50 * Since a spinlock disables recursion of its own context and there is a limit
     51 * to the contexts that can nest; namely: task, softirq, hardirq, nmi. As there
     52 * are at most 4 nesting levels, it can be encoded by a 2-bit number. Now
     53 * we can encode the tail by combining the 2-bit nesting level with the cpu
     54 * number. With one byte for the lock value and 3 bytes for the tail, only a
     55 * 32-bit word is now needed. Even though we only need 1 bit for the lock,
     56 * we extend it to a full byte to achieve better performance for architectures
     57 * that support atomic byte write.
     58 *
     59 * We also change the first spinner to spin on the lock bit instead of its
     60 * node; whereby avoiding the need to carry a node from lock to unlock, and
     61 * preserving existing lock API. This also makes the unlock code simpler and
     62 * faster.
     63 *
     64 * N.B. The current implementation only supports architectures that allow
     65 *      atomic operations on smaller 8-bit and 16-bit data types.
     66 *
     67 */
     68
     69#include "mcs_spinlock.h"
     70#define MAX_NODES	4
     71
     72/*
     73 * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in
     74 * size and four of them will fit nicely in one 64-byte cacheline. For
     75 * pvqspinlock, however, we need more space for extra data. To accommodate
     76 * that, we insert two more long words to pad it up to 32 bytes. IOW, only
     77 * two of them can fit in a cacheline in this case. That is OK as it is rare
     78 * to have more than 2 levels of slowpath nesting in actual use. We don't
     79 * want to penalize pvqspinlocks to optimize for a rare case in native
     80 * qspinlocks.
     81 */
     82struct qnode {
     83	struct mcs_spinlock mcs;
     84#ifdef CONFIG_PARAVIRT_SPINLOCKS
     85	long reserved[2];
     86#endif
     87};
     88
     89/*
     90 * The pending bit spinning loop count.
     91 * This heuristic is used to limit the number of lockword accesses
     92 * made by atomic_cond_read_relaxed when waiting for the lock to
     93 * transition out of the "== _Q_PENDING_VAL" state. We don't spin
     94 * indefinitely because there's no guarantee that we'll make forward
     95 * progress.
     96 */
     97#ifndef _Q_PENDING_LOOPS
     98#define _Q_PENDING_LOOPS	1
     99#endif
    100
    101/*
    102 * Per-CPU queue node structures; we can never have more than 4 nested
    103 * contexts: task, softirq, hardirq, nmi.
    104 *
    105 * Exactly fits one 64-byte cacheline on a 64-bit architecture.
    106 *
    107 * PV doubles the storage and uses the second cacheline for PV state.
    108 */
    109static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]);
    110
    111/*
    112 * We must be able to distinguish between no-tail and the tail at 0:0,
    113 * therefore increment the cpu number by one.
    114 */
    115
    116static inline __pure u32 encode_tail(int cpu, int idx)
    117{
    118	u32 tail;
    119
    120	tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
    121	tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
    122
    123	return tail;
    124}
    125
    126static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
    127{
    128	int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
    129	int idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
    130
    131	return per_cpu_ptr(&qnodes[idx].mcs, cpu);
    132}
    133
    134static inline __pure
    135struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx)
    136{
    137	return &((struct qnode *)base + idx)->mcs;
    138}
    139
    140#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
    141
    142#if _Q_PENDING_BITS == 8
    143/**
    144 * clear_pending - clear the pending bit.
    145 * @lock: Pointer to queued spinlock structure
    146 *
    147 * *,1,* -> *,0,*
    148 */
    149static __always_inline void clear_pending(struct qspinlock *lock)
    150{
    151	WRITE_ONCE(lock->pending, 0);
    152}
    153
    154/**
    155 * clear_pending_set_locked - take ownership and clear the pending bit.
    156 * @lock: Pointer to queued spinlock structure
    157 *
    158 * *,1,0 -> *,0,1
    159 *
    160 * Lock stealing is not allowed if this function is used.
    161 */
    162static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
    163{
    164	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
    165}
    166
    167/*
    168 * xchg_tail - Put in the new queue tail code word & retrieve previous one
    169 * @lock : Pointer to queued spinlock structure
    170 * @tail : The new queue tail code word
    171 * Return: The previous queue tail code word
    172 *
    173 * xchg(lock, tail), which heads an address dependency
    174 *
    175 * p,*,* -> n,*,* ; prev = xchg(lock, node)
    176 */
    177static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
    178{
    179	/*
    180	 * We can use relaxed semantics since the caller ensures that the
    181	 * MCS node is properly initialized before updating the tail.
    182	 */
    183	return (u32)xchg_relaxed(&lock->tail,
    184				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
    185}
    186
    187#else /* _Q_PENDING_BITS == 8 */
    188
    189/**
    190 * clear_pending - clear the pending bit.
    191 * @lock: Pointer to queued spinlock structure
    192 *
    193 * *,1,* -> *,0,*
    194 */
    195static __always_inline void clear_pending(struct qspinlock *lock)
    196{
    197	atomic_andnot(_Q_PENDING_VAL, &lock->val);
    198}
    199
    200/**
    201 * clear_pending_set_locked - take ownership and clear the pending bit.
    202 * @lock: Pointer to queued spinlock structure
    203 *
    204 * *,1,0 -> *,0,1
    205 */
    206static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
    207{
    208	atomic_add(-_Q_PENDING_VAL + _Q_LOCKED_VAL, &lock->val);
    209}
    210
    211/**
    212 * xchg_tail - Put in the new queue tail code word & retrieve previous one
    213 * @lock : Pointer to queued spinlock structure
    214 * @tail : The new queue tail code word
    215 * Return: The previous queue tail code word
    216 *
    217 * xchg(lock, tail)
    218 *
    219 * p,*,* -> n,*,* ; prev = xchg(lock, node)
    220 */
    221static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
    222{
    223	u32 old, new, val = atomic_read(&lock->val);
    224
    225	for (;;) {
    226		new = (val & _Q_LOCKED_PENDING_MASK) | tail;
    227		/*
    228		 * We can use relaxed semantics since the caller ensures that
    229		 * the MCS node is properly initialized before updating the
    230		 * tail.
    231		 */
    232		old = atomic_cmpxchg_relaxed(&lock->val, val, new);
    233		if (old == val)
    234			break;
    235
    236		val = old;
    237	}
    238	return old;
    239}
    240#endif /* _Q_PENDING_BITS == 8 */
    241
    242/**
    243 * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending
    244 * @lock : Pointer to queued spinlock structure
    245 * Return: The previous lock value
    246 *
    247 * *,*,* -> *,1,*
    248 */
    249#ifndef queued_fetch_set_pending_acquire
    250static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock)
    251{
    252	return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val);
    253}
    254#endif
    255
    256/**
    257 * set_locked - Set the lock bit and own the lock
    258 * @lock: Pointer to queued spinlock structure
    259 *
    260 * *,*,0 -> *,0,1
    261 */
    262static __always_inline void set_locked(struct qspinlock *lock)
    263{
    264	WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
    265}
    266
    267
    268/*
    269 * Generate the native code for queued_spin_unlock_slowpath(); provide NOPs for
    270 * all the PV callbacks.
    271 */
    272
    273static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
    274static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
    275					   struct mcs_spinlock *prev) { }
    276static __always_inline void __pv_kick_node(struct qspinlock *lock,
    277					   struct mcs_spinlock *node) { }
    278static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
    279						   struct mcs_spinlock *node)
    280						   { return 0; }
    281
    282#define pv_enabled()		false
    283
    284#define pv_init_node		__pv_init_node
    285#define pv_wait_node		__pv_wait_node
    286#define pv_kick_node		__pv_kick_node
    287#define pv_wait_head_or_lock	__pv_wait_head_or_lock
    288
    289#ifdef CONFIG_PARAVIRT_SPINLOCKS
    290#define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
    291#endif
    292
    293#endif /* _GEN_PV_LOCK_SLOWPATH */
    294
    295/**
    296 * queued_spin_lock_slowpath - acquire the queued spinlock
    297 * @lock: Pointer to queued spinlock structure
    298 * @val: Current value of the queued spinlock 32-bit word
    299 *
    300 * (queue tail, pending bit, lock value)
    301 *
    302 *              fast     :    slow                                  :    unlock
    303 *                       :                                          :
    304 * uncontended  (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
    305 *                       :       | ^--------.------.             /  :
    306 *                       :       v           \      \            |  :
    307 * pending               :    (0,1,1) +--> (0,1,0)   \           |  :
    308 *                       :       | ^--'              |           |  :
    309 *                       :       v                   |           |  :
    310 * uncontended           :    (n,x,y) +--> (n,0,0) --'           |  :
    311 *   queue               :       | ^--'                          |  :
    312 *                       :       v                               |  :
    313 * contended             :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
    314 *   queue               :         ^--'                             :
    315 */
    316void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
    317{
    318	struct mcs_spinlock *prev, *next, *node;
    319	u32 old, tail;
    320	int idx;
    321
    322	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
    323
    324	if (pv_enabled())
    325		goto pv_queue;
    326
    327	if (virt_spin_lock(lock))
    328		return;
    329
    330	/*
    331	 * Wait for in-progress pending->locked hand-overs with a bounded
    332	 * number of spins so that we guarantee forward progress.
    333	 *
    334	 * 0,1,0 -> 0,0,1
    335	 */
    336	if (val == _Q_PENDING_VAL) {
    337		int cnt = _Q_PENDING_LOOPS;
    338		val = atomic_cond_read_relaxed(&lock->val,
    339					       (VAL != _Q_PENDING_VAL) || !cnt--);
    340	}
    341
    342	/*
    343	 * If we observe any contention; queue.
    344	 */
    345	if (val & ~_Q_LOCKED_MASK)
    346		goto queue;
    347
    348	/*
    349	 * trylock || pending
    350	 *
    351	 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
    352	 */
    353	val = queued_fetch_set_pending_acquire(lock);
    354
    355	/*
    356	 * If we observe contention, there is a concurrent locker.
    357	 *
    358	 * Undo and queue; our setting of PENDING might have made the
    359	 * n,0,0 -> 0,0,0 transition fail and it will now be waiting
    360	 * on @next to become !NULL.
    361	 */
    362	if (unlikely(val & ~_Q_LOCKED_MASK)) {
    363
    364		/* Undo PENDING if we set it. */
    365		if (!(val & _Q_PENDING_MASK))
    366			clear_pending(lock);
    367
    368		goto queue;
    369	}
    370
    371	/*
    372	 * We're pending, wait for the owner to go away.
    373	 *
    374	 * 0,1,1 -> 0,1,0
    375	 *
    376	 * this wait loop must be a load-acquire such that we match the
    377	 * store-release that clears the locked bit and create lock
    378	 * sequentiality; this is because not all
    379	 * clear_pending_set_locked() implementations imply full
    380	 * barriers.
    381	 */
    382	if (val & _Q_LOCKED_MASK)
    383		atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK));
    384
    385	/*
    386	 * take ownership and clear the pending bit.
    387	 *
    388	 * 0,1,0 -> 0,0,1
    389	 */
    390	clear_pending_set_locked(lock);
    391	lockevent_inc(lock_pending);
    392	return;
    393
    394	/*
    395	 * End of pending bit optimistic spinning and beginning of MCS
    396	 * queuing.
    397	 */
    398queue:
    399	lockevent_inc(lock_slowpath);
    400pv_queue:
    401	node = this_cpu_ptr(&qnodes[0].mcs);
    402	idx = node->count++;
    403	tail = encode_tail(smp_processor_id(), idx);
    404
    405	trace_contention_begin(lock, LCB_F_SPIN);
    406
    407	/*
    408	 * 4 nodes are allocated based on the assumption that there will
    409	 * not be nested NMIs taking spinlocks. That may not be true in
    410	 * some architectures even though the chance of needing more than
    411	 * 4 nodes will still be extremely unlikely. When that happens,
    412	 * we fall back to spinning on the lock directly without using
    413	 * any MCS node. This is not the most elegant solution, but is
    414	 * simple enough.
    415	 */
    416	if (unlikely(idx >= MAX_NODES)) {
    417		lockevent_inc(lock_no_node);
    418		while (!queued_spin_trylock(lock))
    419			cpu_relax();
    420		goto release;
    421	}
    422
    423	node = grab_mcs_node(node, idx);
    424
    425	/*
    426	 * Keep counts of non-zero index values:
    427	 */
    428	lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
    429
    430	/*
    431	 * Ensure that we increment the head node->count before initialising
    432	 * the actual node. If the compiler is kind enough to reorder these
    433	 * stores, then an IRQ could overwrite our assignments.
    434	 */
    435	barrier();
    436
    437	node->locked = 0;
    438	node->next = NULL;
    439	pv_init_node(node);
    440
    441	/*
    442	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
    443	 * attempt the trylock once more in the hope someone let go while we
    444	 * weren't watching.
    445	 */
    446	if (queued_spin_trylock(lock))
    447		goto release;
    448
    449	/*
    450	 * Ensure that the initialisation of @node is complete before we
    451	 * publish the updated tail via xchg_tail() and potentially link
    452	 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
    453	 */
    454	smp_wmb();
    455
    456	/*
    457	 * Publish the updated tail.
    458	 * We have already touched the queueing cacheline; don't bother with
    459	 * pending stuff.
    460	 *
    461	 * p,*,* -> n,*,*
    462	 */
    463	old = xchg_tail(lock, tail);
    464	next = NULL;
    465
    466	/*
    467	 * if there was a previous node; link it and wait until reaching the
    468	 * head of the waitqueue.
    469	 */
    470	if (old & _Q_TAIL_MASK) {
    471		prev = decode_tail(old);
    472
    473		/* Link @node into the waitqueue. */
    474		WRITE_ONCE(prev->next, node);
    475
    476		pv_wait_node(node, prev);
    477		arch_mcs_spin_lock_contended(&node->locked);
    478
    479		/*
    480		 * While waiting for the MCS lock, the next pointer may have
    481		 * been set by another lock waiter. We optimistically load
    482		 * the next pointer & prefetch the cacheline for writing
    483		 * to reduce latency in the upcoming MCS unlock operation.
    484		 */
    485		next = READ_ONCE(node->next);
    486		if (next)
    487			prefetchw(next);
    488	}
    489
    490	/*
    491	 * we're at the head of the waitqueue, wait for the owner & pending to
    492	 * go away.
    493	 *
    494	 * *,x,y -> *,0,0
    495	 *
    496	 * this wait loop must use a load-acquire such that we match the
    497	 * store-release that clears the locked bit and create lock
    498	 * sequentiality; this is because the set_locked() function below
    499	 * does not imply a full barrier.
    500	 *
    501	 * The PV pv_wait_head_or_lock function, if active, will acquire
    502	 * the lock and return a non-zero value. So we have to skip the
    503	 * atomic_cond_read_acquire() call. As the next PV queue head hasn't
    504	 * been designated yet, there is no way for the locked value to become
    505	 * _Q_SLOW_VAL. So both the set_locked() and the
    506	 * atomic_cmpxchg_relaxed() calls will be safe.
    507	 *
    508	 * If PV isn't active, 0 will be returned instead.
    509	 *
    510	 */
    511	if ((val = pv_wait_head_or_lock(lock, node)))
    512		goto locked;
    513
    514	val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK));
    515
    516locked:
    517	/*
    518	 * claim the lock:
    519	 *
    520	 * n,0,0 -> 0,0,1 : lock, uncontended
    521	 * *,*,0 -> *,*,1 : lock, contended
    522	 *
    523	 * If the queue head is the only one in the queue (lock value == tail)
    524	 * and nobody is pending, clear the tail code and grab the lock.
    525	 * Otherwise, we only need to grab the lock.
    526	 */
    527
    528	/*
    529	 * In the PV case we might already have _Q_LOCKED_VAL set, because
    530	 * of lock stealing; therefore we must also allow:
    531	 *
    532	 * n,0,1 -> 0,0,1
    533	 *
    534	 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
    535	 *       above wait condition, therefore any concurrent setting of
    536	 *       PENDING will make the uncontended transition fail.
    537	 */
    538	if ((val & _Q_TAIL_MASK) == tail) {
    539		if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
    540			goto release; /* No contention */
    541	}
    542
    543	/*
    544	 * Either somebody is queued behind us or _Q_PENDING_VAL got set
    545	 * which will then detect the remaining tail and queue behind us
    546	 * ensuring we'll see a @next.
    547	 */
    548	set_locked(lock);
    549
    550	/*
    551	 * contended path; wait for next if not observed yet, release.
    552	 */
    553	if (!next)
    554		next = smp_cond_load_relaxed(&node->next, (VAL));
    555
    556	arch_mcs_spin_unlock_contended(&next->locked);
    557	pv_kick_node(lock, next);
    558
    559release:
    560	trace_contention_end(lock, 0);
    561
    562	/*
    563	 * release the node
    564	 */
    565	__this_cpu_dec(qnodes[0].mcs.count);
    566}
    567EXPORT_SYMBOL(queued_spin_lock_slowpath);
    568
    569/*
    570 * Generate the paravirt code for queued_spin_unlock_slowpath().
    571 */
    572#if !defined(_GEN_PV_LOCK_SLOWPATH) && defined(CONFIG_PARAVIRT_SPINLOCKS)
    573#define _GEN_PV_LOCK_SLOWPATH
    574
    575#undef  pv_enabled
    576#define pv_enabled()	true
    577
    578#undef pv_init_node
    579#undef pv_wait_node
    580#undef pv_kick_node
    581#undef pv_wait_head_or_lock
    582
    583#undef  queued_spin_lock_slowpath
    584#define queued_spin_lock_slowpath	__pv_queued_spin_lock_slowpath
    585
    586#include "qspinlock_paravirt.h"
    587#include "qspinlock.c"
    588
    589bool nopvspin __initdata;
    590static __init int parse_nopvspin(char *arg)
    591{
    592	nopvspin = true;
    593	return 0;
    594}
    595early_param("nopvspin", parse_nopvspin);
    596#endif