From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Thu, 24 Apr 2025 16:41:25 +0000 (+0200)
Subject: selftests/bpf: Fix arena_spin_lock.c build dependency
X-Git-Tag: v6.16-rc1~131^2~55^2~2
X-Git-Url: https://git.kernel.dk/?a=commitdiff_plain;h=ddfd1f30b5badcd06c199fa519bec5f0f54892e0;p=linux-block.git

selftests/bpf: Fix arena_spin_lock.c build dependency

Changing bpf_arena_spin_lock.h does not lead to recompiling
arena_spin_lock.c. By convention, all BPF progs depend on all
header files in progs/, so move this header file there. There
are no other users besides arena_spin_lock.c.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Link: https://lore.kernel.org/r/20250424165525.154403-2-iii@linux.ibm.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---

diff --git a/tools/testing/selftests/bpf/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/bpf_arena_spin_lock.h
deleted file mode 100644
index 4e29c31c4ef8..000000000000
--- a/tools/testing/selftests/bpf/bpf_arena_spin_lock.h
+++ /dev/null
@@ -1,530 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
-#ifndef BPF_ARENA_SPIN_LOCK_H
-#define BPF_ARENA_SPIN_LOCK_H
-
-#include <vmlinux.h>
-#include <bpf/bpf_helpers.h>
-#include "bpf_atomic.h"
-
-#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label)
-#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1)
-
-#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
-
-#define EBUSY 16
-#define EOPNOTSUPP 95
-#define ETIMEDOUT 110
-
-#ifndef __arena
-#define __arena __attribute__((address_space(1)))
-#endif
-
-extern unsigned long CONFIG_NR_CPUS __kconfig;
-
-/*
- * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but
- * PowerPC overrides the definition to define lock->val as u32 instead of
- * atomic_t, leading to compilation errors.  Import a local definition below so
- * that we don't depend on the vmlinux.h version.
- */
-
-struct __qspinlock {
-	union {
-		atomic_t val;
-		struct {
-			u8 locked;
-			u8 pending;
-		};
-		struct {
-			u16 locked_pending;
-			u16 tail;
-		};
-	};
-};
-
-#define arena_spinlock_t struct __qspinlock
-/* FIXME: Using typedef causes CO-RE relocation error */
-/* typedef struct qspinlock arena_spinlock_t; */
-
-struct arena_mcs_spinlock {
-	struct arena_mcs_spinlock __arena *next;
-	int locked;
-	int count;
-};
-
-struct arena_qnode {
-	struct arena_mcs_spinlock mcs;
-};
-
-#define _Q_MAX_NODES		4
-#define _Q_PENDING_LOOPS	1
-
-/*
- * Bitfields in the atomic value:
- *
- *  0- 7: locked byte
- *     8: pending
- *  9-15: not used
- * 16-17: tail index
- * 18-31: tail cpu (+1)
- */
-#define _Q_MAX_CPUS		1024
-
-#define	_Q_SET_MASK(type)	(((1U << _Q_ ## type ## _BITS) - 1)\
-				      << _Q_ ## type ## _OFFSET)
-#define _Q_LOCKED_OFFSET	0
-#define _Q_LOCKED_BITS		8
-#define _Q_LOCKED_MASK		_Q_SET_MASK(LOCKED)
-
-#define _Q_PENDING_OFFSET	(_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
-#define _Q_PENDING_BITS		8
-#define _Q_PENDING_MASK		_Q_SET_MASK(PENDING)
-
-#define _Q_TAIL_IDX_OFFSET	(_Q_PENDING_OFFSET + _Q_PENDING_BITS)
-#define _Q_TAIL_IDX_BITS	2
-#define _Q_TAIL_IDX_MASK	_Q_SET_MASK(TAIL_IDX)
-
-#define _Q_TAIL_CPU_OFFSET	(_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
-#define _Q_TAIL_CPU_BITS	(32 - _Q_TAIL_CPU_OFFSET)
-#define _Q_TAIL_CPU_MASK	_Q_SET_MASK(TAIL_CPU)
-
-#define _Q_TAIL_OFFSET		_Q_TAIL_IDX_OFFSET
-#define _Q_TAIL_MASK		(_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
-
-#define _Q_LOCKED_VAL		(1U << _Q_LOCKED_OFFSET)
-#define _Q_PENDING_VAL		(1U << _Q_PENDING_OFFSET)
-
-struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES];
-
-static inline u32 encode_tail(int cpu, int idx)
-{
-	u32 tail;
-
-	tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
-	tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
-
-	return tail;
-}
-
-static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail)
-{
-	u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
-	u32 idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
-
-	return &qnodes[cpu][idx].mcs;
-}
-
-static inline
-struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx)
-{
-	return &((struct arena_qnode __arena *)base + idx)->mcs;
-}
-
-#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
-
-/**
- * xchg_tail - Put in the new queue tail code word & retrieve previous one
- * @lock : Pointer to queued spinlock structure
- * @tail : The new queue tail code word
- * Return: The previous queue tail code word
- *
- * xchg(lock, tail)
- *
- * p,*,* -> n,*,* ; prev = xchg(lock, node)
- */
-static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail)
-{
-	u32 old, new;
-
-	old = atomic_read(&lock->val);
-	do {
-		new = (old & _Q_LOCKED_PENDING_MASK) | tail;
-		/*
-		 * We can use relaxed semantics since the caller ensures that
-		 * the MCS node is properly initialized before updating the
-		 * tail.
-		 */
-		/* These loops are not expected to stall, but we still need to
-		 * prove to the verifier they will terminate eventually.
-		 */
-		cond_break_label(out);
-	} while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
-
-	return old;
-out:
-	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
-	return old;
-}
-
-/**
- * clear_pending - clear the pending bit.
- * @lock: Pointer to queued spinlock structure
- *
- * *,1,* -> *,0,*
- */
-static __always_inline void clear_pending(arena_spinlock_t __arena *lock)
-{
-	WRITE_ONCE(lock->pending, 0);
-}
-
-/**
- * clear_pending_set_locked - take ownership and clear the pending bit.
- * @lock: Pointer to queued spinlock structure
- *
- * *,1,0 -> *,0,1
- *
- * Lock stealing is not allowed if this function is used.
- */
-static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock)
-{
-	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
-}
-
-/**
- * set_locked - Set the lock bit and own the lock
- * @lock: Pointer to queued spinlock structure
- *
- * *,*,0 -> *,0,1
- */
-static __always_inline void set_locked(arena_spinlock_t __arena *lock)
-{
-	WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
-}
-
-static __always_inline
-u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock)
-{
-	u32 old, new;
-
-	old = atomic_read(&lock->val);
-	do {
-		new = old | _Q_PENDING_VAL;
-		/*
-		 * These loops are not expected to stall, but we still need to
-		 * prove to the verifier they will terminate eventually.
-		 */
-		cond_break_label(out);
-	} while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new));
-
-	return old;
-out:
-	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
-	return old;
-}
-
-/**
- * arena_spin_trylock - try to acquire the queued spinlock
- * @lock : Pointer to queued spinlock structure
- * Return: 1 if lock acquired, 0 if failed
- */
-static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock)
-{
-	int val = atomic_read(&lock->val);
-
-	if (unlikely(val))
-		return 0;
-
-	return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
-}
-
-__noinline
-int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val)
-{
-	struct arena_mcs_spinlock __arena *prev, *next, *node0, *node;
-	int ret = -ETIMEDOUT;
-	u32 old, tail;
-	int idx;
-
-	/*
-	 * Wait for in-progress pending->locked hand-overs with a bounded
-	 * number of spins so that we guarantee forward progress.
-	 *
-	 * 0,1,0 -> 0,0,1
-	 */
-	if (val == _Q_PENDING_VAL) {
-		int cnt = _Q_PENDING_LOOPS;
-		val = atomic_cond_read_relaxed_label(&lock->val,
-						     (VAL != _Q_PENDING_VAL) || !cnt--,
-						     release_err);
-	}
-
-	/*
-	 * If we observe any contention; queue.
-	 */
-	if (val & ~_Q_LOCKED_MASK)
-		goto queue;
-
-	/*
-	 * trylock || pending
-	 *
-	 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
-	 */
-	val = arena_fetch_set_pending_acquire(lock);
-
-	/*
-	 * If we observe contention, there is a concurrent locker.
-	 *
-	 * Undo and queue; our setting of PENDING might have made the
-	 * n,0,0 -> 0,0,0 transition fail and it will now be waiting
-	 * on @next to become !NULL.
-	 */
-	if (unlikely(val & ~_Q_LOCKED_MASK)) {
-
-		/* Undo PENDING if we set it. */
-		if (!(val & _Q_PENDING_MASK))
-			clear_pending(lock);
-
-		goto queue;
-	}
-
-	/*
-	 * We're pending, wait for the owner to go away.
-	 *
-	 * 0,1,1 -> *,1,0
-	 *
-	 * this wait loop must be a load-acquire such that we match the
-	 * store-release that clears the locked bit and create lock
-	 * sequentiality; this is because not all
-	 * clear_pending_set_locked() implementations imply full
-	 * barriers.
-	 */
-	if (val & _Q_LOCKED_MASK)
-		smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);
-
-	/*
-	 * take ownership and clear the pending bit.
-	 *
-	 * 0,1,0 -> 0,0,1
-	 */
-	clear_pending_set_locked(lock);
-	return 0;
-
-	/*
-	 * End of pending bit optimistic spinning and beginning of MCS
-	 * queuing.
-	 */
-queue:
-	node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs;
-	idx = node0->count++;
-	tail = encode_tail(bpf_get_smp_processor_id(), idx);
-
-	/*
-	 * 4 nodes are allocated based on the assumption that there will not be
-	 * nested NMIs taking spinlocks. That may not be true in some
-	 * architectures even though the chance of needing more than 4 nodes
-	 * will still be extremely unlikely. When that happens, we simply return
-	 * an error. Original qspinlock has a trylock fallback in this case.
-	 */
-	if (unlikely(idx >= _Q_MAX_NODES)) {
-		ret = -EBUSY;
-		goto release_node_err;
-	}
-
-	node = grab_mcs_node(node0, idx);
-
-	/*
-	 * Ensure that we increment the head node->count before initialising
-	 * the actual node. If the compiler is kind enough to reorder these
-	 * stores, then an IRQ could overwrite our assignments.
-	 */
-	barrier();
-
-	node->locked = 0;
-	node->next = NULL;
-
-	/*
-	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
-	 * attempt the trylock once more in the hope someone let go while we
-	 * weren't watching.
-	 */
-	if (arena_spin_trylock(lock))
-		goto release;
-
-	/*
-	 * Ensure that the initialisation of @node is complete before we
-	 * publish the updated tail via xchg_tail() and potentially link
-	 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
-	 */
-	smp_wmb();
-
-	/*
-	 * Publish the updated tail.
-	 * We have already touched the queueing cacheline; don't bother with
-	 * pending stuff.
-	 *
-	 * p,*,* -> n,*,*
-	 */
-	old = xchg_tail(lock, tail);
-	next = NULL;
-
-	/*
-	 * if there was a previous node; link it and wait until reaching the
-	 * head of the waitqueue.
-	 */
-	if (old & _Q_TAIL_MASK) {
-		prev = decode_tail(old);
-
-		/* Link @node into the waitqueue. */
-		WRITE_ONCE(prev->next, node);
-
-		arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);
-
-		/*
-		 * While waiting for the MCS lock, the next pointer may have
-		 * been set by another lock waiter. We cannot prefetch here
-		 * due to lack of equivalent instruction in BPF ISA.
-		 */
-		next = READ_ONCE(node->next);
-	}
-
-	/*
-	 * we're at the head of the waitqueue, wait for the owner & pending to
-	 * go away.
-	 *
-	 * *,x,y -> *,0,0
-	 *
-	 * this wait loop must use a load-acquire such that we match the
-	 * store-release that clears the locked bit and create lock
-	 * sequentiality; this is because the set_locked() function below
-	 * does not imply a full barrier.
-	 */
-	val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
-					     release_node_err);
-
-	/*
-	 * claim the lock:
-	 *
-	 * n,0,0 -> 0,0,1 : lock, uncontended
-	 * *,*,0 -> *,*,1 : lock, contended
-	 *
-	 * If the queue head is the only one in the queue (lock value == tail)
-	 * and nobody is pending, clear the tail code and grab the lock.
-	 * Otherwise, we only need to grab the lock.
-	 */
-
-	/*
-	 * In the PV case we might already have _Q_LOCKED_VAL set, because
-	 * of lock stealing; therefore we must also allow:
-	 *
-	 * n,0,1 -> 0,0,1
-	 *
-	 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
-	 *       above wait condition, therefore any concurrent setting of
-	 *       PENDING will make the uncontended transition fail.
-	 */
-	if ((val & _Q_TAIL_MASK) == tail) {
-		if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
-			goto release; /* No contention */
-	}
-
-	/*
-	 * Either somebody is queued behind us or _Q_PENDING_VAL got set
-	 * which will then detect the remaining tail and queue behind us
-	 * ensuring we'll see a @next.
-	 */
-	set_locked(lock);
-
-	/*
-	 * contended path; wait for next if not observed yet, release.
-	 */
-	if (!next)
-		next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err);
-
-	arch_mcs_spin_unlock_contended(&next->locked);
-
-release:;
-	/*
-	 * release the node
-	 *
-	 * Doing a normal dec vs this_cpu_dec is fine. An upper context always
-	 * decrements count it incremented before returning, thus we're fine.
-	 * For contexts interrupting us, they either observe our dec or not.
-	 * Just ensure the compiler doesn't reorder this statement, as a
-	 * this_cpu_dec implicitly implied that.
-	 */
-	barrier();
-	node0->count--;
-	return 0;
-release_node_err:
-	barrier();
-	node0->count--;
-	goto release_err;
-release_err:
-	return ret;
-}
-
-/**
- * arena_spin_lock - acquire a queued spinlock
- * @lock: Pointer to queued spinlock structure
- *
- * On error, returned value will be negative.
- * On success, zero is returned.
- *
- * The return value _must_ be tested against zero for success,
- * instead of checking it against negative, for passing the
- * BPF verifier.
- *
- * The user should do:
- *	if (arena_spin_lock(...) != 0) // failure
- *		or
- *	if (arena_spin_lock(...) == 0) // success
- *		or
- *	if (arena_spin_lock(...)) // failure
- *		or
- *	if (!arena_spin_lock(...)) // success
- * instead of:
- *	if (arena_spin_lock(...) < 0) // failure
- *
- * The return value can still be inspected later.
- */
-static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock)
-{
-	int val = 0;
-
-	if (CONFIG_NR_CPUS > 1024)
-		return -EOPNOTSUPP;
-
-	bpf_preempt_disable();
-	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
-		return 0;
-
-	val = arena_spin_lock_slowpath(lock, val);
-	/* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */
-	if (val)
-		bpf_preempt_enable();
-	return val;
-}
-
-/**
- * arena_spin_unlock - release a queued spinlock
- * @lock : Pointer to queued spinlock structure
- */
-static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock)
-{
-	/*
-	 * unlock() needs release semantics:
-	 */
-	smp_store_release(&lock->locked, 0);
-	bpf_preempt_enable();
-}
-
-#define arena_spin_lock_irqsave(lock, flags)             \
-	({                                               \
-		int __ret;                               \
-		bpf_local_irq_save(&(flags));            \
-		__ret = arena_spin_lock((lock));         \
-		if (__ret)                               \
-			bpf_local_irq_restore(&(flags)); \
-		(__ret);                                 \
-	})
-
-#define arena_spin_unlock_irqrestore(lock, flags) \
-	({                                        \
-		arena_spin_unlock((lock));        \
-		bpf_local_irq_restore(&(flags));  \
-	})
-
-#endif
-
-#endif /* BPF_ARENA_SPIN_LOCK_H */
diff --git a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h
new file mode 100644
index 000000000000..4e29c31c4ef8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h
@@ -0,0 +1,530 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+#ifndef BPF_ARENA_SPIN_LOCK_H
+#define BPF_ARENA_SPIN_LOCK_H
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_atomic.h"
+
+#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label)
+#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1)
+
+#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+
+#define EBUSY 16
+#define EOPNOTSUPP 95
+#define ETIMEDOUT 110
+
+#ifndef __arena
+#define __arena __attribute__((address_space(1)))
+#endif
+
+extern unsigned long CONFIG_NR_CPUS __kconfig;
+
+/*
+ * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but
+ * PowerPC overrides the definition to define lock->val as u32 instead of
+ * atomic_t, leading to compilation errors.  Import a local definition below so
+ * that we don't depend on the vmlinux.h version.
+ */
+
+struct __qspinlock {
+	union {
+		atomic_t val;
+		struct {
+			u8 locked;
+			u8 pending;
+		};
+		struct {
+			u16 locked_pending;
+			u16 tail;
+		};
+	};
+};
+
+#define arena_spinlock_t struct __qspinlock
+/* FIXME: Using typedef causes CO-RE relocation error */
+/* typedef struct qspinlock arena_spinlock_t; */
+
+struct arena_mcs_spinlock {
+	struct arena_mcs_spinlock __arena *next;
+	int locked;
+	int count;
+};
+
+struct arena_qnode {
+	struct arena_mcs_spinlock mcs;
+};
+
+#define _Q_MAX_NODES		4
+#define _Q_PENDING_LOOPS	1
+
+/*
+ * Bitfields in the atomic value:
+ *
+ *  0- 7: locked byte
+ *     8: pending
+ *  9-15: not used
+ * 16-17: tail index
+ * 18-31: tail cpu (+1)
+ */
+#define _Q_MAX_CPUS		1024
+
+#define	_Q_SET_MASK(type)	(((1U << _Q_ ## type ## _BITS) - 1)\
+				      << _Q_ ## type ## _OFFSET)
+#define _Q_LOCKED_OFFSET	0
+#define _Q_LOCKED_BITS		8
+#define _Q_LOCKED_MASK		_Q_SET_MASK(LOCKED)
+
+#define _Q_PENDING_OFFSET	(_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
+#define _Q_PENDING_BITS		8
+#define _Q_PENDING_MASK		_Q_SET_MASK(PENDING)
+
+#define _Q_TAIL_IDX_OFFSET	(_Q_PENDING_OFFSET + _Q_PENDING_BITS)
+#define _Q_TAIL_IDX_BITS	2
+#define _Q_TAIL_IDX_MASK	_Q_SET_MASK(TAIL_IDX)
+
+#define _Q_TAIL_CPU_OFFSET	(_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
+#define _Q_TAIL_CPU_BITS	(32 - _Q_TAIL_CPU_OFFSET)
+#define _Q_TAIL_CPU_MASK	_Q_SET_MASK(TAIL_CPU)
+
+#define _Q_TAIL_OFFSET		_Q_TAIL_IDX_OFFSET
+#define _Q_TAIL_MASK		(_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
+
+#define _Q_LOCKED_VAL		(1U << _Q_LOCKED_OFFSET)
+#define _Q_PENDING_VAL		(1U << _Q_PENDING_OFFSET)
+
+struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES];
+
+static inline u32 encode_tail(int cpu, int idx)
+{
+	u32 tail;
+
+	tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+	tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+
+	return tail;
+}
+
+static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail)
+{
+	u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+	u32 idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+
+	return &qnodes[cpu][idx].mcs;
+}
+
+static inline
+struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx)
+{
+	return &((struct arena_qnode __arena *)base + idx)->mcs;
+}
+
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+
+/**
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail)
+{
+	u32 old, new;
+
+	old = atomic_read(&lock->val);
+	do {
+		new = (old & _Q_LOCKED_PENDING_MASK) | tail;
+		/*
+		 * We can use relaxed semantics since the caller ensures that
+		 * the MCS node is properly initialized before updating the
+		 * tail.
+		 */
+		/* These loops are not expected to stall, but we still need to
+		 * prove to the verifier they will terminate eventually.
+		 */
+		cond_break_label(out);
+	} while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
+
+	return old;
+out:
+	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
+	return old;
+}
+
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(arena_spinlock_t __arena *lock)
+{
+	WRITE_ONCE(lock->pending, 0);
+}
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock)
+{
+	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
+}
+
+/**
+ * set_locked - Set the lock bit and own the lock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,*,0 -> *,0,1
+ */
+static __always_inline void set_locked(arena_spinlock_t __arena *lock)
+{
+	WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
+}
+
+static __always_inline
+u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock)
+{
+	u32 old, new;
+
+	old = atomic_read(&lock->val);
+	do {
+		new = old | _Q_PENDING_VAL;
+		/*
+		 * These loops are not expected to stall, but we still need to
+		 * prove to the verifier they will terminate eventually.
+		 */
+		cond_break_label(out);
+	} while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new));
+
+	return old;
+out:
+	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
+	return old;
+}
+
+/**
+ * arena_spin_trylock - try to acquire the queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ * Return: 1 if lock acquired, 0 if failed
+ */
+static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock)
+{
+	int val = atomic_read(&lock->val);
+
+	if (unlikely(val))
+		return 0;
+
+	return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
+}
+
+__noinline
+int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val)
+{
+	struct arena_mcs_spinlock __arena *prev, *next, *node0, *node;
+	int ret = -ETIMEDOUT;
+	u32 old, tail;
+	int idx;
+
+	/*
+	 * Wait for in-progress pending->locked hand-overs with a bounded
+	 * number of spins so that we guarantee forward progress.
+	 *
+	 * 0,1,0 -> 0,0,1
+	 */
+	if (val == _Q_PENDING_VAL) {
+		int cnt = _Q_PENDING_LOOPS;
+		val = atomic_cond_read_relaxed_label(&lock->val,
+						     (VAL != _Q_PENDING_VAL) || !cnt--,
+						     release_err);
+	}
+
+	/*
+	 * If we observe any contention; queue.
+	 */
+	if (val & ~_Q_LOCKED_MASK)
+		goto queue;
+
+	/*
+	 * trylock || pending
+	 *
+	 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
+	 */
+	val = arena_fetch_set_pending_acquire(lock);
+
+	/*
+	 * If we observe contention, there is a concurrent locker.
+	 *
+	 * Undo and queue; our setting of PENDING might have made the
+	 * n,0,0 -> 0,0,0 transition fail and it will now be waiting
+	 * on @next to become !NULL.
+	 */
+	if (unlikely(val & ~_Q_LOCKED_MASK)) {
+
+		/* Undo PENDING if we set it. */
+		if (!(val & _Q_PENDING_MASK))
+			clear_pending(lock);
+
+		goto queue;
+	}
+
+	/*
+	 * We're pending, wait for the owner to go away.
+	 *
+	 * 0,1,1 -> *,1,0
+	 *
+	 * this wait loop must be a load-acquire such that we match the
+	 * store-release that clears the locked bit and create lock
+	 * sequentiality; this is because not all
+	 * clear_pending_set_locked() implementations imply full
+	 * barriers.
+	 */
+	if (val & _Q_LOCKED_MASK)
+		smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);
+
+	/*
+	 * take ownership and clear the pending bit.
+	 *
+	 * 0,1,0 -> 0,0,1
+	 */
+	clear_pending_set_locked(lock);
+	return 0;
+
+	/*
+	 * End of pending bit optimistic spinning and beginning of MCS
+	 * queuing.
+	 */
+queue:
+	node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs;
+	idx = node0->count++;
+	tail = encode_tail(bpf_get_smp_processor_id(), idx);
+
+	/*
+	 * 4 nodes are allocated based on the assumption that there will not be
+	 * nested NMIs taking spinlocks. That may not be true in some
+	 * architectures even though the chance of needing more than 4 nodes
+	 * will still be extremely unlikely. When that happens, we simply return
+	 * an error. Original qspinlock has a trylock fallback in this case.
+	 */
+	if (unlikely(idx >= _Q_MAX_NODES)) {
+		ret = -EBUSY;
+		goto release_node_err;
+	}
+
+	node = grab_mcs_node(node0, idx);
+
+	/*
+	 * Ensure that we increment the head node->count before initialising
+	 * the actual node. If the compiler is kind enough to reorder these
+	 * stores, then an IRQ could overwrite our assignments.
+	 */
+	barrier();
+
+	node->locked = 0;
+	node->next = NULL;
+
+	/*
+	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
+	 * attempt the trylock once more in the hope someone let go while we
+	 * weren't watching.
+	 */
+	if (arena_spin_trylock(lock))
+		goto release;
+
+	/*
+	 * Ensure that the initialisation of @node is complete before we
+	 * publish the updated tail via xchg_tail() and potentially link
+	 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
+	 */
+	smp_wmb();
+
+	/*
+	 * Publish the updated tail.
+	 * We have already touched the queueing cacheline; don't bother with
+	 * pending stuff.
+	 *
+	 * p,*,* -> n,*,*
+	 */
+	old = xchg_tail(lock, tail);
+	next = NULL;
+
+	/*
+	 * if there was a previous node; link it and wait until reaching the
+	 * head of the waitqueue.
+	 */
+	if (old & _Q_TAIL_MASK) {
+		prev = decode_tail(old);
+
+		/* Link @node into the waitqueue. */
+		WRITE_ONCE(prev->next, node);
+
+		arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);
+
+		/*
+		 * While waiting for the MCS lock, the next pointer may have
+		 * been set by another lock waiter. We cannot prefetch here
+		 * due to lack of equivalent instruction in BPF ISA.
+		 */
+		next = READ_ONCE(node->next);
+	}
+
+	/*
+	 * we're at the head of the waitqueue, wait for the owner & pending to
+	 * go away.
+	 *
+	 * *,x,y -> *,0,0
+	 *
+	 * this wait loop must use a load-acquire such that we match the
+	 * store-release that clears the locked bit and create lock
+	 * sequentiality; this is because the set_locked() function below
+	 * does not imply a full barrier.
+	 */
+	val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
+					     release_node_err);
+
+	/*
+	 * claim the lock:
+	 *
+	 * n,0,0 -> 0,0,1 : lock, uncontended
+	 * *,*,0 -> *,*,1 : lock, contended
+	 *
+	 * If the queue head is the only one in the queue (lock value == tail)
+	 * and nobody is pending, clear the tail code and grab the lock.
+	 * Otherwise, we only need to grab the lock.
+	 */
+
+	/*
+	 * In the PV case we might already have _Q_LOCKED_VAL set, because
+	 * of lock stealing; therefore we must also allow:
+	 *
+	 * n,0,1 -> 0,0,1
+	 *
+	 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
+	 *       above wait condition, therefore any concurrent setting of
+	 *       PENDING will make the uncontended transition fail.
+	 */
+	if ((val & _Q_TAIL_MASK) == tail) {
+		if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
+			goto release; /* No contention */
+	}
+
+	/*
+	 * Either somebody is queued behind us or _Q_PENDING_VAL got set
+	 * which will then detect the remaining tail and queue behind us
+	 * ensuring we'll see a @next.
+	 */
+	set_locked(lock);
+
+	/*
+	 * contended path; wait for next if not observed yet, release.
+	 */
+	if (!next)
+		next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err);
+
+	arch_mcs_spin_unlock_contended(&next->locked);
+
+release:;
+	/*
+	 * release the node
+	 *
+	 * Doing a normal dec vs this_cpu_dec is fine. An upper context always
+	 * decrements count it incremented before returning, thus we're fine.
+	 * For contexts interrupting us, they either observe our dec or not.
+	 * Just ensure the compiler doesn't reorder this statement, as a
+	 * this_cpu_dec implicitly implied that.
+	 */
+	barrier();
+	node0->count--;
+	return 0;
+release_node_err:
+	barrier();
+	node0->count--;
+	goto release_err;
+release_err:
+	return ret;
+}
+
+/**
+ * arena_spin_lock - acquire a queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * On error, returned value will be negative.
+ * On success, zero is returned.
+ *
+ * The return value _must_ be tested against zero for success,
+ * instead of checking it against negative, for passing the
+ * BPF verifier.
+ *
+ * The user should do:
+ *	if (arena_spin_lock(...) != 0) // failure
+ *		or
+ *	if (arena_spin_lock(...) == 0) // success
+ *		or
+ *	if (arena_spin_lock(...)) // failure
+ *		or
+ *	if (!arena_spin_lock(...)) // success
+ * instead of:
+ *	if (arena_spin_lock(...) < 0) // failure
+ *
+ * The return value can still be inspected later.
+ */
+static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock)
+{
+	int val = 0;
+
+	if (CONFIG_NR_CPUS > 1024)
+		return -EOPNOTSUPP;
+
+	bpf_preempt_disable();
+	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
+		return 0;
+
+	val = arena_spin_lock_slowpath(lock, val);
+	/* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */
+	if (val)
+		bpf_preempt_enable();
+	return val;
+}
+
+/**
+ * arena_spin_unlock - release a queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ */
+static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock)
+{
+	/*
+	 * unlock() needs release semantics:
+	 */
+	smp_store_release(&lock->locked, 0);
+	bpf_preempt_enable();
+}
+
+#define arena_spin_lock_irqsave(lock, flags)             \
+	({                                               \
+		int __ret;                               \
+		bpf_local_irq_save(&(flags));            \
+		__ret = arena_spin_lock((lock));         \
+		if (__ret)                               \
+			bpf_local_irq_restore(&(flags)); \
+		(__ret);                                 \
+	})
+
+#define arena_spin_unlock_irqrestore(lock, flags) \
+	({                                        \
+		arena_spin_unlock((lock));        \
+		bpf_local_irq_restore(&(flags));  \
+	})
+
+#endif
+
+#endif /* BPF_ARENA_SPIN_LOCK_H */