From: Ilya Leoshkevich Date: Thu, 24 Apr 2025 16:41:25 +0000 (+0200) Subject: selftests/bpf: Fix arena_spin_lock.c build dependency X-Git-Tag: v6.16-rc1~131^2~55^2~2 X-Git-Url: https://git.kernel.dk/?a=commitdiff_plain;h=ddfd1f30b5badcd06c199fa519bec5f0f54892e0;p=linux-block.git selftests/bpf: Fix arena_spin_lock.c build dependency Changing bpf_arena_spin_lock.h does not lead to recompiling arena_spin_lock.c. By convention, all BPF progs depend on all header files in progs/, so move this header file there. There are no other users besides arena_spin_lock.c. Signed-off-by: Ilya Leoshkevich Link: https://lore.kernel.org/r/20250424165525.154403-2-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov --- diff --git a/tools/testing/selftests/bpf/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/bpf_arena_spin_lock.h deleted file mode 100644 index 4e29c31c4ef8..000000000000 --- a/tools/testing/selftests/bpf/bpf_arena_spin_lock.h +++ /dev/null @@ -1,530 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ -#ifndef BPF_ARENA_SPIN_LOCK_H -#define BPF_ARENA_SPIN_LOCK_H - -#include -#include -#include "bpf_atomic.h" - -#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label) -#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1) - -#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) - -#define EBUSY 16 -#define EOPNOTSUPP 95 -#define ETIMEDOUT 110 - -#ifndef __arena -#define __arena __attribute__((address_space(1))) -#endif - -extern unsigned long CONFIG_NR_CPUS __kconfig; - -/* - * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but - * PowerPC overrides the definition to define lock->val as u32 instead of - * atomic_t, leading to compilation errors. Import a local definition below so - * that we don't depend on the vmlinux.h version. - */ - -struct __qspinlock { - union { - atomic_t val; - struct { - u8 locked; - u8 pending; - }; - struct { - u16 locked_pending; - u16 tail; - }; - }; -}; - -#define arena_spinlock_t struct __qspinlock -/* FIXME: Using typedef causes CO-RE relocation error */ -/* typedef struct qspinlock arena_spinlock_t; */ - -struct arena_mcs_spinlock { - struct arena_mcs_spinlock __arena *next; - int locked; - int count; -}; - -struct arena_qnode { - struct arena_mcs_spinlock mcs; -}; - -#define _Q_MAX_NODES 4 -#define _Q_PENDING_LOOPS 1 - -/* - * Bitfields in the atomic value: - * - * 0- 7: locked byte - * 8: pending - * 9-15: not used - * 16-17: tail index - * 18-31: tail cpu (+1) - */ -#define _Q_MAX_CPUS 1024 - -#define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ - << _Q_ ## type ## _OFFSET) -#define _Q_LOCKED_OFFSET 0 -#define _Q_LOCKED_BITS 8 -#define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED) - -#define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) -#define _Q_PENDING_BITS 8 -#define _Q_PENDING_MASK _Q_SET_MASK(PENDING) - -#define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS) -#define _Q_TAIL_IDX_BITS 2 -#define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX) - -#define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS) -#define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET) -#define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) - -#define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET -#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) - -#define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) -#define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET) - -struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES]; - -static inline u32 encode_tail(int cpu, int idx) -{ - u32 tail; - - tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; - tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ - - return tail; -} - -static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail) -{ - u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; - u32 idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; - - return &qnodes[cpu][idx].mcs; -} - -static inline -struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx) -{ - return &((struct arena_qnode __arena *)base + idx)->mcs; -} - -#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) - -/** - * xchg_tail - Put in the new queue tail code word & retrieve previous one - * @lock : Pointer to queued spinlock structure - * @tail : The new queue tail code word - * Return: The previous queue tail code word - * - * xchg(lock, tail) - * - * p,*,* -> n,*,* ; prev = xchg(lock, node) - */ -static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail) -{ - u32 old, new; - - old = atomic_read(&lock->val); - do { - new = (old & _Q_LOCKED_PENDING_MASK) | tail; - /* - * We can use relaxed semantics since the caller ensures that - * the MCS node is properly initialized before updating the - * tail. - */ - /* These loops are not expected to stall, but we still need to - * prove to the verifier they will terminate eventually. - */ - cond_break_label(out); - } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); - - return old; -out: - bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); - return old; -} - -/** - * clear_pending - clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,* -> *,0,* - */ -static __always_inline void clear_pending(arena_spinlock_t __arena *lock) -{ - WRITE_ONCE(lock->pending, 0); -} - -/** - * clear_pending_set_locked - take ownership and clear the pending bit. - * @lock: Pointer to queued spinlock structure - * - * *,1,0 -> *,0,1 - * - * Lock stealing is not allowed if this function is used. - */ -static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock) -{ - WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); -} - -/** - * set_locked - Set the lock bit and own the lock - * @lock: Pointer to queued spinlock structure - * - * *,*,0 -> *,0,1 - */ -static __always_inline void set_locked(arena_spinlock_t __arena *lock) -{ - WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); -} - -static __always_inline -u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock) -{ - u32 old, new; - - old = atomic_read(&lock->val); - do { - new = old | _Q_PENDING_VAL; - /* - * These loops are not expected to stall, but we still need to - * prove to the verifier they will terminate eventually. - */ - cond_break_label(out); - } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new)); - - return old; -out: - bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); - return old; -} - -/** - * arena_spin_trylock - try to acquire the queued spinlock - * @lock : Pointer to queued spinlock structure - * Return: 1 if lock acquired, 0 if failed - */ -static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock) -{ - int val = atomic_read(&lock->val); - - if (unlikely(val)) - return 0; - - return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)); -} - -__noinline -int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val) -{ - struct arena_mcs_spinlock __arena *prev, *next, *node0, *node; - int ret = -ETIMEDOUT; - u32 old, tail; - int idx; - - /* - * Wait for in-progress pending->locked hand-overs with a bounded - * number of spins so that we guarantee forward progress. - * - * 0,1,0 -> 0,0,1 - */ - if (val == _Q_PENDING_VAL) { - int cnt = _Q_PENDING_LOOPS; - val = atomic_cond_read_relaxed_label(&lock->val, - (VAL != _Q_PENDING_VAL) || !cnt--, - release_err); - } - - /* - * If we observe any contention; queue. - */ - if (val & ~_Q_LOCKED_MASK) - goto queue; - - /* - * trylock || pending - * - * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock - */ - val = arena_fetch_set_pending_acquire(lock); - - /* - * If we observe contention, there is a concurrent locker. - * - * Undo and queue; our setting of PENDING might have made the - * n,0,0 -> 0,0,0 transition fail and it will now be waiting - * on @next to become !NULL. - */ - if (unlikely(val & ~_Q_LOCKED_MASK)) { - - /* Undo PENDING if we set it. */ - if (!(val & _Q_PENDING_MASK)) - clear_pending(lock); - - goto queue; - } - - /* - * We're pending, wait for the owner to go away. - * - * 0,1,1 -> *,1,0 - * - * this wait loop must be a load-acquire such that we match the - * store-release that clears the locked bit and create lock - * sequentiality; this is because not all - * clear_pending_set_locked() implementations imply full - * barriers. - */ - if (val & _Q_LOCKED_MASK) - smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); - - /* - * take ownership and clear the pending bit. - * - * 0,1,0 -> 0,0,1 - */ - clear_pending_set_locked(lock); - return 0; - - /* - * End of pending bit optimistic spinning and beginning of MCS - * queuing. - */ -queue: - node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs; - idx = node0->count++; - tail = encode_tail(bpf_get_smp_processor_id(), idx); - - /* - * 4 nodes are allocated based on the assumption that there will not be - * nested NMIs taking spinlocks. That may not be true in some - * architectures even though the chance of needing more than 4 nodes - * will still be extremely unlikely. When that happens, we simply return - * an error. Original qspinlock has a trylock fallback in this case. - */ - if (unlikely(idx >= _Q_MAX_NODES)) { - ret = -EBUSY; - goto release_node_err; - } - - node = grab_mcs_node(node0, idx); - - /* - * Ensure that we increment the head node->count before initialising - * the actual node. If the compiler is kind enough to reorder these - * stores, then an IRQ could overwrite our assignments. - */ - barrier(); - - node->locked = 0; - node->next = NULL; - - /* - * We touched a (possibly) cold cacheline in the per-cpu queue node; - * attempt the trylock once more in the hope someone let go while we - * weren't watching. - */ - if (arena_spin_trylock(lock)) - goto release; - - /* - * Ensure that the initialisation of @node is complete before we - * publish the updated tail via xchg_tail() and potentially link - * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. - */ - smp_wmb(); - - /* - * Publish the updated tail. - * We have already touched the queueing cacheline; don't bother with - * pending stuff. - * - * p,*,* -> n,*,* - */ - old = xchg_tail(lock, tail); - next = NULL; - - /* - * if there was a previous node; link it and wait until reaching the - * head of the waitqueue. - */ - if (old & _Q_TAIL_MASK) { - prev = decode_tail(old); - - /* Link @node into the waitqueue. */ - WRITE_ONCE(prev->next, node); - - arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); - - /* - * While waiting for the MCS lock, the next pointer may have - * been set by another lock waiter. We cannot prefetch here - * due to lack of equivalent instruction in BPF ISA. - */ - next = READ_ONCE(node->next); - } - - /* - * we're at the head of the waitqueue, wait for the owner & pending to - * go away. - * - * *,x,y -> *,0,0 - * - * this wait loop must use a load-acquire such that we match the - * store-release that clears the locked bit and create lock - * sequentiality; this is because the set_locked() function below - * does not imply a full barrier. - */ - val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK), - release_node_err); - - /* - * claim the lock: - * - * n,0,0 -> 0,0,1 : lock, uncontended - * *,*,0 -> *,*,1 : lock, contended - * - * If the queue head is the only one in the queue (lock value == tail) - * and nobody is pending, clear the tail code and grab the lock. - * Otherwise, we only need to grab the lock. - */ - - /* - * In the PV case we might already have _Q_LOCKED_VAL set, because - * of lock stealing; therefore we must also allow: - * - * n,0,1 -> 0,0,1 - * - * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the - * above wait condition, therefore any concurrent setting of - * PENDING will make the uncontended transition fail. - */ - if ((val & _Q_TAIL_MASK) == tail) { - if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) - goto release; /* No contention */ - } - - /* - * Either somebody is queued behind us or _Q_PENDING_VAL got set - * which will then detect the remaining tail and queue behind us - * ensuring we'll see a @next. - */ - set_locked(lock); - - /* - * contended path; wait for next if not observed yet, release. - */ - if (!next) - next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err); - - arch_mcs_spin_unlock_contended(&next->locked); - -release:; - /* - * release the node - * - * Doing a normal dec vs this_cpu_dec is fine. An upper context always - * decrements count it incremented before returning, thus we're fine. - * For contexts interrupting us, they either observe our dec or not. - * Just ensure the compiler doesn't reorder this statement, as a - * this_cpu_dec implicitly implied that. - */ - barrier(); - node0->count--; - return 0; -release_node_err: - barrier(); - node0->count--; - goto release_err; -release_err: - return ret; -} - -/** - * arena_spin_lock - acquire a queued spinlock - * @lock: Pointer to queued spinlock structure - * - * On error, returned value will be negative. - * On success, zero is returned. - * - * The return value _must_ be tested against zero for success, - * instead of checking it against negative, for passing the - * BPF verifier. - * - * The user should do: - * if (arena_spin_lock(...) != 0) // failure - * or - * if (arena_spin_lock(...) == 0) // success - * or - * if (arena_spin_lock(...)) // failure - * or - * if (!arena_spin_lock(...)) // success - * instead of: - * if (arena_spin_lock(...) < 0) // failure - * - * The return value can still be inspected later. - */ -static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock) -{ - int val = 0; - - if (CONFIG_NR_CPUS > 1024) - return -EOPNOTSUPP; - - bpf_preempt_disable(); - if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) - return 0; - - val = arena_spin_lock_slowpath(lock, val); - /* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */ - if (val) - bpf_preempt_enable(); - return val; -} - -/** - * arena_spin_unlock - release a queued spinlock - * @lock : Pointer to queued spinlock structure - */ -static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock) -{ - /* - * unlock() needs release semantics: - */ - smp_store_release(&lock->locked, 0); - bpf_preempt_enable(); -} - -#define arena_spin_lock_irqsave(lock, flags) \ - ({ \ - int __ret; \ - bpf_local_irq_save(&(flags)); \ - __ret = arena_spin_lock((lock)); \ - if (__ret) \ - bpf_local_irq_restore(&(flags)); \ - (__ret); \ - }) - -#define arena_spin_unlock_irqrestore(lock, flags) \ - ({ \ - arena_spin_unlock((lock)); \ - bpf_local_irq_restore(&(flags)); \ - }) - -#endif - -#endif /* BPF_ARENA_SPIN_LOCK_H */ diff --git a/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h new file mode 100644 index 000000000000..4e29c31c4ef8 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h @@ -0,0 +1,530 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#ifndef BPF_ARENA_SPIN_LOCK_H +#define BPF_ARENA_SPIN_LOCK_H + +#include +#include +#include "bpf_atomic.h" + +#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label) +#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1) + +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) + +#define EBUSY 16 +#define EOPNOTSUPP 95 +#define ETIMEDOUT 110 + +#ifndef __arena +#define __arena __attribute__((address_space(1))) +#endif + +extern unsigned long CONFIG_NR_CPUS __kconfig; + +/* + * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but + * PowerPC overrides the definition to define lock->val as u32 instead of + * atomic_t, leading to compilation errors. Import a local definition below so + * that we don't depend on the vmlinux.h version. + */ + +struct __qspinlock { + union { + atomic_t val; + struct { + u8 locked; + u8 pending; + }; + struct { + u16 locked_pending; + u16 tail; + }; + }; +}; + +#define arena_spinlock_t struct __qspinlock +/* FIXME: Using typedef causes CO-RE relocation error */ +/* typedef struct qspinlock arena_spinlock_t; */ + +struct arena_mcs_spinlock { + struct arena_mcs_spinlock __arena *next; + int locked; + int count; +}; + +struct arena_qnode { + struct arena_mcs_spinlock mcs; +}; + +#define _Q_MAX_NODES 4 +#define _Q_PENDING_LOOPS 1 + +/* + * Bitfields in the atomic value: + * + * 0- 7: locked byte + * 8: pending + * 9-15: not used + * 16-17: tail index + * 18-31: tail cpu (+1) + */ +#define _Q_MAX_CPUS 1024 + +#define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ + << _Q_ ## type ## _OFFSET) +#define _Q_LOCKED_OFFSET 0 +#define _Q_LOCKED_BITS 8 +#define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED) + +#define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) +#define _Q_PENDING_BITS 8 +#define _Q_PENDING_MASK _Q_SET_MASK(PENDING) + +#define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS) +#define _Q_TAIL_IDX_BITS 2 +#define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX) + +#define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS) +#define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET) +#define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) + +#define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET +#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) + +#define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) +#define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET) + +struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES]; + +static inline u32 encode_tail(int cpu, int idx) +{ + u32 tail; + + tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; + tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ + + return tail; +} + +static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail) +{ + u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; + u32 idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; + + return &qnodes[cpu][idx].mcs; +} + +static inline +struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx) +{ + return &((struct arena_qnode __arena *)base + idx)->mcs; +} + +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) + +/** + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail) +{ + u32 old, new; + + old = atomic_read(&lock->val); + do { + new = (old & _Q_LOCKED_PENDING_MASK) | tail; + /* + * We can use relaxed semantics since the caller ensures that + * the MCS node is properly initialized before updating the + * tail. + */ + /* These loops are not expected to stall, but we still need to + * prove to the verifier they will terminate eventually. + */ + cond_break_label(out); + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); + + return old; +out: + bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); + return old; +} + +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(arena_spinlock_t __arena *lock) +{ + WRITE_ONCE(lock->pending, 0); +} + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + * + * Lock stealing is not allowed if this function is used. + */ +static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock) +{ + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); +} + +/** + * set_locked - Set the lock bit and own the lock + * @lock: Pointer to queued spinlock structure + * + * *,*,0 -> *,0,1 + */ +static __always_inline void set_locked(arena_spinlock_t __arena *lock) +{ + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); +} + +static __always_inline +u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock) +{ + u32 old, new; + + old = atomic_read(&lock->val); + do { + new = old | _Q_PENDING_VAL; + /* + * These loops are not expected to stall, but we still need to + * prove to the verifier they will terminate eventually. + */ + cond_break_label(out); + } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new)); + + return old; +out: + bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); + return old; +} + +/** + * arena_spin_trylock - try to acquire the queued spinlock + * @lock : Pointer to queued spinlock structure + * Return: 1 if lock acquired, 0 if failed + */ +static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock) +{ + int val = atomic_read(&lock->val); + + if (unlikely(val)) + return 0; + + return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)); +} + +__noinline +int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val) +{ + struct arena_mcs_spinlock __arena *prev, *next, *node0, *node; + int ret = -ETIMEDOUT; + u32 old, tail; + int idx; + + /* + * Wait for in-progress pending->locked hand-overs with a bounded + * number of spins so that we guarantee forward progress. + * + * 0,1,0 -> 0,0,1 + */ + if (val == _Q_PENDING_VAL) { + int cnt = _Q_PENDING_LOOPS; + val = atomic_cond_read_relaxed_label(&lock->val, + (VAL != _Q_PENDING_VAL) || !cnt--, + release_err); + } + + /* + * If we observe any contention; queue. + */ + if (val & ~_Q_LOCKED_MASK) + goto queue; + + /* + * trylock || pending + * + * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock + */ + val = arena_fetch_set_pending_acquire(lock); + + /* + * If we observe contention, there is a concurrent locker. + * + * Undo and queue; our setting of PENDING might have made the + * n,0,0 -> 0,0,0 transition fail and it will now be waiting + * on @next to become !NULL. + */ + if (unlikely(val & ~_Q_LOCKED_MASK)) { + + /* Undo PENDING if we set it. */ + if (!(val & _Q_PENDING_MASK)) + clear_pending(lock); + + goto queue; + } + + /* + * We're pending, wait for the owner to go away. + * + * 0,1,1 -> *,1,0 + * + * this wait loop must be a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because not all + * clear_pending_set_locked() implementations imply full + * barriers. + */ + if (val & _Q_LOCKED_MASK) + smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); + + /* + * take ownership and clear the pending bit. + * + * 0,1,0 -> 0,0,1 + */ + clear_pending_set_locked(lock); + return 0; + + /* + * End of pending bit optimistic spinning and beginning of MCS + * queuing. + */ +queue: + node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs; + idx = node0->count++; + tail = encode_tail(bpf_get_smp_processor_id(), idx); + + /* + * 4 nodes are allocated based on the assumption that there will not be + * nested NMIs taking spinlocks. That may not be true in some + * architectures even though the chance of needing more than 4 nodes + * will still be extremely unlikely. When that happens, we simply return + * an error. Original qspinlock has a trylock fallback in this case. + */ + if (unlikely(idx >= _Q_MAX_NODES)) { + ret = -EBUSY; + goto release_node_err; + } + + node = grab_mcs_node(node0, idx); + + /* + * Ensure that we increment the head node->count before initialising + * the actual node. If the compiler is kind enough to reorder these + * stores, then an IRQ could overwrite our assignments. + */ + barrier(); + + node->locked = 0; + node->next = NULL; + + /* + * We touched a (possibly) cold cacheline in the per-cpu queue node; + * attempt the trylock once more in the hope someone let go while we + * weren't watching. + */ + if (arena_spin_trylock(lock)) + goto release; + + /* + * Ensure that the initialisation of @node is complete before we + * publish the updated tail via xchg_tail() and potentially link + * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. + */ + smp_wmb(); + + /* + * Publish the updated tail. + * We have already touched the queueing cacheline; don't bother with + * pending stuff. + * + * p,*,* -> n,*,* + */ + old = xchg_tail(lock, tail); + next = NULL; + + /* + * if there was a previous node; link it and wait until reaching the + * head of the waitqueue. + */ + if (old & _Q_TAIL_MASK) { + prev = decode_tail(old); + + /* Link @node into the waitqueue. */ + WRITE_ONCE(prev->next, node); + + arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); + + /* + * While waiting for the MCS lock, the next pointer may have + * been set by another lock waiter. We cannot prefetch here + * due to lack of equivalent instruction in BPF ISA. + */ + next = READ_ONCE(node->next); + } + + /* + * we're at the head of the waitqueue, wait for the owner & pending to + * go away. + * + * *,x,y -> *,0,0 + * + * this wait loop must use a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because the set_locked() function below + * does not imply a full barrier. + */ + val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK), + release_node_err); + + /* + * claim the lock: + * + * n,0,0 -> 0,0,1 : lock, uncontended + * *,*,0 -> *,*,1 : lock, contended + * + * If the queue head is the only one in the queue (lock value == tail) + * and nobody is pending, clear the tail code and grab the lock. + * Otherwise, we only need to grab the lock. + */ + + /* + * In the PV case we might already have _Q_LOCKED_VAL set, because + * of lock stealing; therefore we must also allow: + * + * n,0,1 -> 0,0,1 + * + * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the + * above wait condition, therefore any concurrent setting of + * PENDING will make the uncontended transition fail. + */ + if ((val & _Q_TAIL_MASK) == tail) { + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) + goto release; /* No contention */ + } + + /* + * Either somebody is queued behind us or _Q_PENDING_VAL got set + * which will then detect the remaining tail and queue behind us + * ensuring we'll see a @next. + */ + set_locked(lock); + + /* + * contended path; wait for next if not observed yet, release. + */ + if (!next) + next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err); + + arch_mcs_spin_unlock_contended(&next->locked); + +release:; + /* + * release the node + * + * Doing a normal dec vs this_cpu_dec is fine. An upper context always + * decrements count it incremented before returning, thus we're fine. + * For contexts interrupting us, they either observe our dec or not. + * Just ensure the compiler doesn't reorder this statement, as a + * this_cpu_dec implicitly implied that. + */ + barrier(); + node0->count--; + return 0; +release_node_err: + barrier(); + node0->count--; + goto release_err; +release_err: + return ret; +} + +/** + * arena_spin_lock - acquire a queued spinlock + * @lock: Pointer to queued spinlock structure + * + * On error, returned value will be negative. + * On success, zero is returned. + * + * The return value _must_ be tested against zero for success, + * instead of checking it against negative, for passing the + * BPF verifier. + * + * The user should do: + * if (arena_spin_lock(...) != 0) // failure + * or + * if (arena_spin_lock(...) == 0) // success + * or + * if (arena_spin_lock(...)) // failure + * or + * if (!arena_spin_lock(...)) // success + * instead of: + * if (arena_spin_lock(...) < 0) // failure + * + * The return value can still be inspected later. + */ +static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock) +{ + int val = 0; + + if (CONFIG_NR_CPUS > 1024) + return -EOPNOTSUPP; + + bpf_preempt_disable(); + if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) + return 0; + + val = arena_spin_lock_slowpath(lock, val); + /* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */ + if (val) + bpf_preempt_enable(); + return val; +} + +/** + * arena_spin_unlock - release a queued spinlock + * @lock : Pointer to queued spinlock structure + */ +static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock) +{ + /* + * unlock() needs release semantics: + */ + smp_store_release(&lock->locked, 0); + bpf_preempt_enable(); +} + +#define arena_spin_lock_irqsave(lock, flags) \ + ({ \ + int __ret; \ + bpf_local_irq_save(&(flags)); \ + __ret = arena_spin_lock((lock)); \ + if (__ret) \ + bpf_local_irq_restore(&(flags)); \ + (__ret); \ + }) + +#define arena_spin_unlock_irqrestore(lock, flags) \ + ({ \ + arena_spin_unlock((lock)); \ + bpf_local_irq_restore(&(flags)); \ + }) + +#endif + +#endif /* BPF_ARENA_SPIN_LOCK_H */