rseq/selftests: Provide parametrized tests
authorMathieu Desnoyers <mathieu.desnoyers@efficios.com>
Sat, 2 Jun 2018 12:44:07 +0000 (08:44 -0400)
committerThomas Gleixner <tglx@linutronix.de>
Wed, 6 Jun 2018 09:58:35 +0000 (11:58 +0200)
"param_test" is a parametrizable restartable sequences test. See
the "--help" output for usage.

"param_test_benchmark" is the same as "param_test", but it removes
testing book-keeping code to allow accurate benchmarks.

"param_test_compare_twice" is the same as "param_test", but it performs
each comparison within rseq critical section twice, thus validating
invariants. If any of the second comparisons fails, an error message
is printed and the test aborts.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Watson <davejwatson@fb.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: linux-kselftest@vger.kernel.org
Cc: "H . Peter Anvin" <hpa@zytor.com>
Cc: Chris Lameter <cl@linux.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Andrew Hunter <ahh@google.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: "Paul E . McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ben Maurer <bmaurer@fb.com>
Cc: linux-api@vger.kernel.org
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lkml.kernel.org/r/20180602124408.8430-16-mathieu.desnoyers@efficios.com
tools/testing/selftests/rseq/param_test.c [new file with mode: 0644]

diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c
new file mode 100644 (file)
index 0000000..6a9f602
--- /dev/null
@@ -0,0 +1,1260 @@
+// SPDX-License-Identifier: LGPL-2.1
+#define _GNU_SOURCE
+#include <assert.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <poll.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <errno.h>
+#include <stddef.h>
+
+static inline pid_t gettid(void)
+{
+       return syscall(__NR_gettid);
+}
+
+#define NR_INJECT      9
+static int loop_cnt[NR_INJECT + 1];
+
+static int loop_cnt_1 asm("asm_loop_cnt_1") __attribute__((used));
+static int loop_cnt_2 asm("asm_loop_cnt_2") __attribute__((used));
+static int loop_cnt_3 asm("asm_loop_cnt_3") __attribute__((used));
+static int loop_cnt_4 asm("asm_loop_cnt_4") __attribute__((used));
+static int loop_cnt_5 asm("asm_loop_cnt_5") __attribute__((used));
+static int loop_cnt_6 asm("asm_loop_cnt_6") __attribute__((used));
+
+static int opt_modulo, verbose;
+
+static int opt_yield, opt_signal, opt_sleep,
+               opt_disable_rseq, opt_threads = 200,
+               opt_disable_mod = 0, opt_test = 's', opt_mb = 0;
+
+#ifndef RSEQ_SKIP_FASTPATH
+static long long opt_reps = 5000;
+#else
+static long long opt_reps = 100;
+#endif
+
+static __thread __attribute__((tls_model("initial-exec")))
+unsigned int signals_delivered;
+
+#ifndef BENCHMARK
+
+static __thread __attribute__((tls_model("initial-exec"), unused))
+unsigned int yield_mod_cnt, nr_abort;
+
+#define printf_verbose(fmt, ...)                       \
+       do {                                            \
+               if (verbose)                            \
+                       printf(fmt, ## __VA_ARGS__);    \
+       } while (0)
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#define INJECT_ASM_REG "eax"
+
+#define RSEQ_INJECT_CLOBBER \
+       , INJECT_ASM_REG
+
+#ifdef __i386__
+
+#define RSEQ_INJECT_ASM(n) \
+       "mov asm_loop_cnt_" #n ", %%" INJECT_ASM_REG "\n\t" \
+       "test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \
+       "jz 333f\n\t" \
+       "222:\n\t" \
+       "dec %%" INJECT_ASM_REG "\n\t" \
+       "jnz 222b\n\t" \
+       "333:\n\t"
+
+#elif defined(__x86_64__)
+
+#define RSEQ_INJECT_ASM(n) \
+       "lea asm_loop_cnt_" #n "(%%rip), %%" INJECT_ASM_REG "\n\t" \
+       "mov (%%" INJECT_ASM_REG "), %%" INJECT_ASM_REG "\n\t" \
+       "test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \
+       "jz 333f\n\t" \
+       "222:\n\t" \
+       "dec %%" INJECT_ASM_REG "\n\t" \
+       "jnz 222b\n\t" \
+       "333:\n\t"
+
+#else
+#error "Unsupported architecture"
+#endif
+
+#elif defined(__ARMEL__)
+
+#define RSEQ_INJECT_INPUT \
+       , [loop_cnt_1]"m"(loop_cnt[1]) \
+       , [loop_cnt_2]"m"(loop_cnt[2]) \
+       , [loop_cnt_3]"m"(loop_cnt[3]) \
+       , [loop_cnt_4]"m"(loop_cnt[4]) \
+       , [loop_cnt_5]"m"(loop_cnt[5]) \
+       , [loop_cnt_6]"m"(loop_cnt[6])
+
+#define INJECT_ASM_REG "r4"
+
+#define RSEQ_INJECT_CLOBBER \
+       , INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n) \
+       "ldr " INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
+       "cmp " INJECT_ASM_REG ", #0\n\t" \
+       "beq 333f\n\t" \
+       "222:\n\t" \
+       "subs " INJECT_ASM_REG ", #1\n\t" \
+       "bne 222b\n\t" \
+       "333:\n\t"
+
+#elif __PPC__
+
+#define RSEQ_INJECT_INPUT \
+       , [loop_cnt_1]"m"(loop_cnt[1]) \
+       , [loop_cnt_2]"m"(loop_cnt[2]) \
+       , [loop_cnt_3]"m"(loop_cnt[3]) \
+       , [loop_cnt_4]"m"(loop_cnt[4]) \
+       , [loop_cnt_5]"m"(loop_cnt[5]) \
+       , [loop_cnt_6]"m"(loop_cnt[6])
+
+#define INJECT_ASM_REG "r18"
+
+#define RSEQ_INJECT_CLOBBER \
+       , INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n) \
+       "lwz %%" INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
+       "cmpwi %%" INJECT_ASM_REG ", 0\n\t" \
+       "beq 333f\n\t" \
+       "222:\n\t" \
+       "subic. %%" INJECT_ASM_REG ", %%" INJECT_ASM_REG ", 1\n\t" \
+       "bne 222b\n\t" \
+       "333:\n\t"
+#else
+#error unsupported target
+#endif
+
+#define RSEQ_INJECT_FAILED \
+       nr_abort++;
+
+#define RSEQ_INJECT_C(n) \
+{ \
+       int loc_i, loc_nr_loops = loop_cnt[n]; \
+       \
+       for (loc_i = 0; loc_i < loc_nr_loops; loc_i++) { \
+               rseq_barrier(); \
+       } \
+       if (loc_nr_loops == -1 && opt_modulo) { \
+               if (yield_mod_cnt == opt_modulo - 1) { \
+                       if (opt_sleep > 0) \
+                               poll(NULL, 0, opt_sleep); \
+                       if (opt_yield) \
+                               sched_yield(); \
+                       if (opt_signal) \
+                               raise(SIGUSR1); \
+                       yield_mod_cnt = 0; \
+               } else { \
+                       yield_mod_cnt++; \
+               } \
+       } \
+}
+
+#else
+
+#define printf_verbose(fmt, ...)
+
+#endif /* BENCHMARK */
+
+#include "rseq.h"
+
+struct percpu_lock_entry {
+       intptr_t v;
+} __attribute__((aligned(128)));
+
+struct percpu_lock {
+       struct percpu_lock_entry c[CPU_SETSIZE];
+};
+
+struct test_data_entry {
+       intptr_t count;
+} __attribute__((aligned(128)));
+
+struct spinlock_test_data {
+       struct percpu_lock lock;
+       struct test_data_entry c[CPU_SETSIZE];
+};
+
+struct spinlock_thread_test_data {
+       struct spinlock_test_data *data;
+       long long reps;
+       int reg;
+};
+
+struct inc_test_data {
+       struct test_data_entry c[CPU_SETSIZE];
+};
+
+struct inc_thread_test_data {
+       struct inc_test_data *data;
+       long long reps;
+       int reg;
+};
+
+struct percpu_list_node {
+       intptr_t data;
+       struct percpu_list_node *next;
+};
+
+struct percpu_list_entry {
+       struct percpu_list_node *head;
+} __attribute__((aligned(128)));
+
+struct percpu_list {
+       struct percpu_list_entry c[CPU_SETSIZE];
+};
+
+#define BUFFER_ITEM_PER_CPU    100
+
+struct percpu_buffer_node {
+       intptr_t data;
+};
+
+struct percpu_buffer_entry {
+       intptr_t offset;
+       intptr_t buflen;
+       struct percpu_buffer_node **array;
+} __attribute__((aligned(128)));
+
+struct percpu_buffer {
+       struct percpu_buffer_entry c[CPU_SETSIZE];
+};
+
+#define MEMCPY_BUFFER_ITEM_PER_CPU     100
+
+struct percpu_memcpy_buffer_node {
+       intptr_t data1;
+       uint64_t data2;
+};
+
+struct percpu_memcpy_buffer_entry {
+       intptr_t offset;
+       intptr_t buflen;
+       struct percpu_memcpy_buffer_node *array;
+} __attribute__((aligned(128)));
+
+struct percpu_memcpy_buffer {
+       struct percpu_memcpy_buffer_entry c[CPU_SETSIZE];
+};
+
+/* A simple percpu spinlock. Grabs lock on current cpu. */
+static int rseq_this_cpu_lock(struct percpu_lock *lock)
+{
+       int cpu;
+
+       for (;;) {
+               int ret;
+
+               cpu = rseq_cpu_start();
+               ret = rseq_cmpeqv_storev(&lock->c[cpu].v,
+                                        0, 1, cpu);
+               if (rseq_likely(!ret))
+                       break;
+               /* Retry if comparison fails or rseq aborts. */
+       }
+       /*
+        * Acquire semantic when taking lock after control dependency.
+        * Matches rseq_smp_store_release().
+        */
+       rseq_smp_acquire__after_ctrl_dep();
+       return cpu;
+}
+
+static void rseq_percpu_unlock(struct percpu_lock *lock, int cpu)
+{
+       assert(lock->c[cpu].v == 1);
+       /*
+        * Release lock, with release semantic. Matches
+        * rseq_smp_acquire__after_ctrl_dep().
+        */
+       rseq_smp_store_release(&lock->c[cpu].v, 0);
+}
+
+void *test_percpu_spinlock_thread(void *arg)
+{
+       struct spinlock_thread_test_data *thread_data = arg;
+       struct spinlock_test_data *data = thread_data->data;
+       long long i, reps;
+
+       if (!opt_disable_rseq && thread_data->reg &&
+           rseq_register_current_thread())
+               abort();
+       reps = thread_data->reps;
+       for (i = 0; i < reps; i++) {
+               int cpu = rseq_cpu_start();
+
+               cpu = rseq_this_cpu_lock(&data->lock);
+               data->c[cpu].count++;
+               rseq_percpu_unlock(&data->lock, cpu);
+#ifndef BENCHMARK
+               if (i != 0 && !(i % (reps / 10)))
+                       printf_verbose("tid %d: count %lld\n", (int) gettid(), i);
+#endif
+       }
+       printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+                      (int) gettid(), nr_abort, signals_delivered);
+       if (!opt_disable_rseq && thread_data->reg &&
+           rseq_unregister_current_thread())
+               abort();
+       return NULL;
+}
+
+/*
+ * A simple test which implements a sharded counter using a per-cpu
+ * lock.  Obviously real applications might prefer to simply use a
+ * per-cpu increment; however, this is reasonable for a test and the
+ * lock can be extended to synchronize more complicated operations.
+ */
+void test_percpu_spinlock(void)
+{
+       const int num_threads = opt_threads;
+       int i, ret;
+       uint64_t sum;
+       pthread_t test_threads[num_threads];
+       struct spinlock_test_data data;
+       struct spinlock_thread_test_data thread_data[num_threads];
+
+       memset(&data, 0, sizeof(data));
+       for (i = 0; i < num_threads; i++) {
+               thread_data[i].reps = opt_reps;
+               if (opt_disable_mod <= 0 || (i % opt_disable_mod))
+                       thread_data[i].reg = 1;
+               else
+                       thread_data[i].reg = 0;
+               thread_data[i].data = &data;
+               ret = pthread_create(&test_threads[i], NULL,
+                                    test_percpu_spinlock_thread,
+                                    &thread_data[i]);
+               if (ret) {
+                       errno = ret;
+                       perror("pthread_create");
+                       abort();
+               }
+       }
+
+       for (i = 0; i < num_threads; i++) {
+               ret = pthread_join(test_threads[i], NULL);
+               if (ret) {
+                       errno = ret;
+                       perror("pthread_join");
+                       abort();
+               }
+       }
+
+       sum = 0;
+       for (i = 0; i < CPU_SETSIZE; i++)
+               sum += data.c[i].count;
+
+       assert(sum == (uint64_t)opt_reps * num_threads);
+}
+
+void *test_percpu_inc_thread(void *arg)
+{
+       struct inc_thread_test_data *thread_data = arg;
+       struct inc_test_data *data = thread_data->data;
+       long long i, reps;
+
+       if (!opt_disable_rseq && thread_data->reg &&
+           rseq_register_current_thread())
+               abort();
+       reps = thread_data->reps;
+       for (i = 0; i < reps; i++) {
+               int ret;
+
+               do {
+                       int cpu;
+
+                       cpu = rseq_cpu_start();
+                       ret = rseq_addv(&data->c[cpu].count, 1, cpu);
+               } while (rseq_unlikely(ret));
+#ifndef BENCHMARK
+               if (i != 0 && !(i % (reps / 10)))
+                       printf_verbose("tid %d: count %lld\n", (int) gettid(), i);
+#endif
+       }
+       printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+                      (int) gettid(), nr_abort, signals_delivered);
+       if (!opt_disable_rseq && thread_data->reg &&
+           rseq_unregister_current_thread())
+               abort();
+       return NULL;
+}
+
+void test_percpu_inc(void)
+{
+       const int num_threads = opt_threads;
+       int i, ret;
+       uint64_t sum;
+       pthread_t test_threads[num_threads];
+       struct inc_test_data data;
+       struct inc_thread_test_data thread_data[num_threads];
+
+       memset(&data, 0, sizeof(data));
+       for (i = 0; i < num_threads; i++) {
+               thread_data[i].reps = opt_reps;
+               if (opt_disable_mod <= 0 || (i % opt_disable_mod))
+                       thread_data[i].reg = 1;
+               else
+                       thread_data[i].reg = 0;
+               thread_data[i].data = &data;
+               ret = pthread_create(&test_threads[i], NULL,
+                                    test_percpu_inc_thread,
+                                    &thread_data[i]);
+               if (ret) {
+                       errno = ret;
+                       perror("pthread_create");
+                       abort();
+               }
+       }
+
+       for (i = 0; i < num_threads; i++) {
+               ret = pthread_join(test_threads[i], NULL);
+               if (ret) {
+                       errno = ret;
+                       perror("pthread_join");
+                       abort();
+               }
+       }
+
+       sum = 0;
+       for (i = 0; i < CPU_SETSIZE; i++)
+               sum += data.c[i].count;
+
+       assert(sum == (uint64_t)opt_reps * num_threads);
+}
+
+void this_cpu_list_push(struct percpu_list *list,
+                       struct percpu_list_node *node,
+                       int *_cpu)
+{
+       int cpu;
+
+       for (;;) {
+               intptr_t *targetptr, newval, expect;
+               int ret;
+
+               cpu = rseq_cpu_start();
+               /* Load list->c[cpu].head with single-copy atomicity. */
+               expect = (intptr_t)RSEQ_READ_ONCE(list->c[cpu].head);
+               newval = (intptr_t)node;
+               targetptr = (intptr_t *)&list->c[cpu].head;
+               node->next = (struct percpu_list_node *)expect;
+               ret = rseq_cmpeqv_storev(targetptr, expect, newval, cpu);
+               if (rseq_likely(!ret))
+                       break;
+               /* Retry if comparison fails or rseq aborts. */
+       }
+       if (_cpu)
+               *_cpu = cpu;
+}
+
+/*
+ * Unlike a traditional lock-less linked list; the availability of a
+ * rseq primitive allows us to implement pop without concerns over
+ * ABA-type races.
+ */
+struct percpu_list_node *this_cpu_list_pop(struct percpu_list *list,
+                                          int *_cpu)
+{
+       struct percpu_list_node *node = NULL;
+       int cpu;
+
+       for (;;) {
+               struct percpu_list_node *head;
+               intptr_t *targetptr, expectnot, *load;
+               off_t offset;
+               int ret;
+
+               cpu = rseq_cpu_start();
+               targetptr = (intptr_t *)&list->c[cpu].head;
+               expectnot = (intptr_t)NULL;
+               offset = offsetof(struct percpu_list_node, next);
+               load = (intptr_t *)&head;
+               ret = rseq_cmpnev_storeoffp_load(targetptr, expectnot,
+                                                  offset, load, cpu);
+               if (rseq_likely(!ret)) {
+                       node = head;
+                       break;
+               }
+               if (ret > 0)
+                       break;
+               /* Retry if rseq aborts. */
+       }
+       if (_cpu)
+               *_cpu = cpu;
+       return node;
+}
+
+/*
+ * __percpu_list_pop is not safe against concurrent accesses. Should
+ * only be used on lists that are not concurrently modified.
+ */
+struct percpu_list_node *__percpu_list_pop(struct percpu_list *list, int cpu)
+{
+       struct percpu_list_node *node;
+
+       node = list->c[cpu].head;
+       if (!node)
+               return NULL;
+       list->c[cpu].head = node->next;
+       return node;
+}
+
+void *test_percpu_list_thread(void *arg)
+{
+       long long i, reps;
+       struct percpu_list *list = (struct percpu_list *)arg;
+
+       if (!opt_disable_rseq && rseq_register_current_thread())
+               abort();
+
+       reps = opt_reps;
+       for (i = 0; i < reps; i++) {
+               struct percpu_list_node *node;
+
+               node = this_cpu_list_pop(list, NULL);
+               if (opt_yield)
+                       sched_yield();  /* encourage shuffling */
+               if (node)
+                       this_cpu_list_push(list, node, NULL);
+       }
+
+       printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+                      (int) gettid(), nr_abort, signals_delivered);
+       if (!opt_disable_rseq && rseq_unregister_current_thread())
+               abort();
+
+       return NULL;
+}
+
+/* Simultaneous modification to a per-cpu linked list from many threads.  */
+void test_percpu_list(void)
+{
+       const int num_threads = opt_threads;
+       int i, j, ret;
+       uint64_t sum = 0, expected_sum = 0;
+       struct percpu_list list;
+       pthread_t test_threads[num_threads];
+       cpu_set_t allowed_cpus;
+
+       memset(&list, 0, sizeof(list));
+
+       /* Generate list entries for every usable cpu. */
+       sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+       for (i = 0; i < CPU_SETSIZE; i++) {
+               if (!CPU_ISSET(i, &allowed_cpus))
+                       continue;
+               for (j = 1; j <= 100; j++) {
+                       struct percpu_list_node *node;
+
+                       expected_sum += j;
+
+                       node = malloc(sizeof(*node));
+                       assert(node);
+                       node->data = j;
+                       node->next = list.c[i].head;
+                       list.c[i].head = node;
+               }
+       }
+
+       for (i = 0; i < num_threads; i++) {
+               ret = pthread_create(&test_threads[i], NULL,
+                                    test_percpu_list_thread, &list);
+               if (ret) {
+                       errno = ret;
+                       perror("pthread_create");
+                       abort();
+               }
+       }
+
+       for (i = 0; i < num_threads; i++) {
+               ret = pthread_join(test_threads[i], NULL);
+               if (ret) {
+                       errno = ret;
+                       perror("pthread_join");
+                       abort();
+               }
+       }
+
+       for (i = 0; i < CPU_SETSIZE; i++) {
+               struct percpu_list_node *node;
+
+               if (!CPU_ISSET(i, &allowed_cpus))
+                       continue;
+
+               while ((node = __percpu_list_pop(&list, i))) {
+                       sum += node->data;
+                       free(node);
+               }
+       }
+
+       /*
+        * All entries should now be accounted for (unless some external
+        * actor is interfering with our allowed affinity while this
+        * test is running).
+        */
+       assert(sum == expected_sum);
+}
+
+bool this_cpu_buffer_push(struct percpu_buffer *buffer,
+                         struct percpu_buffer_node *node,
+                         int *_cpu)
+{
+       bool result = false;
+       int cpu;
+
+       for (;;) {
+               intptr_t *targetptr_spec, newval_spec;
+               intptr_t *targetptr_final, newval_final;
+               intptr_t offset;
+               int ret;
+
+               cpu = rseq_cpu_start();
+               offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+               if (offset == buffer->c[cpu].buflen)
+                       break;
+               newval_spec = (intptr_t)node;
+               targetptr_spec = (intptr_t *)&buffer->c[cpu].array[offset];
+               newval_final = offset + 1;
+               targetptr_final = &buffer->c[cpu].offset;
+               if (opt_mb)
+                       ret = rseq_cmpeqv_trystorev_storev_release(
+                               targetptr_final, offset, targetptr_spec,
+                               newval_spec, newval_final, cpu);
+               else
+                       ret = rseq_cmpeqv_trystorev_storev(targetptr_final,
+                               offset, targetptr_spec, newval_spec,
+                               newval_final, cpu);
+               if (rseq_likely(!ret)) {
+                       result = true;
+                       break;
+               }
+               /* Retry if comparison fails or rseq aborts. */
+       }
+       if (_cpu)
+               *_cpu = cpu;
+       return result;
+}
+
+struct percpu_buffer_node *this_cpu_buffer_pop(struct percpu_buffer *buffer,
+                                              int *_cpu)
+{
+       struct percpu_buffer_node *head;
+       int cpu;
+
+       for (;;) {
+               intptr_t *targetptr, newval;
+               intptr_t offset;
+               int ret;
+
+               cpu = rseq_cpu_start();
+               /* Load offset with single-copy atomicity. */
+               offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+               if (offset == 0) {
+                       head = NULL;
+                       break;
+               }
+               head = RSEQ_READ_ONCE(buffer->c[cpu].array[offset - 1]);
+               newval = offset - 1;
+               targetptr = (intptr_t *)&buffer->c[cpu].offset;
+               ret = rseq_cmpeqv_cmpeqv_storev(targetptr, offset,
+                       (intptr_t *)&buffer->c[cpu].array[offset - 1],
+                       (intptr_t)head, newval, cpu);
+               if (rseq_likely(!ret))
+                       break;
+               /* Retry if comparison fails or rseq aborts. */
+       }
+       if (_cpu)
+               *_cpu = cpu;
+       return head;
+}
+
+/*
+ * __percpu_buffer_pop is not safe against concurrent accesses. Should
+ * only be used on buffers that are not concurrently modified.
+ */
+struct percpu_buffer_node *__percpu_buffer_pop(struct percpu_buffer *buffer,
+                                              int cpu)
+{
+       struct percpu_buffer_node *head;
+       intptr_t offset;
+
+       offset = buffer->c[cpu].offset;
+       if (offset == 0)
+               return NULL;
+       head = buffer->c[cpu].array[offset - 1];
+       buffer->c[cpu].offset = offset - 1;
+       return head;
+}
+
+void *test_percpu_buffer_thread(void *arg)
+{
+       long long i, reps;
+       struct percpu_buffer *buffer = (struct percpu_buffer *)arg;
+
+       if (!opt_disable_rseq && rseq_register_current_thread())
+               abort();
+
+       reps = opt_reps;
+       for (i = 0; i < reps; i++) {
+               struct percpu_buffer_node *node;
+
+               node = this_cpu_buffer_pop(buffer, NULL);
+               if (opt_yield)
+                       sched_yield();  /* encourage shuffling */
+               if (node) {
+                       if (!this_cpu_buffer_push(buffer, node, NULL)) {
+                               /* Should increase buffer size. */
+                               abort();
+                       }
+               }
+       }
+
+       printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+                      (int) gettid(), nr_abort, signals_delivered);
+       if (!opt_disable_rseq && rseq_unregister_current_thread())
+               abort();
+
+       return NULL;
+}
+
+/* Simultaneous modification to a per-cpu buffer from many threads.  */
+void test_percpu_buffer(void)
+{
+       const int num_threads = opt_threads;
+       int i, j, ret;
+       uint64_t sum = 0, expected_sum = 0;
+       struct percpu_buffer buffer;
+       pthread_t test_threads[num_threads];
+       cpu_set_t allowed_cpus;
+
+       memset(&buffer, 0, sizeof(buffer));
+
+       /* Generate list entries for every usable cpu. */
+       sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+       for (i = 0; i < CPU_SETSIZE; i++) {
+               if (!CPU_ISSET(i, &allowed_cpus))
+                       continue;
+               /* Worse-case is every item in same CPU. */
+               buffer.c[i].array =
+                       malloc(sizeof(*buffer.c[i].array) * CPU_SETSIZE *
+                              BUFFER_ITEM_PER_CPU);
+               assert(buffer.c[i].array);
+               buffer.c[i].buflen = CPU_SETSIZE * BUFFER_ITEM_PER_CPU;
+               for (j = 1; j <= BUFFER_ITEM_PER_CPU; j++) {
+                       struct percpu_buffer_node *node;
+
+                       expected_sum += j;
+
+                       /*
+                        * We could theoretically put the word-sized
+                        * "data" directly in the buffer. However, we
+                        * want to model objects that would not fit
+                        * within a single word, so allocate an object
+                        * for each node.
+                        */
+                       node = malloc(sizeof(*node));
+                       assert(node);
+                       node->data = j;
+                       buffer.c[i].array[j - 1] = node;
+                       buffer.c[i].offset++;
+               }
+       }
+
+       for (i = 0; i < num_threads; i++) {
+               ret = pthread_create(&test_threads[i], NULL,
+                                    test_percpu_buffer_thread, &buffer);
+               if (ret) {
+                       errno = ret;
+                       perror("pthread_create");
+                       abort();
+               }
+       }
+
+       for (i = 0; i < num_threads; i++) {
+               ret = pthread_join(test_threads[i], NULL);
+               if (ret) {
+                       errno = ret;
+                       perror("pthread_join");
+                       abort();
+               }
+       }
+
+       for (i = 0; i < CPU_SETSIZE; i++) {
+               struct percpu_buffer_node *node;
+
+               if (!CPU_ISSET(i, &allowed_cpus))
+                       continue;
+
+               while ((node = __percpu_buffer_pop(&buffer, i))) {
+                       sum += node->data;
+                       free(node);
+               }
+               free(buffer.c[i].array);
+       }
+
+       /*
+        * All entries should now be accounted for (unless some external
+        * actor is interfering with our allowed affinity while this
+        * test is running).
+        */
+       assert(sum == expected_sum);
+}
+
+bool this_cpu_memcpy_buffer_push(struct percpu_memcpy_buffer *buffer,
+                                struct percpu_memcpy_buffer_node item,
+                                int *_cpu)
+{
+       bool result = false;
+       int cpu;
+
+       for (;;) {
+               intptr_t *targetptr_final, newval_final, offset;
+               char *destptr, *srcptr;
+               size_t copylen;
+               int ret;
+
+               cpu = rseq_cpu_start();
+               /* Load offset with single-copy atomicity. */
+               offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+               if (offset == buffer->c[cpu].buflen)
+                       break;
+               destptr = (char *)&buffer->c[cpu].array[offset];
+               srcptr = (char *)&item;
+               /* copylen must be <= 4kB. */
+               copylen = sizeof(item);
+               newval_final = offset + 1;
+               targetptr_final = &buffer->c[cpu].offset;
+               if (opt_mb)
+                       ret = rseq_cmpeqv_trymemcpy_storev_release(
+                               targetptr_final, offset,
+                               destptr, srcptr, copylen,
+                               newval_final, cpu);
+               else
+                       ret = rseq_cmpeqv_trymemcpy_storev(targetptr_final,
+                               offset, destptr, srcptr, copylen,
+                               newval_final, cpu);
+               if (rseq_likely(!ret)) {
+                       result = true;
+                       break;
+               }
+               /* Retry if comparison fails or rseq aborts. */
+       }
+       if (_cpu)
+               *_cpu = cpu;
+       return result;
+}
+
+bool this_cpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
+                               struct percpu_memcpy_buffer_node *item,
+                               int *_cpu)
+{
+       bool result = false;
+       int cpu;
+
+       for (;;) {
+               intptr_t *targetptr_final, newval_final, offset;
+               char *destptr, *srcptr;
+               size_t copylen;
+               int ret;
+
+               cpu = rseq_cpu_start();
+               /* Load offset with single-copy atomicity. */
+               offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+               if (offset == 0)
+                       break;
+               destptr = (char *)item;
+               srcptr = (char *)&buffer->c[cpu].array[offset - 1];
+               /* copylen must be <= 4kB. */
+               copylen = sizeof(*item);
+               newval_final = offset - 1;
+               targetptr_final = &buffer->c[cpu].offset;
+               ret = rseq_cmpeqv_trymemcpy_storev(targetptr_final,
+                       offset, destptr, srcptr, copylen,
+                       newval_final, cpu);
+               if (rseq_likely(!ret)) {
+                       result = true;
+                       break;
+               }
+               /* Retry if comparison fails or rseq aborts. */
+       }
+       if (_cpu)
+               *_cpu = cpu;
+       return result;
+}
+
+/*
+ * __percpu_memcpy_buffer_pop is not safe against concurrent accesses. Should
+ * only be used on buffers that are not concurrently modified.
+ */
+bool __percpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
+                               struct percpu_memcpy_buffer_node *item,
+                               int cpu)
+{
+       intptr_t offset;
+
+       offset = buffer->c[cpu].offset;
+       if (offset == 0)
+               return false;
+       memcpy(item, &buffer->c[cpu].array[offset - 1], sizeof(*item));
+       buffer->c[cpu].offset = offset - 1;
+       return true;
+}
+
+void *test_percpu_memcpy_buffer_thread(void *arg)
+{
+       long long i, reps;
+       struct percpu_memcpy_buffer *buffer = (struct percpu_memcpy_buffer *)arg;
+
+       if (!opt_disable_rseq && rseq_register_current_thread())
+               abort();
+
+       reps = opt_reps;
+       for (i = 0; i < reps; i++) {
+               struct percpu_memcpy_buffer_node item;
+               bool result;
+
+               result = this_cpu_memcpy_buffer_pop(buffer, &item, NULL);
+               if (opt_yield)
+                       sched_yield();  /* encourage shuffling */
+               if (result) {
+                       if (!this_cpu_memcpy_buffer_push(buffer, item, NULL)) {
+                               /* Should increase buffer size. */
+                               abort();
+                       }
+               }
+       }
+
+       printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+                      (int) gettid(), nr_abort, signals_delivered);
+       if (!opt_disable_rseq && rseq_unregister_current_thread())
+               abort();
+
+       return NULL;
+}
+
+/* Simultaneous modification to a per-cpu buffer from many threads.  */
+void test_percpu_memcpy_buffer(void)
+{
+       const int num_threads = opt_threads;
+       int i, j, ret;
+       uint64_t sum = 0, expected_sum = 0;
+       struct percpu_memcpy_buffer buffer;
+       pthread_t test_threads[num_threads];
+       cpu_set_t allowed_cpus;
+
+       memset(&buffer, 0, sizeof(buffer));
+
+       /* Generate list entries for every usable cpu. */
+       sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+       for (i = 0; i < CPU_SETSIZE; i++) {
+               if (!CPU_ISSET(i, &allowed_cpus))
+                       continue;
+               /* Worse-case is every item in same CPU. */
+               buffer.c[i].array =
+                       malloc(sizeof(*buffer.c[i].array) * CPU_SETSIZE *
+                              MEMCPY_BUFFER_ITEM_PER_CPU);
+               assert(buffer.c[i].array);
+               buffer.c[i].buflen = CPU_SETSIZE * MEMCPY_BUFFER_ITEM_PER_CPU;
+               for (j = 1; j <= MEMCPY_BUFFER_ITEM_PER_CPU; j++) {
+                       expected_sum += 2 * j + 1;
+
+                       /*
+                        * We could theoretically put the word-sized
+                        * "data" directly in the buffer. However, we
+                        * want to model objects that would not fit
+                        * within a single word, so allocate an object
+                        * for each node.
+                        */
+                       buffer.c[i].array[j - 1].data1 = j;
+                       buffer.c[i].array[j - 1].data2 = j + 1;
+                       buffer.c[i].offset++;
+               }
+       }
+
+       for (i = 0; i < num_threads; i++) {
+               ret = pthread_create(&test_threads[i], NULL,
+                                    test_percpu_memcpy_buffer_thread,
+                                    &buffer);
+               if (ret) {
+                       errno = ret;
+                       perror("pthread_create");
+                       abort();
+               }
+       }
+
+       for (i = 0; i < num_threads; i++) {
+               ret = pthread_join(test_threads[i], NULL);
+               if (ret) {
+                       errno = ret;
+                       perror("pthread_join");
+                       abort();
+               }
+       }
+
+       for (i = 0; i < CPU_SETSIZE; i++) {
+               struct percpu_memcpy_buffer_node item;
+
+               if (!CPU_ISSET(i, &allowed_cpus))
+                       continue;
+
+               while (__percpu_memcpy_buffer_pop(&buffer, &item, i)) {
+                       sum += item.data1;
+                       sum += item.data2;
+               }
+               free(buffer.c[i].array);
+       }
+
+       /*
+        * All entries should now be accounted for (unless some external
+        * actor is interfering with our allowed affinity while this
+        * test is running).
+        */
+       assert(sum == expected_sum);
+}
+
+static void test_signal_interrupt_handler(int signo)
+{
+       signals_delivered++;
+}
+
+static int set_signal_handler(void)
+{
+       int ret = 0;
+       struct sigaction sa;
+       sigset_t sigset;
+
+       ret = sigemptyset(&sigset);
+       if (ret < 0) {
+               perror("sigemptyset");
+               return ret;
+       }
+
+       sa.sa_handler = test_signal_interrupt_handler;
+       sa.sa_mask = sigset;
+       sa.sa_flags = 0;
+       ret = sigaction(SIGUSR1, &sa, NULL);
+       if (ret < 0) {
+               perror("sigaction");
+               return ret;
+       }
+
+       printf_verbose("Signal handler set for SIGUSR1\n");
+
+       return ret;
+}
+
+static void show_usage(int argc, char **argv)
+{
+       printf("Usage : %s <OPTIONS>\n",
+               argv[0]);
+       printf("OPTIONS:\n");
+       printf("        [-1 loops] Number of loops for delay injection 1\n");
+       printf("        [-2 loops] Number of loops for delay injection 2\n");
+       printf("        [-3 loops] Number of loops for delay injection 3\n");
+       printf("        [-4 loops] Number of loops for delay injection 4\n");
+       printf("        [-5 loops] Number of loops for delay injection 5\n");
+       printf("        [-6 loops] Number of loops for delay injection 6\n");
+       printf("        [-7 loops] Number of loops for delay injection 7 (-1 to enable -m)\n");
+       printf("        [-8 loops] Number of loops for delay injection 8 (-1 to enable -m)\n");
+       printf("        [-9 loops] Number of loops for delay injection 9 (-1 to enable -m)\n");
+       printf("        [-m N] Yield/sleep/kill every modulo N (default 0: disabled) (>= 0)\n");
+       printf("        [-y] Yield\n");
+       printf("        [-k] Kill thread with signal\n");
+       printf("        [-s S] S: =0: disabled (default), >0: sleep time (ms)\n");
+       printf("        [-t N] Number of threads (default 200)\n");
+       printf("        [-r N] Number of repetitions per thread (default 5000)\n");
+       printf("        [-d] Disable rseq system call (no initialization)\n");
+       printf("        [-D M] Disable rseq for each M threads\n");
+       printf("        [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement\n");
+       printf("        [-M] Push into buffer and memcpy buffer with memory barriers.\n");
+       printf("        [-v] Verbose output.\n");
+       printf("        [-h] Show this help.\n");
+       printf("\n");
+}
+
+int main(int argc, char **argv)
+{
+       int i;
+
+       for (i = 1; i < argc; i++) {
+               if (argv[i][0] != '-')
+                       continue;
+               switch (argv[i][1]) {
+               case '1':
+               case '2':
+               case '3':
+               case '4':
+               case '5':
+               case '6':
+               case '7':
+               case '8':
+               case '9':
+                       if (argc < i + 2) {
+                               show_usage(argc, argv);
+                               goto error;
+                       }
+                       loop_cnt[argv[i][1] - '0'] = atol(argv[i + 1]);
+                       i++;
+                       break;
+               case 'm':
+                       if (argc < i + 2) {
+                               show_usage(argc, argv);
+                               goto error;
+                       }
+                       opt_modulo = atol(argv[i + 1]);
+                       if (opt_modulo < 0) {
+                               show_usage(argc, argv);
+                               goto error;
+                       }
+                       i++;
+                       break;
+               case 's':
+                       if (argc < i + 2) {
+                               show_usage(argc, argv);
+                               goto error;
+                       }
+                       opt_sleep = atol(argv[i + 1]);
+                       if (opt_sleep < 0) {
+                               show_usage(argc, argv);
+                               goto error;
+                       }
+                       i++;
+                       break;
+               case 'y':
+                       opt_yield = 1;
+                       break;
+               case 'k':
+                       opt_signal = 1;
+                       break;
+               case 'd':
+                       opt_disable_rseq = 1;
+                       break;
+               case 'D':
+                       if (argc < i + 2) {
+                               show_usage(argc, argv);
+                               goto error;
+                       }
+                       opt_disable_mod = atol(argv[i + 1]);
+                       if (opt_disable_mod < 0) {
+                               show_usage(argc, argv);
+                               goto error;
+                       }
+                       i++;
+                       break;
+               case 't':
+                       if (argc < i + 2) {
+                               show_usage(argc, argv);
+                               goto error;
+                       }
+                       opt_threads = atol(argv[i + 1]);
+                       if (opt_threads < 0) {
+                               show_usage(argc, argv);
+                               goto error;
+                       }
+                       i++;
+                       break;
+               case 'r':
+                       if (argc < i + 2) {
+                               show_usage(argc, argv);
+                               goto error;
+                       }
+                       opt_reps = atoll(argv[i + 1]);
+                       if (opt_reps < 0) {
+                               show_usage(argc, argv);
+                               goto error;
+                       }
+                       i++;
+                       break;
+               case 'h':
+                       show_usage(argc, argv);
+                       goto end;
+               case 'T':
+                       if (argc < i + 2) {
+                               show_usage(argc, argv);
+                               goto error;
+                       }
+                       opt_test = *argv[i + 1];
+                       switch (opt_test) {
+                       case 's':
+                       case 'l':
+                       case 'i':
+                       case 'b':
+                       case 'm':
+                               break;
+                       default:
+                               show_usage(argc, argv);
+                               goto error;
+                       }
+                       i++;
+                       break;
+               case 'v':
+                       verbose = 1;
+                       break;
+               case 'M':
+                       opt_mb = 1;
+                       break;
+               default:
+                       show_usage(argc, argv);
+                       goto error;
+               }
+       }
+
+       loop_cnt_1 = loop_cnt[1];
+       loop_cnt_2 = loop_cnt[2];
+       loop_cnt_3 = loop_cnt[3];
+       loop_cnt_4 = loop_cnt[4];
+       loop_cnt_5 = loop_cnt[5];
+       loop_cnt_6 = loop_cnt[6];
+
+       if (set_signal_handler())
+               goto error;
+
+       if (!opt_disable_rseq && rseq_register_current_thread())
+               goto error;
+       switch (opt_test) {
+       case 's':
+               printf_verbose("spinlock\n");
+               test_percpu_spinlock();
+               break;
+       case 'l':
+               printf_verbose("linked list\n");
+               test_percpu_list();
+               break;
+       case 'b':
+               printf_verbose("buffer\n");
+               test_percpu_buffer();
+               break;
+       case 'm':
+               printf_verbose("memcpy buffer\n");
+               test_percpu_memcpy_buffer();
+               break;
+       case 'i':
+               printf_verbose("counter increment\n");
+               test_percpu_inc();
+               break;
+       }
+       if (!opt_disable_rseq && rseq_unregister_current_thread())
+               abort();
+end:
+       return 0;
+
+error:
+       return -1;
+}