RISC-V: Report vector unaligned access speed hwprobe
authorJesse Taube <jesse@rivosinc.com>
Thu, 17 Oct 2024 19:00:22 +0000 (12:00 -0700)
committerPalmer Dabbelt <palmer@rivosinc.com>
Fri, 18 Oct 2024 19:38:34 +0000 (12:38 -0700)
Detect if vector misaligned accesses are faster or slower than
equivalent vector byte accesses. This is useful for usermode to know
whether vector byte accesses or vector misaligned accesses have a better
bandwidth for operations like memcpy.

Signed-off-by: Jesse Taube <jesse@rivosinc.com>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20241017-jesse_unaligned_vector-v10-5-5b33500160f8@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
arch/riscv/Kconfig
arch/riscv/kernel/Makefile
arch/riscv/kernel/copy-unaligned.h
arch/riscv/kernel/sys_hwprobe.c
arch/riscv/kernel/unaligned_access_speed.c
arch/riscv/kernel/vec-copy-unaligned.S [new file with mode: 0644]

index 05f698a8897770ce50641e57a7687919048bff5b..d46394873afce06dd642b798e584410ab5469b18 100644 (file)
@@ -875,6 +875,24 @@ config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
          will dynamically determine the speed of vector unaligned accesses on
          the underlying system if they are supported.
 
+config RISCV_SLOW_VECTOR_UNALIGNED_ACCESS
+       bool "Assume the system supports slow vector unaligned memory accesses"
+       depends on NONPORTABLE
+       help
+         Assume that the system supports slow vector unaligned memory accesses. The
+         kernel and userspace programs may not be able to run at all on systems
+         that do not support unaligned memory accesses.
+
+config RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS
+       bool "Assume the system supports fast vector unaligned memory accesses"
+       depends on NONPORTABLE
+       help
+         Assume that the system supports fast vector unaligned memory accesses. When
+         enabled, this option improves the performance of the kernel on such
+         systems. However, the kernel and userspace programs will run much more
+         slowly, or will not be able to run at all, on systems that do not
+         support efficient unaligned memory accesses.
+
 endchoice
 
 source "arch/riscv/Kconfig.vendor"
index 7f88cc4931f5c44105e827d46832a94fe81483f4..30db92672ada54048c5f62296ffc7168c78d32db 100644 (file)
@@ -70,7 +70,8 @@ obj-$(CONFIG_MMU) += vdso.o vdso/
 
 obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o
 obj-$(CONFIG_RISCV_MISALIGNED) += unaligned_access_speed.o
-obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)     += copy-unaligned.o
+obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)             += copy-unaligned.o
+obj-$(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)      += vec-copy-unaligned.o
 
 obj-$(CONFIG_FPU)              += fpu.o
 obj-$(CONFIG_FPU)              += kernel_mode_fpu.o
index e3d70d35b70819203245a35324360657386a32dc..85d4d11450cb61110de4c2a2bc5efb30e6b9489b 100644 (file)
@@ -10,4 +10,9 @@
 void __riscv_copy_words_unaligned(void *dst, const void *src, size_t size);
 void __riscv_copy_bytes_unaligned(void *dst, const void *src, size_t size);
 
+#ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
+void __riscv_copy_vec_words_unaligned(void *dst, const void *src, size_t size);
+void __riscv_copy_vec_bytes_unaligned(void *dst, const void *src, size_t size);
+#endif
+
 #endif /* __RISCV_KERNEL_COPY_UNALIGNED_H */
index 6441baada36bc29424383056efd9fc145464e1ed..6673278e84d588d97c5973837a5d9626a8758603 100644 (file)
@@ -228,6 +228,12 @@ static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
 #else
 static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
 {
+       if (IS_ENABLED(CONFIG_RISCV_EFFICIENT_VECTOR_UNALIGNED_ACCESS))
+               return RISCV_HWPROBE_MISALIGNED_VECTOR_FAST;
+
+       if (IS_ENABLED(CONFIG_RISCV_SLOW_VECTOR_UNALIGNED_ACCESS))
+               return RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW;
+
        return RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN;
 }
 #endif
index 0b8b5e17453a8eaef7e8834d7d8d317d0bbb8c77..91f189cf16113c9ab1f3a1c4789cf0b288325cb1 100644 (file)
@@ -6,11 +6,13 @@
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
 #include <linux/jump_label.h>
+#include <linux/kthread.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/types.h>
 #include <asm/cpufeature.h>
 #include <asm/hwprobe.h>
+#include <asm/vector.h>
 
 #include "copy-unaligned.h"
 
@@ -268,12 +270,147 @@ static int check_unaligned_access_speed_all_cpus(void)
 }
 #endif
 
+#ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
+static void check_vector_unaligned_access(struct work_struct *work __always_unused)
+{
+       int cpu = smp_processor_id();
+       u64 start_cycles, end_cycles;
+       u64 word_cycles;
+       u64 byte_cycles;
+       int ratio;
+       unsigned long start_jiffies, now;
+       struct page *page;
+       void *dst;
+       void *src;
+       long speed = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW;
+
+       if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN)
+               return;
+
+       page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
+       if (!page) {
+               pr_warn("Allocation failure, not measuring vector misaligned performance\n");
+               return;
+       }
+
+       /* Make an unaligned destination buffer. */
+       dst = (void *)((unsigned long)page_address(page) | 0x1);
+       /* Unalign src as well, but differently (off by 1 + 2 = 3). */
+       src = dst + (MISALIGNED_BUFFER_SIZE / 2);
+       src += 2;
+       word_cycles = -1ULL;
+
+       /* Do a warmup. */
+       kernel_vector_begin();
+       __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+
+       start_jiffies = jiffies;
+       while ((now = jiffies) == start_jiffies)
+               cpu_relax();
+
+       /*
+        * For a fixed amount of time, repeatedly try the function, and take
+        * the best time in cycles as the measurement.
+        */
+       while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
+               start_cycles = get_cycles64();
+               /* Ensure the CSR read can't reorder WRT to the copy. */
+               mb();
+               __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+               /* Ensure the copy ends before the end time is snapped. */
+               mb();
+               end_cycles = get_cycles64();
+               if ((end_cycles - start_cycles) < word_cycles)
+                       word_cycles = end_cycles - start_cycles;
+       }
+
+       byte_cycles = -1ULL;
+       __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+       start_jiffies = jiffies;
+       while ((now = jiffies) == start_jiffies)
+               cpu_relax();
+
+       while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
+               start_cycles = get_cycles64();
+               /* Ensure the CSR read can't reorder WRT to the copy. */
+               mb();
+               __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+               /* Ensure the copy ends before the end time is snapped. */
+               mb();
+               end_cycles = get_cycles64();
+               if ((end_cycles - start_cycles) < byte_cycles)
+                       byte_cycles = end_cycles - start_cycles;
+       }
+
+       kernel_vector_end();
+
+       /* Don't divide by zero. */
+       if (!word_cycles || !byte_cycles) {
+               pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned vector access speed\n",
+                       cpu);
+
+               return;
+       }
+
+       if (word_cycles < byte_cycles)
+               speed = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST;
+
+       ratio = div_u64((byte_cycles * 100), word_cycles);
+       pr_info("cpu%d: Ratio of vector byte access time to vector unaligned word access is %d.%02d, unaligned accesses are %s\n",
+               cpu,
+               ratio / 100,
+               ratio % 100,
+               (speed ==  RISCV_HWPROBE_MISALIGNED_VECTOR_FAST) ? "fast" : "slow");
+
+       per_cpu(vector_misaligned_access, cpu) = speed;
+}
+
+static int riscv_online_cpu_vec(unsigned int cpu)
+{
+       if (!has_vector())
+               return 0;
+
+       if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED)
+               return 0;
+
+       check_vector_unaligned_access_emulated(NULL);
+       check_vector_unaligned_access(NULL);
+       return 0;
+}
+
+/* Measure unaligned access speed on all CPUs present at boot in parallel. */
+static int vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
+{
+       schedule_on_each_cpu(check_vector_unaligned_access);
+
+       /*
+        * Setup hotplug callbacks for any new CPUs that come online or go
+        * offline.
+        */
+       cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
+                                 riscv_online_cpu_vec, NULL);
+
+       return 0;
+}
+#else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */
+static int vec_check_unaligned_access_speed_all_cpus(void *unused __always_unused)
+{
+       return 0;
+}
+#endif
+
 static int check_unaligned_access_all_cpus(void)
 {
-       bool all_cpus_emulated;
+       bool all_cpus_emulated, all_cpus_vec_unsupported;
 
        all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
-       check_vector_unaligned_access_emulated_all_cpus();
+       all_cpus_vec_unsupported = check_vector_unaligned_access_emulated_all_cpus();
+
+       if (!all_cpus_vec_unsupported &&
+           IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
+               kthread_run(vec_check_unaligned_access_speed_all_cpus,
+                           NULL, "vec_check_unaligned_access_speed_all_cpus");
+       }
 
        if (!all_cpus_emulated)
                return check_unaligned_access_speed_all_cpus();
diff --git a/arch/riscv/kernel/vec-copy-unaligned.S b/arch/riscv/kernel/vec-copy-unaligned.S
new file mode 100644 (file)
index 0000000..d16f19f
--- /dev/null
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2024 Rivos Inc. */
+
+#include <linux/args.h>
+#include <linux/linkage.h>
+#include <asm/asm.h>
+
+       .text
+
+#define WORD_EEW 32
+
+#define WORD_SEW CONCATENATE(e, WORD_EEW)
+#define VEC_L CONCATENATE(vle, WORD_EEW).v
+#define VEC_S CONCATENATE(vle, WORD_EEW).v
+
+/* void __riscv_copy_vec_words_unaligned(void *, const void *, size_t) */
+/* Performs a memcpy without aligning buffers, using word loads and stores. */
+/* Note: The size is truncated to a multiple of WORD_EEW */
+SYM_FUNC_START(__riscv_copy_vec_words_unaligned)
+       andi  a4, a2, ~(WORD_EEW-1)
+       beqz  a4, 2f
+       add   a3, a1, a4
+       .option push
+       .option arch, +zve32x
+1:
+       vsetivli t0, 8, WORD_SEW, m8, ta, ma
+       VEC_L v0, (a1)
+       VEC_S v0, (a0)
+       addi  a0, a0, WORD_EEW
+       addi  a1, a1, WORD_EEW
+       bltu  a1, a3, 1b
+
+2:
+       .option pop
+       ret
+SYM_FUNC_END(__riscv_copy_vec_words_unaligned)
+
+/* void __riscv_copy_vec_bytes_unaligned(void *, const void *, size_t) */
+/* Performs a memcpy without aligning buffers, using only byte accesses. */
+/* Note: The size is truncated to a multiple of 8 */
+SYM_FUNC_START(__riscv_copy_vec_bytes_unaligned)
+       andi a4, a2, ~(8-1)
+       beqz a4, 2f
+       add  a3, a1, a4
+       .option push
+       .option arch, +zve32x
+1:
+       vsetivli t0, 8, e8, m8, ta, ma
+       vle8.v v0, (a1)
+       vse8.v v0, (a0)
+       addi a0, a0, 8
+       addi a1, a1, 8
+       bltu a1, a3, 1b
+
+2:
+       .option pop
+       ret
+SYM_FUNC_END(__riscv_copy_vec_bytes_unaligned)