s390/fpu: provide and use vlm and vstm inline assemblies
authorHeiko Carstens <hca@linux.ibm.com>
Sat, 3 Feb 2024 10:45:07 +0000 (11:45 +0100)
committerHeiko Carstens <hca@linux.ibm.com>
Fri, 16 Feb 2024 13:30:15 +0000 (14:30 +0100)
Instead of open-coding vlm and vstm inline assemblies at several locations,
provide an fpu_* function for each instruction, and use them in the new
save_vx_regs() and load_vx_regs() helper functions.

Note that "O" and "R" inline assembly operand modifiers are used in order
to pass the displacement and base register of the memory operands to the
existing VLM and VSTM macros. The two operand modifiers are not available
for clang. Therefore provide two variants of each inline assembly.

The clang variant always uses and clobbers general purpose register 1, like
in the previous inline assemblies, so it can be used as base register with
a zero displacement. This generates slightly less efficient code, but can
be removed as soon as clang has support for the used operand modifiers.

Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
arch/s390/include/asm/fpu-insn.h
arch/s390/include/asm/fpu.h
arch/s390/kernel/fpu.c

index df2cad95b598c60f265ea1a9611f332f84ae6a88..538201864a47fde2b728d3683d15c0d22802c321 100644 (file)
@@ -108,5 +108,75 @@ static __always_inline void fpu_stfpc(unsigned int *fpc)
                     : "memory");
 }
 
+#ifdef CONFIG_CC_IS_CLANG
+
+#define fpu_vlm(_v1, _v3, _vxrs) do {                                  \
+       unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128);  \
+       struct {                                                        \
+               __vector128 _v[(_v3) - (_v1) + 1];                      \
+       } *_v = (void *)(_vxrs);                                        \
+                                                                       \
+       instrument_read(_v, size);                                      \
+       asm volatile("\n"                                               \
+               "       la      1,%[vxrs]\n"                            \
+               "       VLM     %[v1],%[v3],0,1\n"                      \
+               :                                                       \
+               : [vxrs] "R" (*_v),                                     \
+                 [v1] "I" (_v1), [v3] "I" (_v3)                        \
+               : "memory", "1");                                       \
+} while (0)
+
+#else /* CONFIG_CC_IS_CLANG */
+
+#define fpu_vlm(_v1, _v3, _vxrs) do {                                  \
+       unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128);  \
+       struct {                                                        \
+               __vector128 _v[(_v3) - (_v1) + 1];                      \
+       } *_v = (void *)(_vxrs);                                        \
+                                                                       \
+       instrument_read(_v, size);                                      \
+       asm volatile("VLM       %[v1],%[v3],%O[vxrs],%R[vxrs]\n"        \
+                    :                                                  \
+                    : [vxrs] "Q" (*_v),                                \
+                      [v1] "I" (_v1), [v3] "I" (_v3)                   \
+                    : "memory");                                       \
+} while (0)
+
+#endif /* CONFIG_CC_IS_CLANG */
+
+#ifdef CONFIG_CC_IS_CLANG
+
+#define fpu_vstm(_v1, _v3, _vxrs) do {                                 \
+       unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128);  \
+       struct {                                                        \
+               __vector128 _v[(_v3) - (_v1) + 1];                      \
+       } *_v = (void *)(_vxrs);                                        \
+                                                                       \
+       instrument_write(_v, size);                                     \
+       asm volatile("\n"                                               \
+               "       la      1,%[vxrs]\n"                            \
+               "       VSTM    %[v1],%[v3],0,1\n"                      \
+               : [vxrs] "=R" (*_v)                                     \
+               : [v1] "I" (_v1), [v3] "I" (_v3)                        \
+               : "memory", "1");                                       \
+} while (0)
+
+#else /* CONFIG_CC_IS_CLANG */
+
+#define fpu_vstm(_v1, _v3, _vxrs) do {                                 \
+       unsigned int size = ((_v3) - (_v1) + 1) * sizeof(__vector128);  \
+       struct {                                                        \
+               __vector128 _v[(_v3) - (_v1) + 1];                      \
+       } *_v = (void *)(_vxrs);                                        \
+                                                                       \
+       instrument_write(_v, size);                                     \
+       asm volatile("VSTM      %[v1],%[v3],%O[vxrs],%R[vxrs]\n"        \
+                    : [vxrs] "=Q" (*_v)                                \
+                    : [v1] "I" (_v1), [v3] "I" (_v3)                   \
+                    : "memory");                                       \
+} while (0)
+
+#endif /* CONFIG_CC_IS_CLANG */
+
 #endif /* __ASSEMBLY__ */
 #endif /* __ASM_S390_FPU_INSN_H */
index 626695de6085a23a16f2016e42433b5f7ac1dbd9..6a0a23a28ce8616ea61a5a119aa932097205ed0f 100644 (file)
@@ -84,6 +84,18 @@ void __load_fpu_regs(void);
 void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags);
 void __kernel_fpu_end(struct kernel_fpu *state, u32 flags);
 
+static __always_inline void save_vx_regs(__vector128 *vxrs)
+{
+       fpu_vstm(0, 15, &vxrs[0]);
+       fpu_vstm(16, 31, &vxrs[16]);
+}
+
+static __always_inline void load_vx_regs(__vector128 *vxrs)
+{
+       fpu_vlm(0, 15, &vxrs[0]);
+       fpu_vlm(16, 31, &vxrs[16]);
+}
+
 static __always_inline void save_fp_regs(freg_t *fprs)
 {
        fpu_std(0, &fprs[0]);
@@ -148,15 +160,6 @@ static inline void kernel_fpu_end(struct kernel_fpu *state, u32 flags)
        preempt_enable();
 }
 
-static inline void save_vx_regs(__vector128 *vxrs)
-{
-       asm volatile("\n"
-               "       la      1,%0\n"
-               "       .word   0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */
-               "       .word   0xe70f,0x1100,0x0c3e\n" /* vstm 16,31,256(1) */
-               : "=Q" (*(struct vx_array *)vxrs) : : "1");
-}
-
 static inline void convert_vx_to_fp(freg_t *fprs, __vector128 *vxrs)
 {
        int i;
index 6bfd4d0f33e1a49abb457f582159f91df62d80eb..092a4bdf88edc29fa6fa262e70c0cd77840bad73 100644 (file)
@@ -137,16 +137,10 @@ void __load_fpu_regs(void)
        void *regs = current->thread.fpu.regs;
 
        fpu_lfpc_safe(&state->fpc);
-       if (likely(cpu_has_vx())) {
-               asm volatile("lgr       1,%0\n"
-                            "VLM       0,15,0,1\n"
-                            "VLM       16,31,256,1\n"
-                            :
-                            : "d" (regs)
-                            : "1", "cc", "memory");
-       } else {
+       if (likely(cpu_has_vx()))
+               load_vx_regs(regs);
+       else
                load_fp_regs(regs);
-       }
        clear_cpu_flag(CIF_FPU);
 }
 
@@ -173,16 +167,10 @@ void save_fpu_regs(void)
        regs = current->thread.fpu.regs;
 
        fpu_stfpc(&state->fpc);
-       if (likely(cpu_has_vx())) {
-               asm volatile("lgr       1,%0\n"
-                            "VSTM      0,15,0,1\n"
-                            "VSTM      16,31,256,1\n"
-                            :
-                            : "d" (regs)
-                            : "1", "cc", "memory");
-       } else {
+       if (likely(cpu_has_vx()))
+               save_vx_regs(regs);
+       else
                save_fp_regs(regs);
-       }
        set_cpu_flag(CIF_FPU);
 out:
        local_irq_restore(flags);