s390/fpu: decrease stack usage for some cases
authorHeiko Carstens <hca@linux.ibm.com>
Sat, 3 Feb 2024 10:45:17 +0000 (11:45 +0100)
committerHeiko Carstens <hca@linux.ibm.com>
Fri, 16 Feb 2024 13:30:16 +0000 (14:30 +0100)
The kernel_fpu structure has a quite large size of 520 bytes. In order to
reduce stack footprint introduce several kernel fpu structures with
different and also smaller sizes. This way every kernel fpu user must use
the correct variant. A compile time check verifies that the correct variant
is used.

There are several users which use only 16 instead of all 32 vector
registers. For those users the new kernel_fpu_16 structure with a size of
only 266 bytes can be used.

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
arch/s390/crypto/chacha-glue.c
arch/s390/crypto/crc32-vx.c
arch/s390/include/asm/fpu-types.h
arch/s390/include/asm/fpu.h
arch/s390/kernel/fpu.c
arch/s390/kernel/sysinfo.c
arch/s390/kvm/kvm-s390.c
lib/raid6/s390vx.uc

index 97098add2079136267e61f7a3eebf6f488de4240..f8b0c52e77a4fab95de18f8cda80e098b4fe5ee1 100644 (file)
@@ -22,7 +22,7 @@ static void chacha20_crypt_s390(u32 *state, u8 *dst, const u8 *src,
                                unsigned int nbytes, const u32 *key,
                                u32 *counter)
 {
-       DECLARE_KERNEL_FPU_ONSTACK(vxstate);
+       DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
 
        kernel_fpu_begin(&vxstate, KERNEL_VXR);
        chacha20_vx(dst, src, nbytes, key, counter);
index dc2997f18e30593245577aa510220bfc0c4024f0..d9f1fdb66691bb4b4c09d406d64f786d08cf47e8 100644 (file)
@@ -50,7 +50,7 @@ u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
                                unsigned char const *data, size_t datalen)  \
        {                                                                   \
                unsigned long prealign, aligned, remaining;                 \
-               DECLARE_KERNEL_FPU_ONSTACK(vxstate);                        \
+               DECLARE_KERNEL_FPU_ONSTACK16(vxstate);                      \
                                                                            \
                if (datalen < VX_MIN_LEN + VX_ALIGN_MASK)                   \
                        return ___crc32_sw(crc, data, datalen);             \
index 8e6927c23bdc4f0be4820b2d195d21fe79c095b3..04c32b9fc849862137c51763ebe198879da6710e 100644 (file)
@@ -16,14 +16,32 @@ struct fpu {
        __vector128 vxrs[__NUM_VXRS] __aligned(8);
 };
 
-/* In-kernel FPU state structure */
+struct kernel_fpu_hdr {
+       int     mask;
+       u32     fpc;
+};
+
 struct kernel_fpu {
-       int         mask;
-       u32         fpc;
-       __vector128 vxrs[__NUM_VXRS] __aligned(8);
+       struct kernel_fpu_hdr hdr;
+       __vector128 vxrs[] __aligned(8);
 };
 
-#define DECLARE_KERNEL_FPU_ONSTACK(name)       \
-       struct kernel_fpu name __uninitialized
+#define KERNEL_FPU_STRUCT(vxr_size)                            \
+struct kernel_fpu_##vxr_size {                                 \
+       struct kernel_fpu_hdr hdr;                              \
+       __vector128 vxrs[vxr_size] __aligned(8);                \
+}
+
+KERNEL_FPU_STRUCT(16);
+KERNEL_FPU_STRUCT(32);
+
+#define DECLARE_KERNEL_FPU_ONSTACK(vxr_size, name)             \
+       struct kernel_fpu_##vxr_size name __uninitialized
+
+#define DECLARE_KERNEL_FPU_ONSTACK16(name)                     \
+       DECLARE_KERNEL_FPU_ONSTACK(16, name)
+
+#define DECLARE_KERNEL_FPU_ONSTACK32(name)                     \
+       DECLARE_KERNEL_FPU_ONSTACK(32, name)
 
 #endif /* _ASM_S390_FPU_TYPES_H */
index e706af26c5d07fa84a333b36239d609f0df66f89..c1b3920092a1c96127b06ce3a4448f2709436c5f 100644 (file)
@@ -162,28 +162,64 @@ static __always_inline void load_fp_regs_vx(__vector128 *vxrs)
        __load_fp_regs(fprs, sizeof(__vector128) / sizeof(freg_t));
 }
 
-static inline void kernel_fpu_begin(struct kernel_fpu *state, int flags)
+static inline void _kernel_fpu_begin(struct kernel_fpu *state, int flags)
 {
-       state->mask = READ_ONCE(current->thread.kfpu_flags);
+       state->hdr.mask = READ_ONCE(current->thread.kfpu_flags);
        if (!test_thread_flag(TIF_FPU)) {
                /* Save user space FPU state and register contents */
                save_user_fpu_regs();
-       } else if (state->mask & flags) {
+       } else if (state->hdr.mask & flags) {
                /* Save FPU/vector register in-use by the kernel */
                __kernel_fpu_begin(state, flags);
        }
        __atomic_or(flags, &current->thread.kfpu_flags);
 }
 
-static inline void kernel_fpu_end(struct kernel_fpu *state, int flags)
+static inline void _kernel_fpu_end(struct kernel_fpu *state, int flags)
 {
-       WRITE_ONCE(current->thread.kfpu_flags, state->mask);
-       if (state->mask & flags) {
+       WRITE_ONCE(current->thread.kfpu_flags, state->hdr.mask);
+       if (state->hdr.mask & flags) {
                /* Restore FPU/vector register in-use by the kernel */
                __kernel_fpu_end(state, flags);
        }
 }
 
+void __kernel_fpu_invalid_size(void);
+
+static __always_inline void kernel_fpu_check_size(int flags, unsigned int size)
+{
+       unsigned int cnt = 0;
+
+       if (flags & KERNEL_VXR_V0V7)
+               cnt += 8;
+       if (flags & KERNEL_VXR_V8V15)
+               cnt += 8;
+       if (flags & KERNEL_VXR_V16V23)
+               cnt += 8;
+       if (flags & KERNEL_VXR_V24V31)
+               cnt += 8;
+       if (cnt != size)
+               __kernel_fpu_invalid_size();
+}
+
+#define kernel_fpu_begin(state, flags)                                 \
+{                                                                      \
+       typeof(state) s = (state);                                      \
+       int _flags = (flags);                                           \
+                                                                       \
+       kernel_fpu_check_size(_flags, ARRAY_SIZE(s->vxrs));             \
+       _kernel_fpu_begin((struct kernel_fpu *)s, _flags);              \
+}
+
+#define kernel_fpu_end(state, flags)                                   \
+{                                                                      \
+       typeof(state) s = (state);                                      \
+       int _flags = (flags);                                           \
+                                                                       \
+       kernel_fpu_check_size(_flags, ARRAY_SIZE(s->vxrs));             \
+       _kernel_fpu_end((struct kernel_fpu *)s, _flags);                \
+}
+
 static inline void save_kernel_fpu_regs(struct thread_struct *thread)
 {
        struct fpu *state = &thread->kfpu;
index 733e188951b78ed529becee21f47e80d38e3aae5..62e9befe7890a2806d0515ad3660eddde6d7946c 100644 (file)
@@ -19,41 +19,41 @@ void __kernel_fpu_begin(struct kernel_fpu *state, int flags)
         * Limit the save to the FPU/vector registers already
         * in use by the previous context.
         */
-       flags &= state->mask;
+       flags &= state->hdr.mask;
        if (flags & KERNEL_FPC)
-               fpu_stfpc(&state->fpc);
+               fpu_stfpc(&state->hdr.fpc);
        if (!cpu_has_vx()) {
                if (flags & KERNEL_VXR_LOW)
-                       save_fp_regs_vx(state->vxrs);
+                       save_fp_regs_vx(vxrs);
                return;
        }
        mask = flags & KERNEL_VXR;
        if (mask == KERNEL_VXR) {
-               fpu_vstm(0, 15, &vxrs[0]);
-               fpu_vstm(16, 31, &vxrs[16]);
+               vxrs += fpu_vstm(0, 15, vxrs);
+               vxrs += fpu_vstm(16, 31, vxrs);
                return;
        }
        if (mask == KERNEL_VXR_MID) {
-               fpu_vstm(8, 23, &vxrs[8]);
+               vxrs += fpu_vstm(8, 23, vxrs);
                return;
        }
        mask = flags & KERNEL_VXR_LOW;
        if (mask) {
                if (mask == KERNEL_VXR_LOW)
-                       fpu_vstm(0, 15, &vxrs[0]);
+                       vxrs += fpu_vstm(0, 15, vxrs);
                else if (mask == KERNEL_VXR_V0V7)
-                       fpu_vstm(0, 7, &vxrs[0]);
+                       vxrs += fpu_vstm(0, 7, vxrs);
                else
-                       fpu_vstm(8, 15, &vxrs[8]);
+                       vxrs += fpu_vstm(8, 15, vxrs);
        }
        mask = flags & KERNEL_VXR_HIGH;
        if (mask) {
                if (mask == KERNEL_VXR_HIGH)
-                       fpu_vstm(16, 31, &vxrs[16]);
+                       vxrs += fpu_vstm(16, 31, vxrs);
                else if (mask == KERNEL_VXR_V16V23)
-                       fpu_vstm(16, 23, &vxrs[16]);
+                       vxrs += fpu_vstm(16, 23, vxrs);
                else
-                       fpu_vstm(24, 31, &vxrs[24]);
+                       vxrs += fpu_vstm(24, 31, vxrs);
        }
 }
 EXPORT_SYMBOL(__kernel_fpu_begin);
@@ -68,41 +68,41 @@ void __kernel_fpu_end(struct kernel_fpu *state, int flags)
         * previous context that have been overwritten by the
         * current context.
         */
-       flags &= state->mask;
+       flags &= state->hdr.mask;
        if (flags & KERNEL_FPC)
-               fpu_lfpc(&state->fpc);
+               fpu_lfpc(&state->hdr.fpc);
        if (!cpu_has_vx()) {
                if (flags & KERNEL_VXR_LOW)
-                       load_fp_regs_vx(state->vxrs);
+                       load_fp_regs_vx(vxrs);
                return;
        }
        mask = flags & KERNEL_VXR;
        if (mask == KERNEL_VXR) {
-               fpu_vlm(0, 15, &vxrs[0]);
-               fpu_vlm(16, 31, &vxrs[16]);
+               vxrs += fpu_vlm(0, 15, vxrs);
+               vxrs += fpu_vlm(16, 31, vxrs);
                return;
        }
        if (mask == KERNEL_VXR_MID) {
-               fpu_vlm(8, 23, &vxrs[8]);
+               vxrs += fpu_vlm(8, 23, vxrs);
                return;
        }
        mask = flags & KERNEL_VXR_LOW;
        if (mask) {
                if (mask == KERNEL_VXR_LOW)
-                       fpu_vlm(0, 15, &vxrs[0]);
+                       vxrs += fpu_vlm(0, 15, vxrs);
                else if (mask == KERNEL_VXR_V0V7)
-                       fpu_vlm(0, 7, &vxrs[0]);
+                       vxrs += fpu_vlm(0, 7, vxrs);
                else
-                       fpu_vlm(8, 15, &vxrs[8]);
+                       vxrs += fpu_vlm(8, 15, vxrs);
        }
        mask = flags & KERNEL_VXR_HIGH;
        if (mask) {
                if (mask == KERNEL_VXR_HIGH)
-                       fpu_vlm(16, 31, &vxrs[16]);
+                       vxrs += fpu_vlm(16, 31, vxrs);
                else if (mask == KERNEL_VXR_V16V23)
-                       fpu_vlm(16, 23, &vxrs[16]);
+                       vxrs += fpu_vlm(16, 23, vxrs);
                else
-                       fpu_vlm(24, 31, &vxrs[24]);
+                       vxrs += fpu_vlm(24, 31, vxrs);
        }
 }
 EXPORT_SYMBOL(__kernel_fpu_end);
index 061d45cf02618d5a8434f121c7a543c9bdf34f3e..4cd6428bfab2a497d20c528ac176fc3a69fce1e7 100644 (file)
@@ -426,7 +426,7 @@ subsys_initcall(create_proc_service_level);
  */
 void s390_adjust_jiffies(void)
 {
-       DECLARE_KERNEL_FPU_ONSTACK(fpu);
+       DECLARE_KERNEL_FPU_ONSTACK16(fpu);
        struct sysinfo_1_2_2 *info;
        unsigned long capability;
 
index 8467945344b52c48d2cc5f0cf4775ec99c0b8923..8c222b0dfbf2e7e68a953fd95fbee3a92cf84ae1 100644 (file)
@@ -5026,7 +5026,7 @@ static void store_regs(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 {
        struct kvm_run *kvm_run = vcpu->run;
-       DECLARE_KERNEL_FPU_ONSTACK(fpu);
+       DECLARE_KERNEL_FPU_ONSTACK32(fpu);
        int rc;
 
        /*
index bc2f4fbe5a8280107068a82e1f1741dc22173b1f..92c05b7596bcb4e4289635cd0da5f6b55622e9d8 100644 (file)
@@ -80,7 +80,7 @@ static inline void COPY_VEC(int x, int y)
 
 static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
 {
-       DECLARE_KERNEL_FPU_ONSTACK(vxstate);
+       DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
        u8 **dptr, *p, *q;
        int d, z, z0;
 
@@ -113,7 +113,7 @@ static void raid6_s390vx$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
 static void raid6_s390vx$#_xor_syndrome(int disks, int start, int stop,
                                        size_t bytes, void **ptrs)
 {
-       DECLARE_KERNEL_FPU_ONSTACK(vxstate);
+       DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
        u8 **dptr, *p, *q;
        int d, z, z0;