lib/crypto: s390: Move arch/s390/lib/crypto/ into lib/crypto/
authorEric Biggers <ebiggers@kernel.org>
Thu, 19 Jun 2025 19:19:05 +0000 (12:19 -0700)
committerEric Biggers <ebiggers@kernel.org>
Mon, 30 Jun 2025 16:26:20 +0000 (09:26 -0700)
Move the contents of arch/s390/lib/crypto/ into lib/crypto/s390/.

The new code organization makes a lot more sense for how this code
actually works and is developed.  In particular, it makes it possible to
build each algorithm as a single module, with better inlining and dead
code elimination.  For a more detailed explanation, see the patchset
which did this for the CRC library code:
https://lore.kernel.org/r/20250607200454.73587-1-ebiggers@kernel.org/.
Also see the patchset which did this for SHA-512:
https://lore.kernel.org/linux-crypto/20250616014019.415791-1-ebiggers@kernel.org/

This is just a preparatory commit, which does the move to get the files
into their new location but keeps them building the same way as before.
Later commits will make the actual improvements to the way the
arch-optimized code is integrated for each algorithm.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Sohil Mehta <sohil.mehta@intel.com>
Link: https://lore.kernel.org/r/20250619191908.134235-7-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
15 files changed:
arch/s390/lib/Makefile
arch/s390/lib/crypto/Kconfig [deleted file]
arch/s390/lib/crypto/Makefile [deleted file]
arch/s390/lib/crypto/chacha-glue.c [deleted file]
arch/s390/lib/crypto/chacha-s390.S [deleted file]
arch/s390/lib/crypto/chacha-s390.h [deleted file]
arch/s390/lib/crypto/sha256.c [deleted file]
lib/crypto/Kconfig
lib/crypto/Makefile
lib/crypto/s390/Kconfig [new file with mode: 0644]
lib/crypto/s390/Makefile [new file with mode: 0644]
lib/crypto/s390/chacha-glue.c [new file with mode: 0644]
lib/crypto/s390/chacha-s390.S [new file with mode: 0644]
lib/crypto/s390/chacha-s390.h [new file with mode: 0644]
lib/crypto/s390/sha256.c [new file with mode: 0644]

index cd35cdbfa87134ab67ea0e0a9d490d4c95fe9b0e..271a1c407121c526bfda678c196833a1c88ff09e 100644 (file)
@@ -3,7 +3,6 @@
 # Makefile for s390-specific library files..
 #
 
-obj-y += crypto/
 lib-y += delay.o string.o uaccess.o find.o spinlock.o tishift.o
 lib-y += csum-partial.o
 obj-y += mem.o xor.o
diff --git a/arch/s390/lib/crypto/Kconfig b/arch/s390/lib/crypto/Kconfig
deleted file mode 100644 (file)
index e3f855e..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_CHACHA_S390
-       tristate
-       default CRYPTO_LIB_CHACHA
-       select CRYPTO_LIB_CHACHA_GENERIC
-       select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_SHA256_S390
-       tristate
-       default CRYPTO_LIB_SHA256
-       select CRYPTO_ARCH_HAVE_LIB_SHA256
-       select CRYPTO_LIB_SHA256_GENERIC
diff --git a/arch/s390/lib/crypto/Makefile b/arch/s390/lib/crypto/Makefile
deleted file mode 100644 (file)
index 5df30f1..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o
-chacha_s390-y := chacha-glue.o chacha-s390.o
-
-obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256-s390.o
-sha256-s390-y := sha256.o
diff --git a/arch/s390/lib/crypto/chacha-glue.c b/arch/s390/lib/crypto/chacha-glue.c
deleted file mode 100644 (file)
index f95ba34..0000000
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ChaCha stream cipher (s390 optimized)
- *
- * Copyright IBM Corp. 2021
- */
-
-#define KMSG_COMPONENT "chacha_s390"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
-#include <crypto/chacha.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/sizes.h>
-#include <asm/fpu.h>
-#include "chacha-s390.h"
-
-void hchacha_block_arch(const struct chacha_state *state,
-                       u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
-       /* TODO: implement hchacha_block_arch() in assembly */
-       hchacha_block_generic(state, out, nrounds);
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
-                      unsigned int bytes, int nrounds)
-{
-       /* s390 chacha20 implementation has 20 rounds hard-coded,
-        * it cannot handle a block of data or less, but otherwise
-        * it can handle data of arbitrary size
-        */
-       if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx()) {
-               chacha_crypt_generic(state, dst, src, bytes, nrounds);
-       } else {
-               DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
-
-               kernel_fpu_begin(&vxstate, KERNEL_VXR);
-               chacha20_vx(dst, src, bytes, &state->x[4], &state->x[12]);
-               kernel_fpu_end(&vxstate, KERNEL_VXR);
-
-               state->x[12] += round_up(bytes, CHACHA_BLOCK_SIZE) /
-                               CHACHA_BLOCK_SIZE;
-       }
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
-       return cpu_has_vx();
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-MODULE_DESCRIPTION("ChaCha stream cipher (s390 optimized)");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/s390/lib/crypto/chacha-s390.S b/arch/s390/lib/crypto/chacha-s390.S
deleted file mode 100644 (file)
index 63f3102..0000000
+++ /dev/null
@@ -1,908 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Original implementation written by Andy Polyakov, @dot-asm.
- * This is an adaptation of the original code for kernel use.
- *
- * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
- */
-
-#include <linux/linkage.h>
-#include <asm/nospec-insn.h>
-#include <asm/fpu-insn.h>
-
-#define SP     %r15
-#define FRAME  (16 * 8 + 4 * 8)
-
-       .data
-       .balign 32
-
-SYM_DATA_START_LOCAL(sigma)
-       .long   0x61707865,0x3320646e,0x79622d32,0x6b206574     # endian-neutral
-       .long   1,0,0,0
-       .long   2,0,0,0
-       .long   3,0,0,0
-       .long   0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c     # byte swap
-
-       .long   0,1,2,3
-       .long   0x61707865,0x61707865,0x61707865,0x61707865     # smashed sigma
-       .long   0x3320646e,0x3320646e,0x3320646e,0x3320646e
-       .long   0x79622d32,0x79622d32,0x79622d32,0x79622d32
-       .long   0x6b206574,0x6b206574,0x6b206574,0x6b206574
-SYM_DATA_END(sigma)
-
-       .previous
-
-       GEN_BR_THUNK %r14
-
-       .text
-
-#############################################################################
-# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
-#                    counst u32 *key, const u32 *counter)
-
-#define        OUT             %r2
-#define        INP             %r3
-#define        LEN             %r4
-#define        KEY             %r5
-#define        COUNTER         %r6
-
-#define BEPERM         %v31
-#define CTR            %v26
-
-#define K0             %v16
-#define K1             %v17
-#define K2             %v18
-#define K3             %v19
-
-#define XA0            %v0
-#define XA1            %v1
-#define XA2            %v2
-#define XA3            %v3
-
-#define XB0            %v4
-#define XB1            %v5
-#define XB2            %v6
-#define XB3            %v7
-
-#define XC0            %v8
-#define XC1            %v9
-#define XC2            %v10
-#define XC3            %v11
-
-#define XD0            %v12
-#define XD1            %v13
-#define XD2            %v14
-#define XD3            %v15
-
-#define XT0            %v27
-#define XT1            %v28
-#define XT2            %v29
-#define XT3            %v30
-
-SYM_FUNC_START(chacha20_vx_4x)
-       stmg    %r6,%r7,6*8(SP)
-
-       larl    %r7,sigma
-       lhi     %r0,10
-       lhi     %r1,0
-
-       VL      K0,0,,%r7               # load sigma
-       VL      K1,0,,KEY               # load key
-       VL      K2,16,,KEY
-       VL      K3,0,,COUNTER           # load counter
-
-       VL      BEPERM,0x40,,%r7
-       VL      CTR,0x50,,%r7
-
-       VLM     XA0,XA3,0x60,%r7,4      # load [smashed] sigma
-
-       VREPF   XB0,K1,0                # smash the key
-       VREPF   XB1,K1,1
-       VREPF   XB2,K1,2
-       VREPF   XB3,K1,3
-
-       VREPF   XD0,K3,0
-       VREPF   XD1,K3,1
-       VREPF   XD2,K3,2
-       VREPF   XD3,K3,3
-       VAF     XD0,XD0,CTR
-
-       VREPF   XC0,K2,0
-       VREPF   XC1,K2,1
-       VREPF   XC2,K2,2
-       VREPF   XC3,K2,3
-
-.Loop_4x:
-       VAF     XA0,XA0,XB0
-       VX      XD0,XD0,XA0
-       VERLLF  XD0,XD0,16
-
-       VAF     XA1,XA1,XB1
-       VX      XD1,XD1,XA1
-       VERLLF  XD1,XD1,16
-
-       VAF     XA2,XA2,XB2
-       VX      XD2,XD2,XA2
-       VERLLF  XD2,XD2,16
-
-       VAF     XA3,XA3,XB3
-       VX      XD3,XD3,XA3
-       VERLLF  XD3,XD3,16
-
-       VAF     XC0,XC0,XD0
-       VX      XB0,XB0,XC0
-       VERLLF  XB0,XB0,12
-
-       VAF     XC1,XC1,XD1
-       VX      XB1,XB1,XC1
-       VERLLF  XB1,XB1,12
-
-       VAF     XC2,XC2,XD2
-       VX      XB2,XB2,XC2
-       VERLLF  XB2,XB2,12
-
-       VAF     XC3,XC3,XD3
-       VX      XB3,XB3,XC3
-       VERLLF  XB3,XB3,12
-
-       VAF     XA0,XA0,XB0
-       VX      XD0,XD0,XA0
-       VERLLF  XD0,XD0,8
-
-       VAF     XA1,XA1,XB1
-       VX      XD1,XD1,XA1
-       VERLLF  XD1,XD1,8
-
-       VAF     XA2,XA2,XB2
-       VX      XD2,XD2,XA2
-       VERLLF  XD2,XD2,8
-
-       VAF     XA3,XA3,XB3
-       VX      XD3,XD3,XA3
-       VERLLF  XD3,XD3,8
-
-       VAF     XC0,XC0,XD0
-       VX      XB0,XB0,XC0
-       VERLLF  XB0,XB0,7
-
-       VAF     XC1,XC1,XD1
-       VX      XB1,XB1,XC1
-       VERLLF  XB1,XB1,7
-
-       VAF     XC2,XC2,XD2
-       VX      XB2,XB2,XC2
-       VERLLF  XB2,XB2,7
-
-       VAF     XC3,XC3,XD3
-       VX      XB3,XB3,XC3
-       VERLLF  XB3,XB3,7
-
-       VAF     XA0,XA0,XB1
-       VX      XD3,XD3,XA0
-       VERLLF  XD3,XD3,16
-
-       VAF     XA1,XA1,XB2
-       VX      XD0,XD0,XA1
-       VERLLF  XD0,XD0,16
-
-       VAF     XA2,XA2,XB3
-       VX      XD1,XD1,XA2
-       VERLLF  XD1,XD1,16
-
-       VAF     XA3,XA3,XB0
-       VX      XD2,XD2,XA3
-       VERLLF  XD2,XD2,16
-
-       VAF     XC2,XC2,XD3
-       VX      XB1,XB1,XC2
-       VERLLF  XB1,XB1,12
-
-       VAF     XC3,XC3,XD0
-       VX      XB2,XB2,XC3
-       VERLLF  XB2,XB2,12
-
-       VAF     XC0,XC0,XD1
-       VX      XB3,XB3,XC0
-       VERLLF  XB3,XB3,12
-
-       VAF     XC1,XC1,XD2
-       VX      XB0,XB0,XC1
-       VERLLF  XB0,XB0,12
-
-       VAF     XA0,XA0,XB1
-       VX      XD3,XD3,XA0
-       VERLLF  XD3,XD3,8
-
-       VAF     XA1,XA1,XB2
-       VX      XD0,XD0,XA1
-       VERLLF  XD0,XD0,8
-
-       VAF     XA2,XA2,XB3
-       VX      XD1,XD1,XA2
-       VERLLF  XD1,XD1,8
-
-       VAF     XA3,XA3,XB0
-       VX      XD2,XD2,XA3
-       VERLLF  XD2,XD2,8
-
-       VAF     XC2,XC2,XD3
-       VX      XB1,XB1,XC2
-       VERLLF  XB1,XB1,7
-
-       VAF     XC3,XC3,XD0
-       VX      XB2,XB2,XC3
-       VERLLF  XB2,XB2,7
-
-       VAF     XC0,XC0,XD1
-       VX      XB3,XB3,XC0
-       VERLLF  XB3,XB3,7
-
-       VAF     XC1,XC1,XD2
-       VX      XB0,XB0,XC1
-       VERLLF  XB0,XB0,7
-       brct    %r0,.Loop_4x
-
-       VAF     XD0,XD0,CTR
-
-       VMRHF   XT0,XA0,XA1             # transpose data
-       VMRHF   XT1,XA2,XA3
-       VMRLF   XT2,XA0,XA1
-       VMRLF   XT3,XA2,XA3
-       VPDI    XA0,XT0,XT1,0b0000
-       VPDI    XA1,XT0,XT1,0b0101
-       VPDI    XA2,XT2,XT3,0b0000
-       VPDI    XA3,XT2,XT3,0b0101
-
-       VMRHF   XT0,XB0,XB1
-       VMRHF   XT1,XB2,XB3
-       VMRLF   XT2,XB0,XB1
-       VMRLF   XT3,XB2,XB3
-       VPDI    XB0,XT0,XT1,0b0000
-       VPDI    XB1,XT0,XT1,0b0101
-       VPDI    XB2,XT2,XT3,0b0000
-       VPDI    XB3,XT2,XT3,0b0101
-
-       VMRHF   XT0,XC0,XC1
-       VMRHF   XT1,XC2,XC3
-       VMRLF   XT2,XC0,XC1
-       VMRLF   XT3,XC2,XC3
-       VPDI    XC0,XT0,XT1,0b0000
-       VPDI    XC1,XT0,XT1,0b0101
-       VPDI    XC2,XT2,XT3,0b0000
-       VPDI    XC3,XT2,XT3,0b0101
-
-       VMRHF   XT0,XD0,XD1
-       VMRHF   XT1,XD2,XD3
-       VMRLF   XT2,XD0,XD1
-       VMRLF   XT3,XD2,XD3
-       VPDI    XD0,XT0,XT1,0b0000
-       VPDI    XD1,XT0,XT1,0b0101
-       VPDI    XD2,XT2,XT3,0b0000
-       VPDI    XD3,XT2,XT3,0b0101
-
-       VAF     XA0,XA0,K0
-       VAF     XB0,XB0,K1
-       VAF     XC0,XC0,K2
-       VAF     XD0,XD0,K3
-
-       VPERM   XA0,XA0,XA0,BEPERM
-       VPERM   XB0,XB0,XB0,BEPERM
-       VPERM   XC0,XC0,XC0,BEPERM
-       VPERM   XD0,XD0,XD0,BEPERM
-
-       VLM     XT0,XT3,0,INP,0
-
-       VX      XT0,XT0,XA0
-       VX      XT1,XT1,XB0
-       VX      XT2,XT2,XC0
-       VX      XT3,XT3,XD0
-
-       VSTM    XT0,XT3,0,OUT,0
-
-       la      INP,0x40(INP)
-       la      OUT,0x40(OUT)
-       aghi    LEN,-0x40
-
-       VAF     XA0,XA1,K0
-       VAF     XB0,XB1,K1
-       VAF     XC0,XC1,K2
-       VAF     XD0,XD1,K3
-
-       VPERM   XA0,XA0,XA0,BEPERM
-       VPERM   XB0,XB0,XB0,BEPERM
-       VPERM   XC0,XC0,XC0,BEPERM
-       VPERM   XD0,XD0,XD0,BEPERM
-
-       clgfi   LEN,0x40
-       jl      .Ltail_4x
-
-       VLM     XT0,XT3,0,INP,0
-
-       VX      XT0,XT0,XA0
-       VX      XT1,XT1,XB0
-       VX      XT2,XT2,XC0
-       VX      XT3,XT3,XD0
-
-       VSTM    XT0,XT3,0,OUT,0
-
-       la      INP,0x40(INP)
-       la      OUT,0x40(OUT)
-       aghi    LEN,-0x40
-       je      .Ldone_4x
-
-       VAF     XA0,XA2,K0
-       VAF     XB0,XB2,K1
-       VAF     XC0,XC2,K2
-       VAF     XD0,XD2,K3
-
-       VPERM   XA0,XA0,XA0,BEPERM
-       VPERM   XB0,XB0,XB0,BEPERM
-       VPERM   XC0,XC0,XC0,BEPERM
-       VPERM   XD0,XD0,XD0,BEPERM
-
-       clgfi   LEN,0x40
-       jl      .Ltail_4x
-
-       VLM     XT0,XT3,0,INP,0
-
-       VX      XT0,XT0,XA0
-       VX      XT1,XT1,XB0
-       VX      XT2,XT2,XC0
-       VX      XT3,XT3,XD0
-
-       VSTM    XT0,XT3,0,OUT,0
-
-       la      INP,0x40(INP)
-       la      OUT,0x40(OUT)
-       aghi    LEN,-0x40
-       je      .Ldone_4x
-
-       VAF     XA0,XA3,K0
-       VAF     XB0,XB3,K1
-       VAF     XC0,XC3,K2
-       VAF     XD0,XD3,K3
-
-       VPERM   XA0,XA0,XA0,BEPERM
-       VPERM   XB0,XB0,XB0,BEPERM
-       VPERM   XC0,XC0,XC0,BEPERM
-       VPERM   XD0,XD0,XD0,BEPERM
-
-       clgfi   LEN,0x40
-       jl      .Ltail_4x
-
-       VLM     XT0,XT3,0,INP,0
-
-       VX      XT0,XT0,XA0
-       VX      XT1,XT1,XB0
-       VX      XT2,XT2,XC0
-       VX      XT3,XT3,XD0
-
-       VSTM    XT0,XT3,0,OUT,0
-
-.Ldone_4x:
-       lmg     %r6,%r7,6*8(SP)
-       BR_EX   %r14
-
-.Ltail_4x:
-       VLR     XT0,XC0
-       VLR     XT1,XD0
-
-       VST     XA0,8*8+0x00,,SP
-       VST     XB0,8*8+0x10,,SP
-       VST     XT0,8*8+0x20,,SP
-       VST     XT1,8*8+0x30,,SP
-
-       lghi    %r1,0
-
-.Loop_tail_4x:
-       llgc    %r5,0(%r1,INP)
-       llgc    %r6,8*8(%r1,SP)
-       xr      %r6,%r5
-       stc     %r6,0(%r1,OUT)
-       la      %r1,1(%r1)
-       brct    LEN,.Loop_tail_4x
-
-       lmg     %r6,%r7,6*8(SP)
-       BR_EX   %r14
-SYM_FUNC_END(chacha20_vx_4x)
-
-#undef OUT
-#undef INP
-#undef LEN
-#undef KEY
-#undef COUNTER
-
-#undef BEPERM
-
-#undef K0
-#undef K1
-#undef K2
-#undef K3
-
-
-#############################################################################
-# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
-#                 counst u32 *key, const u32 *counter)
-
-#define        OUT             %r2
-#define        INP             %r3
-#define        LEN             %r4
-#define        KEY             %r5
-#define        COUNTER         %r6
-
-#define BEPERM         %v31
-
-#define K0             %v27
-#define K1             %v24
-#define K2             %v25
-#define K3             %v26
-
-#define A0             %v0
-#define B0             %v1
-#define C0             %v2
-#define D0             %v3
-
-#define A1             %v4
-#define B1             %v5
-#define C1             %v6
-#define D1             %v7
-
-#define A2             %v8
-#define B2             %v9
-#define C2             %v10
-#define D2             %v11
-
-#define A3             %v12
-#define B3             %v13
-#define C3             %v14
-#define D3             %v15
-
-#define A4             %v16
-#define B4             %v17
-#define C4             %v18
-#define D4             %v19
-
-#define A5             %v20
-#define B5             %v21
-#define C5             %v22
-#define D5             %v23
-
-#define T0             %v27
-#define T1             %v28
-#define T2             %v29
-#define T3             %v30
-
-SYM_FUNC_START(chacha20_vx)
-       clgfi   LEN,256
-       jle     chacha20_vx_4x
-       stmg    %r6,%r7,6*8(SP)
-
-       lghi    %r1,-FRAME
-       lgr     %r0,SP
-       la      SP,0(%r1,SP)
-       stg     %r0,0(SP)               # back-chain
-
-       larl    %r7,sigma
-       lhi     %r0,10
-
-       VLM     K1,K2,0,KEY,0           # load key
-       VL      K3,0,,COUNTER           # load counter
-
-       VLM     K0,BEPERM,0,%r7,4       # load sigma, increments, ...
-
-.Loop_outer_vx:
-       VLR     A0,K0
-       VLR     B0,K1
-       VLR     A1,K0
-       VLR     B1,K1
-       VLR     A2,K0
-       VLR     B2,K1
-       VLR     A3,K0
-       VLR     B3,K1
-       VLR     A4,K0
-       VLR     B4,K1
-       VLR     A5,K0
-       VLR     B5,K1
-
-       VLR     D0,K3
-       VAF     D1,K3,T1                # K[3]+1
-       VAF     D2,K3,T2                # K[3]+2
-       VAF     D3,K3,T3                # K[3]+3
-       VAF     D4,D2,T2                # K[3]+4
-       VAF     D5,D2,T3                # K[3]+5
-
-       VLR     C0,K2
-       VLR     C1,K2
-       VLR     C2,K2
-       VLR     C3,K2
-       VLR     C4,K2
-       VLR     C5,K2
-
-       VLR     T1,D1
-       VLR     T2,D2
-       VLR     T3,D3
-
-.Loop_vx:
-       VAF     A0,A0,B0
-       VAF     A1,A1,B1
-       VAF     A2,A2,B2
-       VAF     A3,A3,B3
-       VAF     A4,A4,B4
-       VAF     A5,A5,B5
-       VX      D0,D0,A0
-       VX      D1,D1,A1
-       VX      D2,D2,A2
-       VX      D3,D3,A3
-       VX      D4,D4,A4
-       VX      D5,D5,A5
-       VERLLF  D0,D0,16
-       VERLLF  D1,D1,16
-       VERLLF  D2,D2,16
-       VERLLF  D3,D3,16
-       VERLLF  D4,D4,16
-       VERLLF  D5,D5,16
-
-       VAF     C0,C0,D0
-       VAF     C1,C1,D1
-       VAF     C2,C2,D2
-       VAF     C3,C3,D3
-       VAF     C4,C4,D4
-       VAF     C5,C5,D5
-       VX      B0,B0,C0
-       VX      B1,B1,C1
-       VX      B2,B2,C2
-       VX      B3,B3,C3
-       VX      B4,B4,C4
-       VX      B5,B5,C5
-       VERLLF  B0,B0,12
-       VERLLF  B1,B1,12
-       VERLLF  B2,B2,12
-       VERLLF  B3,B3,12
-       VERLLF  B4,B4,12
-       VERLLF  B5,B5,12
-
-       VAF     A0,A0,B0
-       VAF     A1,A1,B1
-       VAF     A2,A2,B2
-       VAF     A3,A3,B3
-       VAF     A4,A4,B4
-       VAF     A5,A5,B5
-       VX      D0,D0,A0
-       VX      D1,D1,A1
-       VX      D2,D2,A2
-       VX      D3,D3,A3
-       VX      D4,D4,A4
-       VX      D5,D5,A5
-       VERLLF  D0,D0,8
-       VERLLF  D1,D1,8
-       VERLLF  D2,D2,8
-       VERLLF  D3,D3,8
-       VERLLF  D4,D4,8
-       VERLLF  D5,D5,8
-
-       VAF     C0,C0,D0
-       VAF     C1,C1,D1
-       VAF     C2,C2,D2
-       VAF     C3,C3,D3
-       VAF     C4,C4,D4
-       VAF     C5,C5,D5
-       VX      B0,B0,C0
-       VX      B1,B1,C1
-       VX      B2,B2,C2
-       VX      B3,B3,C3
-       VX      B4,B4,C4
-       VX      B5,B5,C5
-       VERLLF  B0,B0,7
-       VERLLF  B1,B1,7
-       VERLLF  B2,B2,7
-       VERLLF  B3,B3,7
-       VERLLF  B4,B4,7
-       VERLLF  B5,B5,7
-
-       VSLDB   C0,C0,C0,8
-       VSLDB   C1,C1,C1,8
-       VSLDB   C2,C2,C2,8
-       VSLDB   C3,C3,C3,8
-       VSLDB   C4,C4,C4,8
-       VSLDB   C5,C5,C5,8
-       VSLDB   B0,B0,B0,4
-       VSLDB   B1,B1,B1,4
-       VSLDB   B2,B2,B2,4
-       VSLDB   B3,B3,B3,4
-       VSLDB   B4,B4,B4,4
-       VSLDB   B5,B5,B5,4
-       VSLDB   D0,D0,D0,12
-       VSLDB   D1,D1,D1,12
-       VSLDB   D2,D2,D2,12
-       VSLDB   D3,D3,D3,12
-       VSLDB   D4,D4,D4,12
-       VSLDB   D5,D5,D5,12
-
-       VAF     A0,A0,B0
-       VAF     A1,A1,B1
-       VAF     A2,A2,B2
-       VAF     A3,A3,B3
-       VAF     A4,A4,B4
-       VAF     A5,A5,B5
-       VX      D0,D0,A0
-       VX      D1,D1,A1
-       VX      D2,D2,A2
-       VX      D3,D3,A3
-       VX      D4,D4,A4
-       VX      D5,D5,A5
-       VERLLF  D0,D0,16
-       VERLLF  D1,D1,16
-       VERLLF  D2,D2,16
-       VERLLF  D3,D3,16
-       VERLLF  D4,D4,16
-       VERLLF  D5,D5,16
-
-       VAF     C0,C0,D0
-       VAF     C1,C1,D1
-       VAF     C2,C2,D2
-       VAF     C3,C3,D3
-       VAF     C4,C4,D4
-       VAF     C5,C5,D5
-       VX      B0,B0,C0
-       VX      B1,B1,C1
-       VX      B2,B2,C2
-       VX      B3,B3,C3
-       VX      B4,B4,C4
-       VX      B5,B5,C5
-       VERLLF  B0,B0,12
-       VERLLF  B1,B1,12
-       VERLLF  B2,B2,12
-       VERLLF  B3,B3,12
-       VERLLF  B4,B4,12
-       VERLLF  B5,B5,12
-
-       VAF     A0,A0,B0
-       VAF     A1,A1,B1
-       VAF     A2,A2,B2
-       VAF     A3,A3,B3
-       VAF     A4,A4,B4
-       VAF     A5,A5,B5
-       VX      D0,D0,A0
-       VX      D1,D1,A1
-       VX      D2,D2,A2
-       VX      D3,D3,A3
-       VX      D4,D4,A4
-       VX      D5,D5,A5
-       VERLLF  D0,D0,8
-       VERLLF  D1,D1,8
-       VERLLF  D2,D2,8
-       VERLLF  D3,D3,8
-       VERLLF  D4,D4,8
-       VERLLF  D5,D5,8
-
-       VAF     C0,C0,D0
-       VAF     C1,C1,D1
-       VAF     C2,C2,D2
-       VAF     C3,C3,D3
-       VAF     C4,C4,D4
-       VAF     C5,C5,D5
-       VX      B0,B0,C0
-       VX      B1,B1,C1
-       VX      B2,B2,C2
-       VX      B3,B3,C3
-       VX      B4,B4,C4
-       VX      B5,B5,C5
-       VERLLF  B0,B0,7
-       VERLLF  B1,B1,7
-       VERLLF  B2,B2,7
-       VERLLF  B3,B3,7
-       VERLLF  B4,B4,7
-       VERLLF  B5,B5,7
-
-       VSLDB   C0,C0,C0,8
-       VSLDB   C1,C1,C1,8
-       VSLDB   C2,C2,C2,8
-       VSLDB   C3,C3,C3,8
-       VSLDB   C4,C4,C4,8
-       VSLDB   C5,C5,C5,8
-       VSLDB   B0,B0,B0,12
-       VSLDB   B1,B1,B1,12
-       VSLDB   B2,B2,B2,12
-       VSLDB   B3,B3,B3,12
-       VSLDB   B4,B4,B4,12
-       VSLDB   B5,B5,B5,12
-       VSLDB   D0,D0,D0,4
-       VSLDB   D1,D1,D1,4
-       VSLDB   D2,D2,D2,4
-       VSLDB   D3,D3,D3,4
-       VSLDB   D4,D4,D4,4
-       VSLDB   D5,D5,D5,4
-       brct    %r0,.Loop_vx
-
-       VAF     A0,A0,K0
-       VAF     B0,B0,K1
-       VAF     C0,C0,K2
-       VAF     D0,D0,K3
-       VAF     A1,A1,K0
-       VAF     D1,D1,T1                # +K[3]+1
-
-       VPERM   A0,A0,A0,BEPERM
-       VPERM   B0,B0,B0,BEPERM
-       VPERM   C0,C0,C0,BEPERM
-       VPERM   D0,D0,D0,BEPERM
-
-       clgfi   LEN,0x40
-       jl      .Ltail_vx
-
-       VAF     D2,D2,T2                # +K[3]+2
-       VAF     D3,D3,T3                # +K[3]+3
-       VLM     T0,T3,0,INP,0
-
-       VX      A0,A0,T0
-       VX      B0,B0,T1
-       VX      C0,C0,T2
-       VX      D0,D0,T3
-
-       VLM     K0,T3,0,%r7,4           # re-load sigma and increments
-
-       VSTM    A0,D0,0,OUT,0
-
-       la      INP,0x40(INP)
-       la      OUT,0x40(OUT)
-       aghi    LEN,-0x40
-       je      .Ldone_vx
-
-       VAF     B1,B1,K1
-       VAF     C1,C1,K2
-
-       VPERM   A0,A1,A1,BEPERM
-       VPERM   B0,B1,B1,BEPERM
-       VPERM   C0,C1,C1,BEPERM
-       VPERM   D0,D1,D1,BEPERM
-
-       clgfi   LEN,0x40
-       jl      .Ltail_vx
-
-       VLM     A1,D1,0,INP,0
-
-       VX      A0,A0,A1
-       VX      B0,B0,B1
-       VX      C0,C0,C1
-       VX      D0,D0,D1
-
-       VSTM    A0,D0,0,OUT,0
-
-       la      INP,0x40(INP)
-       la      OUT,0x40(OUT)
-       aghi    LEN,-0x40
-       je      .Ldone_vx
-
-       VAF     A2,A2,K0
-       VAF     B2,B2,K1
-       VAF     C2,C2,K2
-
-       VPERM   A0,A2,A2,BEPERM
-       VPERM   B0,B2,B2,BEPERM
-       VPERM   C0,C2,C2,BEPERM
-       VPERM   D0,D2,D2,BEPERM
-
-       clgfi   LEN,0x40
-       jl      .Ltail_vx
-
-       VLM     A1,D1,0,INP,0
-
-       VX      A0,A0,A1
-       VX      B0,B0,B1
-       VX      C0,C0,C1
-       VX      D0,D0,D1
-
-       VSTM    A0,D0,0,OUT,0
-
-       la      INP,0x40(INP)
-       la      OUT,0x40(OUT)
-       aghi    LEN,-0x40
-       je      .Ldone_vx
-
-       VAF     A3,A3,K0
-       VAF     B3,B3,K1
-       VAF     C3,C3,K2
-       VAF     D2,K3,T3                # K[3]+3
-
-       VPERM   A0,A3,A3,BEPERM
-       VPERM   B0,B3,B3,BEPERM
-       VPERM   C0,C3,C3,BEPERM
-       VPERM   D0,D3,D3,BEPERM
-
-       clgfi   LEN,0x40
-       jl      .Ltail_vx
-
-       VAF     D3,D2,T1                # K[3]+4
-       VLM     A1,D1,0,INP,0
-
-       VX      A0,A0,A1
-       VX      B0,B0,B1
-       VX      C0,C0,C1
-       VX      D0,D0,D1
-
-       VSTM    A0,D0,0,OUT,0
-
-       la      INP,0x40(INP)
-       la      OUT,0x40(OUT)
-       aghi    LEN,-0x40
-       je      .Ldone_vx
-
-       VAF     A4,A4,K0
-       VAF     B4,B4,K1
-       VAF     C4,C4,K2
-       VAF     D4,D4,D3                # +K[3]+4
-       VAF     D3,D3,T1                # K[3]+5
-       VAF     K3,D2,T3                # K[3]+=6
-
-       VPERM   A0,A4,A4,BEPERM
-       VPERM   B0,B4,B4,BEPERM
-       VPERM   C0,C4,C4,BEPERM
-       VPERM   D0,D4,D4,BEPERM
-
-       clgfi   LEN,0x40
-       jl      .Ltail_vx
-
-       VLM     A1,D1,0,INP,0
-
-       VX      A0,A0,A1
-       VX      B0,B0,B1
-       VX      C0,C0,C1
-       VX      D0,D0,D1
-
-       VSTM    A0,D0,0,OUT,0
-
-       la      INP,0x40(INP)
-       la      OUT,0x40(OUT)
-       aghi    LEN,-0x40
-       je      .Ldone_vx
-
-       VAF     A5,A5,K0
-       VAF     B5,B5,K1
-       VAF     C5,C5,K2
-       VAF     D5,D5,D3                # +K[3]+5
-
-       VPERM   A0,A5,A5,BEPERM
-       VPERM   B0,B5,B5,BEPERM
-       VPERM   C0,C5,C5,BEPERM
-       VPERM   D0,D5,D5,BEPERM
-
-       clgfi   LEN,0x40
-       jl      .Ltail_vx
-
-       VLM     A1,D1,0,INP,0
-
-       VX      A0,A0,A1
-       VX      B0,B0,B1
-       VX      C0,C0,C1
-       VX      D0,D0,D1
-
-       VSTM    A0,D0,0,OUT,0
-
-       la      INP,0x40(INP)
-       la      OUT,0x40(OUT)
-       lhi     %r0,10
-       aghi    LEN,-0x40
-       jne     .Loop_outer_vx
-
-.Ldone_vx:
-       lmg     %r6,%r7,FRAME+6*8(SP)
-       la      SP,FRAME(SP)
-       BR_EX   %r14
-
-.Ltail_vx:
-       VSTM    A0,D0,8*8,SP,3
-       lghi    %r1,0
-
-.Loop_tail_vx:
-       llgc    %r5,0(%r1,INP)
-       llgc    %r6,8*8(%r1,SP)
-       xr      %r6,%r5
-       stc     %r6,0(%r1,OUT)
-       la      %r1,1(%r1)
-       brct    LEN,.Loop_tail_vx
-
-       lmg     %r6,%r7,FRAME+6*8(SP)
-       la      SP,FRAME(SP)
-       BR_EX   %r14
-SYM_FUNC_END(chacha20_vx)
-
-.previous
diff --git a/arch/s390/lib/crypto/chacha-s390.h b/arch/s390/lib/crypto/chacha-s390.h
deleted file mode 100644 (file)
index 733744c..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * s390 ChaCha stream cipher.
- *
- * Copyright IBM Corp. 2021
- */
-
-#ifndef _CHACHA_S390_H
-#define _CHACHA_S390_H
-
-void chacha20_vx(u8 *out, const u8 *inp, size_t len, const u32 *key,
-                const u32 *counter);
-
-#endif /* _CHACHA_S390_H */
diff --git a/arch/s390/lib/crypto/sha256.c b/arch/s390/lib/crypto/sha256.c
deleted file mode 100644 (file)
index 7dfe120..0000000
+++ /dev/null
@@ -1,47 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 optimized using the CP Assist for Cryptographic Functions (CPACF)
- *
- * Copyright 2025 Google LLC
- */
-#include <asm/cpacf.h>
-#include <crypto/internal/sha2.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha256);
-
-void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
-                       const u8 *data, size_t nblocks)
-{
-       if (static_branch_likely(&have_cpacf_sha256))
-               cpacf_kimd(CPACF_KIMD_SHA_256, state, data,
-                          nblocks * SHA256_BLOCK_SIZE);
-       else
-               sha256_blocks_generic(state, data, nblocks);
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-
-bool sha256_is_arch_optimized(void)
-{
-       return static_key_enabled(&have_cpacf_sha256);
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
-static int __init sha256_s390_mod_init(void)
-{
-       if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
-           cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256))
-               static_branch_enable(&have_cpacf_sha256);
-       return 0;
-}
-subsys_initcall(sha256_s390_mod_init);
-
-static void __exit sha256_s390_mod_exit(void)
-{
-}
-module_exit(sha256_s390_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 using the CP Assist for Cryptographic Functions (CPACF)");
index a2b58ca2df0cc5a7cc9c6374fe9ab6cedc03f7f0..278b7ef5ec4f941a2e0bfdc92ed15d3bdb86aad5 100644 (file)
@@ -205,7 +205,7 @@ if RISCV
 source "lib/crypto/riscv/Kconfig"
 endif
 if S390
-source "arch/s390/lib/crypto/Kconfig"
+source "lib/crypto/s390/Kconfig"
 endif
 if SPARC
 source "arch/sparc/lib/crypto/Kconfig"
index 05b7e29ea0e8fe58ee1c8e9951309cc67f364f1e..26f65bb4c8d81973da4706d522971bdeedde3d1b 100644 (file)
@@ -112,3 +112,4 @@ obj-$(CONFIG_ARM64) += arm64/
 obj-$(CONFIG_MIPS) += mips/
 obj-$(CONFIG_PPC) += powerpc/
 obj-$(CONFIG_RISCV) += riscv/
+obj-$(CONFIG_S390) += s390/
diff --git a/lib/crypto/s390/Kconfig b/lib/crypto/s390/Kconfig
new file mode 100644 (file)
index 0000000..e3f855e
--- /dev/null
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_CHACHA_S390
+       tristate
+       default CRYPTO_LIB_CHACHA
+       select CRYPTO_LIB_CHACHA_GENERIC
+       select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_SHA256_S390
+       tristate
+       default CRYPTO_LIB_SHA256
+       select CRYPTO_ARCH_HAVE_LIB_SHA256
+       select CRYPTO_LIB_SHA256_GENERIC
diff --git a/lib/crypto/s390/Makefile b/lib/crypto/s390/Makefile
new file mode 100644 (file)
index 0000000..5df30f1
--- /dev/null
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_CHACHA_S390) += chacha_s390.o
+chacha_s390-y := chacha-glue.o chacha-s390.o
+
+obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256-s390.o
+sha256-s390-y := sha256.o
diff --git a/lib/crypto/s390/chacha-glue.c b/lib/crypto/s390/chacha-glue.c
new file mode 100644 (file)
index 0000000..f95ba34
--- /dev/null
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ChaCha stream cipher (s390 optimized)
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#define KMSG_COMPONENT "chacha_s390"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <crypto/chacha.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sizes.h>
+#include <asm/fpu.h>
+#include "chacha-s390.h"
+
+void hchacha_block_arch(const struct chacha_state *state,
+                       u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+       /* TODO: implement hchacha_block_arch() in assembly */
+       hchacha_block_generic(state, out, nrounds);
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
+                      unsigned int bytes, int nrounds)
+{
+       /* s390 chacha20 implementation has 20 rounds hard-coded,
+        * it cannot handle a block of data or less, but otherwise
+        * it can handle data of arbitrary size
+        */
+       if (bytes <= CHACHA_BLOCK_SIZE || nrounds != 20 || !cpu_has_vx()) {
+               chacha_crypt_generic(state, dst, src, bytes, nrounds);
+       } else {
+               DECLARE_KERNEL_FPU_ONSTACK32(vxstate);
+
+               kernel_fpu_begin(&vxstate, KERNEL_VXR);
+               chacha20_vx(dst, src, bytes, &state->x[4], &state->x[12]);
+               kernel_fpu_end(&vxstate, KERNEL_VXR);
+
+               state->x[12] += round_up(bytes, CHACHA_BLOCK_SIZE) /
+                               CHACHA_BLOCK_SIZE;
+       }
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+       return cpu_has_vx();
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+MODULE_DESCRIPTION("ChaCha stream cipher (s390 optimized)");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/s390/chacha-s390.S b/lib/crypto/s390/chacha-s390.S
new file mode 100644 (file)
index 0000000..63f3102
--- /dev/null
@@ -0,0 +1,908 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Original implementation written by Andy Polyakov, @dot-asm.
+ * This is an adaptation of the original code for kernel use.
+ *
+ * Copyright (C) 2006-2019 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ */
+
+#include <linux/linkage.h>
+#include <asm/nospec-insn.h>
+#include <asm/fpu-insn.h>
+
+#define SP     %r15
+#define FRAME  (16 * 8 + 4 * 8)
+
+       .data
+       .balign 32
+
+SYM_DATA_START_LOCAL(sigma)
+       .long   0x61707865,0x3320646e,0x79622d32,0x6b206574     # endian-neutral
+       .long   1,0,0,0
+       .long   2,0,0,0
+       .long   3,0,0,0
+       .long   0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c     # byte swap
+
+       .long   0,1,2,3
+       .long   0x61707865,0x61707865,0x61707865,0x61707865     # smashed sigma
+       .long   0x3320646e,0x3320646e,0x3320646e,0x3320646e
+       .long   0x79622d32,0x79622d32,0x79622d32,0x79622d32
+       .long   0x6b206574,0x6b206574,0x6b206574,0x6b206574
+SYM_DATA_END(sigma)
+
+       .previous
+
+       GEN_BR_THUNK %r14
+
+       .text
+
+#############################################################################
+# void chacha20_vx_4x(u8 *out, counst u8 *inp, size_t len,
+#                    counst u32 *key, const u32 *counter)
+
+#define        OUT             %r2
+#define        INP             %r3
+#define        LEN             %r4
+#define        KEY             %r5
+#define        COUNTER         %r6
+
+#define BEPERM         %v31
+#define CTR            %v26
+
+#define K0             %v16
+#define K1             %v17
+#define K2             %v18
+#define K3             %v19
+
+#define XA0            %v0
+#define XA1            %v1
+#define XA2            %v2
+#define XA3            %v3
+
+#define XB0            %v4
+#define XB1            %v5
+#define XB2            %v6
+#define XB3            %v7
+
+#define XC0            %v8
+#define XC1            %v9
+#define XC2            %v10
+#define XC3            %v11
+
+#define XD0            %v12
+#define XD1            %v13
+#define XD2            %v14
+#define XD3            %v15
+
+#define XT0            %v27
+#define XT1            %v28
+#define XT2            %v29
+#define XT3            %v30
+
+SYM_FUNC_START(chacha20_vx_4x)
+       stmg    %r6,%r7,6*8(SP)
+
+       larl    %r7,sigma
+       lhi     %r0,10
+       lhi     %r1,0
+
+       VL      K0,0,,%r7               # load sigma
+       VL      K1,0,,KEY               # load key
+       VL      K2,16,,KEY
+       VL      K3,0,,COUNTER           # load counter
+
+       VL      BEPERM,0x40,,%r7
+       VL      CTR,0x50,,%r7
+
+       VLM     XA0,XA3,0x60,%r7,4      # load [smashed] sigma
+
+       VREPF   XB0,K1,0                # smash the key
+       VREPF   XB1,K1,1
+       VREPF   XB2,K1,2
+       VREPF   XB3,K1,3
+
+       VREPF   XD0,K3,0
+       VREPF   XD1,K3,1
+       VREPF   XD2,K3,2
+       VREPF   XD3,K3,3
+       VAF     XD0,XD0,CTR
+
+       VREPF   XC0,K2,0
+       VREPF   XC1,K2,1
+       VREPF   XC2,K2,2
+       VREPF   XC3,K2,3
+
+.Loop_4x:
+       VAF     XA0,XA0,XB0
+       VX      XD0,XD0,XA0
+       VERLLF  XD0,XD0,16
+
+       VAF     XA1,XA1,XB1
+       VX      XD1,XD1,XA1
+       VERLLF  XD1,XD1,16
+
+       VAF     XA2,XA2,XB2
+       VX      XD2,XD2,XA2
+       VERLLF  XD2,XD2,16
+
+       VAF     XA3,XA3,XB3
+       VX      XD3,XD3,XA3
+       VERLLF  XD3,XD3,16
+
+       VAF     XC0,XC0,XD0
+       VX      XB0,XB0,XC0
+       VERLLF  XB0,XB0,12
+
+       VAF     XC1,XC1,XD1
+       VX      XB1,XB1,XC1
+       VERLLF  XB1,XB1,12
+
+       VAF     XC2,XC2,XD2
+       VX      XB2,XB2,XC2
+       VERLLF  XB2,XB2,12
+
+       VAF     XC3,XC3,XD3
+       VX      XB3,XB3,XC3
+       VERLLF  XB3,XB3,12
+
+       VAF     XA0,XA0,XB0
+       VX      XD0,XD0,XA0
+       VERLLF  XD0,XD0,8
+
+       VAF     XA1,XA1,XB1
+       VX      XD1,XD1,XA1
+       VERLLF  XD1,XD1,8
+
+       VAF     XA2,XA2,XB2
+       VX      XD2,XD2,XA2
+       VERLLF  XD2,XD2,8
+
+       VAF     XA3,XA3,XB3
+       VX      XD3,XD3,XA3
+       VERLLF  XD3,XD3,8
+
+       VAF     XC0,XC0,XD0
+       VX      XB0,XB0,XC0
+       VERLLF  XB0,XB0,7
+
+       VAF     XC1,XC1,XD1
+       VX      XB1,XB1,XC1
+       VERLLF  XB1,XB1,7
+
+       VAF     XC2,XC2,XD2
+       VX      XB2,XB2,XC2
+       VERLLF  XB2,XB2,7
+
+       VAF     XC3,XC3,XD3
+       VX      XB3,XB3,XC3
+       VERLLF  XB3,XB3,7
+
+       VAF     XA0,XA0,XB1
+       VX      XD3,XD3,XA0
+       VERLLF  XD3,XD3,16
+
+       VAF     XA1,XA1,XB2
+       VX      XD0,XD0,XA1
+       VERLLF  XD0,XD0,16
+
+       VAF     XA2,XA2,XB3
+       VX      XD1,XD1,XA2
+       VERLLF  XD1,XD1,16
+
+       VAF     XA3,XA3,XB0
+       VX      XD2,XD2,XA3
+       VERLLF  XD2,XD2,16
+
+       VAF     XC2,XC2,XD3
+       VX      XB1,XB1,XC2
+       VERLLF  XB1,XB1,12
+
+       VAF     XC3,XC3,XD0
+       VX      XB2,XB2,XC3
+       VERLLF  XB2,XB2,12
+
+       VAF     XC0,XC0,XD1
+       VX      XB3,XB3,XC0
+       VERLLF  XB3,XB3,12
+
+       VAF     XC1,XC1,XD2
+       VX      XB0,XB0,XC1
+       VERLLF  XB0,XB0,12
+
+       VAF     XA0,XA0,XB1
+       VX      XD3,XD3,XA0
+       VERLLF  XD3,XD3,8
+
+       VAF     XA1,XA1,XB2
+       VX      XD0,XD0,XA1
+       VERLLF  XD0,XD0,8
+
+       VAF     XA2,XA2,XB3
+       VX      XD1,XD1,XA2
+       VERLLF  XD1,XD1,8
+
+       VAF     XA3,XA3,XB0
+       VX      XD2,XD2,XA3
+       VERLLF  XD2,XD2,8
+
+       VAF     XC2,XC2,XD3
+       VX      XB1,XB1,XC2
+       VERLLF  XB1,XB1,7
+
+       VAF     XC3,XC3,XD0
+       VX      XB2,XB2,XC3
+       VERLLF  XB2,XB2,7
+
+       VAF     XC0,XC0,XD1
+       VX      XB3,XB3,XC0
+       VERLLF  XB3,XB3,7
+
+       VAF     XC1,XC1,XD2
+       VX      XB0,XB0,XC1
+       VERLLF  XB0,XB0,7
+       brct    %r0,.Loop_4x
+
+       VAF     XD0,XD0,CTR
+
+       VMRHF   XT0,XA0,XA1             # transpose data
+       VMRHF   XT1,XA2,XA3
+       VMRLF   XT2,XA0,XA1
+       VMRLF   XT3,XA2,XA3
+       VPDI    XA0,XT0,XT1,0b0000
+       VPDI    XA1,XT0,XT1,0b0101
+       VPDI    XA2,XT2,XT3,0b0000
+       VPDI    XA3,XT2,XT3,0b0101
+
+       VMRHF   XT0,XB0,XB1
+       VMRHF   XT1,XB2,XB3
+       VMRLF   XT2,XB0,XB1
+       VMRLF   XT3,XB2,XB3
+       VPDI    XB0,XT0,XT1,0b0000
+       VPDI    XB1,XT0,XT1,0b0101
+       VPDI    XB2,XT2,XT3,0b0000
+       VPDI    XB3,XT2,XT3,0b0101
+
+       VMRHF   XT0,XC0,XC1
+       VMRHF   XT1,XC2,XC3
+       VMRLF   XT2,XC0,XC1
+       VMRLF   XT3,XC2,XC3
+       VPDI    XC0,XT0,XT1,0b0000
+       VPDI    XC1,XT0,XT1,0b0101
+       VPDI    XC2,XT2,XT3,0b0000
+       VPDI    XC3,XT2,XT3,0b0101
+
+       VMRHF   XT0,XD0,XD1
+       VMRHF   XT1,XD2,XD3
+       VMRLF   XT2,XD0,XD1
+       VMRLF   XT3,XD2,XD3
+       VPDI    XD0,XT0,XT1,0b0000
+       VPDI    XD1,XT0,XT1,0b0101
+       VPDI    XD2,XT2,XT3,0b0000
+       VPDI    XD3,XT2,XT3,0b0101
+
+       VAF     XA0,XA0,K0
+       VAF     XB0,XB0,K1
+       VAF     XC0,XC0,K2
+       VAF     XD0,XD0,K3
+
+       VPERM   XA0,XA0,XA0,BEPERM
+       VPERM   XB0,XB0,XB0,BEPERM
+       VPERM   XC0,XC0,XC0,BEPERM
+       VPERM   XD0,XD0,XD0,BEPERM
+
+       VLM     XT0,XT3,0,INP,0
+
+       VX      XT0,XT0,XA0
+       VX      XT1,XT1,XB0
+       VX      XT2,XT2,XC0
+       VX      XT3,XT3,XD0
+
+       VSTM    XT0,XT3,0,OUT,0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+
+       VAF     XA0,XA1,K0
+       VAF     XB0,XB1,K1
+       VAF     XC0,XC1,K2
+       VAF     XD0,XD1,K3
+
+       VPERM   XA0,XA0,XA0,BEPERM
+       VPERM   XB0,XB0,XB0,BEPERM
+       VPERM   XC0,XC0,XC0,BEPERM
+       VPERM   XD0,XD0,XD0,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_4x
+
+       VLM     XT0,XT3,0,INP,0
+
+       VX      XT0,XT0,XA0
+       VX      XT1,XT1,XB0
+       VX      XT2,XT2,XC0
+       VX      XT3,XT3,XD0
+
+       VSTM    XT0,XT3,0,OUT,0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_4x
+
+       VAF     XA0,XA2,K0
+       VAF     XB0,XB2,K1
+       VAF     XC0,XC2,K2
+       VAF     XD0,XD2,K3
+
+       VPERM   XA0,XA0,XA0,BEPERM
+       VPERM   XB0,XB0,XB0,BEPERM
+       VPERM   XC0,XC0,XC0,BEPERM
+       VPERM   XD0,XD0,XD0,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_4x
+
+       VLM     XT0,XT3,0,INP,0
+
+       VX      XT0,XT0,XA0
+       VX      XT1,XT1,XB0
+       VX      XT2,XT2,XC0
+       VX      XT3,XT3,XD0
+
+       VSTM    XT0,XT3,0,OUT,0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_4x
+
+       VAF     XA0,XA3,K0
+       VAF     XB0,XB3,K1
+       VAF     XC0,XC3,K2
+       VAF     XD0,XD3,K3
+
+       VPERM   XA0,XA0,XA0,BEPERM
+       VPERM   XB0,XB0,XB0,BEPERM
+       VPERM   XC0,XC0,XC0,BEPERM
+       VPERM   XD0,XD0,XD0,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_4x
+
+       VLM     XT0,XT3,0,INP,0
+
+       VX      XT0,XT0,XA0
+       VX      XT1,XT1,XB0
+       VX      XT2,XT2,XC0
+       VX      XT3,XT3,XD0
+
+       VSTM    XT0,XT3,0,OUT,0
+
+.Ldone_4x:
+       lmg     %r6,%r7,6*8(SP)
+       BR_EX   %r14
+
+.Ltail_4x:
+       VLR     XT0,XC0
+       VLR     XT1,XD0
+
+       VST     XA0,8*8+0x00,,SP
+       VST     XB0,8*8+0x10,,SP
+       VST     XT0,8*8+0x20,,SP
+       VST     XT1,8*8+0x30,,SP
+
+       lghi    %r1,0
+
+.Loop_tail_4x:
+       llgc    %r5,0(%r1,INP)
+       llgc    %r6,8*8(%r1,SP)
+       xr      %r6,%r5
+       stc     %r6,0(%r1,OUT)
+       la      %r1,1(%r1)
+       brct    LEN,.Loop_tail_4x
+
+       lmg     %r6,%r7,6*8(SP)
+       BR_EX   %r14
+SYM_FUNC_END(chacha20_vx_4x)
+
+#undef OUT
+#undef INP
+#undef LEN
+#undef KEY
+#undef COUNTER
+
+#undef BEPERM
+
+#undef K0
+#undef K1
+#undef K2
+#undef K3
+
+
+#############################################################################
+# void chacha20_vx(u8 *out, counst u8 *inp, size_t len,
+#                 counst u32 *key, const u32 *counter)
+
+#define        OUT             %r2
+#define        INP             %r3
+#define        LEN             %r4
+#define        KEY             %r5
+#define        COUNTER         %r6
+
+#define BEPERM         %v31
+
+#define K0             %v27
+#define K1             %v24
+#define K2             %v25
+#define K3             %v26
+
+#define A0             %v0
+#define B0             %v1
+#define C0             %v2
+#define D0             %v3
+
+#define A1             %v4
+#define B1             %v5
+#define C1             %v6
+#define D1             %v7
+
+#define A2             %v8
+#define B2             %v9
+#define C2             %v10
+#define D2             %v11
+
+#define A3             %v12
+#define B3             %v13
+#define C3             %v14
+#define D3             %v15
+
+#define A4             %v16
+#define B4             %v17
+#define C4             %v18
+#define D4             %v19
+
+#define A5             %v20
+#define B5             %v21
+#define C5             %v22
+#define D5             %v23
+
+#define T0             %v27
+#define T1             %v28
+#define T2             %v29
+#define T3             %v30
+
+SYM_FUNC_START(chacha20_vx)
+       clgfi   LEN,256
+       jle     chacha20_vx_4x
+       stmg    %r6,%r7,6*8(SP)
+
+       lghi    %r1,-FRAME
+       lgr     %r0,SP
+       la      SP,0(%r1,SP)
+       stg     %r0,0(SP)               # back-chain
+
+       larl    %r7,sigma
+       lhi     %r0,10
+
+       VLM     K1,K2,0,KEY,0           # load key
+       VL      K3,0,,COUNTER           # load counter
+
+       VLM     K0,BEPERM,0,%r7,4       # load sigma, increments, ...
+
+.Loop_outer_vx:
+       VLR     A0,K0
+       VLR     B0,K1
+       VLR     A1,K0
+       VLR     B1,K1
+       VLR     A2,K0
+       VLR     B2,K1
+       VLR     A3,K0
+       VLR     B3,K1
+       VLR     A4,K0
+       VLR     B4,K1
+       VLR     A5,K0
+       VLR     B5,K1
+
+       VLR     D0,K3
+       VAF     D1,K3,T1                # K[3]+1
+       VAF     D2,K3,T2                # K[3]+2
+       VAF     D3,K3,T3                # K[3]+3
+       VAF     D4,D2,T2                # K[3]+4
+       VAF     D5,D2,T3                # K[3]+5
+
+       VLR     C0,K2
+       VLR     C1,K2
+       VLR     C2,K2
+       VLR     C3,K2
+       VLR     C4,K2
+       VLR     C5,K2
+
+       VLR     T1,D1
+       VLR     T2,D2
+       VLR     T3,D3
+
+.Loop_vx:
+       VAF     A0,A0,B0
+       VAF     A1,A1,B1
+       VAF     A2,A2,B2
+       VAF     A3,A3,B3
+       VAF     A4,A4,B4
+       VAF     A5,A5,B5
+       VX      D0,D0,A0
+       VX      D1,D1,A1
+       VX      D2,D2,A2
+       VX      D3,D3,A3
+       VX      D4,D4,A4
+       VX      D5,D5,A5
+       VERLLF  D0,D0,16
+       VERLLF  D1,D1,16
+       VERLLF  D2,D2,16
+       VERLLF  D3,D3,16
+       VERLLF  D4,D4,16
+       VERLLF  D5,D5,16
+
+       VAF     C0,C0,D0
+       VAF     C1,C1,D1
+       VAF     C2,C2,D2
+       VAF     C3,C3,D3
+       VAF     C4,C4,D4
+       VAF     C5,C5,D5
+       VX      B0,B0,C0
+       VX      B1,B1,C1
+       VX      B2,B2,C2
+       VX      B3,B3,C3
+       VX      B4,B4,C4
+       VX      B5,B5,C5
+       VERLLF  B0,B0,12
+       VERLLF  B1,B1,12
+       VERLLF  B2,B2,12
+       VERLLF  B3,B3,12
+       VERLLF  B4,B4,12
+       VERLLF  B5,B5,12
+
+       VAF     A0,A0,B0
+       VAF     A1,A1,B1
+       VAF     A2,A2,B2
+       VAF     A3,A3,B3
+       VAF     A4,A4,B4
+       VAF     A5,A5,B5
+       VX      D0,D0,A0
+       VX      D1,D1,A1
+       VX      D2,D2,A2
+       VX      D3,D3,A3
+       VX      D4,D4,A4
+       VX      D5,D5,A5
+       VERLLF  D0,D0,8
+       VERLLF  D1,D1,8
+       VERLLF  D2,D2,8
+       VERLLF  D3,D3,8
+       VERLLF  D4,D4,8
+       VERLLF  D5,D5,8
+
+       VAF     C0,C0,D0
+       VAF     C1,C1,D1
+       VAF     C2,C2,D2
+       VAF     C3,C3,D3
+       VAF     C4,C4,D4
+       VAF     C5,C5,D5
+       VX      B0,B0,C0
+       VX      B1,B1,C1
+       VX      B2,B2,C2
+       VX      B3,B3,C3
+       VX      B4,B4,C4
+       VX      B5,B5,C5
+       VERLLF  B0,B0,7
+       VERLLF  B1,B1,7
+       VERLLF  B2,B2,7
+       VERLLF  B3,B3,7
+       VERLLF  B4,B4,7
+       VERLLF  B5,B5,7
+
+       VSLDB   C0,C0,C0,8
+       VSLDB   C1,C1,C1,8
+       VSLDB   C2,C2,C2,8
+       VSLDB   C3,C3,C3,8
+       VSLDB   C4,C4,C4,8
+       VSLDB   C5,C5,C5,8
+       VSLDB   B0,B0,B0,4
+       VSLDB   B1,B1,B1,4
+       VSLDB   B2,B2,B2,4
+       VSLDB   B3,B3,B3,4
+       VSLDB   B4,B4,B4,4
+       VSLDB   B5,B5,B5,4
+       VSLDB   D0,D0,D0,12
+       VSLDB   D1,D1,D1,12
+       VSLDB   D2,D2,D2,12
+       VSLDB   D3,D3,D3,12
+       VSLDB   D4,D4,D4,12
+       VSLDB   D5,D5,D5,12
+
+       VAF     A0,A0,B0
+       VAF     A1,A1,B1
+       VAF     A2,A2,B2
+       VAF     A3,A3,B3
+       VAF     A4,A4,B4
+       VAF     A5,A5,B5
+       VX      D0,D0,A0
+       VX      D1,D1,A1
+       VX      D2,D2,A2
+       VX      D3,D3,A3
+       VX      D4,D4,A4
+       VX      D5,D5,A5
+       VERLLF  D0,D0,16
+       VERLLF  D1,D1,16
+       VERLLF  D2,D2,16
+       VERLLF  D3,D3,16
+       VERLLF  D4,D4,16
+       VERLLF  D5,D5,16
+
+       VAF     C0,C0,D0
+       VAF     C1,C1,D1
+       VAF     C2,C2,D2
+       VAF     C3,C3,D3
+       VAF     C4,C4,D4
+       VAF     C5,C5,D5
+       VX      B0,B0,C0
+       VX      B1,B1,C1
+       VX      B2,B2,C2
+       VX      B3,B3,C3
+       VX      B4,B4,C4
+       VX      B5,B5,C5
+       VERLLF  B0,B0,12
+       VERLLF  B1,B1,12
+       VERLLF  B2,B2,12
+       VERLLF  B3,B3,12
+       VERLLF  B4,B4,12
+       VERLLF  B5,B5,12
+
+       VAF     A0,A0,B0
+       VAF     A1,A1,B1
+       VAF     A2,A2,B2
+       VAF     A3,A3,B3
+       VAF     A4,A4,B4
+       VAF     A5,A5,B5
+       VX      D0,D0,A0
+       VX      D1,D1,A1
+       VX      D2,D2,A2
+       VX      D3,D3,A3
+       VX      D4,D4,A4
+       VX      D5,D5,A5
+       VERLLF  D0,D0,8
+       VERLLF  D1,D1,8
+       VERLLF  D2,D2,8
+       VERLLF  D3,D3,8
+       VERLLF  D4,D4,8
+       VERLLF  D5,D5,8
+
+       VAF     C0,C0,D0
+       VAF     C1,C1,D1
+       VAF     C2,C2,D2
+       VAF     C3,C3,D3
+       VAF     C4,C4,D4
+       VAF     C5,C5,D5
+       VX      B0,B0,C0
+       VX      B1,B1,C1
+       VX      B2,B2,C2
+       VX      B3,B3,C3
+       VX      B4,B4,C4
+       VX      B5,B5,C5
+       VERLLF  B0,B0,7
+       VERLLF  B1,B1,7
+       VERLLF  B2,B2,7
+       VERLLF  B3,B3,7
+       VERLLF  B4,B4,7
+       VERLLF  B5,B5,7
+
+       VSLDB   C0,C0,C0,8
+       VSLDB   C1,C1,C1,8
+       VSLDB   C2,C2,C2,8
+       VSLDB   C3,C3,C3,8
+       VSLDB   C4,C4,C4,8
+       VSLDB   C5,C5,C5,8
+       VSLDB   B0,B0,B0,12
+       VSLDB   B1,B1,B1,12
+       VSLDB   B2,B2,B2,12
+       VSLDB   B3,B3,B3,12
+       VSLDB   B4,B4,B4,12
+       VSLDB   B5,B5,B5,12
+       VSLDB   D0,D0,D0,4
+       VSLDB   D1,D1,D1,4
+       VSLDB   D2,D2,D2,4
+       VSLDB   D3,D3,D3,4
+       VSLDB   D4,D4,D4,4
+       VSLDB   D5,D5,D5,4
+       brct    %r0,.Loop_vx
+
+       VAF     A0,A0,K0
+       VAF     B0,B0,K1
+       VAF     C0,C0,K2
+       VAF     D0,D0,K3
+       VAF     A1,A1,K0
+       VAF     D1,D1,T1                # +K[3]+1
+
+       VPERM   A0,A0,A0,BEPERM
+       VPERM   B0,B0,B0,BEPERM
+       VPERM   C0,C0,C0,BEPERM
+       VPERM   D0,D0,D0,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_vx
+
+       VAF     D2,D2,T2                # +K[3]+2
+       VAF     D3,D3,T3                # +K[3]+3
+       VLM     T0,T3,0,INP,0
+
+       VX      A0,A0,T0
+       VX      B0,B0,T1
+       VX      C0,C0,T2
+       VX      D0,D0,T3
+
+       VLM     K0,T3,0,%r7,4           # re-load sigma and increments
+
+       VSTM    A0,D0,0,OUT,0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_vx
+
+       VAF     B1,B1,K1
+       VAF     C1,C1,K2
+
+       VPERM   A0,A1,A1,BEPERM
+       VPERM   B0,B1,B1,BEPERM
+       VPERM   C0,C1,C1,BEPERM
+       VPERM   D0,D1,D1,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_vx
+
+       VLM     A1,D1,0,INP,0
+
+       VX      A0,A0,A1
+       VX      B0,B0,B1
+       VX      C0,C0,C1
+       VX      D0,D0,D1
+
+       VSTM    A0,D0,0,OUT,0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_vx
+
+       VAF     A2,A2,K0
+       VAF     B2,B2,K1
+       VAF     C2,C2,K2
+
+       VPERM   A0,A2,A2,BEPERM
+       VPERM   B0,B2,B2,BEPERM
+       VPERM   C0,C2,C2,BEPERM
+       VPERM   D0,D2,D2,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_vx
+
+       VLM     A1,D1,0,INP,0
+
+       VX      A0,A0,A1
+       VX      B0,B0,B1
+       VX      C0,C0,C1
+       VX      D0,D0,D1
+
+       VSTM    A0,D0,0,OUT,0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_vx
+
+       VAF     A3,A3,K0
+       VAF     B3,B3,K1
+       VAF     C3,C3,K2
+       VAF     D2,K3,T3                # K[3]+3
+
+       VPERM   A0,A3,A3,BEPERM
+       VPERM   B0,B3,B3,BEPERM
+       VPERM   C0,C3,C3,BEPERM
+       VPERM   D0,D3,D3,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_vx
+
+       VAF     D3,D2,T1                # K[3]+4
+       VLM     A1,D1,0,INP,0
+
+       VX      A0,A0,A1
+       VX      B0,B0,B1
+       VX      C0,C0,C1
+       VX      D0,D0,D1
+
+       VSTM    A0,D0,0,OUT,0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_vx
+
+       VAF     A4,A4,K0
+       VAF     B4,B4,K1
+       VAF     C4,C4,K2
+       VAF     D4,D4,D3                # +K[3]+4
+       VAF     D3,D3,T1                # K[3]+5
+       VAF     K3,D2,T3                # K[3]+=6
+
+       VPERM   A0,A4,A4,BEPERM
+       VPERM   B0,B4,B4,BEPERM
+       VPERM   C0,C4,C4,BEPERM
+       VPERM   D0,D4,D4,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_vx
+
+       VLM     A1,D1,0,INP,0
+
+       VX      A0,A0,A1
+       VX      B0,B0,B1
+       VX      C0,C0,C1
+       VX      D0,D0,D1
+
+       VSTM    A0,D0,0,OUT,0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       aghi    LEN,-0x40
+       je      .Ldone_vx
+
+       VAF     A5,A5,K0
+       VAF     B5,B5,K1
+       VAF     C5,C5,K2
+       VAF     D5,D5,D3                # +K[3]+5
+
+       VPERM   A0,A5,A5,BEPERM
+       VPERM   B0,B5,B5,BEPERM
+       VPERM   C0,C5,C5,BEPERM
+       VPERM   D0,D5,D5,BEPERM
+
+       clgfi   LEN,0x40
+       jl      .Ltail_vx
+
+       VLM     A1,D1,0,INP,0
+
+       VX      A0,A0,A1
+       VX      B0,B0,B1
+       VX      C0,C0,C1
+       VX      D0,D0,D1
+
+       VSTM    A0,D0,0,OUT,0
+
+       la      INP,0x40(INP)
+       la      OUT,0x40(OUT)
+       lhi     %r0,10
+       aghi    LEN,-0x40
+       jne     .Loop_outer_vx
+
+.Ldone_vx:
+       lmg     %r6,%r7,FRAME+6*8(SP)
+       la      SP,FRAME(SP)
+       BR_EX   %r14
+
+.Ltail_vx:
+       VSTM    A0,D0,8*8,SP,3
+       lghi    %r1,0
+
+.Loop_tail_vx:
+       llgc    %r5,0(%r1,INP)
+       llgc    %r6,8*8(%r1,SP)
+       xr      %r6,%r5
+       stc     %r6,0(%r1,OUT)
+       la      %r1,1(%r1)
+       brct    LEN,.Loop_tail_vx
+
+       lmg     %r6,%r7,FRAME+6*8(SP)
+       la      SP,FRAME(SP)
+       BR_EX   %r14
+SYM_FUNC_END(chacha20_vx)
+
+.previous
diff --git a/lib/crypto/s390/chacha-s390.h b/lib/crypto/s390/chacha-s390.h
new file mode 100644 (file)
index 0000000..733744c
--- /dev/null
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * s390 ChaCha stream cipher.
+ *
+ * Copyright IBM Corp. 2021
+ */
+
+#ifndef _CHACHA_S390_H
+#define _CHACHA_S390_H
+
+void chacha20_vx(u8 *out, const u8 *inp, size_t len, const u32 *key,
+                const u32 *counter);
+
+#endif /* _CHACHA_S390_H */
diff --git a/lib/crypto/s390/sha256.c b/lib/crypto/s390/sha256.c
new file mode 100644 (file)
index 0000000..7dfe120
--- /dev/null
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-256 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <crypto/internal/sha2.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_cpacf_sha256);
+
+void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+                       const u8 *data, size_t nblocks)
+{
+       if (static_branch_likely(&have_cpacf_sha256))
+               cpacf_kimd(CPACF_KIMD_SHA_256, state, data,
+                          nblocks * SHA256_BLOCK_SIZE);
+       else
+               sha256_blocks_generic(state, data, nblocks);
+}
+EXPORT_SYMBOL_GPL(sha256_blocks_arch);
+
+bool sha256_is_arch_optimized(void)
+{
+       return static_key_enabled(&have_cpacf_sha256);
+}
+EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
+
+static int __init sha256_s390_mod_init(void)
+{
+       if (cpu_have_feature(S390_CPU_FEATURE_MSA) &&
+           cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA_256))
+               static_branch_enable(&have_cpacf_sha256);
+       return 0;
+}
+subsys_initcall(sha256_s390_mod_init);
+
+static void __exit sha256_s390_mod_exit(void)
+{
+}
+module_exit(sha256_s390_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-256 using the CP Assist for Cryptographic Functions (CPACF)");