lib/crypto: arm/sha1: Migrate optimized code into library
authorEric Biggers <ebiggers@kernel.org>
Sat, 12 Jul 2025 23:22:58 +0000 (16:22 -0700)
committerEric Biggers <ebiggers@kernel.org>
Mon, 14 Jul 2025 18:11:29 +0000 (11:11 -0700)
Instead of exposing the arm-optimized SHA-1 code via arm-specific
crypto_shash algorithms, instead just implement the sha1_blocks()
library function.  This is much simpler, it makes the SHA-1 library
functions be arm-optimized, and it fixes the longstanding issue where
the arm-optimized SHA-1 code was disabled by default.  SHA-1 still
remains available through crypto_shash, but individual architectures no
longer need to handle it.

To match sha1_blocks(), change the type of the nblocks parameter of the
assembly functions from int to size_t.  The assembly functions actually
already treated it as size_t.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250712232329.818226-8-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
19 files changed:
arch/arm/configs/exynos_defconfig
arch/arm/configs/milbeaut_m10v_defconfig
arch/arm/configs/multi_v7_defconfig
arch/arm/configs/omap2plus_defconfig
arch/arm/configs/pxa_defconfig
arch/arm/crypto/Kconfig
arch/arm/crypto/Makefile
arch/arm/crypto/sha1-armv4-large.S [deleted file]
arch/arm/crypto/sha1-armv7-neon.S [deleted file]
arch/arm/crypto/sha1-ce-core.S [deleted file]
arch/arm/crypto/sha1-ce-glue.c [deleted file]
arch/arm/crypto/sha1_glue.c [deleted file]
arch/arm/crypto/sha1_neon_glue.c [deleted file]
lib/crypto/Kconfig
lib/crypto/Makefile
lib/crypto/arm/sha1-armv4-large.S [new file with mode: 0644]
lib/crypto/arm/sha1-armv7-neon.S [new file with mode: 0644]
lib/crypto/arm/sha1-ce-core.S [new file with mode: 0644]
lib/crypto/arm/sha1.h [new file with mode: 0644]

index d58e300693045ad96d5236e5141da3ec417e4fd5..6915c766923a2faad63afa5045c94c1b2d19ddfd 100644 (file)
@@ -363,7 +363,6 @@ CONFIG_CRYPTO_USER_API_HASH=m
 CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
-CONFIG_CRYPTO_SHA1_ARM_NEON=m
 CONFIG_CRYPTO_AES_ARM_BS=m
 CONFIG_CRYPTO_CHACHA20_NEON=m
 CONFIG_CRYPTO_DEV_EXYNOS_RNG=y
index 8ebf8bd872fe8a3693ccd507eb52a4cdb91eedab..a3be0b2ede09c7b32da0ce151d4ddebd606d39bc 100644 (file)
@@ -98,8 +98,6 @@ CONFIG_CRYPTO_SELFTESTS=y
 CONFIG_CRYPTO_AES=y
 CONFIG_CRYPTO_SEQIV=m
 CONFIG_CRYPTO_GHASH_ARM_CE=m
-CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA1_ARM_CE=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
 CONFIG_CRYPTO_AES_ARM_CE=m
index 3fd07e864ca8554e29c11e8e86bb320c8b8e3afc..fb63f487a623222ff07580797627227ab4e97605 100644 (file)
@@ -1280,8 +1280,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 CONFIG_CRYPTO_GHASH_ARM_CE=m
-CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA1_ARM_CE=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
 CONFIG_CRYPTO_AES_ARM_CE=m
index 530dfb8338c98e6d2eb4936fb533f21cb56d8298..046467637901323ba4cbcc480712973edb265503 100644 (file)
@@ -704,7 +704,6 @@ CONFIG_NLS_ISO8859_1=y
 CONFIG_SECURITY=y
 CONFIG_CRYPTO_MICHAEL_MIC=y
 CONFIG_CRYPTO_GHASH_ARM_CE=m
-CONFIG_CRYPTO_SHA1_ARM_NEON=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
 CONFIG_CRYPTO_CHACHA20_NEON=m
index eaa44574d4a64603811f5983e509a4a7bd44c19e..1a80602c12845d2faad55468894d6f561c6f19de 100644 (file)
@@ -658,7 +658,6 @@ CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_DEFLATE=y
 CONFIG_CRYPTO_LZO=y
-CONFIG_CRYPTO_SHA1_ARM=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_FONTS=y
 CONFIG_FONT_8x8=y
index a18f97f1597cbe7b46bfa9aed92b755862d65561..1e5f3cdf691c4f7390ca817b239cb61855bb848d 100644 (file)
@@ -62,37 +62,6 @@ config CRYPTO_BLAKE2B_NEON
          much faster than the SHA-2 family and slightly faster than
          SHA-1.
 
-config CRYPTO_SHA1_ARM
-       tristate "Hash functions: SHA-1"
-       select CRYPTO_SHA1
-       select CRYPTO_HASH
-       help
-         SHA-1 secure hash algorithm (FIPS 180)
-
-         Architecture: arm
-
-config CRYPTO_SHA1_ARM_NEON
-       tristate "Hash functions: SHA-1 (NEON)"
-       depends on KERNEL_MODE_NEON
-       select CRYPTO_SHA1_ARM
-       select CRYPTO_SHA1
-       select CRYPTO_HASH
-       help
-         SHA-1 secure hash algorithm (FIPS 180)
-
-         Architecture: arm using
-         - NEON (Advanced SIMD) extensions
-
-config CRYPTO_SHA1_ARM_CE
-       tristate "Hash functions: SHA-1 (ARMv8 Crypto Extensions)"
-       depends on KERNEL_MODE_NEON
-       select CRYPTO_SHA1_ARM
-       select CRYPTO_HASH
-       help
-         SHA-1 secure hash algorithm (FIPS 180)
-
-         Architecture: arm using ARMv8 Crypto Extensions
-
 config CRYPTO_AES_ARM
        tristate "Ciphers: AES"
        select CRYPTO_ALGAPI
index 78a4042d8761c14baf0fbfa85560e8262b35cd0c..4f23999ae17dfe3d27626bb04bb9c19d453605b4 100644 (file)
@@ -5,22 +5,16 @@
 
 obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
 obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
-obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
-obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
 obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
 obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
 
 aes-arm-y      := aes-cipher-core.o aes-cipher-glue.o
 aes-arm-bs-y   := aes-neonbs-core.o aes-neonbs-glue.o
-sha1-arm-y     := sha1-armv4-large.o sha1_glue.o
-sha1-arm-neon-y        := sha1-armv7-neon.o sha1_neon_glue.o
 blake2b-neon-y  := blake2b-neon-core.o blake2b-neon-glue.o
-sha1-arm-ce-y  := sha1-ce-core.o sha1-ce-glue.o
 aes-arm-ce-y   := aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
diff --git a/arch/arm/crypto/sha1-armv4-large.S b/arch/arm/crypto/sha1-armv4-large.S
deleted file mode 100644 (file)
index 1c8b685..0000000
+++ /dev/null
@@ -1,507 +0,0 @@
-#define __ARM_ARCH__ __LINUX_ARM_ARCH__
-@ SPDX-License-Identifier: GPL-2.0
-
-@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
-@ has relicensed it under the GPLv2. Therefore this program is free software;
-@ you can redistribute it and/or modify it under the terms of the GNU General
-@ Public License version 2 as published by the Free Software Foundation.
-@
-@ The original headers, including the original license headers, are
-@ included below for completeness.
-
-@ ====================================================================
-@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-@ project. The module is, however, dual licensed under OpenSSL and
-@ CRYPTOGAMS licenses depending on where you obtain it. For further
-@ details see https://www.openssl.org/~appro/cryptogams/.
-@ ====================================================================
-
-@ sha1_block procedure for ARMv4.
-@
-@ January 2007.
-
-@ Size/performance trade-off
-@ ====================================================================
-@ impl         size in bytes   comp cycles[*]  measured performance
-@ ====================================================================
-@ thumb                304             3212            4420
-@ armv4-small  392/+29%        1958/+64%       2250/+96%
-@ armv4-compact        740/+89%        1552/+26%       1840/+22%
-@ armv4-large  1420/+92%       1307/+19%       1370/+34%[***]
-@ full unroll  ~5100/+260%     ~1260/+4%       ~1300/+5%
-@ ====================================================================
-@ thumb                = same as 'small' but in Thumb instructions[**] and
-@                with recurring code in two private functions;
-@ small                = detached Xload/update, loops are folded;
-@ compact      = detached Xload/update, 5x unroll;
-@ large                = interleaved Xload/update, 5x unroll;
-@ full unroll  = interleaved Xload/update, full unroll, estimated[!];
-@
-@ [*]  Manually counted instructions in "grand" loop body. Measured
-@      performance is affected by prologue and epilogue overhead,
-@      i-cache availability, branch penalties, etc.
-@ [**] While each Thumb instruction is twice smaller, they are not as
-@      diverse as ARM ones: e.g., there are only two arithmetic
-@      instructions with 3 arguments, no [fixed] rotate, addressing
-@      modes are limited. As result it takes more instructions to do
-@      the same job in Thumb, therefore the code is never twice as
-@      small and always slower.
-@ [***]        which is also ~35% better than compiler generated code. Dual-
-@      issue Cortex A8 core was measured to process input block in
-@      ~990 cycles.
-
-@ August 2010.
-@
-@ Rescheduling for dual-issue pipeline resulted in 13% improvement on
-@ Cortex A8 core and in absolute terms ~870 cycles per input block
-@ [or 13.6 cycles per byte].
-
-@ February 2011.
-@
-@ Profiler-assisted and platform-specific optimization resulted in 10%
-@ improvement on Cortex A8 core and 12.2 cycles per byte.
-
-#include <linux/linkage.h>
-
-.text
-
-.align 2
-ENTRY(sha1_block_data_order)
-       stmdb   sp!,{r4-r12,lr}
-       add     r2,r1,r2,lsl#6  @ r2 to point at the end of r1
-       ldmia   r0,{r3,r4,r5,r6,r7}
-.Lloop:
-       ldr     r8,.LK_00_19
-       mov     r14,sp
-       sub     sp,sp,#15*4
-       mov     r5,r5,ror#30
-       mov     r6,r6,ror#30
-       mov     r7,r7,ror#30            @ [6]
-.L_00_15:
-#if __ARM_ARCH__<7
-       ldrb    r10,[r1,#2]
-       ldrb    r9,[r1,#3]
-       ldrb    r11,[r1,#1]
-       add     r7,r8,r7,ror#2                  @ E+=K_00_19
-       ldrb    r12,[r1],#4
-       orr     r9,r9,r10,lsl#8
-       eor     r10,r5,r6                       @ F_xx_xx
-       orr     r9,r9,r11,lsl#16
-       add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
-       orr     r9,r9,r12,lsl#24
-#else
-       ldr     r9,[r1],#4                      @ handles unaligned
-       add     r7,r8,r7,ror#2                  @ E+=K_00_19
-       eor     r10,r5,r6                       @ F_xx_xx
-       add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
-#ifdef __ARMEL__
-       rev     r9,r9                           @ byte swap
-#endif
-#endif
-       and     r10,r4,r10,ror#2
-       add     r7,r7,r9                        @ E+=X[i]
-       eor     r10,r10,r6,ror#2                @ F_00_19(B,C,D)
-       str     r9,[r14,#-4]!
-       add     r7,r7,r10                       @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
-       ldrb    r10,[r1,#2]
-       ldrb    r9,[r1,#3]
-       ldrb    r11,[r1,#1]
-       add     r6,r8,r6,ror#2                  @ E+=K_00_19
-       ldrb    r12,[r1],#4
-       orr     r9,r9,r10,lsl#8
-       eor     r10,r4,r5                       @ F_xx_xx
-       orr     r9,r9,r11,lsl#16
-       add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
-       orr     r9,r9,r12,lsl#24
-#else
-       ldr     r9,[r1],#4                      @ handles unaligned
-       add     r6,r8,r6,ror#2                  @ E+=K_00_19
-       eor     r10,r4,r5                       @ F_xx_xx
-       add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
-#ifdef __ARMEL__
-       rev     r9,r9                           @ byte swap
-#endif
-#endif
-       and     r10,r3,r10,ror#2
-       add     r6,r6,r9                        @ E+=X[i]
-       eor     r10,r10,r5,ror#2                @ F_00_19(B,C,D)
-       str     r9,[r14,#-4]!
-       add     r6,r6,r10                       @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
-       ldrb    r10,[r1,#2]
-       ldrb    r9,[r1,#3]
-       ldrb    r11,[r1,#1]
-       add     r5,r8,r5,ror#2                  @ E+=K_00_19
-       ldrb    r12,[r1],#4
-       orr     r9,r9,r10,lsl#8
-       eor     r10,r3,r4                       @ F_xx_xx
-       orr     r9,r9,r11,lsl#16
-       add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
-       orr     r9,r9,r12,lsl#24
-#else
-       ldr     r9,[r1],#4                      @ handles unaligned
-       add     r5,r8,r5,ror#2                  @ E+=K_00_19
-       eor     r10,r3,r4                       @ F_xx_xx
-       add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
-#ifdef __ARMEL__
-       rev     r9,r9                           @ byte swap
-#endif
-#endif
-       and     r10,r7,r10,ror#2
-       add     r5,r5,r9                        @ E+=X[i]
-       eor     r10,r10,r4,ror#2                @ F_00_19(B,C,D)
-       str     r9,[r14,#-4]!
-       add     r5,r5,r10                       @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
-       ldrb    r10,[r1,#2]
-       ldrb    r9,[r1,#3]
-       ldrb    r11,[r1,#1]
-       add     r4,r8,r4,ror#2                  @ E+=K_00_19
-       ldrb    r12,[r1],#4
-       orr     r9,r9,r10,lsl#8
-       eor     r10,r7,r3                       @ F_xx_xx
-       orr     r9,r9,r11,lsl#16
-       add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
-       orr     r9,r9,r12,lsl#24
-#else
-       ldr     r9,[r1],#4                      @ handles unaligned
-       add     r4,r8,r4,ror#2                  @ E+=K_00_19
-       eor     r10,r7,r3                       @ F_xx_xx
-       add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
-#ifdef __ARMEL__
-       rev     r9,r9                           @ byte swap
-#endif
-#endif
-       and     r10,r6,r10,ror#2
-       add     r4,r4,r9                        @ E+=X[i]
-       eor     r10,r10,r3,ror#2                @ F_00_19(B,C,D)
-       str     r9,[r14,#-4]!
-       add     r4,r4,r10                       @ E+=F_00_19(B,C,D)
-#if __ARM_ARCH__<7
-       ldrb    r10,[r1,#2]
-       ldrb    r9,[r1,#3]
-       ldrb    r11,[r1,#1]
-       add     r3,r8,r3,ror#2                  @ E+=K_00_19
-       ldrb    r12,[r1],#4
-       orr     r9,r9,r10,lsl#8
-       eor     r10,r6,r7                       @ F_xx_xx
-       orr     r9,r9,r11,lsl#16
-       add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
-       orr     r9,r9,r12,lsl#24
-#else
-       ldr     r9,[r1],#4                      @ handles unaligned
-       add     r3,r8,r3,ror#2                  @ E+=K_00_19
-       eor     r10,r6,r7                       @ F_xx_xx
-       add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
-#ifdef __ARMEL__
-       rev     r9,r9                           @ byte swap
-#endif
-#endif
-       and     r10,r5,r10,ror#2
-       add     r3,r3,r9                        @ E+=X[i]
-       eor     r10,r10,r7,ror#2                @ F_00_19(B,C,D)
-       str     r9,[r14,#-4]!
-       add     r3,r3,r10                       @ E+=F_00_19(B,C,D)
-       cmp     r14,sp
-       bne     .L_00_15                @ [((11+4)*5+2)*3]
-       sub     sp,sp,#25*4
-#if __ARM_ARCH__<7
-       ldrb    r10,[r1,#2]
-       ldrb    r9,[r1,#3]
-       ldrb    r11,[r1,#1]
-       add     r7,r8,r7,ror#2                  @ E+=K_00_19
-       ldrb    r12,[r1],#4
-       orr     r9,r9,r10,lsl#8
-       eor     r10,r5,r6                       @ F_xx_xx
-       orr     r9,r9,r11,lsl#16
-       add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
-       orr     r9,r9,r12,lsl#24
-#else
-       ldr     r9,[r1],#4                      @ handles unaligned
-       add     r7,r8,r7,ror#2                  @ E+=K_00_19
-       eor     r10,r5,r6                       @ F_xx_xx
-       add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
-#ifdef __ARMEL__
-       rev     r9,r9                           @ byte swap
-#endif
-#endif
-       and     r10,r4,r10,ror#2
-       add     r7,r7,r9                        @ E+=X[i]
-       eor     r10,r10,r6,ror#2                @ F_00_19(B,C,D)
-       str     r9,[r14,#-4]!
-       add     r7,r7,r10                       @ E+=F_00_19(B,C,D)
-       ldr     r9,[r14,#15*4]
-       ldr     r10,[r14,#13*4]
-       ldr     r11,[r14,#7*4]
-       add     r6,r8,r6,ror#2                  @ E+=K_xx_xx
-       ldr     r12,[r14,#2*4]
-       eor     r9,r9,r10
-       eor     r11,r11,r12                     @ 1 cycle stall
-       eor     r10,r4,r5                       @ F_xx_xx
-       mov     r9,r9,ror#31
-       add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
-       eor     r9,r9,r11,ror#31
-       str     r9,[r14,#-4]!
-       and r10,r3,r10,ror#2                                    @ F_xx_xx
-                                               @ F_xx_xx
-       add     r6,r6,r9                        @ E+=X[i]
-       eor     r10,r10,r5,ror#2                @ F_00_19(B,C,D)
-       add     r6,r6,r10                       @ E+=F_00_19(B,C,D)
-       ldr     r9,[r14,#15*4]
-       ldr     r10,[r14,#13*4]
-       ldr     r11,[r14,#7*4]
-       add     r5,r8,r5,ror#2                  @ E+=K_xx_xx
-       ldr     r12,[r14,#2*4]
-       eor     r9,r9,r10
-       eor     r11,r11,r12                     @ 1 cycle stall
-       eor     r10,r3,r4                       @ F_xx_xx
-       mov     r9,r9,ror#31
-       add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
-       eor     r9,r9,r11,ror#31
-       str     r9,[r14,#-4]!
-       and r10,r7,r10,ror#2                                    @ F_xx_xx
-                                               @ F_xx_xx
-       add     r5,r5,r9                        @ E+=X[i]
-       eor     r10,r10,r4,ror#2                @ F_00_19(B,C,D)
-       add     r5,r5,r10                       @ E+=F_00_19(B,C,D)
-       ldr     r9,[r14,#15*4]
-       ldr     r10,[r14,#13*4]
-       ldr     r11,[r14,#7*4]
-       add     r4,r8,r4,ror#2                  @ E+=K_xx_xx
-       ldr     r12,[r14,#2*4]
-       eor     r9,r9,r10
-       eor     r11,r11,r12                     @ 1 cycle stall
-       eor     r10,r7,r3                       @ F_xx_xx
-       mov     r9,r9,ror#31
-       add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
-       eor     r9,r9,r11,ror#31
-       str     r9,[r14,#-4]!
-       and r10,r6,r10,ror#2                                    @ F_xx_xx
-                                               @ F_xx_xx
-       add     r4,r4,r9                        @ E+=X[i]
-       eor     r10,r10,r3,ror#2                @ F_00_19(B,C,D)
-       add     r4,r4,r10                       @ E+=F_00_19(B,C,D)
-       ldr     r9,[r14,#15*4]
-       ldr     r10,[r14,#13*4]
-       ldr     r11,[r14,#7*4]
-       add     r3,r8,r3,ror#2                  @ E+=K_xx_xx
-       ldr     r12,[r14,#2*4]
-       eor     r9,r9,r10
-       eor     r11,r11,r12                     @ 1 cycle stall
-       eor     r10,r6,r7                       @ F_xx_xx
-       mov     r9,r9,ror#31
-       add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
-       eor     r9,r9,r11,ror#31
-       str     r9,[r14,#-4]!
-       and r10,r5,r10,ror#2                                    @ F_xx_xx
-                                               @ F_xx_xx
-       add     r3,r3,r9                        @ E+=X[i]
-       eor     r10,r10,r7,ror#2                @ F_00_19(B,C,D)
-       add     r3,r3,r10                       @ E+=F_00_19(B,C,D)
-
-       ldr     r8,.LK_20_39            @ [+15+16*4]
-       cmn     sp,#0                   @ [+3], clear carry to denote 20_39
-.L_20_39_or_60_79:
-       ldr     r9,[r14,#15*4]
-       ldr     r10,[r14,#13*4]
-       ldr     r11,[r14,#7*4]
-       add     r7,r8,r7,ror#2                  @ E+=K_xx_xx
-       ldr     r12,[r14,#2*4]
-       eor     r9,r9,r10
-       eor     r11,r11,r12                     @ 1 cycle stall
-       eor     r10,r5,r6                       @ F_xx_xx
-       mov     r9,r9,ror#31
-       add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
-       eor     r9,r9,r11,ror#31
-       str     r9,[r14,#-4]!
-       eor r10,r4,r10,ror#2                                    @ F_xx_xx
-                                               @ F_xx_xx
-       add     r7,r7,r9                        @ E+=X[i]
-       add     r7,r7,r10                       @ E+=F_20_39(B,C,D)
-       ldr     r9,[r14,#15*4]
-       ldr     r10,[r14,#13*4]
-       ldr     r11,[r14,#7*4]
-       add     r6,r8,r6,ror#2                  @ E+=K_xx_xx
-       ldr     r12,[r14,#2*4]
-       eor     r9,r9,r10
-       eor     r11,r11,r12                     @ 1 cycle stall
-       eor     r10,r4,r5                       @ F_xx_xx
-       mov     r9,r9,ror#31
-       add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
-       eor     r9,r9,r11,ror#31
-       str     r9,[r14,#-4]!
-       eor r10,r3,r10,ror#2                                    @ F_xx_xx
-                                               @ F_xx_xx
-       add     r6,r6,r9                        @ E+=X[i]
-       add     r6,r6,r10                       @ E+=F_20_39(B,C,D)
-       ldr     r9,[r14,#15*4]
-       ldr     r10,[r14,#13*4]
-       ldr     r11,[r14,#7*4]
-       add     r5,r8,r5,ror#2                  @ E+=K_xx_xx
-       ldr     r12,[r14,#2*4]
-       eor     r9,r9,r10
-       eor     r11,r11,r12                     @ 1 cycle stall
-       eor     r10,r3,r4                       @ F_xx_xx
-       mov     r9,r9,ror#31
-       add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
-       eor     r9,r9,r11,ror#31
-       str     r9,[r14,#-4]!
-       eor r10,r7,r10,ror#2                                    @ F_xx_xx
-                                               @ F_xx_xx
-       add     r5,r5,r9                        @ E+=X[i]
-       add     r5,r5,r10                       @ E+=F_20_39(B,C,D)
-       ldr     r9,[r14,#15*4]
-       ldr     r10,[r14,#13*4]
-       ldr     r11,[r14,#7*4]
-       add     r4,r8,r4,ror#2                  @ E+=K_xx_xx
-       ldr     r12,[r14,#2*4]
-       eor     r9,r9,r10
-       eor     r11,r11,r12                     @ 1 cycle stall
-       eor     r10,r7,r3                       @ F_xx_xx
-       mov     r9,r9,ror#31
-       add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
-       eor     r9,r9,r11,ror#31
-       str     r9,[r14,#-4]!
-       eor r10,r6,r10,ror#2                                    @ F_xx_xx
-                                               @ F_xx_xx
-       add     r4,r4,r9                        @ E+=X[i]
-       add     r4,r4,r10                       @ E+=F_20_39(B,C,D)
-       ldr     r9,[r14,#15*4]
-       ldr     r10,[r14,#13*4]
-       ldr     r11,[r14,#7*4]
-       add     r3,r8,r3,ror#2                  @ E+=K_xx_xx
-       ldr     r12,[r14,#2*4]
-       eor     r9,r9,r10
-       eor     r11,r11,r12                     @ 1 cycle stall
-       eor     r10,r6,r7                       @ F_xx_xx
-       mov     r9,r9,ror#31
-       add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
-       eor     r9,r9,r11,ror#31
-       str     r9,[r14,#-4]!
-       eor r10,r5,r10,ror#2                                    @ F_xx_xx
-                                               @ F_xx_xx
-       add     r3,r3,r9                        @ E+=X[i]
-       add     r3,r3,r10                       @ E+=F_20_39(B,C,D)
- ARM(  teq     r14,sp          )       @ preserve carry
- THUMB(        mov     r11,sp          )
- THUMB(        teq     r14,r11         )       @ preserve carry
-       bne     .L_20_39_or_60_79       @ [+((12+3)*5+2)*4]
-       bcs     .L_done                 @ [+((12+3)*5+2)*4], spare 300 bytes
-
-       ldr     r8,.LK_40_59
-       sub     sp,sp,#20*4             @ [+2]
-.L_40_59:
-       ldr     r9,[r14,#15*4]
-       ldr     r10,[r14,#13*4]
-       ldr     r11,[r14,#7*4]
-       add     r7,r8,r7,ror#2                  @ E+=K_xx_xx
-       ldr     r12,[r14,#2*4]
-       eor     r9,r9,r10
-       eor     r11,r11,r12                     @ 1 cycle stall
-       eor     r10,r5,r6                       @ F_xx_xx
-       mov     r9,r9,ror#31
-       add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
-       eor     r9,r9,r11,ror#31
-       str     r9,[r14,#-4]!
-       and r10,r4,r10,ror#2                                    @ F_xx_xx
-       and r11,r5,r6                                   @ F_xx_xx
-       add     r7,r7,r9                        @ E+=X[i]
-       add     r7,r7,r10                       @ E+=F_40_59(B,C,D)
-       add     r7,r7,r11,ror#2
-       ldr     r9,[r14,#15*4]
-       ldr     r10,[r14,#13*4]
-       ldr     r11,[r14,#7*4]
-       add     r6,r8,r6,ror#2                  @ E+=K_xx_xx
-       ldr     r12,[r14,#2*4]
-       eor     r9,r9,r10
-       eor     r11,r11,r12                     @ 1 cycle stall
-       eor     r10,r4,r5                       @ F_xx_xx
-       mov     r9,r9,ror#31
-       add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
-       eor     r9,r9,r11,ror#31
-       str     r9,[r14,#-4]!
-       and r10,r3,r10,ror#2                                    @ F_xx_xx
-       and r11,r4,r5                                   @ F_xx_xx
-       add     r6,r6,r9                        @ E+=X[i]
-       add     r6,r6,r10                       @ E+=F_40_59(B,C,D)
-       add     r6,r6,r11,ror#2
-       ldr     r9,[r14,#15*4]
-       ldr     r10,[r14,#13*4]
-       ldr     r11,[r14,#7*4]
-       add     r5,r8,r5,ror#2                  @ E+=K_xx_xx
-       ldr     r12,[r14,#2*4]
-       eor     r9,r9,r10
-       eor     r11,r11,r12                     @ 1 cycle stall
-       eor     r10,r3,r4                       @ F_xx_xx
-       mov     r9,r9,ror#31
-       add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
-       eor     r9,r9,r11,ror#31
-       str     r9,[r14,#-4]!
-       and r10,r7,r10,ror#2                                    @ F_xx_xx
-       and r11,r3,r4                                   @ F_xx_xx
-       add     r5,r5,r9                        @ E+=X[i]
-       add     r5,r5,r10                       @ E+=F_40_59(B,C,D)
-       add     r5,r5,r11,ror#2
-       ldr     r9,[r14,#15*4]
-       ldr     r10,[r14,#13*4]
-       ldr     r11,[r14,#7*4]
-       add     r4,r8,r4,ror#2                  @ E+=K_xx_xx
-       ldr     r12,[r14,#2*4]
-       eor     r9,r9,r10
-       eor     r11,r11,r12                     @ 1 cycle stall
-       eor     r10,r7,r3                       @ F_xx_xx
-       mov     r9,r9,ror#31
-       add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
-       eor     r9,r9,r11,ror#31
-       str     r9,[r14,#-4]!
-       and r10,r6,r10,ror#2                                    @ F_xx_xx
-       and r11,r7,r3                                   @ F_xx_xx
-       add     r4,r4,r9                        @ E+=X[i]
-       add     r4,r4,r10                       @ E+=F_40_59(B,C,D)
-       add     r4,r4,r11,ror#2
-       ldr     r9,[r14,#15*4]
-       ldr     r10,[r14,#13*4]
-       ldr     r11,[r14,#7*4]
-       add     r3,r8,r3,ror#2                  @ E+=K_xx_xx
-       ldr     r12,[r14,#2*4]
-       eor     r9,r9,r10
-       eor     r11,r11,r12                     @ 1 cycle stall
-       eor     r10,r6,r7                       @ F_xx_xx
-       mov     r9,r9,ror#31
-       add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
-       eor     r9,r9,r11,ror#31
-       str     r9,[r14,#-4]!
-       and r10,r5,r10,ror#2                                    @ F_xx_xx
-       and r11,r6,r7                                   @ F_xx_xx
-       add     r3,r3,r9                        @ E+=X[i]
-       add     r3,r3,r10                       @ E+=F_40_59(B,C,D)
-       add     r3,r3,r11,ror#2
-       cmp     r14,sp
-       bne     .L_40_59                @ [+((12+5)*5+2)*4]
-
-       ldr     r8,.LK_60_79
-       sub     sp,sp,#20*4
-       cmp     sp,#0                   @ set carry to denote 60_79
-       b       .L_20_39_or_60_79       @ [+4], spare 300 bytes
-.L_done:
-       add     sp,sp,#80*4             @ "deallocate" stack frame
-       ldmia   r0,{r8,r9,r10,r11,r12}
-       add     r3,r8,r3
-       add     r4,r9,r4
-       add     r5,r10,r5,ror#2
-       add     r6,r11,r6,ror#2
-       add     r7,r12,r7,ror#2
-       stmia   r0,{r3,r4,r5,r6,r7}
-       teq     r1,r2
-       bne     .Lloop                  @ [+18], total 1307
-
-       ldmia   sp!,{r4-r12,pc}
-.align 2
-.LK_00_19:     .word   0x5a827999
-.LK_20_39:     .word   0x6ed9eba1
-.LK_40_59:     .word   0x8f1bbcdc
-.LK_60_79:     .word   0xca62c1d6
-ENDPROC(sha1_block_data_order)
-.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
-.align 2
diff --git a/arch/arm/crypto/sha1-armv7-neon.S b/arch/arm/crypto/sha1-armv7-neon.S
deleted file mode 100644 (file)
index 28d816a..0000000
+++ /dev/null
@@ -1,634 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
- *
- * Copyright Â© 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-.syntax unified
-.fpu neon
-
-.text
-
-
-/* Context structure */
-
-#define state_h0 0
-#define state_h1 4
-#define state_h2 8
-#define state_h3 12
-#define state_h4 16
-
-
-/* Constants */
-
-#define K1  0x5A827999
-#define K2  0x6ED9EBA1
-#define K3  0x8F1BBCDC
-#define K4  0xCA62C1D6
-.align 4
-.LK_VEC:
-.LK1:  .long K1, K1, K1, K1
-.LK2:  .long K2, K2, K2, K2
-.LK3:  .long K3, K3, K3, K3
-.LK4:  .long K4, K4, K4, K4
-
-
-/* Register macros */
-
-#define RSTATE r0
-#define RDATA r1
-#define RNBLKS r2
-#define ROLDSTACK r3
-#define RWK lr
-
-#define _a r4
-#define _b r5
-#define _c r6
-#define _d r7
-#define _e r8
-
-#define RT0 r9
-#define RT1 r10
-#define RT2 r11
-#define RT3 r12
-
-#define W0 q0
-#define W1 q7
-#define W2 q2
-#define W3 q3
-#define W4 q4
-#define W5 q6
-#define W6 q5
-#define W7 q1
-
-#define tmp0 q8
-#define tmp1 q9
-#define tmp2 q10
-#define tmp3 q11
-
-#define qK1 q12
-#define qK2 q13
-#define qK3 q14
-#define qK4 q15
-
-#ifdef CONFIG_CPU_BIG_ENDIAN
-#define ARM_LE(code...)
-#else
-#define ARM_LE(code...)                code
-#endif
-
-/* Round function macros. */
-
-#define WK_offs(i) (((i) & 15) * 4)
-
-#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
-             W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       ldr RT3, [sp, WK_offs(i)]; \
-               pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
-       bic RT0, d, b; \
-       add e, e, a, ror #(32 - 5); \
-       and RT1, c, b; \
-               pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
-       add RT0, RT0, RT3; \
-       add e, e, RT1; \
-       ror b, #(32 - 30); \
-               pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
-       add e, e, RT0;
-
-#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
-             W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       ldr RT3, [sp, WK_offs(i)]; \
-               pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
-       eor RT0, d, b; \
-       add e, e, a, ror #(32 - 5); \
-       eor RT0, RT0, c; \
-               pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
-       add e, e, RT3; \
-       ror b, #(32 - 30); \
-               pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
-       add e, e, RT0; \
-
-#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
-             W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       ldr RT3, [sp, WK_offs(i)]; \
-               pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
-       eor RT0, b, c; \
-       and RT1, b, c; \
-       add e, e, a, ror #(32 - 5); \
-               pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
-       and RT0, RT0, d; \
-       add RT1, RT1, RT3; \
-       add e, e, RT0; \
-       ror b, #(32 - 30); \
-               pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
-       add e, e, RT1;
-
-#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
-             W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
-             W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
-
-#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
-           W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
-              W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
-
-#define R(a,b,c,d,e,f,i) \
-       _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
-              W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
-
-#define dummy(...)
-
-
-/* Input expansion macros. */
-
-/********* Precalc macros for rounds 0-15 *************************************/
-
-#define W_PRECALC_00_15() \
-       add       RWK, sp, #(WK_offs(0));                       \
-       \
-       vld1.32   {W0, W7}, [RDATA]!;                           \
- ARM_LE(vrev32.8  W0, W0;      )       /* big => little */     \
-       vld1.32   {W6, W5}, [RDATA]!;                           \
-       vadd.u32  tmp0, W0, curK;                               \
- ARM_LE(vrev32.8  W7, W7;      )       /* big => little */     \
- ARM_LE(vrev32.8  W6, W6;      )       /* big => little */     \
-       vadd.u32  tmp1, W7, curK;                               \
- ARM_LE(vrev32.8  W5, W5;      )       /* big => little */     \
-       vadd.u32  tmp2, W6, curK;                               \
-       vst1.32   {tmp0, tmp1}, [RWK]!;                         \
-       vadd.u32  tmp3, W5, curK;                               \
-       vst1.32   {tmp2, tmp3}, [RWK];                          \
-
-#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vld1.32   {W0, W7}, [RDATA]!;                           \
-
-#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       add       RWK, sp, #(WK_offs(0));                       \
-
-#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ARM_LE(vrev32.8  W0, W0;      )       /* big => little */     \
-
-#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vld1.32   {W6, W5}, [RDATA]!;                           \
-
-#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vadd.u32  tmp0, W0, curK;                               \
-
-#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ARM_LE(vrev32.8  W7, W7;      )       /* big => little */     \
-
-#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ARM_LE(vrev32.8  W6, W6;      )       /* big => little */     \
-
-#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vadd.u32  tmp1, W7, curK;                               \
-
-#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
- ARM_LE(vrev32.8  W5, W5;      )       /* big => little */     \
-
-#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vadd.u32  tmp2, W6, curK;                               \
-
-#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vst1.32   {tmp0, tmp1}, [RWK]!;                         \
-
-#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vadd.u32  tmp3, W5, curK;                               \
-
-#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vst1.32   {tmp2, tmp3}, [RWK];                          \
-
-
-/********* Precalc macros for rounds 16-31 ************************************/
-
-#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       veor      tmp0, tmp0;                   \
-       vext.8    W, W_m16, W_m12, #8;          \
-
-#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       add       RWK, sp, #(WK_offs(i));       \
-       vext.8    tmp0, W_m04, tmp0, #4;        \
-
-#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       veor      tmp0, tmp0, W_m16;            \
-       veor.32   W, W, W_m08;                  \
-
-#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       veor      tmp1, tmp1;                   \
-       veor      W, W, tmp0;                   \
-
-#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vshl.u32  tmp0, W, #1;                  \
-
-#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vext.8    tmp1, tmp1, W, #(16-12);      \
-       vshr.u32  W, W, #31;                    \
-
-#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vorr      tmp0, tmp0, W;                \
-       vshr.u32  W, tmp1, #30;                 \
-
-#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vshl.u32  tmp1, tmp1, #2;               \
-
-#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       veor      tmp0, tmp0, W;                \
-
-#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       veor      W, tmp0, tmp1;                \
-
-#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vadd.u32  tmp0, W, curK;                \
-
-#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vst1.32   {tmp0}, [RWK];
-
-
-/********* Precalc macros for rounds 32-79 ************************************/
-
-#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       veor W, W_m28; \
-
-#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vext.8 tmp0, W_m08, W_m04, #8; \
-
-#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       veor W, W_m16; \
-
-#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       veor W, tmp0; \
-
-#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       add RWK, sp, #(WK_offs(i&~3)); \
-
-#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vshl.u32 tmp1, W, #2; \
-
-#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vshr.u32 tmp0, W, #30; \
-
-#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vorr W, tmp0, tmp1; \
-
-#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vadd.u32 tmp0, W, curK; \
-
-#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
-       vst1.32 {tmp0}, [RWK];
-
-
-/*
- * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
- *
- * unsigned int
- * sha1_transform_neon (void *ctx, const unsigned char *data,
- *                      unsigned int nblks)
- */
-.align 3
-ENTRY(sha1_transform_neon)
-  /* input:
-   *   r0: ctx, CTX
-   *   r1: data (64*nblks bytes)
-   *   r2: nblks
-   */
-
-  cmp RNBLKS, #0;
-  beq .Ldo_nothing;
-
-  push {r4-r12, lr};
-  /*vpush {q4-q7};*/
-
-  adr RT3, .LK_VEC;
-
-  mov ROLDSTACK, sp;
-
-  /* Align stack. */
-  sub RT0, sp, #(16*4);
-  and RT0, #(~(16-1));
-  mov sp, RT0;
-
-  vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
-
-  /* Get the values of the chaining variables. */
-  ldm RSTATE, {_a-_e};
-
-  vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
-
-#undef curK
-#define curK qK1
-  /* Precalc 0-15. */
-  W_PRECALC_00_15();
-
-.Loop:
-  /* Transform 0-15 + Precalc 16-31. */
-  _R( _a, _b, _c, _d, _e, F1,  0,
-      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
-      W4, W5, W6, W7, W0, _, _, _ );
-  _R( _e, _a, _b, _c, _d, F1,  1,
-      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
-      W4, W5, W6, W7, W0, _, _, _ );
-  _R( _d, _e, _a, _b, _c, F1,  2,
-      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
-      W4, W5, W6, W7, W0, _, _, _ );
-  _R( _c, _d, _e, _a, _b, F1,  3,
-      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
-      W4, W5, W6, W7, W0, _, _, _ );
-
-#undef curK
-#define curK qK2
-  _R( _b, _c, _d, _e, _a, F1,  4,
-      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
-      W3, W4, W5, W6, W7, _, _, _ );
-  _R( _a, _b, _c, _d, _e, F1,  5,
-      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
-      W3, W4, W5, W6, W7, _, _, _ );
-  _R( _e, _a, _b, _c, _d, F1,  6,
-      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
-      W3, W4, W5, W6, W7, _, _, _ );
-  _R( _d, _e, _a, _b, _c, F1,  7,
-      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
-      W3, W4, W5, W6, W7, _, _, _ );
-
-  _R( _c, _d, _e, _a, _b, F1,  8,
-      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
-      W2, W3, W4, W5, W6, _, _, _ );
-  _R( _b, _c, _d, _e, _a, F1,  9,
-      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
-      W2, W3, W4, W5, W6, _, _, _ );
-  _R( _a, _b, _c, _d, _e, F1, 10,
-      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
-      W2, W3, W4, W5, W6, _, _, _ );
-  _R( _e, _a, _b, _c, _d, F1, 11,
-      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
-      W2, W3, W4, W5, W6, _, _, _ );
-
-  _R( _d, _e, _a, _b, _c, F1, 12,
-      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
-      W1, W2, W3, W4, W5, _, _, _ );
-  _R( _c, _d, _e, _a, _b, F1, 13,
-      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
-      W1, W2, W3, W4, W5, _, _, _ );
-  _R( _b, _c, _d, _e, _a, F1, 14,
-      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
-      W1, W2, W3, W4, W5, _, _, _ );
-  _R( _a, _b, _c, _d, _e, F1, 15,
-      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
-      W1, W2, W3, W4, W5, _, _, _ );
-
-  /* Transform 16-63 + Precalc 32-79. */
-  _R( _e, _a, _b, _c, _d, F1, 16,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
-      W0, W1, W2, W3, W4, W5, W6, W7);
-  _R( _d, _e, _a, _b, _c, F1, 17,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
-      W0, W1, W2, W3, W4, W5, W6, W7);
-  _R( _c, _d, _e, _a, _b, F1, 18,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 32,
-      W0, W1, W2, W3, W4, W5, W6, W7);
-  _R( _b, _c, _d, _e, _a, F1, 19,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 32,
-      W0, W1, W2, W3, W4, W5, W6, W7);
-
-  _R( _a, _b, _c, _d, _e, F2, 20,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
-      W7, W0, W1, W2, W3, W4, W5, W6);
-  _R( _e, _a, _b, _c, _d, F2, 21,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
-      W7, W0, W1, W2, W3, W4, W5, W6);
-  _R( _d, _e, _a, _b, _c, F2, 22,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 36,
-      W7, W0, W1, W2, W3, W4, W5, W6);
-  _R( _c, _d, _e, _a, _b, F2, 23,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 36,
-      W7, W0, W1, W2, W3, W4, W5, W6);
-
-#undef curK
-#define curK qK3
-  _R( _b, _c, _d, _e, _a, F2, 24,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
-      W6, W7, W0, W1, W2, W3, W4, W5);
-  _R( _a, _b, _c, _d, _e, F2, 25,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
-      W6, W7, W0, W1, W2, W3, W4, W5);
-  _R( _e, _a, _b, _c, _d, F2, 26,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 40,
-      W6, W7, W0, W1, W2, W3, W4, W5);
-  _R( _d, _e, _a, _b, _c, F2, 27,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 40,
-      W6, W7, W0, W1, W2, W3, W4, W5);
-
-  _R( _c, _d, _e, _a, _b, F2, 28,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
-      W5, W6, W7, W0, W1, W2, W3, W4);
-  _R( _b, _c, _d, _e, _a, F2, 29,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
-      W5, W6, W7, W0, W1, W2, W3, W4);
-  _R( _a, _b, _c, _d, _e, F2, 30,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 44,
-      W5, W6, W7, W0, W1, W2, W3, W4);
-  _R( _e, _a, _b, _c, _d, F2, 31,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 44,
-      W5, W6, W7, W0, W1, W2, W3, W4);
-
-  _R( _d, _e, _a, _b, _c, F2, 32,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
-      W4, W5, W6, W7, W0, W1, W2, W3);
-  _R( _c, _d, _e, _a, _b, F2, 33,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
-      W4, W5, W6, W7, W0, W1, W2, W3);
-  _R( _b, _c, _d, _e, _a, F2, 34,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 48,
-      W4, W5, W6, W7, W0, W1, W2, W3);
-  _R( _a, _b, _c, _d, _e, F2, 35,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 48,
-      W4, W5, W6, W7, W0, W1, W2, W3);
-
-  _R( _e, _a, _b, _c, _d, F2, 36,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
-      W3, W4, W5, W6, W7, W0, W1, W2);
-  _R( _d, _e, _a, _b, _c, F2, 37,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
-      W3, W4, W5, W6, W7, W0, W1, W2);
-  _R( _c, _d, _e, _a, _b, F2, 38,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 52,
-      W3, W4, W5, W6, W7, W0, W1, W2);
-  _R( _b, _c, _d, _e, _a, F2, 39,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 52,
-      W3, W4, W5, W6, W7, W0, W1, W2);
-
-  _R( _a, _b, _c, _d, _e, F3, 40,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
-      W2, W3, W4, W5, W6, W7, W0, W1);
-  _R( _e, _a, _b, _c, _d, F3, 41,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
-      W2, W3, W4, W5, W6, W7, W0, W1);
-  _R( _d, _e, _a, _b, _c, F3, 42,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 56,
-      W2, W3, W4, W5, W6, W7, W0, W1);
-  _R( _c, _d, _e, _a, _b, F3, 43,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 56,
-      W2, W3, W4, W5, W6, W7, W0, W1);
-
-#undef curK
-#define curK qK4
-  _R( _b, _c, _d, _e, _a, F3, 44,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
-      W1, W2, W3, W4, W5, W6, W7, W0);
-  _R( _a, _b, _c, _d, _e, F3, 45,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
-      W1, W2, W3, W4, W5, W6, W7, W0);
-  _R( _e, _a, _b, _c, _d, F3, 46,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 60,
-      W1, W2, W3, W4, W5, W6, W7, W0);
-  _R( _d, _e, _a, _b, _c, F3, 47,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 60,
-      W1, W2, W3, W4, W5, W6, W7, W0);
-
-  _R( _c, _d, _e, _a, _b, F3, 48,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
-      W0, W1, W2, W3, W4, W5, W6, W7);
-  _R( _b, _c, _d, _e, _a, F3, 49,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
-      W0, W1, W2, W3, W4, W5, W6, W7);
-  _R( _a, _b, _c, _d, _e, F3, 50,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 64,
-      W0, W1, W2, W3, W4, W5, W6, W7);
-  _R( _e, _a, _b, _c, _d, F3, 51,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 64,
-      W0, W1, W2, W3, W4, W5, W6, W7);
-
-  _R( _d, _e, _a, _b, _c, F3, 52,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
-      W7, W0, W1, W2, W3, W4, W5, W6);
-  _R( _c, _d, _e, _a, _b, F3, 53,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
-      W7, W0, W1, W2, W3, W4, W5, W6);
-  _R( _b, _c, _d, _e, _a, F3, 54,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 68,
-      W7, W0, W1, W2, W3, W4, W5, W6);
-  _R( _a, _b, _c, _d, _e, F3, 55,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 68,
-      W7, W0, W1, W2, W3, W4, W5, W6);
-
-  _R( _e, _a, _b, _c, _d, F3, 56,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
-      W6, W7, W0, W1, W2, W3, W4, W5);
-  _R( _d, _e, _a, _b, _c, F3, 57,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
-      W6, W7, W0, W1, W2, W3, W4, W5);
-  _R( _c, _d, _e, _a, _b, F3, 58,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 72,
-      W6, W7, W0, W1, W2, W3, W4, W5);
-  _R( _b, _c, _d, _e, _a, F3, 59,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 72,
-      W6, W7, W0, W1, W2, W3, W4, W5);
-
-  subs RNBLKS, #1;
-
-  _R( _a, _b, _c, _d, _e, F4, 60,
-      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
-      W5, W6, W7, W0, W1, W2, W3, W4);
-  _R( _e, _a, _b, _c, _d, F4, 61,
-      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
-      W5, W6, W7, W0, W1, W2, W3, W4);
-  _R( _d, _e, _a, _b, _c, F4, 62,
-      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 76,
-      W5, W6, W7, W0, W1, W2, W3, W4);
-  _R( _c, _d, _e, _a, _b, F4, 63,
-      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 76,
-      W5, W6, W7, W0, W1, W2, W3, W4);
-
-  beq .Lend;
-
-  /* Transform 64-79 + Precalc 0-15 of next block. */
-#undef curK
-#define curK qK1
-  _R( _b, _c, _d, _e, _a, F4, 64,
-      WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _a, _b, _c, _d, _e, F4, 65,
-      WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _e, _a, _b, _c, _d, F4, 66,
-      WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _d, _e, _a, _b, _c, F4, 67,
-      WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-
-  _R( _c, _d, _e, _a, _b, F4, 68,
-      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _b, _c, _d, _e, _a, F4, 69,
-      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _a, _b, _c, _d, _e, F4, 70,
-      WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _e, _a, _b, _c, _d, F4, 71,
-      WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-
-  _R( _d, _e, _a, _b, _c, F4, 72,
-      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _c, _d, _e, _a, _b, F4, 73,
-      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _b, _c, _d, _e, _a, F4, 74,
-      WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _a, _b, _c, _d, _e, F4, 75,
-      WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-
-  _R( _e, _a, _b, _c, _d, F4, 76,
-      WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _d, _e, _a, _b, _c, F4, 77,
-      WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _c, _d, _e, _a, _b, F4, 78,
-      WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
-  _R( _b, _c, _d, _e, _a, F4, 79,
-      WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
-
-  /* Update the chaining variables. */
-  ldm RSTATE, {RT0-RT3};
-  add _a, RT0;
-  ldr RT0, [RSTATE, #state_h4];
-  add _b, RT1;
-  add _c, RT2;
-  add _d, RT3;
-  add _e, RT0;
-  stm RSTATE, {_a-_e};
-
-  b .Loop;
-
-.Lend:
-  /* Transform 64-79 */
-  R( _b, _c, _d, _e, _a, F4, 64 );
-  R( _a, _b, _c, _d, _e, F4, 65 );
-  R( _e, _a, _b, _c, _d, F4, 66 );
-  R( _d, _e, _a, _b, _c, F4, 67 );
-  R( _c, _d, _e, _a, _b, F4, 68 );
-  R( _b, _c, _d, _e, _a, F4, 69 );
-  R( _a, _b, _c, _d, _e, F4, 70 );
-  R( _e, _a, _b, _c, _d, F4, 71 );
-  R( _d, _e, _a, _b, _c, F4, 72 );
-  R( _c, _d, _e, _a, _b, F4, 73 );
-  R( _b, _c, _d, _e, _a, F4, 74 );
-  R( _a, _b, _c, _d, _e, F4, 75 );
-  R( _e, _a, _b, _c, _d, F4, 76 );
-  R( _d, _e, _a, _b, _c, F4, 77 );
-  R( _c, _d, _e, _a, _b, F4, 78 );
-  R( _b, _c, _d, _e, _a, F4, 79 );
-
-  mov sp, ROLDSTACK;
-
-  /* Update the chaining variables. */
-  ldm RSTATE, {RT0-RT3};
-  add _a, RT0;
-  ldr RT0, [RSTATE, #state_h4];
-  add _b, RT1;
-  add _c, RT2;
-  add _d, RT3;
-  /*vpop {q4-q7};*/
-  add _e, RT0;
-  stm RSTATE, {_a-_e};
-
-  pop {r4-r12, pc};
-
-.Ldo_nothing:
-  bx lr
-ENDPROC(sha1_transform_neon)
diff --git a/arch/arm/crypto/sha1-ce-core.S b/arch/arm/crypto/sha1-ce-core.S
deleted file mode 100644 (file)
index 8a702e0..0000000
+++ /dev/null
@@ -1,123 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2015 Linaro Ltd.
- * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-       .text
-       .arch           armv8-a
-       .fpu            crypto-neon-fp-armv8
-
-       k0              .req    q0
-       k1              .req    q1
-       k2              .req    q2
-       k3              .req    q3
-
-       ta0             .req    q4
-       ta1             .req    q5
-       tb0             .req    q5
-       tb1             .req    q4
-
-       dga             .req    q6
-       dgb             .req    q7
-       dgbs            .req    s28
-
-       dg0             .req    q12
-       dg1a0           .req    q13
-       dg1a1           .req    q14
-       dg1b0           .req    q14
-       dg1b1           .req    q13
-
-       .macro          add_only, op, ev, rc, s0, dg1
-       .ifnb           \s0
-       vadd.u32        tb\ev, q\s0, \rc
-       .endif
-       sha1h.32        dg1b\ev, dg0
-       .ifb            \dg1
-       sha1\op\().32   dg0, dg1a\ev, ta\ev
-       .else
-       sha1\op\().32   dg0, \dg1, ta\ev
-       .endif
-       .endm
-
-       .macro          add_update, op, ev, rc, s0, s1, s2, s3, dg1
-       sha1su0.32      q\s0, q\s1, q\s2
-       add_only        \op, \ev, \rc, \s1, \dg1
-       sha1su1.32      q\s0, q\s3
-       .endm
-
-       .align          6
-.Lsha1_rcon:
-       .word           0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
-       .word           0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
-       .word           0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
-       .word           0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
-
-       /*
-        * void sha1_ce_transform(struct sha1_state *sst, u8 const *src,
-        *                        int blocks);
-        */
-ENTRY(sha1_ce_transform)
-       /* load round constants */
-       adr             ip, .Lsha1_rcon
-       vld1.32         {k0-k1}, [ip, :128]!
-       vld1.32         {k2-k3}, [ip, :128]
-
-       /* load state */
-       vld1.32         {dga}, [r0]
-       vldr            dgbs, [r0, #16]
-
-       /* load input */
-0:     vld1.32         {q8-q9}, [r1]!
-       vld1.32         {q10-q11}, [r1]!
-       subs            r2, r2, #1
-
-#ifndef CONFIG_CPU_BIG_ENDIAN
-       vrev32.8        q8, q8
-       vrev32.8        q9, q9
-       vrev32.8        q10, q10
-       vrev32.8        q11, q11
-#endif
-
-       vadd.u32        ta0, q8, k0
-       vmov            dg0, dga
-
-       add_update      c, 0, k0,  8,  9, 10, 11, dgb
-       add_update      c, 1, k0,  9, 10, 11,  8
-       add_update      c, 0, k0, 10, 11,  8,  9
-       add_update      c, 1, k0, 11,  8,  9, 10
-       add_update      c, 0, k1,  8,  9, 10, 11
-
-       add_update      p, 1, k1,  9, 10, 11,  8
-       add_update      p, 0, k1, 10, 11,  8,  9
-       add_update      p, 1, k1, 11,  8,  9, 10
-       add_update      p, 0, k1,  8,  9, 10, 11
-       add_update      p, 1, k2,  9, 10, 11,  8
-
-       add_update      m, 0, k2, 10, 11,  8,  9
-       add_update      m, 1, k2, 11,  8,  9, 10
-       add_update      m, 0, k2,  8,  9, 10, 11
-       add_update      m, 1, k2,  9, 10, 11,  8
-       add_update      m, 0, k3, 10, 11,  8,  9
-
-       add_update      p, 1, k3, 11,  8,  9, 10
-       add_only        p, 0, k3,  9
-       add_only        p, 1, k3, 10
-       add_only        p, 0, k3, 11
-       add_only        p, 1
-
-       /* update state */
-       vadd.u32        dga, dga, dg0
-       vadd.u32        dgb, dgb, dg1a0
-       bne             0b
-
-       /* store new state */
-       vst1.32         {dga}, [r0]
-       vstr            dgbs, [r0, #16]
-       bx              lr
-ENDPROC(sha1_ce_transform)
diff --git a/arch/arm/crypto/sha1-ce-glue.c b/arch/arm/crypto/sha1-ce-glue.c
deleted file mode 100644 (file)
index fac07a4..0000000
+++ /dev/null
@@ -1,72 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-
-asmlinkage void sha1_ce_transform(struct sha1_state *sst, u8 const *src,
-                                 int blocks);
-
-static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
-                         unsigned int len)
-{
-       int remain;
-
-       kernel_neon_begin();
-       remain = sha1_base_do_update_blocks(desc, data, len, sha1_ce_transform);
-       kernel_neon_end();
-
-       return remain;
-}
-
-static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
-                        unsigned int len, u8 *out)
-{
-       kernel_neon_begin();
-       sha1_base_do_finup(desc, data, len, sha1_ce_transform);
-       kernel_neon_end();
-
-       return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
-       .init                   = sha1_base_init,
-       .update                 = sha1_ce_update,
-       .finup                  = sha1_ce_finup,
-       .descsize               = SHA1_STATE_SIZE,
-       .digestsize             = SHA1_DIGEST_SIZE,
-       .base                   = {
-               .cra_name               = "sha1",
-               .cra_driver_name        = "sha1-ce",
-               .cra_priority           = 200,
-               .cra_flags              = CRYPTO_AHASH_ALG_BLOCK_ONLY,
-               .cra_blocksize          = SHA1_BLOCK_SIZE,
-               .cra_module             = THIS_MODULE,
-       }
-};
-
-static int __init sha1_ce_mod_init(void)
-{
-       return crypto_register_shash(&alg);
-}
-
-static void __exit sha1_ce_mod_fini(void)
-{
-       crypto_unregister_shash(&alg);
-}
-
-module_cpu_feature_match(SHA1, sha1_ce_mod_init);
-module_exit(sha1_ce_mod_fini);
diff --git a/arch/arm/crypto/sha1_glue.c b/arch/arm/crypto/sha1_glue.c
deleted file mode 100644 (file)
index 255da00..0000000
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Cryptographic API.
- * Glue code for the SHA1 Secure Hash Algorithm assembler implementation
- *
- * This file is based on sha1_generic.c and sha1_ssse3_glue.c
- *
- * Copyright (c) Alan Smithee.
- * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- * Copyright (c) Mathias Krause <minipli@googlemail.com>
- */
-
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha1_block_data_order(struct sha1_state *digest,
-               const u8 *data, int rounds);
-
-static int sha1_update_arm(struct shash_desc *desc, const u8 *data,
-                          unsigned int len)
-{
-       /* make sure signature matches sha1_block_fn() */
-       BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
-
-       return sha1_base_do_update_blocks(desc, data, len,
-                                         sha1_block_data_order);
-}
-
-static int sha1_finup_arm(struct shash_desc *desc, const u8 *data,
-                         unsigned int len, u8 *out)
-{
-       sha1_base_do_finup(desc, data, len, sha1_block_data_order);
-       return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
-       .digestsize     =       SHA1_DIGEST_SIZE,
-       .init           =       sha1_base_init,
-       .update         =       sha1_update_arm,
-       .finup          =       sha1_finup_arm,
-       .descsize       =       SHA1_STATE_SIZE,
-       .base           =       {
-               .cra_name       =       "sha1",
-               .cra_driver_name=       "sha1-asm",
-               .cra_priority   =       150,
-               .cra_flags      =       CRYPTO_AHASH_ALG_BLOCK_ONLY,
-               .cra_blocksize  =       SHA1_BLOCK_SIZE,
-               .cra_module     =       THIS_MODULE,
-       }
-};
-
-
-static int __init sha1_mod_init(void)
-{
-       return crypto_register_shash(&alg);
-}
-
-
-static void __exit sha1_mod_fini(void)
-{
-       crypto_unregister_shash(&alg);
-}
-
-
-module_init(sha1_mod_init);
-module_exit(sha1_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm (ARM)");
-MODULE_ALIAS_CRYPTO("sha1");
-MODULE_AUTHOR("David McCullough <ucdevel@gmail.com>");
diff --git a/arch/arm/crypto/sha1_neon_glue.c b/arch/arm/crypto/sha1_neon_glue.c
deleted file mode 100644 (file)
index d321850..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using
- * ARM NEON instructions.
- *
- * Copyright Â© 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This file is based on sha1_generic.c and sha1_ssse3_glue.c:
- *  Copyright (c) Alan Smithee.
- *  Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
- *  Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
- *  Copyright (c) Mathias Krause <minipli@googlemail.com>
- *  Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha1.h>
-#include <crypto/sha1_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha1_transform_neon(struct sha1_state *state_h,
-                                   const u8 *data, int rounds);
-
-static int sha1_neon_update(struct shash_desc *desc, const u8 *data,
-                         unsigned int len)
-{
-       int remain;
-
-       kernel_neon_begin();
-       remain = sha1_base_do_update_blocks(desc, data, len,
-                                           sha1_transform_neon);
-       kernel_neon_end();
-
-       return remain;
-}
-
-static int sha1_neon_finup(struct shash_desc *desc, const u8 *data,
-                          unsigned int len, u8 *out)
-{
-       kernel_neon_begin();
-       sha1_base_do_finup(desc, data, len, sha1_transform_neon);
-       kernel_neon_end();
-
-       return sha1_base_finish(desc, out);
-}
-
-static struct shash_alg alg = {
-       .digestsize     =       SHA1_DIGEST_SIZE,
-       .init           =       sha1_base_init,
-       .update         =       sha1_neon_update,
-       .finup          =       sha1_neon_finup,
-       .descsize               = SHA1_STATE_SIZE,
-       .base           =       {
-               .cra_name               = "sha1",
-               .cra_driver_name        = "sha1-neon",
-               .cra_priority           = 250,
-               .cra_flags              = CRYPTO_AHASH_ALG_BLOCK_ONLY,
-               .cra_blocksize          = SHA1_BLOCK_SIZE,
-               .cra_module             = THIS_MODULE,
-       }
-};
-
-static int __init sha1_neon_mod_init(void)
-{
-       if (!cpu_has_neon())
-               return -ENODEV;
-
-       return crypto_register_shash(&alg);
-}
-
-static void __exit sha1_neon_mod_fini(void)
-{
-       crypto_unregister_shash(&alg);
-}
-
-module_init(sha1_neon_mod_init);
-module_exit(sha1_neon_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, NEON accelerated");
-MODULE_ALIAS_CRYPTO("sha1");
index 5aaf484fc9de43654deca5b60368a8a7beb76456..519c5d6a050fddff2941b83202f8f99b0d55b1e1 100644 (file)
@@ -146,6 +146,7 @@ config CRYPTO_LIB_SHA1
 config CRYPTO_LIB_SHA1_ARCH
        bool
        depends on CRYPTO_LIB_SHA1 && !UML
+       default y if ARM
 
 config CRYPTO_LIB_SHA256
        tristate
index 0eb0906d693f2f0de6d7f51beb2a536594fee807..699a421339271cf926b3b646df116a67a4691d98 100644 (file)
@@ -71,6 +71,11 @@ obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o
 libsha1-y := sha1.o
 ifeq ($(CONFIG_CRYPTO_LIB_SHA1_ARCH),y)
 CFLAGS_sha1.o += -I$(src)/$(SRCARCH)
+ifeq ($(CONFIG_ARM),y)
+libsha1-y += arm/sha1-armv4-large.o
+libsha1-$(CONFIG_KERNEL_MODE_NEON) += arm/sha1-armv7-neon.o \
+                                     arm/sha1-ce-core.o
+endif
 endif # CONFIG_CRYPTO_LIB_SHA1_ARCH
 
 ################################################################################
diff --git a/lib/crypto/arm/sha1-armv4-large.S b/lib/crypto/arm/sha1-armv4-large.S
new file mode 100644 (file)
index 0000000..1c8b685
--- /dev/null
@@ -0,0 +1,507 @@
+#define __ARM_ARCH__ __LINUX_ARM_ARCH__
+@ SPDX-License-Identifier: GPL-2.0
+
+@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
+@ has relicensed it under the GPLv2. Therefore this program is free software;
+@ you can redistribute it and/or modify it under the terms of the GNU General
+@ Public License version 2 as published by the Free Software Foundation.
+@
+@ The original headers, including the original license headers, are
+@ included below for completeness.
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see https://www.openssl.org/~appro/cryptogams/.
+@ ====================================================================
+
+@ sha1_block procedure for ARMv4.
+@
+@ January 2007.
+
+@ Size/performance trade-off
+@ ====================================================================
+@ impl         size in bytes   comp cycles[*]  measured performance
+@ ====================================================================
+@ thumb                304             3212            4420
+@ armv4-small  392/+29%        1958/+64%       2250/+96%
+@ armv4-compact        740/+89%        1552/+26%       1840/+22%
+@ armv4-large  1420/+92%       1307/+19%       1370/+34%[***]
+@ full unroll  ~5100/+260%     ~1260/+4%       ~1300/+5%
+@ ====================================================================
+@ thumb                = same as 'small' but in Thumb instructions[**] and
+@                with recurring code in two private functions;
+@ small                = detached Xload/update, loops are folded;
+@ compact      = detached Xload/update, 5x unroll;
+@ large                = interleaved Xload/update, 5x unroll;
+@ full unroll  = interleaved Xload/update, full unroll, estimated[!];
+@
+@ [*]  Manually counted instructions in "grand" loop body. Measured
+@      performance is affected by prologue and epilogue overhead,
+@      i-cache availability, branch penalties, etc.
+@ [**] While each Thumb instruction is twice smaller, they are not as
+@      diverse as ARM ones: e.g., there are only two arithmetic
+@      instructions with 3 arguments, no [fixed] rotate, addressing
+@      modes are limited. As result it takes more instructions to do
+@      the same job in Thumb, therefore the code is never twice as
+@      small and always slower.
+@ [***]        which is also ~35% better than compiler generated code. Dual-
+@      issue Cortex A8 core was measured to process input block in
+@      ~990 cycles.
+
+@ August 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 13% improvement on
+@ Cortex A8 core and in absolute terms ~870 cycles per input block
+@ [or 13.6 cycles per byte].
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 10%
+@ improvement on Cortex A8 core and 12.2 cycles per byte.
+
+#include <linux/linkage.h>
+
+.text
+
+.align 2
+ENTRY(sha1_block_data_order)
+       stmdb   sp!,{r4-r12,lr}
+       add     r2,r1,r2,lsl#6  @ r2 to point at the end of r1
+       ldmia   r0,{r3,r4,r5,r6,r7}
+.Lloop:
+       ldr     r8,.LK_00_19
+       mov     r14,sp
+       sub     sp,sp,#15*4
+       mov     r5,r5,ror#30
+       mov     r6,r6,ror#30
+       mov     r7,r7,ror#30            @ [6]
+.L_00_15:
+#if __ARM_ARCH__<7
+       ldrb    r10,[r1,#2]
+       ldrb    r9,[r1,#3]
+       ldrb    r11,[r1,#1]
+       add     r7,r8,r7,ror#2                  @ E+=K_00_19
+       ldrb    r12,[r1],#4
+       orr     r9,r9,r10,lsl#8
+       eor     r10,r5,r6                       @ F_xx_xx
+       orr     r9,r9,r11,lsl#16
+       add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
+       orr     r9,r9,r12,lsl#24
+#else
+       ldr     r9,[r1],#4                      @ handles unaligned
+       add     r7,r8,r7,ror#2                  @ E+=K_00_19
+       eor     r10,r5,r6                       @ F_xx_xx
+       add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+       rev     r9,r9                           @ byte swap
+#endif
+#endif
+       and     r10,r4,r10,ror#2
+       add     r7,r7,r9                        @ E+=X[i]
+       eor     r10,r10,r6,ror#2                @ F_00_19(B,C,D)
+       str     r9,[r14,#-4]!
+       add     r7,r7,r10                       @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+       ldrb    r10,[r1,#2]
+       ldrb    r9,[r1,#3]
+       ldrb    r11,[r1,#1]
+       add     r6,r8,r6,ror#2                  @ E+=K_00_19
+       ldrb    r12,[r1],#4
+       orr     r9,r9,r10,lsl#8
+       eor     r10,r4,r5                       @ F_xx_xx
+       orr     r9,r9,r11,lsl#16
+       add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
+       orr     r9,r9,r12,lsl#24
+#else
+       ldr     r9,[r1],#4                      @ handles unaligned
+       add     r6,r8,r6,ror#2                  @ E+=K_00_19
+       eor     r10,r4,r5                       @ F_xx_xx
+       add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+       rev     r9,r9                           @ byte swap
+#endif
+#endif
+       and     r10,r3,r10,ror#2
+       add     r6,r6,r9                        @ E+=X[i]
+       eor     r10,r10,r5,ror#2                @ F_00_19(B,C,D)
+       str     r9,[r14,#-4]!
+       add     r6,r6,r10                       @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+       ldrb    r10,[r1,#2]
+       ldrb    r9,[r1,#3]
+       ldrb    r11,[r1,#1]
+       add     r5,r8,r5,ror#2                  @ E+=K_00_19
+       ldrb    r12,[r1],#4
+       orr     r9,r9,r10,lsl#8
+       eor     r10,r3,r4                       @ F_xx_xx
+       orr     r9,r9,r11,lsl#16
+       add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
+       orr     r9,r9,r12,lsl#24
+#else
+       ldr     r9,[r1],#4                      @ handles unaligned
+       add     r5,r8,r5,ror#2                  @ E+=K_00_19
+       eor     r10,r3,r4                       @ F_xx_xx
+       add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+       rev     r9,r9                           @ byte swap
+#endif
+#endif
+       and     r10,r7,r10,ror#2
+       add     r5,r5,r9                        @ E+=X[i]
+       eor     r10,r10,r4,ror#2                @ F_00_19(B,C,D)
+       str     r9,[r14,#-4]!
+       add     r5,r5,r10                       @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+       ldrb    r10,[r1,#2]
+       ldrb    r9,[r1,#3]
+       ldrb    r11,[r1,#1]
+       add     r4,r8,r4,ror#2                  @ E+=K_00_19
+       ldrb    r12,[r1],#4
+       orr     r9,r9,r10,lsl#8
+       eor     r10,r7,r3                       @ F_xx_xx
+       orr     r9,r9,r11,lsl#16
+       add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
+       orr     r9,r9,r12,lsl#24
+#else
+       ldr     r9,[r1],#4                      @ handles unaligned
+       add     r4,r8,r4,ror#2                  @ E+=K_00_19
+       eor     r10,r7,r3                       @ F_xx_xx
+       add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+       rev     r9,r9                           @ byte swap
+#endif
+#endif
+       and     r10,r6,r10,ror#2
+       add     r4,r4,r9                        @ E+=X[i]
+       eor     r10,r10,r3,ror#2                @ F_00_19(B,C,D)
+       str     r9,[r14,#-4]!
+       add     r4,r4,r10                       @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+       ldrb    r10,[r1,#2]
+       ldrb    r9,[r1,#3]
+       ldrb    r11,[r1,#1]
+       add     r3,r8,r3,ror#2                  @ E+=K_00_19
+       ldrb    r12,[r1],#4
+       orr     r9,r9,r10,lsl#8
+       eor     r10,r6,r7                       @ F_xx_xx
+       orr     r9,r9,r11,lsl#16
+       add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
+       orr     r9,r9,r12,lsl#24
+#else
+       ldr     r9,[r1],#4                      @ handles unaligned
+       add     r3,r8,r3,ror#2                  @ E+=K_00_19
+       eor     r10,r6,r7                       @ F_xx_xx
+       add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+       rev     r9,r9                           @ byte swap
+#endif
+#endif
+       and     r10,r5,r10,ror#2
+       add     r3,r3,r9                        @ E+=X[i]
+       eor     r10,r10,r7,ror#2                @ F_00_19(B,C,D)
+       str     r9,[r14,#-4]!
+       add     r3,r3,r10                       @ E+=F_00_19(B,C,D)
+       cmp     r14,sp
+       bne     .L_00_15                @ [((11+4)*5+2)*3]
+       sub     sp,sp,#25*4
+#if __ARM_ARCH__<7
+       ldrb    r10,[r1,#2]
+       ldrb    r9,[r1,#3]
+       ldrb    r11,[r1,#1]
+       add     r7,r8,r7,ror#2                  @ E+=K_00_19
+       ldrb    r12,[r1],#4
+       orr     r9,r9,r10,lsl#8
+       eor     r10,r5,r6                       @ F_xx_xx
+       orr     r9,r9,r11,lsl#16
+       add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
+       orr     r9,r9,r12,lsl#24
+#else
+       ldr     r9,[r1],#4                      @ handles unaligned
+       add     r7,r8,r7,ror#2                  @ E+=K_00_19
+       eor     r10,r5,r6                       @ F_xx_xx
+       add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+       rev     r9,r9                           @ byte swap
+#endif
+#endif
+       and     r10,r4,r10,ror#2
+       add     r7,r7,r9                        @ E+=X[i]
+       eor     r10,r10,r6,ror#2                @ F_00_19(B,C,D)
+       str     r9,[r14,#-4]!
+       add     r7,r7,r10                       @ E+=F_00_19(B,C,D)
+       ldr     r9,[r14,#15*4]
+       ldr     r10,[r14,#13*4]
+       ldr     r11,[r14,#7*4]
+       add     r6,r8,r6,ror#2                  @ E+=K_xx_xx
+       ldr     r12,[r14,#2*4]
+       eor     r9,r9,r10
+       eor     r11,r11,r12                     @ 1 cycle stall
+       eor     r10,r4,r5                       @ F_xx_xx
+       mov     r9,r9,ror#31
+       add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
+       eor     r9,r9,r11,ror#31
+       str     r9,[r14,#-4]!
+       and r10,r3,r10,ror#2                                    @ F_xx_xx
+                                               @ F_xx_xx
+       add     r6,r6,r9                        @ E+=X[i]
+       eor     r10,r10,r5,ror#2                @ F_00_19(B,C,D)
+       add     r6,r6,r10                       @ E+=F_00_19(B,C,D)
+       ldr     r9,[r14,#15*4]
+       ldr     r10,[r14,#13*4]
+       ldr     r11,[r14,#7*4]
+       add     r5,r8,r5,ror#2                  @ E+=K_xx_xx
+       ldr     r12,[r14,#2*4]
+       eor     r9,r9,r10
+       eor     r11,r11,r12                     @ 1 cycle stall
+       eor     r10,r3,r4                       @ F_xx_xx
+       mov     r9,r9,ror#31
+       add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
+       eor     r9,r9,r11,ror#31
+       str     r9,[r14,#-4]!
+       and r10,r7,r10,ror#2                                    @ F_xx_xx
+                                               @ F_xx_xx
+       add     r5,r5,r9                        @ E+=X[i]
+       eor     r10,r10,r4,ror#2                @ F_00_19(B,C,D)
+       add     r5,r5,r10                       @ E+=F_00_19(B,C,D)
+       ldr     r9,[r14,#15*4]
+       ldr     r10,[r14,#13*4]
+       ldr     r11,[r14,#7*4]
+       add     r4,r8,r4,ror#2                  @ E+=K_xx_xx
+       ldr     r12,[r14,#2*4]
+       eor     r9,r9,r10
+       eor     r11,r11,r12                     @ 1 cycle stall
+       eor     r10,r7,r3                       @ F_xx_xx
+       mov     r9,r9,ror#31
+       add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
+       eor     r9,r9,r11,ror#31
+       str     r9,[r14,#-4]!
+       and r10,r6,r10,ror#2                                    @ F_xx_xx
+                                               @ F_xx_xx
+       add     r4,r4,r9                        @ E+=X[i]
+       eor     r10,r10,r3,ror#2                @ F_00_19(B,C,D)
+       add     r4,r4,r10                       @ E+=F_00_19(B,C,D)
+       ldr     r9,[r14,#15*4]
+       ldr     r10,[r14,#13*4]
+       ldr     r11,[r14,#7*4]
+       add     r3,r8,r3,ror#2                  @ E+=K_xx_xx
+       ldr     r12,[r14,#2*4]
+       eor     r9,r9,r10
+       eor     r11,r11,r12                     @ 1 cycle stall
+       eor     r10,r6,r7                       @ F_xx_xx
+       mov     r9,r9,ror#31
+       add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
+       eor     r9,r9,r11,ror#31
+       str     r9,[r14,#-4]!
+       and r10,r5,r10,ror#2                                    @ F_xx_xx
+                                               @ F_xx_xx
+       add     r3,r3,r9                        @ E+=X[i]
+       eor     r10,r10,r7,ror#2                @ F_00_19(B,C,D)
+       add     r3,r3,r10                       @ E+=F_00_19(B,C,D)
+
+       ldr     r8,.LK_20_39            @ [+15+16*4]
+       cmn     sp,#0                   @ [+3], clear carry to denote 20_39
+.L_20_39_or_60_79:
+       ldr     r9,[r14,#15*4]
+       ldr     r10,[r14,#13*4]
+       ldr     r11,[r14,#7*4]
+       add     r7,r8,r7,ror#2                  @ E+=K_xx_xx
+       ldr     r12,[r14,#2*4]
+       eor     r9,r9,r10
+       eor     r11,r11,r12                     @ 1 cycle stall
+       eor     r10,r5,r6                       @ F_xx_xx
+       mov     r9,r9,ror#31
+       add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
+       eor     r9,r9,r11,ror#31
+       str     r9,[r14,#-4]!
+       eor r10,r4,r10,ror#2                                    @ F_xx_xx
+                                               @ F_xx_xx
+       add     r7,r7,r9                        @ E+=X[i]
+       add     r7,r7,r10                       @ E+=F_20_39(B,C,D)
+       ldr     r9,[r14,#15*4]
+       ldr     r10,[r14,#13*4]
+       ldr     r11,[r14,#7*4]
+       add     r6,r8,r6,ror#2                  @ E+=K_xx_xx
+       ldr     r12,[r14,#2*4]
+       eor     r9,r9,r10
+       eor     r11,r11,r12                     @ 1 cycle stall
+       eor     r10,r4,r5                       @ F_xx_xx
+       mov     r9,r9,ror#31
+       add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
+       eor     r9,r9,r11,ror#31
+       str     r9,[r14,#-4]!
+       eor r10,r3,r10,ror#2                                    @ F_xx_xx
+                                               @ F_xx_xx
+       add     r6,r6,r9                        @ E+=X[i]
+       add     r6,r6,r10                       @ E+=F_20_39(B,C,D)
+       ldr     r9,[r14,#15*4]
+       ldr     r10,[r14,#13*4]
+       ldr     r11,[r14,#7*4]
+       add     r5,r8,r5,ror#2                  @ E+=K_xx_xx
+       ldr     r12,[r14,#2*4]
+       eor     r9,r9,r10
+       eor     r11,r11,r12                     @ 1 cycle stall
+       eor     r10,r3,r4                       @ F_xx_xx
+       mov     r9,r9,ror#31
+       add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
+       eor     r9,r9,r11,ror#31
+       str     r9,[r14,#-4]!
+       eor r10,r7,r10,ror#2                                    @ F_xx_xx
+                                               @ F_xx_xx
+       add     r5,r5,r9                        @ E+=X[i]
+       add     r5,r5,r10                       @ E+=F_20_39(B,C,D)
+       ldr     r9,[r14,#15*4]
+       ldr     r10,[r14,#13*4]
+       ldr     r11,[r14,#7*4]
+       add     r4,r8,r4,ror#2                  @ E+=K_xx_xx
+       ldr     r12,[r14,#2*4]
+       eor     r9,r9,r10
+       eor     r11,r11,r12                     @ 1 cycle stall
+       eor     r10,r7,r3                       @ F_xx_xx
+       mov     r9,r9,ror#31
+       add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
+       eor     r9,r9,r11,ror#31
+       str     r9,[r14,#-4]!
+       eor r10,r6,r10,ror#2                                    @ F_xx_xx
+                                               @ F_xx_xx
+       add     r4,r4,r9                        @ E+=X[i]
+       add     r4,r4,r10                       @ E+=F_20_39(B,C,D)
+       ldr     r9,[r14,#15*4]
+       ldr     r10,[r14,#13*4]
+       ldr     r11,[r14,#7*4]
+       add     r3,r8,r3,ror#2                  @ E+=K_xx_xx
+       ldr     r12,[r14,#2*4]
+       eor     r9,r9,r10
+       eor     r11,r11,r12                     @ 1 cycle stall
+       eor     r10,r6,r7                       @ F_xx_xx
+       mov     r9,r9,ror#31
+       add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
+       eor     r9,r9,r11,ror#31
+       str     r9,[r14,#-4]!
+       eor r10,r5,r10,ror#2                                    @ F_xx_xx
+                                               @ F_xx_xx
+       add     r3,r3,r9                        @ E+=X[i]
+       add     r3,r3,r10                       @ E+=F_20_39(B,C,D)
+ ARM(  teq     r14,sp          )       @ preserve carry
+ THUMB(        mov     r11,sp          )
+ THUMB(        teq     r14,r11         )       @ preserve carry
+       bne     .L_20_39_or_60_79       @ [+((12+3)*5+2)*4]
+       bcs     .L_done                 @ [+((12+3)*5+2)*4], spare 300 bytes
+
+       ldr     r8,.LK_40_59
+       sub     sp,sp,#20*4             @ [+2]
+.L_40_59:
+       ldr     r9,[r14,#15*4]
+       ldr     r10,[r14,#13*4]
+       ldr     r11,[r14,#7*4]
+       add     r7,r8,r7,ror#2                  @ E+=K_xx_xx
+       ldr     r12,[r14,#2*4]
+       eor     r9,r9,r10
+       eor     r11,r11,r12                     @ 1 cycle stall
+       eor     r10,r5,r6                       @ F_xx_xx
+       mov     r9,r9,ror#31
+       add     r7,r7,r3,ror#27                 @ E+=ROR(A,27)
+       eor     r9,r9,r11,ror#31
+       str     r9,[r14,#-4]!
+       and r10,r4,r10,ror#2                                    @ F_xx_xx
+       and r11,r5,r6                                   @ F_xx_xx
+       add     r7,r7,r9                        @ E+=X[i]
+       add     r7,r7,r10                       @ E+=F_40_59(B,C,D)
+       add     r7,r7,r11,ror#2
+       ldr     r9,[r14,#15*4]
+       ldr     r10,[r14,#13*4]
+       ldr     r11,[r14,#7*4]
+       add     r6,r8,r6,ror#2                  @ E+=K_xx_xx
+       ldr     r12,[r14,#2*4]
+       eor     r9,r9,r10
+       eor     r11,r11,r12                     @ 1 cycle stall
+       eor     r10,r4,r5                       @ F_xx_xx
+       mov     r9,r9,ror#31
+       add     r6,r6,r7,ror#27                 @ E+=ROR(A,27)
+       eor     r9,r9,r11,ror#31
+       str     r9,[r14,#-4]!
+       and r10,r3,r10,ror#2                                    @ F_xx_xx
+       and r11,r4,r5                                   @ F_xx_xx
+       add     r6,r6,r9                        @ E+=X[i]
+       add     r6,r6,r10                       @ E+=F_40_59(B,C,D)
+       add     r6,r6,r11,ror#2
+       ldr     r9,[r14,#15*4]
+       ldr     r10,[r14,#13*4]
+       ldr     r11,[r14,#7*4]
+       add     r5,r8,r5,ror#2                  @ E+=K_xx_xx
+       ldr     r12,[r14,#2*4]
+       eor     r9,r9,r10
+       eor     r11,r11,r12                     @ 1 cycle stall
+       eor     r10,r3,r4                       @ F_xx_xx
+       mov     r9,r9,ror#31
+       add     r5,r5,r6,ror#27                 @ E+=ROR(A,27)
+       eor     r9,r9,r11,ror#31
+       str     r9,[r14,#-4]!
+       and r10,r7,r10,ror#2                                    @ F_xx_xx
+       and r11,r3,r4                                   @ F_xx_xx
+       add     r5,r5,r9                        @ E+=X[i]
+       add     r5,r5,r10                       @ E+=F_40_59(B,C,D)
+       add     r5,r5,r11,ror#2
+       ldr     r9,[r14,#15*4]
+       ldr     r10,[r14,#13*4]
+       ldr     r11,[r14,#7*4]
+       add     r4,r8,r4,ror#2                  @ E+=K_xx_xx
+       ldr     r12,[r14,#2*4]
+       eor     r9,r9,r10
+       eor     r11,r11,r12                     @ 1 cycle stall
+       eor     r10,r7,r3                       @ F_xx_xx
+       mov     r9,r9,ror#31
+       add     r4,r4,r5,ror#27                 @ E+=ROR(A,27)
+       eor     r9,r9,r11,ror#31
+       str     r9,[r14,#-4]!
+       and r10,r6,r10,ror#2                                    @ F_xx_xx
+       and r11,r7,r3                                   @ F_xx_xx
+       add     r4,r4,r9                        @ E+=X[i]
+       add     r4,r4,r10                       @ E+=F_40_59(B,C,D)
+       add     r4,r4,r11,ror#2
+       ldr     r9,[r14,#15*4]
+       ldr     r10,[r14,#13*4]
+       ldr     r11,[r14,#7*4]
+       add     r3,r8,r3,ror#2                  @ E+=K_xx_xx
+       ldr     r12,[r14,#2*4]
+       eor     r9,r9,r10
+       eor     r11,r11,r12                     @ 1 cycle stall
+       eor     r10,r6,r7                       @ F_xx_xx
+       mov     r9,r9,ror#31
+       add     r3,r3,r4,ror#27                 @ E+=ROR(A,27)
+       eor     r9,r9,r11,ror#31
+       str     r9,[r14,#-4]!
+       and r10,r5,r10,ror#2                                    @ F_xx_xx
+       and r11,r6,r7                                   @ F_xx_xx
+       add     r3,r3,r9                        @ E+=X[i]
+       add     r3,r3,r10                       @ E+=F_40_59(B,C,D)
+       add     r3,r3,r11,ror#2
+       cmp     r14,sp
+       bne     .L_40_59                @ [+((12+5)*5+2)*4]
+
+       ldr     r8,.LK_60_79
+       sub     sp,sp,#20*4
+       cmp     sp,#0                   @ set carry to denote 60_79
+       b       .L_20_39_or_60_79       @ [+4], spare 300 bytes
+.L_done:
+       add     sp,sp,#80*4             @ "deallocate" stack frame
+       ldmia   r0,{r8,r9,r10,r11,r12}
+       add     r3,r8,r3
+       add     r4,r9,r4
+       add     r5,r10,r5,ror#2
+       add     r6,r11,r6,ror#2
+       add     r7,r12,r7,ror#2
+       stmia   r0,{r3,r4,r5,r6,r7}
+       teq     r1,r2
+       bne     .Lloop                  @ [+18], total 1307
+
+       ldmia   sp!,{r4-r12,pc}
+.align 2
+.LK_00_19:     .word   0x5a827999
+.LK_20_39:     .word   0x6ed9eba1
+.LK_40_59:     .word   0x8f1bbcdc
+.LK_60_79:     .word   0xca62c1d6
+ENDPROC(sha1_block_data_order)
+.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
+.align 2
diff --git a/lib/crypto/arm/sha1-armv7-neon.S b/lib/crypto/arm/sha1-armv7-neon.S
new file mode 100644 (file)
index 0000000..6edba3a
--- /dev/null
@@ -0,0 +1,633 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
+ *
+ * Copyright Â© 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+.syntax unified
+.fpu neon
+
+.text
+
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+
+
+/* Constants */
+
+#define K1  0x5A827999
+#define K2  0x6ED9EBA1
+#define K3  0x8F1BBCDC
+#define K4  0xCA62C1D6
+.align 4
+.LK_VEC:
+.LK1:  .long K1, K1, K1, K1
+.LK2:  .long K2, K2, K2, K2
+.LK3:  .long K3, K3, K3, K3
+.LK4:  .long K4, K4, K4, K4
+
+
+/* Register macros */
+
+#define RSTATE r0
+#define RDATA r1
+#define RNBLKS r2
+#define ROLDSTACK r3
+#define RWK lr
+
+#define _a r4
+#define _b r5
+#define _c r6
+#define _d r7
+#define _e r8
+
+#define RT0 r9
+#define RT1 r10
+#define RT2 r11
+#define RT3 r12
+
+#define W0 q0
+#define W1 q7
+#define W2 q2
+#define W3 q3
+#define W4 q4
+#define W5 q6
+#define W6 q5
+#define W7 q1
+
+#define tmp0 q8
+#define tmp1 q9
+#define tmp2 q10
+#define tmp3 q11
+
+#define qK1 q12
+#define qK2 q13
+#define qK3 q14
+#define qK4 q15
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+#define ARM_LE(code...)
+#else
+#define ARM_LE(code...)                code
+#endif
+
+/* Round function macros. */
+
+#define WK_offs(i) (((i) & 15) * 4)
+
+#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+             W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       ldr RT3, [sp, WK_offs(i)]; \
+               pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+       bic RT0, d, b; \
+       add e, e, a, ror #(32 - 5); \
+       and RT1, c, b; \
+               pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+       add RT0, RT0, RT3; \
+       add e, e, RT1; \
+       ror b, #(32 - 30); \
+               pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+       add e, e, RT0;
+
+#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+             W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       ldr RT3, [sp, WK_offs(i)]; \
+               pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+       eor RT0, d, b; \
+       add e, e, a, ror #(32 - 5); \
+       eor RT0, RT0, c; \
+               pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+       add e, e, RT3; \
+       ror b, #(32 - 30); \
+               pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+       add e, e, RT0; \
+
+#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+             W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       ldr RT3, [sp, WK_offs(i)]; \
+               pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+       eor RT0, b, c; \
+       and RT1, b, c; \
+       add e, e, a, ror #(32 - 5); \
+               pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+       and RT0, RT0, d; \
+       add RT1, RT1, RT3; \
+       add e, e, RT0; \
+       ror b, #(32 - 30); \
+               pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+       add e, e, RT1;
+
+#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+             W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+             W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
+           W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+              W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define R(a,b,c,d,e,f,i) \
+       _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
+              W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define dummy(...)
+
+
+/* Input expansion macros. */
+
+/********* Precalc macros for rounds 0-15 *************************************/
+
+#define W_PRECALC_00_15() \
+       add       RWK, sp, #(WK_offs(0));                       \
+       \
+       vld1.32   {W0, W7}, [RDATA]!;                           \
+ ARM_LE(vrev32.8  W0, W0;      )       /* big => little */     \
+       vld1.32   {W6, W5}, [RDATA]!;                           \
+       vadd.u32  tmp0, W0, curK;                               \
+ ARM_LE(vrev32.8  W7, W7;      )       /* big => little */     \
+ ARM_LE(vrev32.8  W6, W6;      )       /* big => little */     \
+       vadd.u32  tmp1, W7, curK;                               \
+ ARM_LE(vrev32.8  W5, W5;      )       /* big => little */     \
+       vadd.u32  tmp2, W6, curK;                               \
+       vst1.32   {tmp0, tmp1}, [RWK]!;                         \
+       vadd.u32  tmp3, W5, curK;                               \
+       vst1.32   {tmp2, tmp3}, [RWK];                          \
+
+#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vld1.32   {W0, W7}, [RDATA]!;                           \
+
+#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       add       RWK, sp, #(WK_offs(0));                       \
+
+#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8  W0, W0;      )       /* big => little */     \
+
+#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vld1.32   {W6, W5}, [RDATA]!;                           \
+
+#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vadd.u32  tmp0, W0, curK;                               \
+
+#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8  W7, W7;      )       /* big => little */     \
+
+#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8  W6, W6;      )       /* big => little */     \
+
+#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vadd.u32  tmp1, W7, curK;                               \
+
+#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+ ARM_LE(vrev32.8  W5, W5;      )       /* big => little */     \
+
+#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vadd.u32  tmp2, W6, curK;                               \
+
+#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vst1.32   {tmp0, tmp1}, [RWK]!;                         \
+
+#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vadd.u32  tmp3, W5, curK;                               \
+
+#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vst1.32   {tmp2, tmp3}, [RWK];                          \
+
+
+/********* Precalc macros for rounds 16-31 ************************************/
+
+#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       veor      tmp0, tmp0;                   \
+       vext.8    W, W_m16, W_m12, #8;          \
+
+#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       add       RWK, sp, #(WK_offs(i));       \
+       vext.8    tmp0, W_m04, tmp0, #4;        \
+
+#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       veor      tmp0, tmp0, W_m16;            \
+       veor.32   W, W, W_m08;                  \
+
+#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       veor      tmp1, tmp1;                   \
+       veor      W, W, tmp0;                   \
+
+#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vshl.u32  tmp0, W, #1;                  \
+
+#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vext.8    tmp1, tmp1, W, #(16-12);      \
+       vshr.u32  W, W, #31;                    \
+
+#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vorr      tmp0, tmp0, W;                \
+       vshr.u32  W, tmp1, #30;                 \
+
+#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vshl.u32  tmp1, tmp1, #2;               \
+
+#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       veor      tmp0, tmp0, W;                \
+
+#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       veor      W, tmp0, tmp1;                \
+
+#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vadd.u32  tmp0, W, curK;                \
+
+#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vst1.32   {tmp0}, [RWK];
+
+
+/********* Precalc macros for rounds 32-79 ************************************/
+
+#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       veor W, W_m28; \
+
+#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vext.8 tmp0, W_m08, W_m04, #8; \
+
+#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       veor W, W_m16; \
+
+#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       veor W, tmp0; \
+
+#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       add RWK, sp, #(WK_offs(i&~3)); \
+
+#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vshl.u32 tmp1, W, #2; \
+
+#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vshr.u32 tmp0, W, #30; \
+
+#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vorr W, tmp0, tmp1; \
+
+#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vadd.u32 tmp0, W, curK; \
+
+#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+       vst1.32 {tmp0}, [RWK];
+
+
+/*
+ * Transform nblocks*64 bytes (nblocks*16 32-bit words) at DATA.
+ *
+ * void sha1_transform_neon(struct sha1_block_state *state,
+ *                         const u8 *data, size_t nblocks);
+ */
+.align 3
+ENTRY(sha1_transform_neon)
+  /* input:
+   *   r0: state
+   *   r1: data (64*nblocks bytes)
+   *   r2: nblocks
+   */
+
+  cmp RNBLKS, #0;
+  beq .Ldo_nothing;
+
+  push {r4-r12, lr};
+  /*vpush {q4-q7};*/
+
+  adr RT3, .LK_VEC;
+
+  mov ROLDSTACK, sp;
+
+  /* Align stack. */
+  sub RT0, sp, #(16*4);
+  and RT0, #(~(16-1));
+  mov sp, RT0;
+
+  vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
+
+  /* Get the values of the chaining variables. */
+  ldm RSTATE, {_a-_e};
+
+  vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
+
+#undef curK
+#define curK qK1
+  /* Precalc 0-15. */
+  W_PRECALC_00_15();
+
+.Loop:
+  /* Transform 0-15 + Precalc 16-31. */
+  _R( _a, _b, _c, _d, _e, F1,  0,
+      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
+      W4, W5, W6, W7, W0, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F1,  1,
+      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
+      W4, W5, W6, W7, W0, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F1,  2,
+      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
+      W4, W5, W6, W7, W0, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F1,  3,
+      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
+      W4, W5, W6, W7, W0, _, _, _ );
+
+#undef curK
+#define curK qK2
+  _R( _b, _c, _d, _e, _a, F1,  4,
+      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
+      W3, W4, W5, W6, W7, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F1,  5,
+      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
+      W3, W4, W5, W6, W7, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F1,  6,
+      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
+      W3, W4, W5, W6, W7, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F1,  7,
+      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
+      W3, W4, W5, W6, W7, _, _, _ );
+
+  _R( _c, _d, _e, _a, _b, F1,  8,
+      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
+      W2, W3, W4, W5, W6, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F1,  9,
+      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
+      W2, W3, W4, W5, W6, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F1, 10,
+      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
+      W2, W3, W4, W5, W6, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F1, 11,
+      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
+      W2, W3, W4, W5, W6, _, _, _ );
+
+  _R( _d, _e, _a, _b, _c, F1, 12,
+      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
+      W1, W2, W3, W4, W5, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F1, 13,
+      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
+      W1, W2, W3, W4, W5, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F1, 14,
+      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
+      W1, W2, W3, W4, W5, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F1, 15,
+      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
+      W1, W2, W3, W4, W5, _, _, _ );
+
+  /* Transform 16-63 + Precalc 32-79. */
+  _R( _e, _a, _b, _c, _d, F1, 16,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _d, _e, _a, _b, _c, F1, 17,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _c, _d, _e, _a, _b, F1, 18,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 32,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _b, _c, _d, _e, _a, F1, 19,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 32,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+
+  _R( _a, _b, _c, _d, _e, F2, 20,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _e, _a, _b, _c, _d, F2, 21,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _d, _e, _a, _b, _c, F2, 22,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 36,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _c, _d, _e, _a, _b, F2, 23,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 36,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+
+#undef curK
+#define curK qK3
+  _R( _b, _c, _d, _e, _a, F2, 24,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _a, _b, _c, _d, _e, F2, 25,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _e, _a, _b, _c, _d, F2, 26,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 40,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _d, _e, _a, _b, _c, F2, 27,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 40,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+
+  _R( _c, _d, _e, _a, _b, F2, 28,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _b, _c, _d, _e, _a, F2, 29,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _a, _b, _c, _d, _e, F2, 30,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 44,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _e, _a, _b, _c, _d, F2, 31,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 44,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+
+  _R( _d, _e, _a, _b, _c, F2, 32,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
+      W4, W5, W6, W7, W0, W1, W2, W3);
+  _R( _c, _d, _e, _a, _b, F2, 33,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
+      W4, W5, W6, W7, W0, W1, W2, W3);
+  _R( _b, _c, _d, _e, _a, F2, 34,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 48,
+      W4, W5, W6, W7, W0, W1, W2, W3);
+  _R( _a, _b, _c, _d, _e, F2, 35,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 48,
+      W4, W5, W6, W7, W0, W1, W2, W3);
+
+  _R( _e, _a, _b, _c, _d, F2, 36,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
+      W3, W4, W5, W6, W7, W0, W1, W2);
+  _R( _d, _e, _a, _b, _c, F2, 37,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
+      W3, W4, W5, W6, W7, W0, W1, W2);
+  _R( _c, _d, _e, _a, _b, F2, 38,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 52,
+      W3, W4, W5, W6, W7, W0, W1, W2);
+  _R( _b, _c, _d, _e, _a, F2, 39,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 52,
+      W3, W4, W5, W6, W7, W0, W1, W2);
+
+  _R( _a, _b, _c, _d, _e, F3, 40,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
+      W2, W3, W4, W5, W6, W7, W0, W1);
+  _R( _e, _a, _b, _c, _d, F3, 41,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
+      W2, W3, W4, W5, W6, W7, W0, W1);
+  _R( _d, _e, _a, _b, _c, F3, 42,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 56,
+      W2, W3, W4, W5, W6, W7, W0, W1);
+  _R( _c, _d, _e, _a, _b, F3, 43,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 56,
+      W2, W3, W4, W5, W6, W7, W0, W1);
+
+#undef curK
+#define curK qK4
+  _R( _b, _c, _d, _e, _a, F3, 44,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
+      W1, W2, W3, W4, W5, W6, W7, W0);
+  _R( _a, _b, _c, _d, _e, F3, 45,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
+      W1, W2, W3, W4, W5, W6, W7, W0);
+  _R( _e, _a, _b, _c, _d, F3, 46,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 60,
+      W1, W2, W3, W4, W5, W6, W7, W0);
+  _R( _d, _e, _a, _b, _c, F3, 47,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 60,
+      W1, W2, W3, W4, W5, W6, W7, W0);
+
+  _R( _c, _d, _e, _a, _b, F3, 48,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _b, _c, _d, _e, _a, F3, 49,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _a, _b, _c, _d, _e, F3, 50,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 64,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _e, _a, _b, _c, _d, F3, 51,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 64,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+
+  _R( _d, _e, _a, _b, _c, F3, 52,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _c, _d, _e, _a, _b, F3, 53,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _b, _c, _d, _e, _a, F3, 54,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 68,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _a, _b, _c, _d, _e, F3, 55,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 68,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+
+  _R( _e, _a, _b, _c, _d, F3, 56,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _d, _e, _a, _b, _c, F3, 57,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _c, _d, _e, _a, _b, F3, 58,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 72,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _b, _c, _d, _e, _a, F3, 59,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 72,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+
+  subs RNBLKS, #1;
+
+  _R( _a, _b, _c, _d, _e, F4, 60,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _e, _a, _b, _c, _d, F4, 61,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _d, _e, _a, _b, _c, F4, 62,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 76,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _c, _d, _e, _a, _b, F4, 63,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 76,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+
+  beq .Lend;
+
+  /* Transform 64-79 + Precalc 0-15 of next block. */
+#undef curK
+#define curK qK1
+  _R( _b, _c, _d, _e, _a, F4, 64,
+      WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F4, 65,
+      WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F4, 66,
+      WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F4, 67,
+      WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+  _R( _c, _d, _e, _a, _b, F4, 68,
+      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F4, 69,
+      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F4, 70,
+      WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F4, 71,
+      WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+  _R( _d, _e, _a, _b, _c, F4, 72,
+      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F4, 73,
+      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F4, 74,
+      WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F4, 75,
+      WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+  _R( _e, _a, _b, _c, _d, F4, 76,
+      WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F4, 77,
+      WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F4, 78,
+      WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F4, 79,
+      WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
+
+  /* Update the chaining variables. */
+  ldm RSTATE, {RT0-RT3};
+  add _a, RT0;
+  ldr RT0, [RSTATE, #state_h4];
+  add _b, RT1;
+  add _c, RT2;
+  add _d, RT3;
+  add _e, RT0;
+  stm RSTATE, {_a-_e};
+
+  b .Loop;
+
+.Lend:
+  /* Transform 64-79 */
+  R( _b, _c, _d, _e, _a, F4, 64 );
+  R( _a, _b, _c, _d, _e, F4, 65 );
+  R( _e, _a, _b, _c, _d, F4, 66 );
+  R( _d, _e, _a, _b, _c, F4, 67 );
+  R( _c, _d, _e, _a, _b, F4, 68 );
+  R( _b, _c, _d, _e, _a, F4, 69 );
+  R( _a, _b, _c, _d, _e, F4, 70 );
+  R( _e, _a, _b, _c, _d, F4, 71 );
+  R( _d, _e, _a, _b, _c, F4, 72 );
+  R( _c, _d, _e, _a, _b, F4, 73 );
+  R( _b, _c, _d, _e, _a, F4, 74 );
+  R( _a, _b, _c, _d, _e, F4, 75 );
+  R( _e, _a, _b, _c, _d, F4, 76 );
+  R( _d, _e, _a, _b, _c, F4, 77 );
+  R( _c, _d, _e, _a, _b, F4, 78 );
+  R( _b, _c, _d, _e, _a, F4, 79 );
+
+  mov sp, ROLDSTACK;
+
+  /* Update the chaining variables. */
+  ldm RSTATE, {RT0-RT3};
+  add _a, RT0;
+  ldr RT0, [RSTATE, #state_h4];
+  add _b, RT1;
+  add _c, RT2;
+  add _d, RT3;
+  /*vpop {q4-q7};*/
+  add _e, RT0;
+  stm RSTATE, {_a-_e};
+
+  pop {r4-r12, pc};
+
+.Ldo_nothing:
+  bx lr
+ENDPROC(sha1_transform_neon)
diff --git a/lib/crypto/arm/sha1-ce-core.S b/lib/crypto/arm/sha1-ce-core.S
new file mode 100644 (file)
index 0000000..2de40dd
--- /dev/null
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2015 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+       .text
+       .arch           armv8-a
+       .fpu            crypto-neon-fp-armv8
+
+       k0              .req    q0
+       k1              .req    q1
+       k2              .req    q2
+       k3              .req    q3
+
+       ta0             .req    q4
+       ta1             .req    q5
+       tb0             .req    q5
+       tb1             .req    q4
+
+       dga             .req    q6
+       dgb             .req    q7
+       dgbs            .req    s28
+
+       dg0             .req    q12
+       dg1a0           .req    q13
+       dg1a1           .req    q14
+       dg1b0           .req    q14
+       dg1b1           .req    q13
+
+       .macro          add_only, op, ev, rc, s0, dg1
+       .ifnb           \s0
+       vadd.u32        tb\ev, q\s0, \rc
+       .endif
+       sha1h.32        dg1b\ev, dg0
+       .ifb            \dg1
+       sha1\op\().32   dg0, dg1a\ev, ta\ev
+       .else
+       sha1\op\().32   dg0, \dg1, ta\ev
+       .endif
+       .endm
+
+       .macro          add_update, op, ev, rc, s0, s1, s2, s3, dg1
+       sha1su0.32      q\s0, q\s1, q\s2
+       add_only        \op, \ev, \rc, \s1, \dg1
+       sha1su1.32      q\s0, q\s3
+       .endm
+
+       .align          6
+.Lsha1_rcon:
+       .word           0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999
+       .word           0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1
+       .word           0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc
+       .word           0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6
+
+       /*
+        * void sha1_ce_transform(struct sha1_block_state *state,
+        *                        const u8 *data, size_t nblocks);
+        */
+ENTRY(sha1_ce_transform)
+       /* load round constants */
+       adr             ip, .Lsha1_rcon
+       vld1.32         {k0-k1}, [ip, :128]!
+       vld1.32         {k2-k3}, [ip, :128]
+
+       /* load state */
+       vld1.32         {dga}, [r0]
+       vldr            dgbs, [r0, #16]
+
+       /* load input */
+0:     vld1.32         {q8-q9}, [r1]!
+       vld1.32         {q10-q11}, [r1]!
+       subs            r2, r2, #1
+
+#ifndef CONFIG_CPU_BIG_ENDIAN
+       vrev32.8        q8, q8
+       vrev32.8        q9, q9
+       vrev32.8        q10, q10
+       vrev32.8        q11, q11
+#endif
+
+       vadd.u32        ta0, q8, k0
+       vmov            dg0, dga
+
+       add_update      c, 0, k0,  8,  9, 10, 11, dgb
+       add_update      c, 1, k0,  9, 10, 11,  8
+       add_update      c, 0, k0, 10, 11,  8,  9
+       add_update      c, 1, k0, 11,  8,  9, 10
+       add_update      c, 0, k1,  8,  9, 10, 11
+
+       add_update      p, 1, k1,  9, 10, 11,  8
+       add_update      p, 0, k1, 10, 11,  8,  9
+       add_update      p, 1, k1, 11,  8,  9, 10
+       add_update      p, 0, k1,  8,  9, 10, 11
+       add_update      p, 1, k2,  9, 10, 11,  8
+
+       add_update      m, 0, k2, 10, 11,  8,  9
+       add_update      m, 1, k2, 11,  8,  9, 10
+       add_update      m, 0, k2,  8,  9, 10, 11
+       add_update      m, 1, k2,  9, 10, 11,  8
+       add_update      m, 0, k3, 10, 11,  8,  9
+
+       add_update      p, 1, k3, 11,  8,  9, 10
+       add_only        p, 0, k3,  9
+       add_only        p, 1, k3, 10
+       add_only        p, 0, k3, 11
+       add_only        p, 1
+
+       /* update state */
+       vadd.u32        dga, dga, dg0
+       vadd.u32        dgb, dgb, dg1a0
+       bne             0b
+
+       /* store new state */
+       vst1.32         {dga}, [r0]
+       vstr            dgbs, [r0, #16]
+       bx              lr
+ENDPROC(sha1_ce_transform)
diff --git a/lib/crypto/arm/sha1.h b/lib/crypto/arm/sha1.h
new file mode 100644 (file)
index 0000000..fa1e924
--- /dev/null
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-1 optimized for ARM
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+asmlinkage void sha1_block_data_order(struct sha1_block_state *state,
+                                     const u8 *data, size_t nblocks);
+asmlinkage void sha1_transform_neon(struct sha1_block_state *state,
+                                   const u8 *data, size_t nblocks);
+asmlinkage void sha1_ce_transform(struct sha1_block_state *state,
+                                 const u8 *data, size_t nblocks);
+
+static void sha1_blocks(struct sha1_block_state *state,
+                       const u8 *data, size_t nblocks)
+{
+       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+           static_branch_likely(&have_neon) && likely(may_use_simd())) {
+               kernel_neon_begin();
+               if (static_branch_likely(&have_ce))
+                       sha1_ce_transform(state, data, nblocks);
+               else
+                       sha1_transform_neon(state, data, nblocks);
+               kernel_neon_end();
+       } else {
+               sha1_block_data_order(state, data, nblocks);
+       }
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha1_mod_init_arch sha1_mod_init_arch
+static inline void sha1_mod_init_arch(void)
+{
+       if (elf_hwcap & HWCAP_NEON) {
+               static_branch_enable(&have_neon);
+               if (elf_hwcap2 & HWCAP2_SHA1)
+                       static_branch_enable(&have_ce);
+       }
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */