lib/crypto: arm/sha512: Migrate optimized SHA-512 code to library
authorEric Biggers <ebiggers@kernel.org>
Mon, 30 Jun 2025 16:03:11 +0000 (09:03 -0700)
committerEric Biggers <ebiggers@kernel.org>
Mon, 30 Jun 2025 16:26:19 +0000 (09:26 -0700)
Instead of exposing the arm-optimized SHA-512 code via arm-specific
crypto_shash algorithms, instead just implement the sha512_blocks()
library function.  This is much simpler, it makes the SHA-512 (and
SHA-384) library functions be arm-optimized, and it fixes the
longstanding issue where the arm-optimized SHA-512 code was disabled by
default.  SHA-512 still remains available through crypto_shash, but
individual architectures no longer need to handle it.

To match sha512_blocks(), change the type of the nblocks parameter of
the assembly functions from int to size_t.  The assembly functions
actually already treated it as size_t.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250630160320.2888-8-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
16 files changed:
arch/arm/configs/exynos_defconfig
arch/arm/configs/milbeaut_m10v_defconfig
arch/arm/configs/multi_v7_defconfig
arch/arm/configs/omap2plus_defconfig
arch/arm/configs/pxa_defconfig
arch/arm/crypto/Kconfig
arch/arm/crypto/Makefile
arch/arm/crypto/sha512-armv4.pl [deleted file]
arch/arm/crypto/sha512-glue.c [deleted file]
arch/arm/crypto/sha512-neon-glue.c [deleted file]
arch/arm/crypto/sha512.h [deleted file]
lib/crypto/Kconfig
lib/crypto/Makefile
lib/crypto/arm/.gitignore [new file with mode: 0644]
lib/crypto/arm/sha512-armv4.pl [new file with mode: 0644]
lib/crypto/arm/sha512.h [new file with mode: 0644]

index f71af368674cf13609635e7c37165bab8116320f..d58e300693045ad96d5236e5141da3ec417e4fd5 100644 (file)
@@ -364,7 +364,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
 CONFIG_CRYPTO_CHACHA20_NEON=m
 CONFIG_CRYPTO_DEV_EXYNOS_RNG=y
index 242e7d5a3f6820405d1dc5be0ce1baf3420bdb0d..8ebf8bd872fe8a3693ccd507eb52a4cdb91eedab 100644 (file)
@@ -100,7 +100,6 @@ CONFIG_CRYPTO_SEQIV=m
 CONFIG_CRYPTO_GHASH_ARM_CE=m
 CONFIG_CRYPTO_SHA1_ARM_NEON=m
 CONFIG_CRYPTO_SHA1_ARM_CE=m
-CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
 CONFIG_CRYPTO_AES_ARM_CE=m
index 50c170b4619f72db1933f7b692c4655e2a9130f0..3fd07e864ca8554e29c11e8e86bb320c8b8e3afc 100644 (file)
@@ -1282,7 +1282,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m
 CONFIG_CRYPTO_GHASH_ARM_CE=m
 CONFIG_CRYPTO_SHA1_ARM_NEON=m
 CONFIG_CRYPTO_SHA1_ARM_CE=m
-CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
 CONFIG_CRYPTO_AES_ARM_CE=m
index 9f9780c8e62aadea51de9546805ae5d7c28f8cde..530dfb8338c98e6d2eb4936fb533f21cb56d8298 100644 (file)
@@ -705,7 +705,6 @@ CONFIG_SECURITY=y
 CONFIG_CRYPTO_MICHAEL_MIC=y
 CONFIG_CRYPTO_GHASH_ARM_CE=m
 CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
 CONFIG_CRYPTO_CHACHA20_NEON=m
index ff29c5b0e9c93670dda5f13481dac08f26ed3b5e..eaa44574d4a64603811f5983e509a4a7bd44c19e 100644 (file)
@@ -659,7 +659,6 @@ CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_DEFLATE=y
 CONFIG_CRYPTO_LZO=y
 CONFIG_CRYPTO_SHA1_ARM=m
-CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_FONTS=y
 CONFIG_FONT_8x8=y
index 7efb9a8596e4e58302144c4b16cfa90f9afac465..a18f97f1597cbe7b46bfa9aed92b755862d65561 100644 (file)
@@ -93,16 +93,6 @@ config CRYPTO_SHA1_ARM_CE
 
          Architecture: arm using ARMv8 Crypto Extensions
 
-config CRYPTO_SHA512_ARM
-       tristate "Hash functions: SHA-384 and SHA-512 (NEON)"
-       select CRYPTO_HASH
-       depends on !CPU_V7M
-       help
-         SHA-384 and SHA-512 secure hash algorithms (FIPS 180)
-
-         Architecture: arm using
-         - NEON (Advanced SIMD) extensions
-
 config CRYPTO_AES_ARM
        tristate "Ciphers: AES"
        select CRYPTO_ALGAPI
index 8479137c6e80022766342694d4ee9dbca12205f4..78a4042d8761c14baf0fbfa85560e8262b35cd0c 100644 (file)
@@ -7,7 +7,6 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
 obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
-obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
 obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
 obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
@@ -20,23 +19,9 @@ aes-arm-y    := aes-cipher-core.o aes-cipher-glue.o
 aes-arm-bs-y   := aes-neonbs-core.o aes-neonbs-glue.o
 sha1-arm-y     := sha1-armv4-large.o sha1_glue.o
 sha1-arm-neon-y        := sha1-armv7-neon.o sha1_neon_glue.o
-sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
-sha512-arm-y   := sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
 blake2b-neon-y  := blake2b-neon-core.o blake2b-neon-glue.o
 sha1-arm-ce-y  := sha1-ce-core.o sha1-ce-glue.o
 aes-arm-ce-y   := aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
 curve25519-neon-y := curve25519-core.o curve25519-glue.o
-
-quiet_cmd_perl = PERL    $@
-      cmd_perl = $(PERL) $(<) > $(@)
-
-$(obj)/%-core.S: $(src)/%-armv4.pl
-       $(call cmd,perl)
-
-clean-files += sha512-core.S
-
-aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
-
-AFLAGS_sha512-core.o += $(aflags-thumb2-y)
diff --git a/arch/arm/crypto/sha512-armv4.pl b/arch/arm/crypto/sha512-armv4.pl
deleted file mode 100644 (file)
index 2fc3516..0000000
+++ /dev/null
@@ -1,657 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from the OpenSSL project but the author (Andy Polyakov)
-# has relicensed it under the GPLv2. Therefore this program is free software;
-# you can redistribute it and/or modify it under the terms of the GNU General
-# Public License version 2 as published by the Free Software Foundation.
-#
-# The original headers, including the original license headers, are
-# included below for completeness.
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see https://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# SHA512 block procedure for ARMv4. September 2007.
-
-# This code is ~4.5 (four and a half) times faster than code generated
-# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
-# Xscale PXA250 core].
-#
-# July 2010.
-#
-# Rescheduling for dual-issue pipeline resulted in 6% improvement on
-# Cortex A8 core and ~40 cycles per processed byte.
-
-# February 2011.
-#
-# Profiler-assisted and platform-specific optimization resulted in 7%
-# improvement on Coxtex A8 core and ~38 cycles per byte.
-
-# March 2011.
-#
-# Add NEON implementation. On Cortex A8 it was measured to process
-# one byte in 23.3 cycles or ~60% faster than integer-only code.
-
-# August 2012.
-#
-# Improve NEON performance by 12% on Snapdragon S4. In absolute
-# terms it's 22.6 cycles per byte, which is disappointing result.
-# Technical writers asserted that 3-way S4 pipeline can sustain
-# multiple NEON instructions per cycle, but dual NEON issue could
-# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
-# for further details. On side note Cortex-A15 processes one byte in
-# 16 cycles.
-
-# Byte order [in]dependence. =========================================
-#
-# Originally caller was expected to maintain specific *dword* order in
-# h[0-7], namely with most significant dword at *lower* address, which
-# was reflected in below two parameters as 0 and 4. Now caller is
-# expected to maintain native byte order for whole 64-bit values.
-$hi="HI";
-$lo="LO";
-# ====================================================================
-
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-$ctx="r0";     # parameter block
-$inp="r1";
-$len="r2";
-
-$Tlo="r3";
-$Thi="r4";
-$Alo="r5";
-$Ahi="r6";
-$Elo="r7";
-$Ehi="r8";
-$t0="r9";
-$t1="r10";
-$t2="r11";
-$t3="r12";
-############   r13 is stack pointer
-$Ktbl="r14";
-############   r15 is program counter
-
-$Aoff=8*0;
-$Boff=8*1;
-$Coff=8*2;
-$Doff=8*3;
-$Eoff=8*4;
-$Foff=8*5;
-$Goff=8*6;
-$Hoff=8*7;
-$Xoff=8*8;
-
-sub BODY_00_15() {
-my $magic = shift;
-$code.=<<___;
-       @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
-       @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
-       @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
-       mov     $t0,$Elo,lsr#14
-       str     $Tlo,[sp,#$Xoff+0]
-       mov     $t1,$Ehi,lsr#14
-       str     $Thi,[sp,#$Xoff+4]
-       eor     $t0,$t0,$Ehi,lsl#18
-       ldr     $t2,[sp,#$Hoff+0]       @ h.lo
-       eor     $t1,$t1,$Elo,lsl#18
-       ldr     $t3,[sp,#$Hoff+4]       @ h.hi
-       eor     $t0,$t0,$Elo,lsr#18
-       eor     $t1,$t1,$Ehi,lsr#18
-       eor     $t0,$t0,$Ehi,lsl#14
-       eor     $t1,$t1,$Elo,lsl#14
-       eor     $t0,$t0,$Ehi,lsr#9
-       eor     $t1,$t1,$Elo,lsr#9
-       eor     $t0,$t0,$Elo,lsl#23
-       eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
-       adds    $Tlo,$Tlo,$t0
-       ldr     $t0,[sp,#$Foff+0]       @ f.lo
-       adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
-       ldr     $t1,[sp,#$Foff+4]       @ f.hi
-       adds    $Tlo,$Tlo,$t2
-       ldr     $t2,[sp,#$Goff+0]       @ g.lo
-       adc     $Thi,$Thi,$t3           @ T += h
-       ldr     $t3,[sp,#$Goff+4]       @ g.hi
-
-       eor     $t0,$t0,$t2
-       str     $Elo,[sp,#$Eoff+0]
-       eor     $t1,$t1,$t3
-       str     $Ehi,[sp,#$Eoff+4]
-       and     $t0,$t0,$Elo
-       str     $Alo,[sp,#$Aoff+0]
-       and     $t1,$t1,$Ehi
-       str     $Ahi,[sp,#$Aoff+4]
-       eor     $t0,$t0,$t2
-       ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
-       eor     $t1,$t1,$t3             @ Ch(e,f,g)
-       ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
-
-       adds    $Tlo,$Tlo,$t0
-       ldr     $Elo,[sp,#$Doff+0]      @ d.lo
-       adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
-       ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
-       adds    $Tlo,$Tlo,$t2
-       and     $t0,$t2,#0xff
-       adc     $Thi,$Thi,$t3           @ T += K[i]
-       adds    $Elo,$Elo,$Tlo
-       ldr     $t2,[sp,#$Boff+0]       @ b.lo
-       adc     $Ehi,$Ehi,$Thi          @ d += T
-       teq     $t0,#$magic
-
-       ldr     $t3,[sp,#$Coff+0]       @ c.lo
-#if __ARM_ARCH__>=7
-       it      eq                      @ Thumb2 thing, sanity check in ARM
-#endif
-       orreq   $Ktbl,$Ktbl,#1
-       @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
-       @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
-       @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
-       mov     $t0,$Alo,lsr#28
-       mov     $t1,$Ahi,lsr#28
-       eor     $t0,$t0,$Ahi,lsl#4
-       eor     $t1,$t1,$Alo,lsl#4
-       eor     $t0,$t0,$Ahi,lsr#2
-       eor     $t1,$t1,$Alo,lsr#2
-       eor     $t0,$t0,$Alo,lsl#30
-       eor     $t1,$t1,$Ahi,lsl#30
-       eor     $t0,$t0,$Ahi,lsr#7
-       eor     $t1,$t1,$Alo,lsr#7
-       eor     $t0,$t0,$Alo,lsl#25
-       eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
-       adds    $Tlo,$Tlo,$t0
-       and     $t0,$Alo,$t2
-       adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
-
-       ldr     $t1,[sp,#$Boff+4]       @ b.hi
-       orr     $Alo,$Alo,$t2
-       ldr     $t2,[sp,#$Coff+4]       @ c.hi
-       and     $Alo,$Alo,$t3
-       and     $t3,$Ahi,$t1
-       orr     $Ahi,$Ahi,$t1
-       orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
-       and     $Ahi,$Ahi,$t2
-       adds    $Alo,$Alo,$Tlo
-       orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
-       sub     sp,sp,#8
-       adc     $Ahi,$Ahi,$Thi          @ h += T
-       tst     $Ktbl,#1
-       add     $Ktbl,$Ktbl,#8
-___
-}
-$code=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-# define VFP_ABI_PUSH  vstmdb  sp!,{d8-d15}
-# define VFP_ABI_POP   vldmia  sp!,{d8-d15}
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-# define VFP_ABI_PUSH
-# define VFP_ABI_POP
-#endif
-
-#ifdef __ARMEL__
-# define LO 0
-# define HI 4
-# define WORD64(hi0,lo0,hi1,lo1)       .word   lo0,hi0, lo1,hi1
-#else
-# define HI 0
-# define LO 4
-# define WORD64(hi0,lo0,hi1,lo1)       .word   hi0,lo0, hi1,lo1
-#endif
-
-.text
-#if __ARM_ARCH__<7
-.code  32
-#else
-.syntax unified
-# ifdef __thumb2__
-.thumb
-# else
-.code   32
-# endif
-#endif
-
-.type  K512,%object
-.align 5
-K512:
-WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
-WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
-WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
-WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
-WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
-WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
-WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
-WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
-WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
-WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
-WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
-WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
-WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
-WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
-WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
-WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
-WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
-WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
-WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
-WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
-WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
-WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
-WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
-WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
-WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
-WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
-WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
-WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
-WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
-WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
-WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
-WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
-WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
-WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
-WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
-WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
-WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
-WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
-WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
-WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
-.size  K512,.-K512
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word  OPENSSL_armcap_P-sha512_block_data_order
-.skip  32-4
-#else
-.skip  32
-#endif
-
-.global        sha512_block_data_order
-.type  sha512_block_data_order,%function
-sha512_block_data_order:
-.Lsha512_block_data_order:
-#if __ARM_ARCH__<7
-       sub     r3,pc,#8                @ sha512_block_data_order
-#else
-       adr     r3,.Lsha512_block_data_order
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-       ldr     r12,.LOPENSSL_armcap
-       ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
-       tst     r12,#1
-       bne     .LNEON
-#endif
-       add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
-       stmdb   sp!,{r4-r12,lr}
-       sub     $Ktbl,r3,#672           @ K512
-       sub     sp,sp,#9*8
-
-       ldr     $Elo,[$ctx,#$Eoff+$lo]
-       ldr     $Ehi,[$ctx,#$Eoff+$hi]
-       ldr     $t0, [$ctx,#$Goff+$lo]
-       ldr     $t1, [$ctx,#$Goff+$hi]
-       ldr     $t2, [$ctx,#$Hoff+$lo]
-       ldr     $t3, [$ctx,#$Hoff+$hi]
-.Loop:
-       str     $t0, [sp,#$Goff+0]
-       str     $t1, [sp,#$Goff+4]
-       str     $t2, [sp,#$Hoff+0]
-       str     $t3, [sp,#$Hoff+4]
-       ldr     $Alo,[$ctx,#$Aoff+$lo]
-       ldr     $Ahi,[$ctx,#$Aoff+$hi]
-       ldr     $Tlo,[$ctx,#$Boff+$lo]
-       ldr     $Thi,[$ctx,#$Boff+$hi]
-       ldr     $t0, [$ctx,#$Coff+$lo]
-       ldr     $t1, [$ctx,#$Coff+$hi]
-       ldr     $t2, [$ctx,#$Doff+$lo]
-       ldr     $t3, [$ctx,#$Doff+$hi]
-       str     $Tlo,[sp,#$Boff+0]
-       str     $Thi,[sp,#$Boff+4]
-       str     $t0, [sp,#$Coff+0]
-       str     $t1, [sp,#$Coff+4]
-       str     $t2, [sp,#$Doff+0]
-       str     $t3, [sp,#$Doff+4]
-       ldr     $Tlo,[$ctx,#$Foff+$lo]
-       ldr     $Thi,[$ctx,#$Foff+$hi]
-       str     $Tlo,[sp,#$Foff+0]
-       str     $Thi,[sp,#$Foff+4]
-
-.L00_15:
-#if __ARM_ARCH__<7
-       ldrb    $Tlo,[$inp,#7]
-       ldrb    $t0, [$inp,#6]
-       ldrb    $t1, [$inp,#5]
-       ldrb    $t2, [$inp,#4]
-       ldrb    $Thi,[$inp,#3]
-       ldrb    $t3, [$inp,#2]
-       orr     $Tlo,$Tlo,$t0,lsl#8
-       ldrb    $t0, [$inp,#1]
-       orr     $Tlo,$Tlo,$t1,lsl#16
-       ldrb    $t1, [$inp],#8
-       orr     $Tlo,$Tlo,$t2,lsl#24
-       orr     $Thi,$Thi,$t3,lsl#8
-       orr     $Thi,$Thi,$t0,lsl#16
-       orr     $Thi,$Thi,$t1,lsl#24
-#else
-       ldr     $Tlo,[$inp,#4]
-       ldr     $Thi,[$inp],#8
-#ifdef __ARMEL__
-       rev     $Tlo,$Tlo
-       rev     $Thi,$Thi
-#endif
-#endif
-___
-       &BODY_00_15(0x94);
-$code.=<<___;
-       tst     $Ktbl,#1
-       beq     .L00_15
-       ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
-       ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
-       bic     $Ktbl,$Ktbl,#1
-.L16_79:
-       @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
-       @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
-       @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
-       mov     $Tlo,$t0,lsr#1
-       ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
-       mov     $Thi,$t1,lsr#1
-       ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
-       eor     $Tlo,$Tlo,$t1,lsl#31
-       eor     $Thi,$Thi,$t0,lsl#31
-       eor     $Tlo,$Tlo,$t0,lsr#8
-       eor     $Thi,$Thi,$t1,lsr#8
-       eor     $Tlo,$Tlo,$t1,lsl#24
-       eor     $Thi,$Thi,$t0,lsl#24
-       eor     $Tlo,$Tlo,$t0,lsr#7
-       eor     $Thi,$Thi,$t1,lsr#7
-       eor     $Tlo,$Tlo,$t1,lsl#25
-
-       @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
-       @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
-       @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
-       mov     $t0,$t2,lsr#19
-       mov     $t1,$t3,lsr#19
-       eor     $t0,$t0,$t3,lsl#13
-       eor     $t1,$t1,$t2,lsl#13
-       eor     $t0,$t0,$t3,lsr#29
-       eor     $t1,$t1,$t2,lsr#29
-       eor     $t0,$t0,$t2,lsl#3
-       eor     $t1,$t1,$t3,lsl#3
-       eor     $t0,$t0,$t2,lsr#6
-       eor     $t1,$t1,$t3,lsr#6
-       ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
-       eor     $t0,$t0,$t3,lsl#26
-
-       ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
-       adds    $Tlo,$Tlo,$t0
-       ldr     $t0,[sp,#`$Xoff+8*16`+0]
-       adc     $Thi,$Thi,$t1
-
-       ldr     $t1,[sp,#`$Xoff+8*16`+4]
-       adds    $Tlo,$Tlo,$t2
-       adc     $Thi,$Thi,$t3
-       adds    $Tlo,$Tlo,$t0
-       adc     $Thi,$Thi,$t1
-___
-       &BODY_00_15(0x17);
-$code.=<<___;
-#if __ARM_ARCH__>=7
-       ittt    eq                      @ Thumb2 thing, sanity check in ARM
-#endif
-       ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
-       ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
-       beq     .L16_79
-       bic     $Ktbl,$Ktbl,#1
-
-       ldr     $Tlo,[sp,#$Boff+0]
-       ldr     $Thi,[sp,#$Boff+4]
-       ldr     $t0, [$ctx,#$Aoff+$lo]
-       ldr     $t1, [$ctx,#$Aoff+$hi]
-       ldr     $t2, [$ctx,#$Boff+$lo]
-       ldr     $t3, [$ctx,#$Boff+$hi]
-       adds    $t0,$Alo,$t0
-       str     $t0, [$ctx,#$Aoff+$lo]
-       adc     $t1,$Ahi,$t1
-       str     $t1, [$ctx,#$Aoff+$hi]
-       adds    $t2,$Tlo,$t2
-       str     $t2, [$ctx,#$Boff+$lo]
-       adc     $t3,$Thi,$t3
-       str     $t3, [$ctx,#$Boff+$hi]
-
-       ldr     $Alo,[sp,#$Coff+0]
-       ldr     $Ahi,[sp,#$Coff+4]
-       ldr     $Tlo,[sp,#$Doff+0]
-       ldr     $Thi,[sp,#$Doff+4]
-       ldr     $t0, [$ctx,#$Coff+$lo]
-       ldr     $t1, [$ctx,#$Coff+$hi]
-       ldr     $t2, [$ctx,#$Doff+$lo]
-       ldr     $t3, [$ctx,#$Doff+$hi]
-       adds    $t0,$Alo,$t0
-       str     $t0, [$ctx,#$Coff+$lo]
-       adc     $t1,$Ahi,$t1
-       str     $t1, [$ctx,#$Coff+$hi]
-       adds    $t2,$Tlo,$t2
-       str     $t2, [$ctx,#$Doff+$lo]
-       adc     $t3,$Thi,$t3
-       str     $t3, [$ctx,#$Doff+$hi]
-
-       ldr     $Tlo,[sp,#$Foff+0]
-       ldr     $Thi,[sp,#$Foff+4]
-       ldr     $t0, [$ctx,#$Eoff+$lo]
-       ldr     $t1, [$ctx,#$Eoff+$hi]
-       ldr     $t2, [$ctx,#$Foff+$lo]
-       ldr     $t3, [$ctx,#$Foff+$hi]
-       adds    $Elo,$Elo,$t0
-       str     $Elo,[$ctx,#$Eoff+$lo]
-       adc     $Ehi,$Ehi,$t1
-       str     $Ehi,[$ctx,#$Eoff+$hi]
-       adds    $t2,$Tlo,$t2
-       str     $t2, [$ctx,#$Foff+$lo]
-       adc     $t3,$Thi,$t3
-       str     $t3, [$ctx,#$Foff+$hi]
-
-       ldr     $Alo,[sp,#$Goff+0]
-       ldr     $Ahi,[sp,#$Goff+4]
-       ldr     $Tlo,[sp,#$Hoff+0]
-       ldr     $Thi,[sp,#$Hoff+4]
-       ldr     $t0, [$ctx,#$Goff+$lo]
-       ldr     $t1, [$ctx,#$Goff+$hi]
-       ldr     $t2, [$ctx,#$Hoff+$lo]
-       ldr     $t3, [$ctx,#$Hoff+$hi]
-       adds    $t0,$Alo,$t0
-       str     $t0, [$ctx,#$Goff+$lo]
-       adc     $t1,$Ahi,$t1
-       str     $t1, [$ctx,#$Goff+$hi]
-       adds    $t2,$Tlo,$t2
-       str     $t2, [$ctx,#$Hoff+$lo]
-       adc     $t3,$Thi,$t3
-       str     $t3, [$ctx,#$Hoff+$hi]
-
-       add     sp,sp,#640
-       sub     $Ktbl,$Ktbl,#640
-
-       teq     $inp,$len
-       bne     .Loop
-
-       add     sp,sp,#8*9              @ destroy frame
-#if __ARM_ARCH__>=5
-       ldmia   sp!,{r4-r12,pc}
-#else
-       ldmia   sp!,{r4-r12,lr}
-       tst     lr,#1
-       moveq   pc,lr                   @ be binary compatible with V4, yet
-       bx      lr                      @ interoperable with Thumb ISA:-)
-#endif
-.size  sha512_block_data_order,.-sha512_block_data_order
-___
-
-{
-my @Sigma0=(28,34,39);
-my @Sigma1=(14,18,41);
-my @sigma0=(1, 8, 7);
-my @sigma1=(19,61,6);
-
-my $Ktbl="r3";
-my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
-
-my @X=map("d$_",(0..15));
-my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
-
-sub NEON_00_15() {
-my $i=shift;
-my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
-my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));  # temps
-
-$code.=<<___ if ($i<16 || $i&1);
-       vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
-#if $i<16
-       vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
-#endif
-       vshr.u64        $t1,$e,#@Sigma1[1]
-#if $i>0
-        vadd.i64       $a,$Maj                 @ h+=Maj from the past
-#endif
-       vshr.u64        $t2,$e,#@Sigma1[2]
-___
-$code.=<<___;
-       vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
-       vsli.64         $t0,$e,#`64-@Sigma1[0]`
-       vsli.64         $t1,$e,#`64-@Sigma1[1]`
-       vmov            $Ch,$e
-       vsli.64         $t2,$e,#`64-@Sigma1[2]`
-#if $i<16 && defined(__ARMEL__)
-       vrev64.8        @X[$i],@X[$i]
-#endif
-       veor            $t1,$t0
-       vbsl            $Ch,$f,$g               @ Ch(e,f,g)
-       vshr.u64        $t0,$a,#@Sigma0[0]
-       veor            $t2,$t1                 @ Sigma1(e)
-       vadd.i64        $T1,$Ch,$h
-       vshr.u64        $t1,$a,#@Sigma0[1]
-       vsli.64         $t0,$a,#`64-@Sigma0[0]`
-       vadd.i64        $T1,$t2
-       vshr.u64        $t2,$a,#@Sigma0[2]
-       vadd.i64        $K,@X[$i%16]
-       vsli.64         $t1,$a,#`64-@Sigma0[1]`
-       veor            $Maj,$a,$b
-       vsli.64         $t2,$a,#`64-@Sigma0[2]`
-       veor            $h,$t0,$t1
-       vadd.i64        $T1,$K
-       vbsl            $Maj,$c,$b              @ Maj(a,b,c)
-       veor            $h,$t2                  @ Sigma0(a)
-       vadd.i64        $d,$T1
-       vadd.i64        $Maj,$T1
-       @ vadd.i64      $h,$Maj
-___
-}
-
-sub NEON_16_79() {
-my $i=shift;
-
-if ($i&1)      { &NEON_00_15($i,@_); return; }
-
-# 2x-vectorized, therefore runs every 2nd round
-my @X=map("q$_",(0..7));                       # view @X as 128-bit vector
-my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));    # temps
-my ($d0,$d1,$d2) = map("d$_",(24..26));                # temps from NEON_00_15
-my $e=@_[4];                                   # $e from NEON_00_15
-$i /= 2;
-$code.=<<___;
-       vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
-       vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
-        vadd.i64       @_[0],d30                       @ h+=Maj from the past
-       vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
-       vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
-       vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
-       vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
-       veor            $s1,$t0
-       vshr.u64        $t0,$s0,#@sigma0[0]
-       veor            $s1,$t1                         @ sigma1(X[i+14])
-       vshr.u64        $t1,$s0,#@sigma0[1]
-       vadd.i64        @X[$i%8],$s1
-       vshr.u64        $s1,$s0,#@sigma0[2]
-       vsli.64         $t0,$s0,#`64-@sigma0[0]`
-       vsli.64         $t1,$s0,#`64-@sigma0[1]`
-       vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
-       veor            $s1,$t0
-       vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
-       vadd.i64        @X[$i%8],$s0
-       vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
-       veor            $s1,$t1                         @ sigma0(X[i+1])
-       vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
-       vadd.i64        @X[$i%8],$s1
-___
-       &NEON_00_15(2*$i,@_);
-}
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
-.arch  armv7-a
-.fpu   neon
-
-.global        sha512_block_data_order_neon
-.type  sha512_block_data_order_neon,%function
-.align 4
-sha512_block_data_order_neon:
-.LNEON:
-       dmb                             @ errata #451034 on early Cortex A8
-       add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
-       VFP_ABI_PUSH
-       adr     $Ktbl,.Lsha512_block_data_order
-       sub     $Ktbl,$Ktbl,.Lsha512_block_data_order-K512
-       vldmia  $ctx,{$A-$H}            @ load context
-.Loop_neon:
-___
-for($i=0;$i<16;$i++)   { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
-       mov             $cnt,#4
-.L16_79_neon:
-       subs            $cnt,#1
-___
-for(;$i<32;$i++)       { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
-       bne             .L16_79_neon
-
-        vadd.i64       $A,d30          @ h+=Maj from the past
-       vldmia          $ctx,{d24-d31}  @ load context to temp
-       vadd.i64        q8,q12          @ vectorized accumulate
-       vadd.i64        q9,q13
-       vadd.i64        q10,q14
-       vadd.i64        q11,q15
-       vstmia          $ctx,{$A-$H}    @ save context
-       teq             $inp,$len
-       sub             $Ktbl,#640      @ rewind K512
-       bne             .Loop_neon
-
-       VFP_ABI_POP
-       ret                             @ bx lr
-.size  sha512_block_data_order_neon,.-sha512_block_data_order_neon
-#endif
-___
-}
-$code.=<<___;
-.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
-.align 2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm  OPENSSL_armcap_P,4,4
-#endif
-___
-
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;   # make it possible to compile with -march=armv4
-$code =~ s/\bret\b/bx  lr/gm;
-
-open SELF,$0;
-while(<SELF>) {
-       next if (/^#!/);
-       last if (!s/^#/@/ and !/^$/);
-       print;
-}
-close SELF;
-
-print $code;
-close STDOUT; # enforce flush
diff --git a/arch/arm/crypto/sha512-glue.c b/arch/arm/crypto/sha512-glue.c
deleted file mode 100644 (file)
index f8a6480..0000000
+++ /dev/null
@@ -1,110 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha512-glue.c - accelerated SHA-384/512 for ARM
- *
- * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include "sha512.h"
-
-MODULE_DESCRIPTION("Accelerated SHA-384/SHA-512 secure hash for ARM");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-
-MODULE_ALIAS_CRYPTO("sha384");
-MODULE_ALIAS_CRYPTO("sha512");
-MODULE_ALIAS_CRYPTO("sha384-arm");
-MODULE_ALIAS_CRYPTO("sha512-arm");
-
-asmlinkage void sha512_block_data_order(struct sha512_state *state,
-                                       u8 const *src, int blocks);
-
-static int sha512_arm_update(struct shash_desc *desc, const u8 *data,
-                            unsigned int len)
-{
-       return sha512_base_do_update_blocks(desc, data, len,
-                                           sha512_block_data_order);
-}
-
-static int sha512_arm_finup(struct shash_desc *desc, const u8 *data,
-                           unsigned int len, u8 *out)
-{
-       sha512_base_do_finup(desc, data, len, sha512_block_data_order);
-       return sha512_base_finish(desc, out);
-}
-
-static struct shash_alg sha512_arm_algs[] = { {
-       .init                   = sha384_base_init,
-       .update                 = sha512_arm_update,
-       .finup                  = sha512_arm_finup,
-       .descsize               = SHA512_STATE_SIZE,
-       .digestsize             = SHA384_DIGEST_SIZE,
-       .base                   = {
-               .cra_name               = "sha384",
-               .cra_driver_name        = "sha384-arm",
-               .cra_priority           = 250,
-               .cra_flags              = CRYPTO_AHASH_ALG_BLOCK_ONLY |
-                                         CRYPTO_AHASH_ALG_FINUP_MAX,
-               .cra_blocksize          = SHA512_BLOCK_SIZE,
-               .cra_module             = THIS_MODULE,
-       }
-},  {
-       .init                   = sha512_base_init,
-       .update                 = sha512_arm_update,
-       .finup                  = sha512_arm_finup,
-       .descsize               = SHA512_STATE_SIZE,
-       .digestsize             = SHA512_DIGEST_SIZE,
-       .base                   = {
-               .cra_name               = "sha512",
-               .cra_driver_name        = "sha512-arm",
-               .cra_priority           = 250,
-               .cra_flags              = CRYPTO_AHASH_ALG_BLOCK_ONLY |
-                                         CRYPTO_AHASH_ALG_FINUP_MAX,
-               .cra_blocksize          = SHA512_BLOCK_SIZE,
-               .cra_module             = THIS_MODULE,
-       }
-} };
-
-static int __init sha512_arm_mod_init(void)
-{
-       int err;
-
-       err = crypto_register_shashes(sha512_arm_algs,
-                                     ARRAY_SIZE(sha512_arm_algs));
-       if (err)
-               return err;
-
-       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
-               err = crypto_register_shashes(sha512_neon_algs,
-                                             ARRAY_SIZE(sha512_neon_algs));
-               if (err)
-                       goto err_unregister;
-       }
-       return 0;
-
-err_unregister:
-       crypto_unregister_shashes(sha512_arm_algs,
-                                 ARRAY_SIZE(sha512_arm_algs));
-
-       return err;
-}
-
-static void __exit sha512_arm_mod_fini(void)
-{
-       crypto_unregister_shashes(sha512_arm_algs,
-                                 ARRAY_SIZE(sha512_arm_algs));
-       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
-               crypto_unregister_shashes(sha512_neon_algs,
-                                         ARRAY_SIZE(sha512_neon_algs));
-}
-
-module_init(sha512_arm_mod_init);
-module_exit(sha512_arm_mod_fini);
diff --git a/arch/arm/crypto/sha512-neon-glue.c b/arch/arm/crypto/sha512-neon-glue.c
deleted file mode 100644 (file)
index bd52807..0000000
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha512-neon-glue.c - accelerated SHA-384/512 for ARM NEON
- *
- * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha512_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include "sha512.h"
-
-MODULE_ALIAS_CRYPTO("sha384-neon");
-MODULE_ALIAS_CRYPTO("sha512-neon");
-
-asmlinkage void sha512_block_data_order_neon(struct sha512_state *state,
-                                            const u8 *src, int blocks);
-
-static int sha512_neon_update(struct shash_desc *desc, const u8 *data,
-                             unsigned int len)
-{
-       int remain;
-
-       kernel_neon_begin();
-       remain = sha512_base_do_update_blocks(desc, data, len,
-                                             sha512_block_data_order_neon);
-       kernel_neon_end();
-       return remain;
-}
-
-static int sha512_neon_finup(struct shash_desc *desc, const u8 *data,
-                            unsigned int len, u8 *out)
-{
-       kernel_neon_begin();
-       sha512_base_do_finup(desc, data, len, sha512_block_data_order_neon);
-       kernel_neon_end();
-       return sha512_base_finish(desc, out);
-}
-
-struct shash_alg sha512_neon_algs[] = { {
-       .init                   = sha384_base_init,
-       .update                 = sha512_neon_update,
-       .finup                  = sha512_neon_finup,
-       .descsize               = SHA512_STATE_SIZE,
-       .digestsize             = SHA384_DIGEST_SIZE,
-       .base                   = {
-               .cra_name               = "sha384",
-               .cra_driver_name        = "sha384-neon",
-               .cra_priority           = 300,
-               .cra_flags              = CRYPTO_AHASH_ALG_BLOCK_ONLY |
-                                         CRYPTO_AHASH_ALG_FINUP_MAX,
-               .cra_blocksize          = SHA384_BLOCK_SIZE,
-               .cra_module             = THIS_MODULE,
-
-       }
-},  {
-       .init                   = sha512_base_init,
-       .update                 = sha512_neon_update,
-       .finup                  = sha512_neon_finup,
-       .descsize               = SHA512_STATE_SIZE,
-       .digestsize             = SHA512_DIGEST_SIZE,
-       .base                   = {
-               .cra_name               = "sha512",
-               .cra_driver_name        = "sha512-neon",
-               .cra_priority           = 300,
-               .cra_flags              = CRYPTO_AHASH_ALG_BLOCK_ONLY |
-                                         CRYPTO_AHASH_ALG_FINUP_MAX,
-               .cra_blocksize          = SHA512_BLOCK_SIZE,
-               .cra_module             = THIS_MODULE,
-       }
-} };
diff --git a/arch/arm/crypto/sha512.h b/arch/arm/crypto/sha512.h
deleted file mode 100644 (file)
index eeaee52..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-extern struct shash_alg sha512_neon_algs[2];
index d1bee3787eb3cea03d77b30628f57d46d93cb8d2..dac6356ba0aace355879f1781885c7f9db5c2cd2 100644 (file)
@@ -177,6 +177,7 @@ config CRYPTO_LIB_SHA512
 config CRYPTO_LIB_SHA512_ARCH
        bool
        depends on CRYPTO_LIB_SHA512 && !UML
+       default y if ARM && !CPU_V7M
 
 config CRYPTO_LIB_SM3
        tristate
index f6b6f370451ecc5926c9520a29df692948c47e55..67008a1612c6f68c0795e53c4570e9a23c459778 100644 (file)
@@ -1,5 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0
 
+aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
+
+quiet_cmd_perlasm = PERLASM $@
+      cmd_perlasm = $(PERL) $(<) > $(@)
+
 obj-$(CONFIG_CRYPTO_LIB_UTILS)                 += libcryptoutils.o
 libcryptoutils-y                               := memneq.o utils.o
 
@@ -68,6 +73,15 @@ obj-$(CONFIG_CRYPTO_LIB_SHA512) += libsha512.o
 libsha512-y := sha512.o
 ifeq ($(CONFIG_CRYPTO_LIB_SHA512_ARCH),y)
 CFLAGS_sha512.o += -I$(src)/$(SRCARCH)
+
+ifeq ($(CONFIG_ARM),y)
+libsha512-y += arm/sha512-core.o
+$(obj)/arm/sha512-core.S: $(src)/arm/sha512-armv4.pl
+       $(call cmd,perlasm)
+clean-files += arm/sha512-core.S
+AFLAGS_arm/sha512-core.o += $(aflags-thumb2-y)
+endif
+
 endif # CONFIG_CRYPTO_LIB_SHA512_ARCH
 
 obj-$(CONFIG_MPILIB) += mpi/
diff --git a/lib/crypto/arm/.gitignore b/lib/crypto/arm/.gitignore
new file mode 100644 (file)
index 0000000..670a4d9
--- /dev/null
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+sha512-core.S
diff --git a/lib/crypto/arm/sha512-armv4.pl b/lib/crypto/arm/sha512-armv4.pl
new file mode 100644 (file)
index 0000000..2fc3516
--- /dev/null
@@ -0,0 +1,657 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA512 block procedure for ARMv4. September 2007.
+
+# This code is ~4.5 (four and a half) times faster than code generated
+# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+# Xscale PXA250 core].
+#
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 6% improvement on
+# Cortex A8 core and ~40 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Coxtex A8 core and ~38 cycles per byte.
+
+# March 2011.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process
+# one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+# August 2012.
+#
+# Improve NEON performance by 12% on Snapdragon S4. In absolute
+# terms it's 22.6 cycles per byte, which is disappointing result.
+# Technical writers asserted that 3-way S4 pipeline can sustain
+# multiple NEON instructions per cycle, but dual NEON issue could
+# not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html
+# for further details. On side note Cortex-A15 processes one byte in
+# 16 cycles.
+
+# Byte order [in]dependence. =========================================
+#
+# Originally caller was expected to maintain specific *dword* order in
+# h[0-7], namely with most significant dword at *lower* address, which
+# was reflected in below two parameters as 0 and 4. Now caller is
+# expected to maintain native byte order for whole 64-bit values.
+$hi="HI";
+$lo="LO";
+# ====================================================================
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0";     # parameter block
+$inp="r1";
+$len="r2";
+
+$Tlo="r3";
+$Thi="r4";
+$Alo="r5";
+$Ahi="r6";
+$Elo="r7";
+$Ehi="r8";
+$t0="r9";
+$t1="r10";
+$t2="r11";
+$t3="r12";
+############   r13 is stack pointer
+$Ktbl="r14";
+############   r15 is program counter
+
+$Aoff=8*0;
+$Boff=8*1;
+$Coff=8*2;
+$Doff=8*3;
+$Eoff=8*4;
+$Foff=8*5;
+$Goff=8*6;
+$Hoff=8*7;
+$Xoff=8*8;
+
+sub BODY_00_15() {
+my $magic = shift;
+$code.=<<___;
+       @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+       @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+       @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+       mov     $t0,$Elo,lsr#14
+       str     $Tlo,[sp,#$Xoff+0]
+       mov     $t1,$Ehi,lsr#14
+       str     $Thi,[sp,#$Xoff+4]
+       eor     $t0,$t0,$Ehi,lsl#18
+       ldr     $t2,[sp,#$Hoff+0]       @ h.lo
+       eor     $t1,$t1,$Elo,lsl#18
+       ldr     $t3,[sp,#$Hoff+4]       @ h.hi
+       eor     $t0,$t0,$Elo,lsr#18
+       eor     $t1,$t1,$Ehi,lsr#18
+       eor     $t0,$t0,$Ehi,lsl#14
+       eor     $t1,$t1,$Elo,lsl#14
+       eor     $t0,$t0,$Ehi,lsr#9
+       eor     $t1,$t1,$Elo,lsr#9
+       eor     $t0,$t0,$Elo,lsl#23
+       eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
+       adds    $Tlo,$Tlo,$t0
+       ldr     $t0,[sp,#$Foff+0]       @ f.lo
+       adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
+       ldr     $t1,[sp,#$Foff+4]       @ f.hi
+       adds    $Tlo,$Tlo,$t2
+       ldr     $t2,[sp,#$Goff+0]       @ g.lo
+       adc     $Thi,$Thi,$t3           @ T += h
+       ldr     $t3,[sp,#$Goff+4]       @ g.hi
+
+       eor     $t0,$t0,$t2
+       str     $Elo,[sp,#$Eoff+0]
+       eor     $t1,$t1,$t3
+       str     $Ehi,[sp,#$Eoff+4]
+       and     $t0,$t0,$Elo
+       str     $Alo,[sp,#$Aoff+0]
+       and     $t1,$t1,$Ehi
+       str     $Ahi,[sp,#$Aoff+4]
+       eor     $t0,$t0,$t2
+       ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
+       eor     $t1,$t1,$t3             @ Ch(e,f,g)
+       ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
+
+       adds    $Tlo,$Tlo,$t0
+       ldr     $Elo,[sp,#$Doff+0]      @ d.lo
+       adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
+       ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
+       adds    $Tlo,$Tlo,$t2
+       and     $t0,$t2,#0xff
+       adc     $Thi,$Thi,$t3           @ T += K[i]
+       adds    $Elo,$Elo,$Tlo
+       ldr     $t2,[sp,#$Boff+0]       @ b.lo
+       adc     $Ehi,$Ehi,$Thi          @ d += T
+       teq     $t0,#$magic
+
+       ldr     $t3,[sp,#$Coff+0]       @ c.lo
+#if __ARM_ARCH__>=7
+       it      eq                      @ Thumb2 thing, sanity check in ARM
+#endif
+       orreq   $Ktbl,$Ktbl,#1
+       @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+       @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+       @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+       mov     $t0,$Alo,lsr#28
+       mov     $t1,$Ahi,lsr#28
+       eor     $t0,$t0,$Ahi,lsl#4
+       eor     $t1,$t1,$Alo,lsl#4
+       eor     $t0,$t0,$Ahi,lsr#2
+       eor     $t1,$t1,$Alo,lsr#2
+       eor     $t0,$t0,$Alo,lsl#30
+       eor     $t1,$t1,$Ahi,lsl#30
+       eor     $t0,$t0,$Ahi,lsr#7
+       eor     $t1,$t1,$Alo,lsr#7
+       eor     $t0,$t0,$Alo,lsl#25
+       eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
+       adds    $Tlo,$Tlo,$t0
+       and     $t0,$Alo,$t2
+       adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
+
+       ldr     $t1,[sp,#$Boff+4]       @ b.hi
+       orr     $Alo,$Alo,$t2
+       ldr     $t2,[sp,#$Coff+4]       @ c.hi
+       and     $Alo,$Alo,$t3
+       and     $t3,$Ahi,$t1
+       orr     $Ahi,$Ahi,$t1
+       orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
+       and     $Ahi,$Ahi,$t2
+       adds    $Alo,$Alo,$Tlo
+       orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
+       sub     sp,sp,#8
+       adc     $Ahi,$Ahi,$Thi          @ h += T
+       tst     $Ktbl,#1
+       add     $Ktbl,$Ktbl,#8
+___
+}
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+# define VFP_ABI_PUSH  vstmdb  sp!,{d8-d15}
+# define VFP_ABI_POP   vldmia  sp!,{d8-d15}
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)       .word   lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)       .word   hi0,lo0, hi1,lo1
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code  32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type  K512,%object
+.align 5
+K512:
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
+.size  K512,.-K512
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word  OPENSSL_armcap_P-sha512_block_data_order
+.skip  32-4
+#else
+.skip  32
+#endif
+
+.global        sha512_block_data_order
+.type  sha512_block_data_order,%function
+sha512_block_data_order:
+.Lsha512_block_data_order:
+#if __ARM_ARCH__<7
+       sub     r3,pc,#8                @ sha512_block_data_order
+#else
+       adr     r3,.Lsha512_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+       ldr     r12,.LOPENSSL_armcap
+       ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
+       tst     r12,#1
+       bne     .LNEON
+#endif
+       add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
+       stmdb   sp!,{r4-r12,lr}
+       sub     $Ktbl,r3,#672           @ K512
+       sub     sp,sp,#9*8
+
+       ldr     $Elo,[$ctx,#$Eoff+$lo]
+       ldr     $Ehi,[$ctx,#$Eoff+$hi]
+       ldr     $t0, [$ctx,#$Goff+$lo]
+       ldr     $t1, [$ctx,#$Goff+$hi]
+       ldr     $t2, [$ctx,#$Hoff+$lo]
+       ldr     $t3, [$ctx,#$Hoff+$hi]
+.Loop:
+       str     $t0, [sp,#$Goff+0]
+       str     $t1, [sp,#$Goff+4]
+       str     $t2, [sp,#$Hoff+0]
+       str     $t3, [sp,#$Hoff+4]
+       ldr     $Alo,[$ctx,#$Aoff+$lo]
+       ldr     $Ahi,[$ctx,#$Aoff+$hi]
+       ldr     $Tlo,[$ctx,#$Boff+$lo]
+       ldr     $Thi,[$ctx,#$Boff+$hi]
+       ldr     $t0, [$ctx,#$Coff+$lo]
+       ldr     $t1, [$ctx,#$Coff+$hi]
+       ldr     $t2, [$ctx,#$Doff+$lo]
+       ldr     $t3, [$ctx,#$Doff+$hi]
+       str     $Tlo,[sp,#$Boff+0]
+       str     $Thi,[sp,#$Boff+4]
+       str     $t0, [sp,#$Coff+0]
+       str     $t1, [sp,#$Coff+4]
+       str     $t2, [sp,#$Doff+0]
+       str     $t3, [sp,#$Doff+4]
+       ldr     $Tlo,[$ctx,#$Foff+$lo]
+       ldr     $Thi,[$ctx,#$Foff+$hi]
+       str     $Tlo,[sp,#$Foff+0]
+       str     $Thi,[sp,#$Foff+4]
+
+.L00_15:
+#if __ARM_ARCH__<7
+       ldrb    $Tlo,[$inp,#7]
+       ldrb    $t0, [$inp,#6]
+       ldrb    $t1, [$inp,#5]
+       ldrb    $t2, [$inp,#4]
+       ldrb    $Thi,[$inp,#3]
+       ldrb    $t3, [$inp,#2]
+       orr     $Tlo,$Tlo,$t0,lsl#8
+       ldrb    $t0, [$inp,#1]
+       orr     $Tlo,$Tlo,$t1,lsl#16
+       ldrb    $t1, [$inp],#8
+       orr     $Tlo,$Tlo,$t2,lsl#24
+       orr     $Thi,$Thi,$t3,lsl#8
+       orr     $Thi,$Thi,$t0,lsl#16
+       orr     $Thi,$Thi,$t1,lsl#24
+#else
+       ldr     $Tlo,[$inp,#4]
+       ldr     $Thi,[$inp],#8
+#ifdef __ARMEL__
+       rev     $Tlo,$Tlo
+       rev     $Thi,$Thi
+#endif
+#endif
+___
+       &BODY_00_15(0x94);
+$code.=<<___;
+       tst     $Ktbl,#1
+       beq     .L00_15
+       ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
+       ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
+       bic     $Ktbl,$Ktbl,#1
+.L16_79:
+       @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+       @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+       @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+       mov     $Tlo,$t0,lsr#1
+       ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
+       mov     $Thi,$t1,lsr#1
+       ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
+       eor     $Tlo,$Tlo,$t1,lsl#31
+       eor     $Thi,$Thi,$t0,lsl#31
+       eor     $Tlo,$Tlo,$t0,lsr#8
+       eor     $Thi,$Thi,$t1,lsr#8
+       eor     $Tlo,$Tlo,$t1,lsl#24
+       eor     $Thi,$Thi,$t0,lsl#24
+       eor     $Tlo,$Tlo,$t0,lsr#7
+       eor     $Thi,$Thi,$t1,lsr#7
+       eor     $Tlo,$Tlo,$t1,lsl#25
+
+       @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+       @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+       @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+       mov     $t0,$t2,lsr#19
+       mov     $t1,$t3,lsr#19
+       eor     $t0,$t0,$t3,lsl#13
+       eor     $t1,$t1,$t2,lsl#13
+       eor     $t0,$t0,$t3,lsr#29
+       eor     $t1,$t1,$t2,lsr#29
+       eor     $t0,$t0,$t2,lsl#3
+       eor     $t1,$t1,$t3,lsl#3
+       eor     $t0,$t0,$t2,lsr#6
+       eor     $t1,$t1,$t3,lsr#6
+       ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
+       eor     $t0,$t0,$t3,lsl#26
+
+       ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
+       adds    $Tlo,$Tlo,$t0
+       ldr     $t0,[sp,#`$Xoff+8*16`+0]
+       adc     $Thi,$Thi,$t1
+
+       ldr     $t1,[sp,#`$Xoff+8*16`+4]
+       adds    $Tlo,$Tlo,$t2
+       adc     $Thi,$Thi,$t3
+       adds    $Tlo,$Tlo,$t0
+       adc     $Thi,$Thi,$t1
+___
+       &BODY_00_15(0x17);
+$code.=<<___;
+#if __ARM_ARCH__>=7
+       ittt    eq                      @ Thumb2 thing, sanity check in ARM
+#endif
+       ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
+       ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
+       beq     .L16_79
+       bic     $Ktbl,$Ktbl,#1
+
+       ldr     $Tlo,[sp,#$Boff+0]
+       ldr     $Thi,[sp,#$Boff+4]
+       ldr     $t0, [$ctx,#$Aoff+$lo]
+       ldr     $t1, [$ctx,#$Aoff+$hi]
+       ldr     $t2, [$ctx,#$Boff+$lo]
+       ldr     $t3, [$ctx,#$Boff+$hi]
+       adds    $t0,$Alo,$t0
+       str     $t0, [$ctx,#$Aoff+$lo]
+       adc     $t1,$Ahi,$t1
+       str     $t1, [$ctx,#$Aoff+$hi]
+       adds    $t2,$Tlo,$t2
+       str     $t2, [$ctx,#$Boff+$lo]
+       adc     $t3,$Thi,$t3
+       str     $t3, [$ctx,#$Boff+$hi]
+
+       ldr     $Alo,[sp,#$Coff+0]
+       ldr     $Ahi,[sp,#$Coff+4]
+       ldr     $Tlo,[sp,#$Doff+0]
+       ldr     $Thi,[sp,#$Doff+4]
+       ldr     $t0, [$ctx,#$Coff+$lo]
+       ldr     $t1, [$ctx,#$Coff+$hi]
+       ldr     $t2, [$ctx,#$Doff+$lo]
+       ldr     $t3, [$ctx,#$Doff+$hi]
+       adds    $t0,$Alo,$t0
+       str     $t0, [$ctx,#$Coff+$lo]
+       adc     $t1,$Ahi,$t1
+       str     $t1, [$ctx,#$Coff+$hi]
+       adds    $t2,$Tlo,$t2
+       str     $t2, [$ctx,#$Doff+$lo]
+       adc     $t3,$Thi,$t3
+       str     $t3, [$ctx,#$Doff+$hi]
+
+       ldr     $Tlo,[sp,#$Foff+0]
+       ldr     $Thi,[sp,#$Foff+4]
+       ldr     $t0, [$ctx,#$Eoff+$lo]
+       ldr     $t1, [$ctx,#$Eoff+$hi]
+       ldr     $t2, [$ctx,#$Foff+$lo]
+       ldr     $t3, [$ctx,#$Foff+$hi]
+       adds    $Elo,$Elo,$t0
+       str     $Elo,[$ctx,#$Eoff+$lo]
+       adc     $Ehi,$Ehi,$t1
+       str     $Ehi,[$ctx,#$Eoff+$hi]
+       adds    $t2,$Tlo,$t2
+       str     $t2, [$ctx,#$Foff+$lo]
+       adc     $t3,$Thi,$t3
+       str     $t3, [$ctx,#$Foff+$hi]
+
+       ldr     $Alo,[sp,#$Goff+0]
+       ldr     $Ahi,[sp,#$Goff+4]
+       ldr     $Tlo,[sp,#$Hoff+0]
+       ldr     $Thi,[sp,#$Hoff+4]
+       ldr     $t0, [$ctx,#$Goff+$lo]
+       ldr     $t1, [$ctx,#$Goff+$hi]
+       ldr     $t2, [$ctx,#$Hoff+$lo]
+       ldr     $t3, [$ctx,#$Hoff+$hi]
+       adds    $t0,$Alo,$t0
+       str     $t0, [$ctx,#$Goff+$lo]
+       adc     $t1,$Ahi,$t1
+       str     $t1, [$ctx,#$Goff+$hi]
+       adds    $t2,$Tlo,$t2
+       str     $t2, [$ctx,#$Hoff+$lo]
+       adc     $t3,$Thi,$t3
+       str     $t3, [$ctx,#$Hoff+$hi]
+
+       add     sp,sp,#640
+       sub     $Ktbl,$Ktbl,#640
+
+       teq     $inp,$len
+       bne     .Loop
+
+       add     sp,sp,#8*9              @ destroy frame
+#if __ARM_ARCH__>=5
+       ldmia   sp!,{r4-r12,pc}
+#else
+       ldmia   sp!,{r4-r12,lr}
+       tst     lr,#1
+       moveq   pc,lr                   @ be binary compatible with V4, yet
+       bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
+.size  sha512_block_data_order,.-sha512_block_data_order
+___
+
+{
+my @Sigma0=(28,34,39);
+my @Sigma1=(14,18,41);
+my @sigma0=(1, 8, 7);
+my @sigma1=(19,61,6);
+
+my $Ktbl="r3";
+my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
+
+my @X=map("d$_",(0..15));
+my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
+
+sub NEON_00_15() {
+my $i=shift;
+my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));  # temps
+
+$code.=<<___ if ($i<16 || $i&1);
+       vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
+#if $i<16
+       vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
+#endif
+       vshr.u64        $t1,$e,#@Sigma1[1]
+#if $i>0
+        vadd.i64       $a,$Maj                 @ h+=Maj from the past
+#endif
+       vshr.u64        $t2,$e,#@Sigma1[2]
+___
+$code.=<<___;
+       vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
+       vsli.64         $t0,$e,#`64-@Sigma1[0]`
+       vsli.64         $t1,$e,#`64-@Sigma1[1]`
+       vmov            $Ch,$e
+       vsli.64         $t2,$e,#`64-@Sigma1[2]`
+#if $i<16 && defined(__ARMEL__)
+       vrev64.8        @X[$i],@X[$i]
+#endif
+       veor            $t1,$t0
+       vbsl            $Ch,$f,$g               @ Ch(e,f,g)
+       vshr.u64        $t0,$a,#@Sigma0[0]
+       veor            $t2,$t1                 @ Sigma1(e)
+       vadd.i64        $T1,$Ch,$h
+       vshr.u64        $t1,$a,#@Sigma0[1]
+       vsli.64         $t0,$a,#`64-@Sigma0[0]`
+       vadd.i64        $T1,$t2
+       vshr.u64        $t2,$a,#@Sigma0[2]
+       vadd.i64        $K,@X[$i%16]
+       vsli.64         $t1,$a,#`64-@Sigma0[1]`
+       veor            $Maj,$a,$b
+       vsli.64         $t2,$a,#`64-@Sigma0[2]`
+       veor            $h,$t0,$t1
+       vadd.i64        $T1,$K
+       vbsl            $Maj,$c,$b              @ Maj(a,b,c)
+       veor            $h,$t2                  @ Sigma0(a)
+       vadd.i64        $d,$T1
+       vadd.i64        $Maj,$T1
+       @ vadd.i64      $h,$Maj
+___
+}
+
+sub NEON_16_79() {
+my $i=shift;
+
+if ($i&1)      { &NEON_00_15($i,@_); return; }
+
+# 2x-vectorized, therefore runs every 2nd round
+my @X=map("q$_",(0..7));                       # view @X as 128-bit vector
+my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));    # temps
+my ($d0,$d1,$d2) = map("d$_",(24..26));                # temps from NEON_00_15
+my $e=@_[4];                                   # $e from NEON_00_15
+$i /= 2;
+$code.=<<___;
+       vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
+       vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
+        vadd.i64       @_[0],d30                       @ h+=Maj from the past
+       vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
+       vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
+       vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
+       vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
+       veor            $s1,$t0
+       vshr.u64        $t0,$s0,#@sigma0[0]
+       veor            $s1,$t1                         @ sigma1(X[i+14])
+       vshr.u64        $t1,$s0,#@sigma0[1]
+       vadd.i64        @X[$i%8],$s1
+       vshr.u64        $s1,$s0,#@sigma0[2]
+       vsli.64         $t0,$s0,#`64-@sigma0[0]`
+       vsli.64         $t1,$s0,#`64-@sigma0[1]`
+       vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
+       veor            $s1,$t0
+       vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
+       vadd.i64        @X[$i%8],$s0
+       vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
+       veor            $s1,$t1                         @ sigma0(X[i+1])
+       vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
+       vadd.i64        @X[$i%8],$s1
+___
+       &NEON_00_15(2*$i,@_);
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch  armv7-a
+.fpu   neon
+
+.global        sha512_block_data_order_neon
+.type  sha512_block_data_order_neon,%function
+.align 4
+sha512_block_data_order_neon:
+.LNEON:
+       dmb                             @ errata #451034 on early Cortex A8
+       add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
+       VFP_ABI_PUSH
+       adr     $Ktbl,.Lsha512_block_data_order
+       sub     $Ktbl,$Ktbl,.Lsha512_block_data_order-K512
+       vldmia  $ctx,{$A-$H}            @ load context
+.Loop_neon:
+___
+for($i=0;$i<16;$i++)   { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+       mov             $cnt,#4
+.L16_79_neon:
+       subs            $cnt,#1
+___
+for(;$i<32;$i++)       { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+       bne             .L16_79_neon
+
+        vadd.i64       $A,d30          @ h+=Maj from the past
+       vldmia          $ctx,{d24-d31}  @ load context to temp
+       vadd.i64        q8,q12          @ vectorized accumulate
+       vadd.i64        q9,q13
+       vadd.i64        q10,q14
+       vadd.i64        q11,q15
+       vstmia          $ctx,{$A-$H}    @ save context
+       teq             $inp,$len
+       sub             $Ktbl,#640      @ rewind K512
+       bne             .Loop_neon
+
+       VFP_ABI_POP
+       ret                             @ bx lr
+.size  sha512_block_data_order_neon,.-sha512_block_data_order_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm  OPENSSL_armcap_P,4,4
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;   # make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx  lr/gm;
+
+open SELF,$0;
+while(<SELF>) {
+       next if (/^#!/);
+       last if (!s/^#/@/ and !/^$/);
+       print;
+}
+close SELF;
+
+print $code;
+close STDOUT; # enforce flush
diff --git a/lib/crypto/arm/sha512.h b/lib/crypto/arm/sha512.h
new file mode 100644 (file)
index 0000000..f147b64
--- /dev/null
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * arm32-optimized SHA-512 block function
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <crypto/internal/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+asmlinkage void sha512_block_data_order(struct sha512_block_state *state,
+                                       const u8 *data, size_t nblocks);
+asmlinkage void sha512_block_data_order_neon(struct sha512_block_state *state,
+                                            const u8 *data, size_t nblocks);
+
+static void sha512_blocks(struct sha512_block_state *state,
+                         const u8 *data, size_t nblocks)
+{
+       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+           static_branch_likely(&have_neon) && likely(crypto_simd_usable())) {
+               kernel_neon_begin();
+               sha512_block_data_order_neon(state, data, nblocks);
+               kernel_neon_end();
+       } else {
+               sha512_block_data_order(state, data, nblocks);
+       }
+}
+
+#ifdef CONFIG_KERNEL_MODE_NEON
+#define sha512_mod_init_arch sha512_mod_init_arch
+static inline void sha512_mod_init_arch(void)
+{
+       if (cpu_has_neon())
+               static_branch_enable(&have_neon);
+}
+#endif /* CONFIG_KERNEL_MODE_NEON */