crypto: arm/sha256 - implement library instead of shash
authorEric Biggers <ebiggers@google.com>
Mon, 28 Apr 2025 17:00:27 +0000 (10:00 -0700)
committerHerbert Xu <herbert@gondor.apana.org.au>
Mon, 5 May 2025 10:20:43 +0000 (18:20 +0800)
Instead of providing crypto_shash algorithms for the arch-optimized
SHA-256 code, instead implement the SHA-256 library.  This is much
simpler, it makes the SHA-256 library functions be arch-optimized, and
it fixes the longstanding issue where the arch-optimized SHA-256 was
disabled by default.  SHA-256 still remains available through
crypto_shash, but individual architectures no longer need to handle it.

To merge the scalar, NEON, and CE code all into one module cleanly, add
!CPU_V7M as a direct dependency of the CE code.  Previously, !CPU_V7M
was only a direct dependency of the scalar and NEON code.  The result is
still the same because CPU_V7M implies !KERNEL_MODE_NEON, so !CPU_V7M
was already an indirect dependency of the CE code.

To match sha256_blocks_arch(), change the type of the nblocks parameter
of the assembly functions from int to size_t.  The assembly functions
actually already treated it as size_t.

While renaming the assembly files, also fix the naming quirk where
"sha2" meant sha256.  (SHA-512 is also part of SHA-2.)

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
19 files changed:
arch/arm/configs/exynos_defconfig
arch/arm/configs/milbeaut_m10v_defconfig
arch/arm/configs/multi_v7_defconfig
arch/arm/configs/omap2plus_defconfig
arch/arm/configs/pxa_defconfig
arch/arm/crypto/Kconfig
arch/arm/crypto/Makefile
arch/arm/crypto/sha2-ce-core.S [deleted file]
arch/arm/crypto/sha2-ce-glue.c [deleted file]
arch/arm/crypto/sha256-armv4.pl [deleted file]
arch/arm/crypto/sha256_glue.c [deleted file]
arch/arm/crypto/sha256_glue.h [deleted file]
arch/arm/crypto/sha256_neon_glue.c [deleted file]
arch/arm/lib/crypto/.gitignore
arch/arm/lib/crypto/Kconfig
arch/arm/lib/crypto/Makefile
arch/arm/lib/crypto/sha256-armv4.pl [new file with mode: 0644]
arch/arm/lib/crypto/sha256-ce.S [new file with mode: 0644]
arch/arm/lib/crypto/sha256.c [new file with mode: 0644]

index e81a5d6c1c20856acc45cd1b34da92de095844ec..c6792c0256a679d6ac442b61544008ab1d035632 100644 (file)
@@ -364,7 +364,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA256_ARM=m
 CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
 CONFIG_CRYPTO_CHACHA20_NEON=m
index 275ddf7a3a14df3001b56b9b4a86bf485610ffb5..4ec21f477c633a0b6c4168012b50db7d7d7f36e6 100644 (file)
@@ -101,7 +101,6 @@ CONFIG_CRYPTO_SEQIV=m
 CONFIG_CRYPTO_GHASH_ARM_CE=m
 CONFIG_CRYPTO_SHA1_ARM_NEON=m
 CONFIG_CRYPTO_SHA1_ARM_CE=m
-CONFIG_CRYPTO_SHA2_ARM_CE=m
 CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
index ad037c175fdb0ec8601c9b3607aca0c0e5f3c145..96178acedad0b18e12f7c81e96681296624b7bff 100644 (file)
@@ -1301,7 +1301,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m
 CONFIG_CRYPTO_GHASH_ARM_CE=m
 CONFIG_CRYPTO_SHA1_ARM_NEON=m
 CONFIG_CRYPTO_SHA1_ARM_CE=m
-CONFIG_CRYPTO_SHA2_ARM_CE=m
 CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
index 75b326bc7830ce3315a89c3979fbdf36401ff5b0..317f977e509e6e9423d60fd284d15679717e2a53 100644 (file)
@@ -697,7 +697,6 @@ CONFIG_SECURITY=y
 CONFIG_CRYPTO_MICHAEL_MIC=y
 CONFIG_CRYPTO_GHASH_ARM_CE=m
 CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA256_ARM=m
 CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_CRYPTO_AES_ARM_BS=m
index 24fca8608554cd74088bb2107eeb5c9e6ee2b7af..56be857529095700951f5d88eb8a8eda4b0a76dc 100644 (file)
@@ -660,7 +660,6 @@ CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_DEFLATE=y
 CONFIG_CRYPTO_LZO=y
 CONFIG_CRYPTO_SHA1_ARM=m
-CONFIG_CRYPTO_SHA256_ARM=m
 CONFIG_CRYPTO_SHA512_ARM=m
 CONFIG_CRYPTO_AES_ARM=m
 CONFIG_FONTS=y
index 1f889d6bab77debc386cb6023157437b4247a8b4..7efb9a8596e4e58302144c4b16cfa90f9afac465 100644 (file)
@@ -93,27 +93,6 @@ config CRYPTO_SHA1_ARM_CE
 
          Architecture: arm using ARMv8 Crypto Extensions
 
-config CRYPTO_SHA2_ARM_CE
-       tristate "Hash functions: SHA-224 and SHA-256 (ARMv8 Crypto Extensions)"
-       depends on KERNEL_MODE_NEON
-       select CRYPTO_SHA256_ARM
-       select CRYPTO_HASH
-       help
-         SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
-         Architecture: arm using
-         - ARMv8 Crypto Extensions
-
-config CRYPTO_SHA256_ARM
-       tristate "Hash functions: SHA-224 and SHA-256 (NEON)"
-       select CRYPTO_HASH
-       depends on !CPU_V7M
-       help
-         SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
-         Architecture: arm using
-         - NEON (Advanced SIMD) extensions
-
 config CRYPTO_SHA512_ARM
        tristate "Hash functions: SHA-384 and SHA-512 (NEON)"
        select CRYPTO_HASH
index ecabe6603e08046071facffb7e887fb27623d67c..8479137c6e80022766342694d4ee9dbca12205f4 100644 (file)
@@ -7,7 +7,6 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
 obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
-obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
 obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
@@ -15,20 +14,16 @@ obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
 obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
 
 aes-arm-y      := aes-cipher-core.o aes-cipher-glue.o
 aes-arm-bs-y   := aes-neonbs-core.o aes-neonbs-glue.o
 sha1-arm-y     := sha1-armv4-large.o sha1_glue.o
 sha1-arm-neon-y        := sha1-armv7-neon.o sha1_neon_glue.o
-sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
-sha256-arm-y   := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
 sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
 sha512-arm-y   := sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
 blake2b-neon-y  := blake2b-neon-core.o blake2b-neon-glue.o
 sha1-arm-ce-y  := sha1-ce-core.o sha1-ce-glue.o
-sha2-arm-ce-y  := sha2-ce-core.o sha2-ce-glue.o
 aes-arm-ce-y   := aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
@@ -40,9 +35,8 @@ quiet_cmd_perl = PERL    $@
 $(obj)/%-core.S: $(src)/%-armv4.pl
        $(call cmd,perl)
 
-clean-files += sha256-core.S sha512-core.S
+clean-files += sha512-core.S
 
 aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
 
-AFLAGS_sha256-core.o += $(aflags-thumb2-y)
 AFLAGS_sha512-core.o += $(aflags-thumb2-y)
diff --git a/arch/arm/crypto/sha2-ce-core.S b/arch/arm/crypto/sha2-ce-core.S
deleted file mode 100644 (file)
index b6369d2..0000000
+++ /dev/null
@@ -1,123 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha2-ce-core.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2015 Linaro Ltd.
- * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-       .text
-       .arch           armv8-a
-       .fpu            crypto-neon-fp-armv8
-
-       k0              .req    q7
-       k1              .req    q8
-       rk              .req    r3
-
-       ta0             .req    q9
-       ta1             .req    q10
-       tb0             .req    q10
-       tb1             .req    q9
-
-       dga             .req    q11
-       dgb             .req    q12
-
-       dg0             .req    q13
-       dg1             .req    q14
-       dg2             .req    q15
-
-       .macro          add_only, ev, s0
-       vmov            dg2, dg0
-       .ifnb           \s0
-       vld1.32         {k\ev}, [rk, :128]!
-       .endif
-       sha256h.32      dg0, dg1, tb\ev
-       sha256h2.32     dg1, dg2, tb\ev
-       .ifnb           \s0
-       vadd.u32        ta\ev, q\s0, k\ev
-       .endif
-       .endm
-
-       .macro          add_update, ev, s0, s1, s2, s3
-       sha256su0.32    q\s0, q\s1
-       add_only        \ev, \s1
-       sha256su1.32    q\s0, q\s2, q\s3
-       .endm
-
-       .align          6
-.Lsha256_rcon:
-       .word           0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
-       .word           0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
-       .word           0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
-       .word           0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
-       .word           0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
-       .word           0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
-       .word           0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
-       .word           0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
-       .word           0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
-       .word           0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
-       .word           0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
-       .word           0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
-       .word           0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
-       .word           0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
-       .word           0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
-       .word           0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-
-       /*
-        * void sha2_ce_transform(struct sha256_state *sst, u8 const *src,
-                                 int blocks);
-        */
-ENTRY(sha2_ce_transform)
-       /* load state */
-       vld1.32         {dga-dgb}, [r0]
-
-       /* load input */
-0:     vld1.32         {q0-q1}, [r1]!
-       vld1.32         {q2-q3}, [r1]!
-       subs            r2, r2, #1
-
-#ifndef CONFIG_CPU_BIG_ENDIAN
-       vrev32.8        q0, q0
-       vrev32.8        q1, q1
-       vrev32.8        q2, q2
-       vrev32.8        q3, q3
-#endif
-
-       /* load first round constant */
-       adr             rk, .Lsha256_rcon
-       vld1.32         {k0}, [rk, :128]!
-
-       vadd.u32        ta0, q0, k0
-       vmov            dg0, dga
-       vmov            dg1, dgb
-
-       add_update      1, 0, 1, 2, 3
-       add_update      0, 1, 2, 3, 0
-       add_update      1, 2, 3, 0, 1
-       add_update      0, 3, 0, 1, 2
-       add_update      1, 0, 1, 2, 3
-       add_update      0, 1, 2, 3, 0
-       add_update      1, 2, 3, 0, 1
-       add_update      0, 3, 0, 1, 2
-       add_update      1, 0, 1, 2, 3
-       add_update      0, 1, 2, 3, 0
-       add_update      1, 2, 3, 0, 1
-       add_update      0, 3, 0, 1, 2
-
-       add_only        1, 1
-       add_only        0, 2
-       add_only        1, 3
-       add_only        0
-
-       /* update state */
-       vadd.u32        dga, dga, dg0
-       vadd.u32        dgb, dgb, dg1
-       bne             0b
-
-       /* store new state */
-       vst1.32         {dga-dgb}, [r0]
-       bx              lr
-ENDPROC(sha2_ce_transform)
diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
deleted file mode 100644 (file)
index 1e9d16f..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-
-asmlinkage void sha2_ce_transform(struct crypto_sha256_state *sst,
-                                 u8 const *src, int blocks);
-
-static int sha2_ce_update(struct shash_desc *desc, const u8 *data,
-                         unsigned int len)
-{
-       int remain;
-
-       kernel_neon_begin();
-       remain = sha256_base_do_update_blocks(desc, data, len,
-                                             sha2_ce_transform);
-       kernel_neon_end();
-       return remain;
-}
-
-static int sha2_ce_finup(struct shash_desc *desc, const u8 *data,
-                        unsigned int len, u8 *out)
-{
-       kernel_neon_begin();
-       sha256_base_do_finup(desc, data, len, sha2_ce_transform);
-       kernel_neon_end();
-       return sha256_base_finish(desc, out);
-}
-
-static struct shash_alg algs[] = { {
-       .init                   = sha224_base_init,
-       .update                 = sha2_ce_update,
-       .finup                  = sha2_ce_finup,
-       .descsize               = sizeof(struct crypto_sha256_state),
-       .digestsize             = SHA224_DIGEST_SIZE,
-       .base                   = {
-               .cra_name               = "sha224",
-               .cra_driver_name        = "sha224-ce",
-               .cra_priority           = 300,
-               .cra_flags              = CRYPTO_AHASH_ALG_BLOCK_ONLY |
-                                         CRYPTO_AHASH_ALG_FINUP_MAX,
-               .cra_blocksize          = SHA256_BLOCK_SIZE,
-               .cra_module             = THIS_MODULE,
-       }
-}, {
-       .init                   = sha256_base_init,
-       .update                 = sha2_ce_update,
-       .finup                  = sha2_ce_finup,
-       .descsize               = sizeof(struct crypto_sha256_state),
-       .digestsize             = SHA256_DIGEST_SIZE,
-       .base                   = {
-               .cra_name               = "sha256",
-               .cra_driver_name        = "sha256-ce",
-               .cra_priority           = 300,
-               .cra_flags              = CRYPTO_AHASH_ALG_BLOCK_ONLY |
-                                         CRYPTO_AHASH_ALG_FINUP_MAX,
-               .cra_blocksize          = SHA256_BLOCK_SIZE,
-               .cra_module             = THIS_MODULE,
-       }
-} };
-
-static int __init sha2_ce_mod_init(void)
-{
-       return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha2_ce_mod_fini(void)
-{
-       crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_cpu_feature_match(SHA2, sha2_ce_mod_init);
-module_exit(sha2_ce_mod_fini);
diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
deleted file mode 100644 (file)
index f3a2b54..0000000
+++ /dev/null
@@ -1,724 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from the OpenSSL project but the author (Andy Polyakov)
-# has relicensed it under the GPLv2. Therefore this program is free software;
-# you can redistribute it and/or modify it under the terms of the GNU General
-# Public License version 2 as published by the Free Software Foundation.
-#
-# The original headers, including the original license headers, are
-# included below for completeness.
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see https://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# SHA256 block procedure for ARMv4. May 2007.
-
-# Performance is ~2x better than gcc 3.4 generated code and in "abso-
-# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
-# byte [on single-issue Xscale PXA250 core].
-
-# July 2010.
-#
-# Rescheduling for dual-issue pipeline resulted in 22% improvement on
-# Cortex A8 core and ~20 cycles per processed byte.
-
-# February 2011.
-#
-# Profiler-assisted and platform-specific optimization resulted in 16%
-# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
-
-# September 2013.
-#
-# Add NEON implementation. On Cortex A8 it was measured to process one
-# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
-# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
-# code (meaning that latter performs sub-optimally, nothing was done
-# about it).
-
-# May 2014.
-#
-# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
-
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-$ctx="r0";     $t0="r0";
-$inp="r1";     $t4="r1";
-$len="r2";     $t1="r2";
-$T1="r3";      $t3="r3";
-$A="r4";
-$B="r5";
-$C="r6";
-$D="r7";
-$E="r8";
-$F="r9";
-$G="r10";
-$H="r11";
-@V=($A,$B,$C,$D,$E,$F,$G,$H);
-$t2="r12";
-$Ktbl="r14";
-
-@Sigma0=( 2,13,22);
-@Sigma1=( 6,11,25);
-@sigma0=( 7,18, 3);
-@sigma1=(17,19,10);
-
-sub BODY_00_15 {
-my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
-
-$code.=<<___ if ($i<16);
-#if __ARM_ARCH__>=7
-       @ ldr   $t1,[$inp],#4                   @ $i
-# if $i==15
-       str     $inp,[sp,#17*4]                 @ make room for $t4
-# endif
-       eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
-       add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
-       eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
-# ifndef __ARMEB__
-       rev     $t1,$t1
-# endif
-#else
-       @ ldrb  $t1,[$inp,#3]                   @ $i
-       add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
-       ldrb    $t2,[$inp,#2]
-       ldrb    $t0,[$inp,#1]
-       orr     $t1,$t1,$t2,lsl#8
-       ldrb    $t2,[$inp],#4
-       orr     $t1,$t1,$t0,lsl#16
-# if $i==15
-       str     $inp,[sp,#17*4]                 @ make room for $t4
-# endif
-       eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
-       orr     $t1,$t1,$t2,lsl#24
-       eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
-#endif
-___
-$code.=<<___;
-       ldr     $t2,[$Ktbl],#4                  @ *K256++
-       add     $h,$h,$t1                       @ h+=X[i]
-       str     $t1,[sp,#`$i%16`*4]
-       eor     $t1,$f,$g
-       add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
-       and     $t1,$t1,$e
-       add     $h,$h,$t2                       @ h+=K256[i]
-       eor     $t1,$t1,$g                      @ Ch(e,f,g)
-       eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
-       add     $h,$h,$t1                       @ h+=Ch(e,f,g)
-#if $i==31
-       and     $t2,$t2,#0xff
-       cmp     $t2,#0xf2                       @ done?
-#endif
-#if $i<15
-# if __ARM_ARCH__>=7
-       ldr     $t1,[$inp],#4                   @ prefetch
-# else
-       ldrb    $t1,[$inp,#3]
-# endif
-       eor     $t2,$a,$b                       @ a^b, b^c in next round
-#else
-       ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
-       eor     $t2,$a,$b                       @ a^b, b^c in next round
-       ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
-#endif
-       eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
-       and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
-       add     $d,$d,$h                        @ d+=h
-       eor     $t3,$t3,$b                      @ Maj(a,b,c)
-       add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
-       @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
-___
-       ($t2,$t3)=($t3,$t2);
-}
-
-sub BODY_16_XX {
-my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
-
-$code.=<<___;
-       @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
-       @ ldr   $t4,[sp,#`($i+14)%16`*4]
-       mov     $t0,$t1,ror#$sigma0[0]
-       add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
-       mov     $t2,$t4,ror#$sigma1[0]
-       eor     $t0,$t0,$t1,ror#$sigma0[1]
-       eor     $t2,$t2,$t4,ror#$sigma1[1]
-       eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
-       ldr     $t1,[sp,#`($i+0)%16`*4]
-       eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
-       ldr     $t4,[sp,#`($i+9)%16`*4]
-
-       add     $t2,$t2,$t0
-       eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
-       add     $t1,$t1,$t2
-       eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
-       add     $t1,$t1,$t4                     @ X[i]
-___
-       &BODY_00_15(@_);
-}
-
-$code=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-#endif
-
-.text
-#if __ARM_ARCH__<7
-.code  32
-#else
-.syntax unified
-# ifdef __thumb2__
-.thumb
-# else
-.code   32
-# endif
-#endif
-
-.type  K256,%object
-.align 5
-K256:
-.word  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.word  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.word  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.word  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.word  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.word  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.word  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.word  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.word  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.word  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.word  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.word  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.word  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.word  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.word  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.word  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-.size  K256,.-K256
-.word  0                               @ terminator
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word  OPENSSL_armcap_P-sha256_block_data_order
-#endif
-.align 5
-
-.global        sha256_block_data_order
-.type  sha256_block_data_order,%function
-sha256_block_data_order:
-.Lsha256_block_data_order:
-#if __ARM_ARCH__<7
-       sub     r3,pc,#8                @ sha256_block_data_order
-#else
-       adr     r3,.Lsha256_block_data_order
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-       ldr     r12,.LOPENSSL_armcap
-       ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
-       tst     r12,#ARMV8_SHA256
-       bne     .LARMv8
-       tst     r12,#ARMV7_NEON
-       bne     .LNEON
-#endif
-       add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
-       stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
-       ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
-       sub     $Ktbl,r3,#256+32        @ K256
-       sub     sp,sp,#16*4             @ alloca(X[16])
-.Loop:
-# if __ARM_ARCH__>=7
-       ldr     $t1,[$inp],#4
-# else
-       ldrb    $t1,[$inp,#3]
-# endif
-       eor     $t3,$B,$C               @ magic
-       eor     $t2,$t2,$t2
-___
-for($i=0;$i<16;$i++)   { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
-$code.=".Lrounds_16_xx:\n";
-for (;$i<32;$i++)      { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
-#if __ARM_ARCH__>=7
-       ite     eq                      @ Thumb2 thing, sanity check in ARM
-#endif
-       ldreq   $t3,[sp,#16*4]          @ pull ctx
-       bne     .Lrounds_16_xx
-
-       add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
-       ldr     $t0,[$t3,#0]
-       ldr     $t1,[$t3,#4]
-       ldr     $t2,[$t3,#8]
-       add     $A,$A,$t0
-       ldr     $t0,[$t3,#12]
-       add     $B,$B,$t1
-       ldr     $t1,[$t3,#16]
-       add     $C,$C,$t2
-       ldr     $t2,[$t3,#20]
-       add     $D,$D,$t0
-       ldr     $t0,[$t3,#24]
-       add     $E,$E,$t1
-       ldr     $t1,[$t3,#28]
-       add     $F,$F,$t2
-       ldr     $inp,[sp,#17*4]         @ pull inp
-       ldr     $t2,[sp,#18*4]          @ pull inp+len
-       add     $G,$G,$t0
-       add     $H,$H,$t1
-       stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
-       cmp     $inp,$t2
-       sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
-       bne     .Loop
-
-       add     sp,sp,#`16+3`*4 @ destroy frame
-#if __ARM_ARCH__>=5
-       ldmia   sp!,{r4-r11,pc}
-#else
-       ldmia   sp!,{r4-r11,lr}
-       tst     lr,#1
-       moveq   pc,lr                   @ be binary compatible with V4, yet
-       bx      lr                      @ interoperable with Thumb ISA:-)
-#endif
-.size  sha256_block_data_order,.-sha256_block_data_order
-___
-######################################################################
-# NEON stuff
-#
-{{{
-my @X=map("q$_",(0..3));
-my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
-my $Xfer=$t4;
-my $j=0;
-
-sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
-sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
-
-sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
-  my $arg = pop;
-    $arg = "#$arg" if ($arg*1 eq $arg);
-    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
-}
-
-sub Xupdate()
-{ use integer;
-  my $body = shift;
-  my @insns = (&$body,&$body,&$body,&$body);
-  my ($a,$b,$c,$d,$e,$f,$g,$h);
-
-       &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vshr_u32       ($T2,$T0,$sigma0[0]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vshr_u32       ($T1,$T0,$sigma0[2]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vsli_32        ($T2,$T0,32-$sigma0[0]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vshr_u32       ($T3,$T0,$sigma0[1]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &veor           ($T1,$T1,$T2);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vsli_32        ($T3,$T0,32-$sigma0[1]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &veor         ($T5,$T5,$T4);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &veor         ($T5,$T5,$T4);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vld1_32        ("{$T0}","[$Ktbl,:128]!");
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vadd_i32       ($T0,$T0,@X[0]);
-        while($#insns>=2) { eval(shift(@insns)); }
-       &vst1_32        ("{$T0}","[$Xfer,:128]!");
-        eval(shift(@insns));
-        eval(shift(@insns));
-
-       push(@X,shift(@X));             # "rotate" X[]
-}
-
-sub Xpreload()
-{ use integer;
-  my $body = shift;
-  my @insns = (&$body,&$body,&$body,&$body);
-  my ($a,$b,$c,$d,$e,$f,$g,$h);
-
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vld1_32        ("{$T0}","[$Ktbl,:128]!");
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vrev32_8       (@X[0],@X[0]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vadd_i32       ($T0,$T0,@X[0]);
-        foreach (@insns) { eval; }     # remaining instructions
-       &vst1_32        ("{$T0}","[$Xfer,:128]!");
-
-       push(@X,shift(@X));             # "rotate" X[]
-}
-
-sub body_00_15 () {
-       (
-       '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
-       '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
-       '&eor   ($t1,$f,$g)',
-       '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
-       '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
-       '&and   ($t1,$t1,$e)',
-       '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
-       '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
-       '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
-       '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
-       '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
-       '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
-       '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
-       '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
-       '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
-       '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
-       '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
-       '&add   ($d,$d,$h)',                    # d+=h
-       '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
-       '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
-       '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
-       )
-}
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
-.arch  armv7-a
-.fpu   neon
-
-.global        sha256_block_data_order_neon
-.type  sha256_block_data_order_neon,%function
-.align 4
-sha256_block_data_order_neon:
-.LNEON:
-       stmdb   sp!,{r4-r12,lr}
-
-       sub     $H,sp,#16*4+16
-       adr     $Ktbl,.Lsha256_block_data_order
-       sub     $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
-       bic     $H,$H,#15               @ align for 128-bit stores
-       mov     $t2,sp
-       mov     sp,$H                   @ alloca
-       add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
-
-       vld1.8          {@X[0]},[$inp]!
-       vld1.8          {@X[1]},[$inp]!
-       vld1.8          {@X[2]},[$inp]!
-       vld1.8          {@X[3]},[$inp]!
-       vld1.32         {$T0},[$Ktbl,:128]!
-       vld1.32         {$T1},[$Ktbl,:128]!
-       vld1.32         {$T2},[$Ktbl,:128]!
-       vld1.32         {$T3},[$Ktbl,:128]!
-       vrev32.8        @X[0],@X[0]             @ yes, even on
-       str             $ctx,[sp,#64]
-       vrev32.8        @X[1],@X[1]             @ big-endian
-       str             $inp,[sp,#68]
-       mov             $Xfer,sp
-       vrev32.8        @X[2],@X[2]
-       str             $len,[sp,#72]
-       vrev32.8        @X[3],@X[3]
-       str             $t2,[sp,#76]            @ save original sp
-       vadd.i32        $T0,$T0,@X[0]
-       vadd.i32        $T1,$T1,@X[1]
-       vst1.32         {$T0},[$Xfer,:128]!
-       vadd.i32        $T2,$T2,@X[2]
-       vst1.32         {$T1},[$Xfer,:128]!
-       vadd.i32        $T3,$T3,@X[3]
-       vst1.32         {$T2},[$Xfer,:128]!
-       vst1.32         {$T3},[$Xfer,:128]!
-
-       ldmia           $ctx,{$A-$H}
-       sub             $Xfer,$Xfer,#64
-       ldr             $t1,[sp,#0]
-       eor             $t2,$t2,$t2
-       eor             $t3,$B,$C
-       b               .L_00_48
-
-.align 4
-.L_00_48:
-___
-       &Xupdate(\&body_00_15);
-       &Xupdate(\&body_00_15);
-       &Xupdate(\&body_00_15);
-       &Xupdate(\&body_00_15);
-$code.=<<___;
-       teq     $t1,#0                          @ check for K256 terminator
-       ldr     $t1,[sp,#0]
-       sub     $Xfer,$Xfer,#64
-       bne     .L_00_48
-
-       ldr             $inp,[sp,#68]
-       ldr             $t0,[sp,#72]
-       sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
-       teq             $inp,$t0
-       it              eq
-       subeq           $inp,$inp,#64           @ avoid SEGV
-       vld1.8          {@X[0]},[$inp]!         @ load next input block
-       vld1.8          {@X[1]},[$inp]!
-       vld1.8          {@X[2]},[$inp]!
-       vld1.8          {@X[3]},[$inp]!
-       it              ne
-       strne           $inp,[sp,#68]
-       mov             $Xfer,sp
-___
-       &Xpreload(\&body_00_15);
-       &Xpreload(\&body_00_15);
-       &Xpreload(\&body_00_15);
-       &Xpreload(\&body_00_15);
-$code.=<<___;
-       ldr     $t0,[$t1,#0]
-       add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
-       ldr     $t2,[$t1,#4]
-       ldr     $t3,[$t1,#8]
-       ldr     $t4,[$t1,#12]
-       add     $A,$A,$t0                       @ accumulate
-       ldr     $t0,[$t1,#16]
-       add     $B,$B,$t2
-       ldr     $t2,[$t1,#20]
-       add     $C,$C,$t3
-       ldr     $t3,[$t1,#24]
-       add     $D,$D,$t4
-       ldr     $t4,[$t1,#28]
-       add     $E,$E,$t0
-       str     $A,[$t1],#4
-       add     $F,$F,$t2
-       str     $B,[$t1],#4
-       add     $G,$G,$t3
-       str     $C,[$t1],#4
-       add     $H,$H,$t4
-       str     $D,[$t1],#4
-       stmia   $t1,{$E-$H}
-
-       ittte   ne
-       movne   $Xfer,sp
-       ldrne   $t1,[sp,#0]
-       eorne   $t2,$t2,$t2
-       ldreq   sp,[sp,#76]                     @ restore original sp
-       itt     ne
-       eorne   $t3,$B,$C
-       bne     .L_00_48
-
-       ldmia   sp!,{r4-r12,pc}
-.size  sha256_block_data_order_neon,.-sha256_block_data_order_neon
-#endif
-___
-}}}
-######################################################################
-# ARMv8 stuff
-#
-{{{
-my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
-my @MSG=map("q$_",(8..11));
-my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
-my $Ktbl="r3";
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-
-# ifdef __thumb2__
-#  define INST(a,b,c,d)        .byte   c,d|0xc,a,b
-# else
-#  define INST(a,b,c,d)        .byte   a,b,c,d
-# endif
-
-.type  sha256_block_data_order_armv8,%function
-.align 5
-sha256_block_data_order_armv8:
-.LARMv8:
-       vld1.32 {$ABCD,$EFGH},[$ctx]
-# ifdef __thumb2__
-       adr     $Ktbl,.LARMv8
-       sub     $Ktbl,$Ktbl,#.LARMv8-K256
-# else
-       adrl    $Ktbl,K256
-# endif
-       add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
-
-.Loop_v8:
-       vld1.8          {@MSG[0]-@MSG[1]},[$inp]!
-       vld1.8          {@MSG[2]-@MSG[3]},[$inp]!
-       vld1.32         {$W0},[$Ktbl]!
-       vrev32.8        @MSG[0],@MSG[0]
-       vrev32.8        @MSG[1],@MSG[1]
-       vrev32.8        @MSG[2],@MSG[2]
-       vrev32.8        @MSG[3],@MSG[3]
-       vmov            $ABCD_SAVE,$ABCD        @ offload
-       vmov            $EFGH_SAVE,$EFGH
-       teq             $inp,$len
-___
-for($i=0;$i<12;$i++) {
-$code.=<<___;
-       vld1.32         {$W1},[$Ktbl]!
-       vadd.i32        $W0,$W0,@MSG[0]
-       sha256su0       @MSG[0],@MSG[1]
-       vmov            $abcd,$ABCD
-       sha256h         $ABCD,$EFGH,$W0
-       sha256h2        $EFGH,$abcd,$W0
-       sha256su1       @MSG[0],@MSG[2],@MSG[3]
-___
-       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
-}
-$code.=<<___;
-       vld1.32         {$W1},[$Ktbl]!
-       vadd.i32        $W0,$W0,@MSG[0]
-       vmov            $abcd,$ABCD
-       sha256h         $ABCD,$EFGH,$W0
-       sha256h2        $EFGH,$abcd,$W0
-
-       vld1.32         {$W0},[$Ktbl]!
-       vadd.i32        $W1,$W1,@MSG[1]
-       vmov            $abcd,$ABCD
-       sha256h         $ABCD,$EFGH,$W1
-       sha256h2        $EFGH,$abcd,$W1
-
-       vld1.32         {$W1},[$Ktbl]
-       vadd.i32        $W0,$W0,@MSG[2]
-       sub             $Ktbl,$Ktbl,#256-16     @ rewind
-       vmov            $abcd,$ABCD
-       sha256h         $ABCD,$EFGH,$W0
-       sha256h2        $EFGH,$abcd,$W0
-
-       vadd.i32        $W1,$W1,@MSG[3]
-       vmov            $abcd,$ABCD
-       sha256h         $ABCD,$EFGH,$W1
-       sha256h2        $EFGH,$abcd,$W1
-
-       vadd.i32        $ABCD,$ABCD,$ABCD_SAVE
-       vadd.i32        $EFGH,$EFGH,$EFGH_SAVE
-       it              ne
-       bne             .Loop_v8
-
-       vst1.32         {$ABCD,$EFGH},[$ctx]
-
-       ret             @ bx lr
-.size  sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
-#endif
-___
-}}}
-$code.=<<___;
-.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
-.align 2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm   OPENSSL_armcap_P,4,4
-#endif
-___
-
-open SELF,$0;
-while(<SELF>) {
-       next if (/^#!/);
-       last if (!s/^#/@/ and !/^$/);
-       print;
-}
-close SELF;
-
-{   my  %opcode = (
-       "sha256h"       => 0xf3000c40,  "sha256h2"      => 0xf3100c40,
-       "sha256su0"     => 0xf3ba03c0,  "sha256su1"     => 0xf3200c40   );
-
-    sub unsha256 {
-       my ($mnemonic,$arg)=@_;
-
-       if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
-           my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
-                                        |(($2&7)<<17)|(($2&8)<<4)
-                                        |(($3&7)<<1) |(($3&8)<<2);
-           # since ARMv7 instructions are always encoded little-endian.
-           # correct solution is to use .inst directive, but older
-           # assemblers don't implement it:-(
-           sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
-                       $word&0xff,($word>>8)&0xff,
-                       ($word>>16)&0xff,($word>>24)&0xff,
-                       $mnemonic,$arg;
-       }
-    }
-}
-
-foreach (split($/,$code)) {
-
-       s/\`([^\`]*)\`/eval $1/geo;
-
-       s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
-
-       s/\bret\b/bx    lr/go           or
-       s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
-
-       print $_,"\n";
-}
-
-close STDOUT; # enforce flush
diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c
deleted file mode 100644 (file)
index d04c4e6..0000000
+++ /dev/null
@@ -1,107 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
- * using optimized ARM assembler and NEON instructions.
- *
- * Copyright Â© 2015 Google Inc.
- *
- * This file is based on sha256_ssse3_glue.c:
- *   Copyright (C) 2013 Intel Corporation
- *   Author: Tim Chen <tim.c.chen@linux.intel.com>
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include "sha256_glue.h"
-
-asmlinkage void sha256_block_data_order(struct crypto_sha256_state *state,
-                                       const u8 *data, int num_blks);
-
-static int crypto_sha256_arm_update(struct shash_desc *desc, const u8 *data,
-                                   unsigned int len)
-{
-       /* make sure casting to sha256_block_fn() is safe */
-       BUILD_BUG_ON(offsetof(struct crypto_sha256_state, state) != 0);
-
-       return sha256_base_do_update_blocks(desc, data, len,
-                                           sha256_block_data_order);
-}
-
-static int crypto_sha256_arm_finup(struct shash_desc *desc, const u8 *data,
-                                  unsigned int len, u8 *out)
-{
-       sha256_base_do_finup(desc, data, len, sha256_block_data_order);
-       return sha256_base_finish(desc, out);
-}
-
-static struct shash_alg algs[] = { {
-       .digestsize     =       SHA256_DIGEST_SIZE,
-       .init           =       sha256_base_init,
-       .update         =       crypto_sha256_arm_update,
-       .finup          =       crypto_sha256_arm_finup,
-       .descsize       =       sizeof(struct crypto_sha256_state),
-       .base           =       {
-               .cra_name       =       "sha256",
-               .cra_driver_name =      "sha256-asm",
-               .cra_priority   =       150,
-               .cra_flags      =       CRYPTO_AHASH_ALG_BLOCK_ONLY |
-                                       CRYPTO_AHASH_ALG_FINUP_MAX,
-               .cra_blocksize  =       SHA256_BLOCK_SIZE,
-               .cra_module     =       THIS_MODULE,
-       }
-}, {
-       .digestsize     =       SHA224_DIGEST_SIZE,
-       .init           =       sha224_base_init,
-       .update         =       crypto_sha256_arm_update,
-       .finup          =       crypto_sha256_arm_finup,
-       .descsize       =       sizeof(struct crypto_sha256_state),
-       .base           =       {
-               .cra_name       =       "sha224",
-               .cra_driver_name =      "sha224-asm",
-               .cra_priority   =       150,
-               .cra_flags      =       CRYPTO_AHASH_ALG_BLOCK_ONLY |
-                                       CRYPTO_AHASH_ALG_FINUP_MAX,
-               .cra_blocksize  =       SHA224_BLOCK_SIZE,
-               .cra_module     =       THIS_MODULE,
-       }
-} };
-
-static int __init sha256_mod_init(void)
-{
-       int res = crypto_register_shashes(algs, ARRAY_SIZE(algs));
-
-       if (res < 0)
-               return res;
-
-       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
-               res = crypto_register_shashes(sha256_neon_algs,
-                                             ARRAY_SIZE(sha256_neon_algs));
-
-               if (res < 0)
-                       crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-       }
-
-       return res;
-}
-
-static void __exit sha256_mod_fini(void)
-{
-       crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-
-       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
-               crypto_unregister_shashes(sha256_neon_algs,
-                                         ARRAY_SIZE(sha256_neon_algs));
-}
-
-module_init(sha256_mod_init);
-module_exit(sha256_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm (ARM), including NEON");
-
-MODULE_ALIAS_CRYPTO("sha256");
diff --git a/arch/arm/crypto/sha256_glue.h b/arch/arm/crypto/sha256_glue.h
deleted file mode 100644 (file)
index 9881c9a..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _CRYPTO_SHA256_GLUE_H
-#define _CRYPTO_SHA256_GLUE_H
-
-#include <crypto/hash.h>
-
-extern struct shash_alg sha256_neon_algs[2];
-
-#endif /* _CRYPTO_SHA256_GLUE_H */
diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c
deleted file mode 100644 (file)
index 76eb3cd..0000000
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
- * using NEON instructions.
- *
- * Copyright Â© 2015 Google Inc.
- *
- * This file is based on sha512_neon_glue.c:
- *   Copyright Â© 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include "sha256_glue.h"
-
-asmlinkage void sha256_block_data_order_neon(
-       struct crypto_sha256_state *digest, const u8 *data, int num_blks);
-
-static int crypto_sha256_neon_update(struct shash_desc *desc, const u8 *data,
-                                    unsigned int len)
-{
-       int remain;
-
-       kernel_neon_begin();
-       remain = sha256_base_do_update_blocks(desc, data, len,
-                                             sha256_block_data_order_neon);
-       kernel_neon_end();
-       return remain;
-}
-
-static int crypto_sha256_neon_finup(struct shash_desc *desc, const u8 *data,
-                                   unsigned int len, u8 *out)
-{
-       kernel_neon_begin();
-       sha256_base_do_finup(desc, data, len, sha256_block_data_order_neon);
-       kernel_neon_end();
-       return sha256_base_finish(desc, out);
-}
-
-struct shash_alg sha256_neon_algs[] = { {
-       .digestsize     =       SHA256_DIGEST_SIZE,
-       .init           =       sha256_base_init,
-       .update         =       crypto_sha256_neon_update,
-       .finup          =       crypto_sha256_neon_finup,
-       .descsize       =       sizeof(struct crypto_sha256_state),
-       .base           =       {
-               .cra_name       =       "sha256",
-               .cra_driver_name =      "sha256-neon",
-               .cra_priority   =       250,
-               .cra_flags      =       CRYPTO_AHASH_ALG_BLOCK_ONLY |
-                                       CRYPTO_AHASH_ALG_FINUP_MAX,
-               .cra_blocksize  =       SHA256_BLOCK_SIZE,
-               .cra_module     =       THIS_MODULE,
-       }
-}, {
-       .digestsize     =       SHA224_DIGEST_SIZE,
-       .init           =       sha224_base_init,
-       .update         =       crypto_sha256_neon_update,
-       .finup          =       crypto_sha256_neon_finup,
-       .descsize       =       sizeof(struct crypto_sha256_state),
-       .base           =       {
-               .cra_name       =       "sha224",
-               .cra_driver_name =      "sha224-neon",
-               .cra_priority   =       250,
-               .cra_flags      =       CRYPTO_AHASH_ALG_BLOCK_ONLY |
-                                       CRYPTO_AHASH_ALG_FINUP_MAX,
-               .cra_blocksize  =       SHA224_BLOCK_SIZE,
-               .cra_module     =       THIS_MODULE,
-       }
-} };
index 0d47d4f21c6de9593c262189c3d4c3c16564009e..12d74d8b03d0aa791e0d6ece8cd0d438eea9bc67 100644 (file)
@@ -1,2 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 poly1305-core.S
+sha256-core.S
index e8444fd0aae3036e859ecba0a3bcc3eb5fd7c511..9f3ff30f4032868d0c6327396026477f6a53a0f2 100644 (file)
@@ -22,3 +22,9 @@ config CRYPTO_POLY1305_ARM
        tristate
        default CRYPTO_LIB_POLY1305
        select CRYPTO_ARCH_HAVE_LIB_POLY1305
+
+config CRYPTO_SHA256_ARM
+       tristate
+       depends on !CPU_V7M
+       default CRYPTO_LIB_SHA256
+       select CRYPTO_ARCH_HAVE_LIB_SHA256
index 4c042a4c77ed6e8d6ef4f026b86edc44e68c4e6b..431f77c3ff6fd5ff8b57683c2fc1823f65868cd5 100644 (file)
@@ -10,13 +10,17 @@ chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
 obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
 poly1305-arm-y := poly1305-core.o poly1305-glue.o
 
+obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
+sha256-arm-y := sha256.o sha256-core.o
+sha256-arm-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o
+
 quiet_cmd_perl = PERL    $@
       cmd_perl = $(PERL) $(<) > $(@)
 
 $(obj)/%-core.S: $(src)/%-armv4.pl
        $(call cmd,perl)
 
-clean-files += poly1305-core.S
+clean-files += poly1305-core.S sha256-core.S
 
 aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
 
@@ -24,3 +28,5 @@ aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
 poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
 poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
 AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y)
+
+AFLAGS_sha256-core.o += $(aflags-thumb2-y)
diff --git a/arch/arm/lib/crypto/sha256-armv4.pl b/arch/arm/lib/crypto/sha256-armv4.pl
new file mode 100644 (file)
index 0000000..f3a2b54
--- /dev/null
@@ -0,0 +1,724 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0";     $t0="r0";
+$inp="r1";     $t4="r1";
+$len="r2";     $t1="r2";
+$T1="r3";      $t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+       @ ldr   $t1,[$inp],#4                   @ $i
+# if $i==15
+       str     $inp,[sp,#17*4]                 @ make room for $t4
+# endif
+       eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+       add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
+       eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     $t1,$t1
+# endif
+#else
+       @ ldrb  $t1,[$inp,#3]                   @ $i
+       add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
+       ldrb    $t2,[$inp,#2]
+       ldrb    $t0,[$inp,#1]
+       orr     $t1,$t1,$t2,lsl#8
+       ldrb    $t2,[$inp],#4
+       orr     $t1,$t1,$t0,lsl#16
+# if $i==15
+       str     $inp,[sp,#17*4]                 @ make room for $t4
+# endif
+       eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+       orr     $t1,$t1,$t2,lsl#24
+       eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
+#endif
+___
+$code.=<<___;
+       ldr     $t2,[$Ktbl],#4                  @ *K256++
+       add     $h,$h,$t1                       @ h+=X[i]
+       str     $t1,[sp,#`$i%16`*4]
+       eor     $t1,$f,$g
+       add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
+       and     $t1,$t1,$e
+       add     $h,$h,$t2                       @ h+=K256[i]
+       eor     $t1,$t1,$g                      @ Ch(e,f,g)
+       eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+       add     $h,$h,$t1                       @ h+=Ch(e,f,g)
+#if $i==31
+       and     $t2,$t2,#0xff
+       cmp     $t2,#0xf2                       @ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+       ldr     $t1,[$inp],#4                   @ prefetch
+# else
+       ldrb    $t1,[$inp,#3]
+# endif
+       eor     $t2,$a,$b                       @ a^b, b^c in next round
+#else
+       ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
+       eor     $t2,$a,$b                       @ a^b, b^c in next round
+       ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
+#endif
+       eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
+       and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
+       add     $d,$d,$h                        @ d+=h
+       eor     $t3,$t3,$b                      @ Maj(a,b,c)
+       add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
+       @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
+___
+       ($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+       @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
+       @ ldr   $t4,[sp,#`($i+14)%16`*4]
+       mov     $t0,$t1,ror#$sigma0[0]
+       add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
+       mov     $t2,$t4,ror#$sigma1[0]
+       eor     $t0,$t0,$t1,ror#$sigma0[1]
+       eor     $t2,$t2,$t4,ror#$sigma1[1]
+       eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
+       ldr     $t1,[sp,#`($i+0)%16`*4]
+       eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
+       ldr     $t4,[sp,#`($i+9)%16`*4]
+
+       add     $t2,$t2,$t0
+       eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
+       add     $t1,$t1,$t2
+       eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
+       add     $t1,$t1,$t4                     @ X[i]
+___
+       &BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code  32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type  K256,%object
+.align 5
+K256:
+.word  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size  K256,.-K256
+.word  0                               @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word  OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align 5
+
+.global        sha256_block_data_order
+.type  sha256_block_data_order,%function
+sha256_block_data_order:
+.Lsha256_block_data_order:
+#if __ARM_ARCH__<7
+       sub     r3,pc,#8                @ sha256_block_data_order
+#else
+       adr     r3,.Lsha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+       ldr     r12,.LOPENSSL_armcap
+       ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
+       tst     r12,#ARMV8_SHA256
+       bne     .LARMv8
+       tst     r12,#ARMV7_NEON
+       bne     .LNEON
+#endif
+       add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
+       stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
+       ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+       sub     $Ktbl,r3,#256+32        @ K256
+       sub     sp,sp,#16*4             @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+       ldr     $t1,[$inp],#4
+# else
+       ldrb    $t1,[$inp,#3]
+# endif
+       eor     $t3,$B,$C               @ magic
+       eor     $t2,$t2,$t2
+___
+for($i=0;$i<16;$i++)   { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++)      { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+       ite     eq                      @ Thumb2 thing, sanity check in ARM
+#endif
+       ldreq   $t3,[sp,#16*4]          @ pull ctx
+       bne     .Lrounds_16_xx
+
+       add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
+       ldr     $t0,[$t3,#0]
+       ldr     $t1,[$t3,#4]
+       ldr     $t2,[$t3,#8]
+       add     $A,$A,$t0
+       ldr     $t0,[$t3,#12]
+       add     $B,$B,$t1
+       ldr     $t1,[$t3,#16]
+       add     $C,$C,$t2
+       ldr     $t2,[$t3,#20]
+       add     $D,$D,$t0
+       ldr     $t0,[$t3,#24]
+       add     $E,$E,$t1
+       ldr     $t1,[$t3,#28]
+       add     $F,$F,$t2
+       ldr     $inp,[sp,#17*4]         @ pull inp
+       ldr     $t2,[sp,#18*4]          @ pull inp+len
+       add     $G,$G,$t0
+       add     $H,$H,$t1
+       stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+       cmp     $inp,$t2
+       sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
+       bne     .Loop
+
+       add     sp,sp,#`16+3`*4 @ destroy frame
+#if __ARM_ARCH__>=5
+       ldmia   sp!,{r4-r11,pc}
+#else
+       ldmia   sp!,{r4-r11,lr}
+       tst     lr,#1
+       moveq   pc,lr                   @ be binary compatible with V4, yet
+       bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
+.size  sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+       &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vshr_u32       ($T2,$T0,$sigma0[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vshr_u32       ($T1,$T0,$sigma0[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vsli_32        ($T2,$T0,32-$sigma0[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vshr_u32       ($T3,$T0,$sigma0[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &veor           ($T1,$T1,$T2);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vsli_32        ($T3,$T0,32-$sigma0[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &veor         ($T5,$T5,$T4);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &veor         ($T5,$T5,$T4);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vld1_32        ("{$T0}","[$Ktbl,:128]!");
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       ($T0,$T0,@X[0]);
+        while($#insns>=2) { eval(shift(@insns)); }
+       &vst1_32        ("{$T0}","[$Xfer,:128]!");
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       push(@X,shift(@X));             # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vld1_32        ("{$T0}","[$Ktbl,:128]!");
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vrev32_8       (@X[0],@X[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       ($T0,$T0,@X[0]);
+        foreach (@insns) { eval; }     # remaining instructions
+       &vst1_32        ("{$T0}","[$Xfer,:128]!");
+
+       push(@X,shift(@X));             # "rotate" X[]
+}
+
+sub body_00_15 () {
+       (
+       '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+       '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
+       '&eor   ($t1,$f,$g)',
+       '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+       '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
+       '&and   ($t1,$t1,$e)',
+       '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
+       '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+       '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
+       '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
+       '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
+       '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
+       '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
+       '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
+       '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
+       '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
+       '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
+       '&add   ($d,$d,$h)',                    # d+=h
+       '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
+       '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
+       '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+       )
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch  armv7-a
+.fpu   neon
+
+.global        sha256_block_data_order_neon
+.type  sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+       stmdb   sp!,{r4-r12,lr}
+
+       sub     $H,sp,#16*4+16
+       adr     $Ktbl,.Lsha256_block_data_order
+       sub     $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
+       bic     $H,$H,#15               @ align for 128-bit stores
+       mov     $t2,sp
+       mov     sp,$H                   @ alloca
+       add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
+
+       vld1.8          {@X[0]},[$inp]!
+       vld1.8          {@X[1]},[$inp]!
+       vld1.8          {@X[2]},[$inp]!
+       vld1.8          {@X[3]},[$inp]!
+       vld1.32         {$T0},[$Ktbl,:128]!
+       vld1.32         {$T1},[$Ktbl,:128]!
+       vld1.32         {$T2},[$Ktbl,:128]!
+       vld1.32         {$T3},[$Ktbl,:128]!
+       vrev32.8        @X[0],@X[0]             @ yes, even on
+       str             $ctx,[sp,#64]
+       vrev32.8        @X[1],@X[1]             @ big-endian
+       str             $inp,[sp,#68]
+       mov             $Xfer,sp
+       vrev32.8        @X[2],@X[2]
+       str             $len,[sp,#72]
+       vrev32.8        @X[3],@X[3]
+       str             $t2,[sp,#76]            @ save original sp
+       vadd.i32        $T0,$T0,@X[0]
+       vadd.i32        $T1,$T1,@X[1]
+       vst1.32         {$T0},[$Xfer,:128]!
+       vadd.i32        $T2,$T2,@X[2]
+       vst1.32         {$T1},[$Xfer,:128]!
+       vadd.i32        $T3,$T3,@X[3]
+       vst1.32         {$T2},[$Xfer,:128]!
+       vst1.32         {$T3},[$Xfer,:128]!
+
+       ldmia           $ctx,{$A-$H}
+       sub             $Xfer,$Xfer,#64
+       ldr             $t1,[sp,#0]
+       eor             $t2,$t2,$t2
+       eor             $t3,$B,$C
+       b               .L_00_48
+
+.align 4
+.L_00_48:
+___
+       &Xupdate(\&body_00_15);
+       &Xupdate(\&body_00_15);
+       &Xupdate(\&body_00_15);
+       &Xupdate(\&body_00_15);
+$code.=<<___;
+       teq     $t1,#0                          @ check for K256 terminator
+       ldr     $t1,[sp,#0]
+       sub     $Xfer,$Xfer,#64
+       bne     .L_00_48
+
+       ldr             $inp,[sp,#68]
+       ldr             $t0,[sp,#72]
+       sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
+       teq             $inp,$t0
+       it              eq
+       subeq           $inp,$inp,#64           @ avoid SEGV
+       vld1.8          {@X[0]},[$inp]!         @ load next input block
+       vld1.8          {@X[1]},[$inp]!
+       vld1.8          {@X[2]},[$inp]!
+       vld1.8          {@X[3]},[$inp]!
+       it              ne
+       strne           $inp,[sp,#68]
+       mov             $Xfer,sp
+___
+       &Xpreload(\&body_00_15);
+       &Xpreload(\&body_00_15);
+       &Xpreload(\&body_00_15);
+       &Xpreload(\&body_00_15);
+$code.=<<___;
+       ldr     $t0,[$t1,#0]
+       add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
+       ldr     $t2,[$t1,#4]
+       ldr     $t3,[$t1,#8]
+       ldr     $t4,[$t1,#12]
+       add     $A,$A,$t0                       @ accumulate
+       ldr     $t0,[$t1,#16]
+       add     $B,$B,$t2
+       ldr     $t2,[$t1,#20]
+       add     $C,$C,$t3
+       ldr     $t3,[$t1,#24]
+       add     $D,$D,$t4
+       ldr     $t4,[$t1,#28]
+       add     $E,$E,$t0
+       str     $A,[$t1],#4
+       add     $F,$F,$t2
+       str     $B,[$t1],#4
+       add     $G,$G,$t3
+       str     $C,[$t1],#4
+       add     $H,$H,$t4
+       str     $D,[$t1],#4
+       stmia   $t1,{$E-$H}
+
+       ittte   ne
+       movne   $Xfer,sp
+       ldrne   $t1,[sp,#0]
+       eorne   $t2,$t2,$t2
+       ldreq   sp,[sp,#76]                     @ restore original sp
+       itt     ne
+       eorne   $t3,$B,$C
+       bne     .L_00_48
+
+       ldmia   sp!,{r4-r12,pc}
+.size  sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+#  define INST(a,b,c,d)        .byte   c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)        .byte   a,b,c,d
+# endif
+
+.type  sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+       vld1.32 {$ABCD,$EFGH},[$ctx]
+# ifdef __thumb2__
+       adr     $Ktbl,.LARMv8
+       sub     $Ktbl,$Ktbl,#.LARMv8-K256
+# else
+       adrl    $Ktbl,K256
+# endif
+       add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
+
+.Loop_v8:
+       vld1.8          {@MSG[0]-@MSG[1]},[$inp]!
+       vld1.8          {@MSG[2]-@MSG[3]},[$inp]!
+       vld1.32         {$W0},[$Ktbl]!
+       vrev32.8        @MSG[0],@MSG[0]
+       vrev32.8        @MSG[1],@MSG[1]
+       vrev32.8        @MSG[2],@MSG[2]
+       vrev32.8        @MSG[3],@MSG[3]
+       vmov            $ABCD_SAVE,$ABCD        @ offload
+       vmov            $EFGH_SAVE,$EFGH
+       teq             $inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+       vld1.32         {$W1},[$Ktbl]!
+       vadd.i32        $W0,$W0,@MSG[0]
+       sha256su0       @MSG[0],@MSG[1]
+       vmov            $abcd,$ABCD
+       sha256h         $ABCD,$EFGH,$W0
+       sha256h2        $EFGH,$abcd,$W0
+       sha256su1       @MSG[0],@MSG[2],@MSG[3]
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+       vld1.32         {$W1},[$Ktbl]!
+       vadd.i32        $W0,$W0,@MSG[0]
+       vmov            $abcd,$ABCD
+       sha256h         $ABCD,$EFGH,$W0
+       sha256h2        $EFGH,$abcd,$W0
+
+       vld1.32         {$W0},[$Ktbl]!
+       vadd.i32        $W1,$W1,@MSG[1]
+       vmov            $abcd,$ABCD
+       sha256h         $ABCD,$EFGH,$W1
+       sha256h2        $EFGH,$abcd,$W1
+
+       vld1.32         {$W1},[$Ktbl]
+       vadd.i32        $W0,$W0,@MSG[2]
+       sub             $Ktbl,$Ktbl,#256-16     @ rewind
+       vmov            $abcd,$ABCD
+       sha256h         $ABCD,$EFGH,$W0
+       sha256h2        $EFGH,$abcd,$W0
+
+       vadd.i32        $W1,$W1,@MSG[3]
+       vmov            $abcd,$ABCD
+       sha256h         $ABCD,$EFGH,$W1
+       sha256h2        $EFGH,$abcd,$W1
+
+       vadd.i32        $ABCD,$ABCD,$ABCD_SAVE
+       vadd.i32        $EFGH,$EFGH,$EFGH_SAVE
+       it              ne
+       bne             .Loop_v8
+
+       vst1.32         {$ABCD,$EFGH},[$ctx]
+
+       ret             @ bx lr
+.size  sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm   OPENSSL_armcap_P,4,4
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+       next if (/^#!/);
+       last if (!s/^#/@/ and !/^$/);
+       print;
+}
+close SELF;
+
+{   my  %opcode = (
+       "sha256h"       => 0xf3000c40,  "sha256h2"      => 0xf3100c40,
+       "sha256su0"     => 0xf3ba03c0,  "sha256su1"     => 0xf3200c40   );
+
+    sub unsha256 {
+       my ($mnemonic,$arg)=@_;
+
+       if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+           my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+                                        |(($2&7)<<17)|(($2&8)<<4)
+                                        |(($3&7)<<1) |(($3&8)<<2);
+           # since ARMv7 instructions are always encoded little-endian.
+           # correct solution is to use .inst directive, but older
+           # assemblers don't implement it:-(
+           sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+                       $word&0xff,($word>>8)&0xff,
+                       ($word>>16)&0xff,($word>>24)&0xff,
+                       $mnemonic,$arg;
+       }
+    }
+}
+
+foreach (split($/,$code)) {
+
+       s/\`([^\`]*)\`/eval $1/geo;
+
+       s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+       s/\bret\b/bx    lr/go           or
+       s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
+
+       print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/arch/arm/lib/crypto/sha256-ce.S b/arch/arm/lib/crypto/sha256-ce.S
new file mode 100644 (file)
index 0000000..ac2c9b0
--- /dev/null
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sha256-ce.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2015 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+       .text
+       .arch           armv8-a
+       .fpu            crypto-neon-fp-armv8
+
+       k0              .req    q7
+       k1              .req    q8
+       rk              .req    r3
+
+       ta0             .req    q9
+       ta1             .req    q10
+       tb0             .req    q10
+       tb1             .req    q9
+
+       dga             .req    q11
+       dgb             .req    q12
+
+       dg0             .req    q13
+       dg1             .req    q14
+       dg2             .req    q15
+
+       .macro          add_only, ev, s0
+       vmov            dg2, dg0
+       .ifnb           \s0
+       vld1.32         {k\ev}, [rk, :128]!
+       .endif
+       sha256h.32      dg0, dg1, tb\ev
+       sha256h2.32     dg1, dg2, tb\ev
+       .ifnb           \s0
+       vadd.u32        ta\ev, q\s0, k\ev
+       .endif
+       .endm
+
+       .macro          add_update, ev, s0, s1, s2, s3
+       sha256su0.32    q\s0, q\s1
+       add_only        \ev, \s1
+       sha256su1.32    q\s0, q\s2, q\s3
+       .endm
+
+       .align          6
+.Lsha256_rcon:
+       .word           0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+       .word           0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+       .word           0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+       .word           0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+       .word           0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+       .word           0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+       .word           0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+       .word           0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+       .word           0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+       .word           0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+       .word           0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+       .word           0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+       .word           0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+       .word           0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+       .word           0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+       .word           0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+       /*
+        * void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
+        *                          const u8 *data, size_t nblocks);
+        */
+ENTRY(sha256_ce_transform)
+       /* load state */
+       vld1.32         {dga-dgb}, [r0]
+
+       /* load input */
+0:     vld1.32         {q0-q1}, [r1]!
+       vld1.32         {q2-q3}, [r1]!
+       subs            r2, r2, #1
+
+#ifndef CONFIG_CPU_BIG_ENDIAN
+       vrev32.8        q0, q0
+       vrev32.8        q1, q1
+       vrev32.8        q2, q2
+       vrev32.8        q3, q3
+#endif
+
+       /* load first round constant */
+       adr             rk, .Lsha256_rcon
+       vld1.32         {k0}, [rk, :128]!
+
+       vadd.u32        ta0, q0, k0
+       vmov            dg0, dga
+       vmov            dg1, dgb
+
+       add_update      1, 0, 1, 2, 3
+       add_update      0, 1, 2, 3, 0
+       add_update      1, 2, 3, 0, 1
+       add_update      0, 3, 0, 1, 2
+       add_update      1, 0, 1, 2, 3
+       add_update      0, 1, 2, 3, 0
+       add_update      1, 2, 3, 0, 1
+       add_update      0, 3, 0, 1, 2
+       add_update      1, 0, 1, 2, 3
+       add_update      0, 1, 2, 3, 0
+       add_update      1, 2, 3, 0, 1
+       add_update      0, 3, 0, 1, 2
+
+       add_only        1, 1
+       add_only        0, 2
+       add_only        1, 3
+       add_only        0
+
+       /* update state */
+       vadd.u32        dga, dga, dg0
+       vadd.u32        dgb, dgb, dg1
+       bne             0b
+
+       /* store new state */
+       vst1.32         {dga-dgb}, [r0]
+       bx              lr
+ENDPROC(sha256_ce_transform)
diff --git a/arch/arm/lib/crypto/sha256.c b/arch/arm/lib/crypto/sha256.c
new file mode 100644 (file)
index 0000000..3a8dfc3
--- /dev/null
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-256 optimized for ARM
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <crypto/internal/sha2.h>
+#include <crypto/internal/simd.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+asmlinkage void sha256_block_data_order(u32 state[SHA256_STATE_WORDS],
+                                       const u8 *data, size_t nblocks);
+asmlinkage void sha256_block_data_order_neon(u32 state[SHA256_STATE_WORDS],
+                                            const u8 *data, size_t nblocks);
+asmlinkage void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
+                                   const u8 *data, size_t nblocks);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+                       const u8 *data, size_t nblocks)
+{
+       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+           static_branch_likely(&have_neon) && crypto_simd_usable()) {
+               kernel_neon_begin();
+               if (static_branch_likely(&have_ce))
+                       sha256_ce_transform(state, data, nblocks);
+               else
+                       sha256_block_data_order_neon(state, data, nblocks);
+               kernel_neon_end();
+       } else {
+               sha256_block_data_order(state, data, nblocks);
+       }
+}
+EXPORT_SYMBOL(sha256_blocks_arch);
+
+bool sha256_is_arch_optimized(void)
+{
+       /* We always can use at least the ARM scalar implementation. */
+       return true;
+}
+EXPORT_SYMBOL(sha256_is_arch_optimized);
+
+static int __init sha256_arm_mod_init(void)
+{
+       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
+               static_branch_enable(&have_neon);
+               if (elf_hwcap2 & HWCAP2_SHA2)
+                       static_branch_enable(&have_ce);
+       }
+       return 0;
+}
+arch_initcall(sha256_arm_mod_init);
+
+static void __exit sha256_arm_mod_exit(void)
+{
+}
+module_exit(sha256_arm_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-256 optimized for ARM");