CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA256_ARM=m
CONFIG_CRYPTO_SHA512_ARM=m
CONFIG_CRYPTO_AES_ARM_BS=m
CONFIG_CRYPTO_CHACHA20_NEON=m
CONFIG_CRYPTO_GHASH_ARM_CE=m
CONFIG_CRYPTO_SHA1_ARM_NEON=m
CONFIG_CRYPTO_SHA1_ARM_CE=m
-CONFIG_CRYPTO_SHA2_ARM_CE=m
CONFIG_CRYPTO_SHA512_ARM=m
CONFIG_CRYPTO_AES_ARM=m
CONFIG_CRYPTO_AES_ARM_BS=m
CONFIG_CRYPTO_GHASH_ARM_CE=m
CONFIG_CRYPTO_SHA1_ARM_NEON=m
CONFIG_CRYPTO_SHA1_ARM_CE=m
-CONFIG_CRYPTO_SHA2_ARM_CE=m
CONFIG_CRYPTO_SHA512_ARM=m
CONFIG_CRYPTO_AES_ARM=m
CONFIG_CRYPTO_AES_ARM_BS=m
CONFIG_CRYPTO_MICHAEL_MIC=y
CONFIG_CRYPTO_GHASH_ARM_CE=m
CONFIG_CRYPTO_SHA1_ARM_NEON=m
-CONFIG_CRYPTO_SHA256_ARM=m
CONFIG_CRYPTO_SHA512_ARM=m
CONFIG_CRYPTO_AES_ARM=m
CONFIG_CRYPTO_AES_ARM_BS=m
CONFIG_CRYPTO_DEFLATE=y
CONFIG_CRYPTO_LZO=y
CONFIG_CRYPTO_SHA1_ARM=m
-CONFIG_CRYPTO_SHA256_ARM=m
CONFIG_CRYPTO_SHA512_ARM=m
CONFIG_CRYPTO_AES_ARM=m
CONFIG_FONTS=y
Architecture: arm using ARMv8 Crypto Extensions
-config CRYPTO_SHA2_ARM_CE
- tristate "Hash functions: SHA-224 and SHA-256 (ARMv8 Crypto Extensions)"
- depends on KERNEL_MODE_NEON
- select CRYPTO_SHA256_ARM
- select CRYPTO_HASH
- help
- SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
- Architecture: arm using
- - ARMv8 Crypto Extensions
-
-config CRYPTO_SHA256_ARM
- tristate "Hash functions: SHA-224 and SHA-256 (NEON)"
- select CRYPTO_HASH
- depends on !CPU_V7M
- help
- SHA-224 and SHA-256 secure hash algorithms (FIPS 180)
-
- Architecture: arm using
- - NEON (Advanced SIMD) extensions
-
config CRYPTO_SHA512_ARM
tristate "Hash functions: SHA-384 and SHA-512 (NEON)"
select CRYPTO_HASH
obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
-obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
aes-arm-y := aes-cipher-core.o aes-cipher-glue.o
aes-arm-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
sha1-arm-y := sha1-armv4-large.o sha1_glue.o
sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o
-sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
-sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
sha512-arm-y := sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o
sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o
-sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o
aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
$(obj)/%-core.S: $(src)/%-armv4.pl
$(call cmd,perl)
-clean-files += sha256-core.S sha512-core.S
+clean-files += sha512-core.S
aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1
-AFLAGS_sha256-core.o += $(aflags-thumb2-y)
AFLAGS_sha512-core.o += $(aflags-thumb2-y)
+++ /dev/null
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha2-ce-core.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2015 Linaro Ltd.
- * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
- .text
- .arch armv8-a
- .fpu crypto-neon-fp-armv8
-
- k0 .req q7
- k1 .req q8
- rk .req r3
-
- ta0 .req q9
- ta1 .req q10
- tb0 .req q10
- tb1 .req q9
-
- dga .req q11
- dgb .req q12
-
- dg0 .req q13
- dg1 .req q14
- dg2 .req q15
-
- .macro add_only, ev, s0
- vmov dg2, dg0
- .ifnb \s0
- vld1.32 {k\ev}, [rk, :128]!
- .endif
- sha256h.32 dg0, dg1, tb\ev
- sha256h2.32 dg1, dg2, tb\ev
- .ifnb \s0
- vadd.u32 ta\ev, q\s0, k\ev
- .endif
- .endm
-
- .macro add_update, ev, s0, s1, s2, s3
- sha256su0.32 q\s0, q\s1
- add_only \ev, \s1
- sha256su1.32 q\s0, q\s2, q\s3
- .endm
-
- .align 6
-.Lsha256_rcon:
- .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
- .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
- .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
- .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
- .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
- .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
- .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
- .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
- .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
- .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
- .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
- .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
- .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
- .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
- .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
- .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-
- /*
- * void sha2_ce_transform(struct sha256_state *sst, u8 const *src,
- int blocks);
- */
-ENTRY(sha2_ce_transform)
- /* load state */
- vld1.32 {dga-dgb}, [r0]
-
- /* load input */
-0: vld1.32 {q0-q1}, [r1]!
- vld1.32 {q2-q3}, [r1]!
- subs r2, r2, #1
-
-#ifndef CONFIG_CPU_BIG_ENDIAN
- vrev32.8 q0, q0
- vrev32.8 q1, q1
- vrev32.8 q2, q2
- vrev32.8 q3, q3
-#endif
-
- /* load first round constant */
- adr rk, .Lsha256_rcon
- vld1.32 {k0}, [rk, :128]!
-
- vadd.u32 ta0, q0, k0
- vmov dg0, dga
- vmov dg1, dgb
-
- add_update 1, 0, 1, 2, 3
- add_update 0, 1, 2, 3, 0
- add_update 1, 2, 3, 0, 1
- add_update 0, 3, 0, 1, 2
- add_update 1, 0, 1, 2, 3
- add_update 0, 1, 2, 3, 0
- add_update 1, 2, 3, 0, 1
- add_update 0, 3, 0, 1, 2
- add_update 1, 0, 1, 2, 3
- add_update 0, 1, 2, 3, 0
- add_update 1, 2, 3, 0, 1
- add_update 0, 3, 0, 1, 2
-
- add_only 1, 1
- add_only 0, 2
- add_only 1, 3
- add_only 0
-
- /* update state */
- vadd.u32 dga, dga, dg0
- vadd.u32 dgb, dgb, dg1
- bne 0b
-
- /* store new state */
- vst1.32 {dga-dgb}, [r0]
- bx lr
-ENDPROC(sha2_ce_transform)
+++ /dev/null
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-
-asmlinkage void sha2_ce_transform(struct crypto_sha256_state *sst,
- u8 const *src, int blocks);
-
-static int sha2_ce_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- int remain;
-
- kernel_neon_begin();
- remain = sha256_base_do_update_blocks(desc, data, len,
- sha2_ce_transform);
- kernel_neon_end();
- return remain;
-}
-
-static int sha2_ce_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- kernel_neon_begin();
- sha256_base_do_finup(desc, data, len, sha2_ce_transform);
- kernel_neon_end();
- return sha256_base_finish(desc, out);
-}
-
-static struct shash_alg algs[] = { {
- .init = sha224_base_init,
- .update = sha2_ce_update,
- .finup = sha2_ce_finup,
- .descsize = sizeof(struct crypto_sha256_state),
- .digestsize = SHA224_DIGEST_SIZE,
- .base = {
- .cra_name = "sha224",
- .cra_driver_name = "sha224-ce",
- .cra_priority = 300,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA256_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .init = sha256_base_init,
- .update = sha2_ce_update,
- .finup = sha2_ce_finup,
- .descsize = sizeof(struct crypto_sha256_state),
- .digestsize = SHA256_DIGEST_SIZE,
- .base = {
- .cra_name = "sha256",
- .cra_driver_name = "sha256-ce",
- .cra_priority = 300,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA256_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static int __init sha2_ce_mod_init(void)
-{
- return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha2_ce_mod_fini(void)
-{
- crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_cpu_feature_match(SHA2, sha2_ce_mod_init);
-module_exit(sha2_ce_mod_fini);
+++ /dev/null
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from the OpenSSL project but the author (Andy Polyakov)
-# has relicensed it under the GPLv2. Therefore this program is free software;
-# you can redistribute it and/or modify it under the terms of the GNU General
-# Public License version 2 as published by the Free Software Foundation.
-#
-# The original headers, including the original license headers, are
-# included below for completeness.
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see https://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# SHA256 block procedure for ARMv4. May 2007.
-
-# Performance is ~2x better than gcc 3.4 generated code and in "abso-
-# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
-# byte [on single-issue Xscale PXA250 core].
-
-# July 2010.
-#
-# Rescheduling for dual-issue pipeline resulted in 22% improvement on
-# Cortex A8 core and ~20 cycles per processed byte.
-
-# February 2011.
-#
-# Profiler-assisted and platform-specific optimization resulted in 16%
-# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
-
-# September 2013.
-#
-# Add NEON implementation. On Cortex A8 it was measured to process one
-# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
-# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
-# code (meaning that latter performs sub-optimally, nothing was done
-# about it).
-
-# May 2014.
-#
-# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
-
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-$ctx="r0"; $t0="r0";
-$inp="r1"; $t4="r1";
-$len="r2"; $t1="r2";
-$T1="r3"; $t3="r3";
-$A="r4";
-$B="r5";
-$C="r6";
-$D="r7";
-$E="r8";
-$F="r9";
-$G="r10";
-$H="r11";
-@V=($A,$B,$C,$D,$E,$F,$G,$H);
-$t2="r12";
-$Ktbl="r14";
-
-@Sigma0=( 2,13,22);
-@Sigma1=( 6,11,25);
-@sigma0=( 7,18, 3);
-@sigma1=(17,19,10);
-
-sub BODY_00_15 {
-my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
-
-$code.=<<___ if ($i<16);
-#if __ARM_ARCH__>=7
- @ ldr $t1,[$inp],#4 @ $i
-# if $i==15
- str $inp,[sp,#17*4] @ make room for $t4
-# endif
- eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
- add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
- eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
-# ifndef __ARMEB__
- rev $t1,$t1
-# endif
-#else
- @ ldrb $t1,[$inp,#3] @ $i
- add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
- ldrb $t2,[$inp,#2]
- ldrb $t0,[$inp,#1]
- orr $t1,$t1,$t2,lsl#8
- ldrb $t2,[$inp],#4
- orr $t1,$t1,$t0,lsl#16
-# if $i==15
- str $inp,[sp,#17*4] @ make room for $t4
-# endif
- eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
- orr $t1,$t1,$t2,lsl#24
- eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
-#endif
-___
-$code.=<<___;
- ldr $t2,[$Ktbl],#4 @ *K256++
- add $h,$h,$t1 @ h+=X[i]
- str $t1,[sp,#`$i%16`*4]
- eor $t1,$f,$g
- add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
- and $t1,$t1,$e
- add $h,$h,$t2 @ h+=K256[i]
- eor $t1,$t1,$g @ Ch(e,f,g)
- eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
- add $h,$h,$t1 @ h+=Ch(e,f,g)
-#if $i==31
- and $t2,$t2,#0xff
- cmp $t2,#0xf2 @ done?
-#endif
-#if $i<15
-# if __ARM_ARCH__>=7
- ldr $t1,[$inp],#4 @ prefetch
-# else
- ldrb $t1,[$inp,#3]
-# endif
- eor $t2,$a,$b @ a^b, b^c in next round
-#else
- ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
- eor $t2,$a,$b @ a^b, b^c in next round
- ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
-#endif
- eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
- and $t3,$t3,$t2 @ (b^c)&=(a^b)
- add $d,$d,$h @ d+=h
- eor $t3,$t3,$b @ Maj(a,b,c)
- add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
- @ add $h,$h,$t3 @ h+=Maj(a,b,c)
-___
- ($t2,$t3)=($t3,$t2);
-}
-
-sub BODY_16_XX {
-my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
-
-$code.=<<___;
- @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
- @ ldr $t4,[sp,#`($i+14)%16`*4]
- mov $t0,$t1,ror#$sigma0[0]
- add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
- mov $t2,$t4,ror#$sigma1[0]
- eor $t0,$t0,$t1,ror#$sigma0[1]
- eor $t2,$t2,$t4,ror#$sigma1[1]
- eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
- ldr $t1,[sp,#`($i+0)%16`*4]
- eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
- ldr $t4,[sp,#`($i+9)%16`*4]
-
- add $t2,$t2,$t0
- eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
- add $t1,$t1,$t2
- eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
- add $t1,$t1,$t4 @ X[i]
-___
- &BODY_00_15(@_);
-}
-
-$code=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-#endif
-
-.text
-#if __ARM_ARCH__<7
-.code 32
-#else
-.syntax unified
-# ifdef __thumb2__
-.thumb
-# else
-.code 32
-# endif
-#endif
-
-.type K256,%object
-.align 5
-K256:
-.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-.size K256,.-K256
-.word 0 @ terminator
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-sha256_block_data_order
-#endif
-.align 5
-
-.global sha256_block_data_order
-.type sha256_block_data_order,%function
-sha256_block_data_order:
-.Lsha256_block_data_order:
-#if __ARM_ARCH__<7
- sub r3,pc,#8 @ sha256_block_data_order
-#else
- adr r3,.Lsha256_block_data_order
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
- ldr r12,.LOPENSSL_armcap
- ldr r12,[r3,r12] @ OPENSSL_armcap_P
- tst r12,#ARMV8_SHA256
- bne .LARMv8
- tst r12,#ARMV7_NEON
- bne .LNEON
-#endif
- add $len,$inp,$len,lsl#6 @ len to point at the end of inp
- stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
- ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
- sub $Ktbl,r3,#256+32 @ K256
- sub sp,sp,#16*4 @ alloca(X[16])
-.Loop:
-# if __ARM_ARCH__>=7
- ldr $t1,[$inp],#4
-# else
- ldrb $t1,[$inp,#3]
-# endif
- eor $t3,$B,$C @ magic
- eor $t2,$t2,$t2
-___
-for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
-$code.=".Lrounds_16_xx:\n";
-for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
-#if __ARM_ARCH__>=7
- ite eq @ Thumb2 thing, sanity check in ARM
-#endif
- ldreq $t3,[sp,#16*4] @ pull ctx
- bne .Lrounds_16_xx
-
- add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
- ldr $t0,[$t3,#0]
- ldr $t1,[$t3,#4]
- ldr $t2,[$t3,#8]
- add $A,$A,$t0
- ldr $t0,[$t3,#12]
- add $B,$B,$t1
- ldr $t1,[$t3,#16]
- add $C,$C,$t2
- ldr $t2,[$t3,#20]
- add $D,$D,$t0
- ldr $t0,[$t3,#24]
- add $E,$E,$t1
- ldr $t1,[$t3,#28]
- add $F,$F,$t2
- ldr $inp,[sp,#17*4] @ pull inp
- ldr $t2,[sp,#18*4] @ pull inp+len
- add $G,$G,$t0
- add $H,$H,$t1
- stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
- cmp $inp,$t2
- sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
- bne .Loop
-
- add sp,sp,#`16+3`*4 @ destroy frame
-#if __ARM_ARCH__>=5
- ldmia sp!,{r4-r11,pc}
-#else
- ldmia sp!,{r4-r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size sha256_block_data_order,.-sha256_block_data_order
-___
-######################################################################
-# NEON stuff
-#
-{{{
-my @X=map("q$_",(0..3));
-my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
-my $Xfer=$t4;
-my $j=0;
-
-sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
-sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
-
-sub AUTOLOAD() # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
- my $arg = pop;
- $arg = "#$arg" if ($arg*1 eq $arg);
- $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
-}
-
-sub Xupdate()
-{ use integer;
- my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body);
- my ($a,$b,$c,$d,$e,$f,$g,$h);
-
- &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T2,$T0,$sigma0[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T1,$T0,$sigma0[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vsli_32 ($T2,$T0,32-$sigma0[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T3,$T0,$sigma0[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &veor ($T1,$T1,$T2);
- eval(shift(@insns));
- eval(shift(@insns));
- &vsli_32 ($T3,$T0,32-$sigma0[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &veor ($T1,$T1,$T3); # sigma0(X[1..4])
- eval(shift(@insns));
- eval(shift(@insns));
- &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
- eval(shift(@insns));
- eval(shift(@insns));
- &veor ($T5,$T5,$T4);
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &veor ($T5,$T5,$T4); # sigma1(X[14..15])
- eval(shift(@insns));
- eval(shift(@insns));
- &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
- eval(shift(@insns));
- eval(shift(@insns));
- &veor ($T5,$T5,$T4);
- eval(shift(@insns));
- eval(shift(@insns));
- &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &vld1_32 ("{$T0}","[$Ktbl,:128]!");
- eval(shift(@insns));
- eval(shift(@insns));
- &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
- eval(shift(@insns));
- eval(shift(@insns));
- &veor ($T5,$T5,$T4); # sigma1(X[16..17])
- eval(shift(@insns));
- eval(shift(@insns));
- &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
- eval(shift(@insns));
- eval(shift(@insns));
- &vadd_i32 ($T0,$T0,@X[0]);
- while($#insns>=2) { eval(shift(@insns)); }
- &vst1_32 ("{$T0}","[$Xfer,:128]!");
- eval(shift(@insns));
- eval(shift(@insns));
-
- push(@X,shift(@X)); # "rotate" X[]
-}
-
-sub Xpreload()
-{ use integer;
- my $body = shift;
- my @insns = (&$body,&$body,&$body,&$body);
- my ($a,$b,$c,$d,$e,$f,$g,$h);
-
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vld1_32 ("{$T0}","[$Ktbl,:128]!");
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vrev32_8 (@X[0],@X[0]);
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- eval(shift(@insns));
- &vadd_i32 ($T0,$T0,@X[0]);
- foreach (@insns) { eval; } # remaining instructions
- &vst1_32 ("{$T0}","[$Xfer,:128]!");
-
- push(@X,shift(@X)); # "rotate" X[]
-}
-
-sub body_00_15 () {
- (
- '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
- '&add ($h,$h,$t1)', # h+=X[i]+K[i]
- '&eor ($t1,$f,$g)',
- '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
- '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
- '&and ($t1,$t1,$e)',
- '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
- '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
- '&eor ($t1,$t1,$g)', # Ch(e,f,g)
- '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
- '&eor ($t2,$a,$b)', # a^b, b^c in next round
- '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
- '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
- '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
- '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
- '&ldr ($t1,"[sp,#64]") if ($j==31)',
- '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
- '&add ($d,$d,$h)', # d+=h
- '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
- '&eor ($t3,$t3,$b)', # Maj(a,b,c)
- '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
- )
-}
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
-.arch armv7-a
-.fpu neon
-
-.global sha256_block_data_order_neon
-.type sha256_block_data_order_neon,%function
-.align 4
-sha256_block_data_order_neon:
-.LNEON:
- stmdb sp!,{r4-r12,lr}
-
- sub $H,sp,#16*4+16
- adr $Ktbl,.Lsha256_block_data_order
- sub $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
- bic $H,$H,#15 @ align for 128-bit stores
- mov $t2,sp
- mov sp,$H @ alloca
- add $len,$inp,$len,lsl#6 @ len to point at the end of inp
-
- vld1.8 {@X[0]},[$inp]!
- vld1.8 {@X[1]},[$inp]!
- vld1.8 {@X[2]},[$inp]!
- vld1.8 {@X[3]},[$inp]!
- vld1.32 {$T0},[$Ktbl,:128]!
- vld1.32 {$T1},[$Ktbl,:128]!
- vld1.32 {$T2},[$Ktbl,:128]!
- vld1.32 {$T3},[$Ktbl,:128]!
- vrev32.8 @X[0],@X[0] @ yes, even on
- str $ctx,[sp,#64]
- vrev32.8 @X[1],@X[1] @ big-endian
- str $inp,[sp,#68]
- mov $Xfer,sp
- vrev32.8 @X[2],@X[2]
- str $len,[sp,#72]
- vrev32.8 @X[3],@X[3]
- str $t2,[sp,#76] @ save original sp
- vadd.i32 $T0,$T0,@X[0]
- vadd.i32 $T1,$T1,@X[1]
- vst1.32 {$T0},[$Xfer,:128]!
- vadd.i32 $T2,$T2,@X[2]
- vst1.32 {$T1},[$Xfer,:128]!
- vadd.i32 $T3,$T3,@X[3]
- vst1.32 {$T2},[$Xfer,:128]!
- vst1.32 {$T3},[$Xfer,:128]!
-
- ldmia $ctx,{$A-$H}
- sub $Xfer,$Xfer,#64
- ldr $t1,[sp,#0]
- eor $t2,$t2,$t2
- eor $t3,$B,$C
- b .L_00_48
-
-.align 4
-.L_00_48:
-___
- &Xupdate(\&body_00_15);
- &Xupdate(\&body_00_15);
- &Xupdate(\&body_00_15);
- &Xupdate(\&body_00_15);
-$code.=<<___;
- teq $t1,#0 @ check for K256 terminator
- ldr $t1,[sp,#0]
- sub $Xfer,$Xfer,#64
- bne .L_00_48
-
- ldr $inp,[sp,#68]
- ldr $t0,[sp,#72]
- sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
- teq $inp,$t0
- it eq
- subeq $inp,$inp,#64 @ avoid SEGV
- vld1.8 {@X[0]},[$inp]! @ load next input block
- vld1.8 {@X[1]},[$inp]!
- vld1.8 {@X[2]},[$inp]!
- vld1.8 {@X[3]},[$inp]!
- it ne
- strne $inp,[sp,#68]
- mov $Xfer,sp
-___
- &Xpreload(\&body_00_15);
- &Xpreload(\&body_00_15);
- &Xpreload(\&body_00_15);
- &Xpreload(\&body_00_15);
-$code.=<<___;
- ldr $t0,[$t1,#0]
- add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
- ldr $t2,[$t1,#4]
- ldr $t3,[$t1,#8]
- ldr $t4,[$t1,#12]
- add $A,$A,$t0 @ accumulate
- ldr $t0,[$t1,#16]
- add $B,$B,$t2
- ldr $t2,[$t1,#20]
- add $C,$C,$t3
- ldr $t3,[$t1,#24]
- add $D,$D,$t4
- ldr $t4,[$t1,#28]
- add $E,$E,$t0
- str $A,[$t1],#4
- add $F,$F,$t2
- str $B,[$t1],#4
- add $G,$G,$t3
- str $C,[$t1],#4
- add $H,$H,$t4
- str $D,[$t1],#4
- stmia $t1,{$E-$H}
-
- ittte ne
- movne $Xfer,sp
- ldrne $t1,[sp,#0]
- eorne $t2,$t2,$t2
- ldreq sp,[sp,#76] @ restore original sp
- itt ne
- eorne $t3,$B,$C
- bne .L_00_48
-
- ldmia sp!,{r4-r12,pc}
-.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
-#endif
-___
-}}}
-######################################################################
-# ARMv8 stuff
-#
-{{{
-my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
-my @MSG=map("q$_",(8..11));
-my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
-my $Ktbl="r3";
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-
-# ifdef __thumb2__
-# define INST(a,b,c,d) .byte c,d|0xc,a,b
-# else
-# define INST(a,b,c,d) .byte a,b,c,d
-# endif
-
-.type sha256_block_data_order_armv8,%function
-.align 5
-sha256_block_data_order_armv8:
-.LARMv8:
- vld1.32 {$ABCD,$EFGH},[$ctx]
-# ifdef __thumb2__
- adr $Ktbl,.LARMv8
- sub $Ktbl,$Ktbl,#.LARMv8-K256
-# else
- adrl $Ktbl,K256
-# endif
- add $len,$inp,$len,lsl#6 @ len to point at the end of inp
-
-.Loop_v8:
- vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
- vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
- vld1.32 {$W0},[$Ktbl]!
- vrev32.8 @MSG[0],@MSG[0]
- vrev32.8 @MSG[1],@MSG[1]
- vrev32.8 @MSG[2],@MSG[2]
- vrev32.8 @MSG[3],@MSG[3]
- vmov $ABCD_SAVE,$ABCD @ offload
- vmov $EFGH_SAVE,$EFGH
- teq $inp,$len
-___
-for($i=0;$i<12;$i++) {
-$code.=<<___;
- vld1.32 {$W1},[$Ktbl]!
- vadd.i32 $W0,$W0,@MSG[0]
- sha256su0 @MSG[0],@MSG[1]
- vmov $abcd,$ABCD
- sha256h $ABCD,$EFGH,$W0
- sha256h2 $EFGH,$abcd,$W0
- sha256su1 @MSG[0],@MSG[2],@MSG[3]
-___
- ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
-}
-$code.=<<___;
- vld1.32 {$W1},[$Ktbl]!
- vadd.i32 $W0,$W0,@MSG[0]
- vmov $abcd,$ABCD
- sha256h $ABCD,$EFGH,$W0
- sha256h2 $EFGH,$abcd,$W0
-
- vld1.32 {$W0},[$Ktbl]!
- vadd.i32 $W1,$W1,@MSG[1]
- vmov $abcd,$ABCD
- sha256h $ABCD,$EFGH,$W1
- sha256h2 $EFGH,$abcd,$W1
-
- vld1.32 {$W1},[$Ktbl]
- vadd.i32 $W0,$W0,@MSG[2]
- sub $Ktbl,$Ktbl,#256-16 @ rewind
- vmov $abcd,$ABCD
- sha256h $ABCD,$EFGH,$W0
- sha256h2 $EFGH,$abcd,$W0
-
- vadd.i32 $W1,$W1,@MSG[3]
- vmov $abcd,$ABCD
- sha256h $ABCD,$EFGH,$W1
- sha256h2 $EFGH,$abcd,$W1
-
- vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
- vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
- it ne
- bne .Loop_v8
-
- vst1.32 {$ABCD,$EFGH},[$ctx]
-
- ret @ bx lr
-.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
-#endif
-___
-}}}
-$code.=<<___;
-.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
-.align 2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm OPENSSL_armcap_P,4,4
-#endif
-___
-
-open SELF,$0;
-while(<SELF>) {
- next if (/^#!/);
- last if (!s/^#/@/ and !/^$/);
- print;
-}
-close SELF;
-
-{ my %opcode = (
- "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
- "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
-
- sub unsha256 {
- my ($mnemonic,$arg)=@_;
-
- if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
- my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
- |(($2&7)<<17)|(($2&8)<<4)
- |(($3&7)<<1) |(($3&8)<<2);
- # since ARMv7 instructions are always encoded little-endian.
- # correct solution is to use .inst directive, but older
- # assemblers don't implement it:-(
- sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
- $word&0xff,($word>>8)&0xff,
- ($word>>16)&0xff,($word>>24)&0xff,
- $mnemonic,$arg;
- }
- }
-}
-
-foreach (split($/,$code)) {
-
- s/\`([^\`]*)\`/eval $1/geo;
-
- s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
-
- s/\bret\b/bx lr/go or
- s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
-
- print $_,"\n";
-}
-
-close STDOUT; # enforce flush
+++ /dev/null
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
- * using optimized ARM assembler and NEON instructions.
- *
- * Copyright © 2015 Google Inc.
- *
- * This file is based on sha256_ssse3_glue.c:
- * Copyright (C) 2013 Intel Corporation
- * Author: Tim Chen <tim.c.chen@linux.intel.com>
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include "sha256_glue.h"
-
-asmlinkage void sha256_block_data_order(struct crypto_sha256_state *state,
- const u8 *data, int num_blks);
-
-static int crypto_sha256_arm_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- /* make sure casting to sha256_block_fn() is safe */
- BUILD_BUG_ON(offsetof(struct crypto_sha256_state, state) != 0);
-
- return sha256_base_do_update_blocks(desc, data, len,
- sha256_block_data_order);
-}
-
-static int crypto_sha256_arm_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- sha256_base_do_finup(desc, data, len, sha256_block_data_order);
- return sha256_base_finish(desc, out);
-}
-
-static struct shash_alg algs[] = { {
- .digestsize = SHA256_DIGEST_SIZE,
- .init = sha256_base_init,
- .update = crypto_sha256_arm_update,
- .finup = crypto_sha256_arm_finup,
- .descsize = sizeof(struct crypto_sha256_state),
- .base = {
- .cra_name = "sha256",
- .cra_driver_name = "sha256-asm",
- .cra_priority = 150,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA256_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA224_DIGEST_SIZE,
- .init = sha224_base_init,
- .update = crypto_sha256_arm_update,
- .finup = crypto_sha256_arm_finup,
- .descsize = sizeof(struct crypto_sha256_state),
- .base = {
- .cra_name = "sha224",
- .cra_driver_name = "sha224-asm",
- .cra_priority = 150,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA224_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
-
-static int __init sha256_mod_init(void)
-{
- int res = crypto_register_shashes(algs, ARRAY_SIZE(algs));
-
- if (res < 0)
- return res;
-
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
- res = crypto_register_shashes(sha256_neon_algs,
- ARRAY_SIZE(sha256_neon_algs));
-
- if (res < 0)
- crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
- }
-
- return res;
-}
-
-static void __exit sha256_mod_fini(void)
-{
- crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-
- if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
- crypto_unregister_shashes(sha256_neon_algs,
- ARRAY_SIZE(sha256_neon_algs));
-}
-
-module_init(sha256_mod_init);
-module_exit(sha256_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm (ARM), including NEON");
-
-MODULE_ALIAS_CRYPTO("sha256");
+++ /dev/null
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _CRYPTO_SHA256_GLUE_H
-#define _CRYPTO_SHA256_GLUE_H
-
-#include <crypto/hash.h>
-
-extern struct shash_alg sha256_neon_algs[2];
-
-#endif /* _CRYPTO_SHA256_GLUE_H */
+++ /dev/null
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Glue code for the SHA256 Secure Hash Algorithm assembly implementation
- * using NEON instructions.
- *
- * Copyright © 2015 Google Inc.
- *
- * This file is based on sha512_neon_glue.c:
- * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha256_base.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include "sha256_glue.h"
-
-asmlinkage void sha256_block_data_order_neon(
- struct crypto_sha256_state *digest, const u8 *data, int num_blks);
-
-static int crypto_sha256_neon_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
-{
- int remain;
-
- kernel_neon_begin();
- remain = sha256_base_do_update_blocks(desc, data, len,
- sha256_block_data_order_neon);
- kernel_neon_end();
- return remain;
-}
-
-static int crypto_sha256_neon_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out)
-{
- kernel_neon_begin();
- sha256_base_do_finup(desc, data, len, sha256_block_data_order_neon);
- kernel_neon_end();
- return sha256_base_finish(desc, out);
-}
-
-struct shash_alg sha256_neon_algs[] = { {
- .digestsize = SHA256_DIGEST_SIZE,
- .init = sha256_base_init,
- .update = crypto_sha256_neon_update,
- .finup = crypto_sha256_neon_finup,
- .descsize = sizeof(struct crypto_sha256_state),
- .base = {
- .cra_name = "sha256",
- .cra_driver_name = "sha256-neon",
- .cra_priority = 250,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA256_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-}, {
- .digestsize = SHA224_DIGEST_SIZE,
- .init = sha224_base_init,
- .update = crypto_sha256_neon_update,
- .finup = crypto_sha256_neon_finup,
- .descsize = sizeof(struct crypto_sha256_state),
- .base = {
- .cra_name = "sha224",
- .cra_driver_name = "sha224-neon",
- .cra_priority = 250,
- .cra_flags = CRYPTO_AHASH_ALG_BLOCK_ONLY |
- CRYPTO_AHASH_ALG_FINUP_MAX,
- .cra_blocksize = SHA224_BLOCK_SIZE,
- .cra_module = THIS_MODULE,
- }
-} };
# SPDX-License-Identifier: GPL-2.0-only
poly1305-core.S
+sha256-core.S
tristate
default CRYPTO_LIB_POLY1305
select CRYPTO_ARCH_HAVE_LIB_POLY1305
+
+config CRYPTO_SHA256_ARM
+ tristate
+ depends on !CPU_V7M
+ default CRYPTO_LIB_SHA256
+ select CRYPTO_ARCH_HAVE_LIB_SHA256
obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
poly1305-arm-y := poly1305-core.o poly1305-glue.o
+obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
+sha256-arm-y := sha256.o sha256-core.o
+sha256-arm-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o
+
quiet_cmd_perl = PERL $@
cmd_perl = $(PERL) $(<) > $(@)
$(obj)/%-core.S: $(src)/%-armv4.pl
$(call cmd,perl)
-clean-files += poly1305-core.S
+clean-files += poly1305-core.S sha256-core.S
aflags-thumb2-$(CONFIG_THUMB2_KERNEL) := -U__thumb2__ -D__thumb2__=1
poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y)
+
+AFLAGS_sha256-core.o += $(aflags-thumb2-y)
--- /dev/null
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0"; $t0="r0";
+$inp="r1"; $t4="r1";
+$len="r2"; $t1="r2";
+$T1="r3"; $t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+ @ ldr $t1,[$inp],#4 @ $i
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+# ifndef __ARMEB__
+ rev $t1,$t1
+# endif
+#else
+ @ ldrb $t1,[$inp,#3] @ $i
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ ldrb $t2,[$inp,#2]
+ ldrb $t0,[$inp,#1]
+ orr $t1,$t1,$t2,lsl#8
+ ldrb $t2,[$inp],#4
+ orr $t1,$t1,$t0,lsl#16
+# if $i==15
+ str $inp,[sp,#17*4] @ make room for $t4
+# endif
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+ orr $t1,$t1,$t2,lsl#24
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+#endif
+___
+$code.=<<___;
+ ldr $t2,[$Ktbl],#4 @ *K256++
+ add $h,$h,$t1 @ h+=X[i]
+ str $t1,[sp,#`$i%16`*4]
+ eor $t1,$f,$g
+ add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
+ and $t1,$t1,$e
+ add $h,$h,$t2 @ h+=K256[i]
+ eor $t1,$t1,$g @ Ch(e,f,g)
+ eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+ add $h,$h,$t1 @ h+=Ch(e,f,g)
+#if $i==31
+ and $t2,$t2,#0xff
+ cmp $t2,#0xf2 @ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4 @ prefetch
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t2,$a,$b @ a^b, b^c in next round
+#else
+ ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
+ eor $t2,$a,$b @ a^b, b^c in next round
+ ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
+#endif
+ eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
+ and $t3,$t3,$t2 @ (b^c)&=(a^b)
+ add $d,$d,$h @ d+=h
+ eor $t3,$t3,$b @ Maj(a,b,c)
+ add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
+ @ add $h,$h,$t3 @ h+=Maj(a,b,c)
+___
+ ($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+ @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
+ @ ldr $t4,[sp,#`($i+14)%16`*4]
+ mov $t0,$t1,ror#$sigma0[0]
+ add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
+ mov $t2,$t4,ror#$sigma1[0]
+ eor $t0,$t0,$t1,ror#$sigma0[1]
+ eor $t2,$t2,$t4,ror#$sigma1[1]
+ eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
+ ldr $t1,[sp,#`($i+0)%16`*4]
+ eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
+ ldr $t4,[sp,#`($i+9)%16`*4]
+
+ add $t2,$t2,$t0
+ eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
+ add $t1,$t1,$t2
+ eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
+ add $t1,$t1,$t4 @ X[i]
+___
+ &BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code 32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code 32
+# endif
+#endif
+
+.type K256,%object
+.align 5
+K256:
+.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size K256,.-K256
+.word 0 @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-sha256_block_data_order
+#endif
+.align 5
+
+.global sha256_block_data_order
+.type sha256_block_data_order,%function
+sha256_block_data_order:
+.Lsha256_block_data_order:
+#if __ARM_ARCH__<7
+ sub r3,pc,#8 @ sha256_block_data_order
+#else
+ adr r3,.Lsha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+ tst r12,#ARMV8_SHA256
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+ stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
+ ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+ sub $Ktbl,r3,#256+32 @ K256
+ sub sp,sp,#16*4 @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+ ldr $t1,[$inp],#4
+# else
+ ldrb $t1,[$inp,#3]
+# endif
+ eor $t3,$B,$C @ magic
+ eor $t2,$t2,$t2
+___
+for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq $t3,[sp,#16*4] @ pull ctx
+ bne .Lrounds_16_xx
+
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t0,[$t3,#0]
+ ldr $t1,[$t3,#4]
+ ldr $t2,[$t3,#8]
+ add $A,$A,$t0
+ ldr $t0,[$t3,#12]
+ add $B,$B,$t1
+ ldr $t1,[$t3,#16]
+ add $C,$C,$t2
+ ldr $t2,[$t3,#20]
+ add $D,$D,$t0
+ ldr $t0,[$t3,#24]
+ add $E,$E,$t1
+ ldr $t1,[$t3,#28]
+ add $F,$F,$t2
+ ldr $inp,[sp,#17*4] @ pull inp
+ ldr $t2,[sp,#18*4] @ pull inp+len
+ add $G,$G,$t0
+ add $H,$H,$t1
+ stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+ cmp $inp,$t2
+ sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
+ bne .Loop
+
+ add sp,sp,#`16+3`*4 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4-r11,pc}
+#else
+ ldmia sp!,{r4-r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
+#endif
+.size sha256_block_data_order,.-sha256_block_data_order
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
+sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
+
+sub AUTOLOAD() # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+ my $arg = pop;
+ $arg = "#$arg" if ($arg*1 eq $arg);
+ $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T2,$T0,$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T1,$T0,$sigma0[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T2,$T0,32-$sigma0[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T3,$T0,$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T2);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T3,$T0,32-$sigma0[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T1,$T1,$T3); # sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &veor ($T5,$T5,$T4); # sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ while($#insns>=2) { eval(shift(@insns)); }
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+ my $body = shift;
+ my @insns = (&$body,&$body,&$body,&$body);
+ my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vld1_32 ("{$T0}","[$Ktbl,:128]!");
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vrev32_8 (@X[0],@X[0]);
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ eval(shift(@insns));
+ &vadd_i32 ($T0,$T0,@X[0]);
+ foreach (@insns) { eval; } # remaining instructions
+ &vst1_32 ("{$T0}","[$Xfer,:128]!");
+
+ push(@X,shift(@X)); # "rotate" X[]
+}
+
+sub body_00_15 () {
+ (
+ '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+ '&add ($h,$h,$t1)', # h+=X[i]+K[i]
+ '&eor ($t1,$f,$g)',
+ '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+ '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
+ '&and ($t1,$t1,$e)',
+ '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
+ '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+ '&eor ($t1,$t1,$g)', # Ch(e,f,g)
+ '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
+ '&eor ($t2,$a,$b)', # a^b, b^c in next round
+ '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
+ '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
+ '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
+ '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
+ '&ldr ($t1,"[sp,#64]") if ($j==31)',
+ '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
+ '&add ($d,$d,$h)', # d+=h
+ '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
+ '&eor ($t3,$t3,$b)', # Maj(a,b,c)
+ '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+ )
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.global sha256_block_data_order_neon
+.type sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4-r12,lr}
+
+ sub $H,sp,#16*4+16
+ adr $Ktbl,.Lsha256_block_data_order
+ sub $Ktbl,$Ktbl,#.Lsha256_block_data_order-K256
+ bic $H,$H,#15 @ align for 128-bit stores
+ mov $t2,sp
+ mov sp,$H @ alloca
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+
+ vld1.8 {@X[0]},[$inp]!
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ vld1.32 {$T0},[$Ktbl,:128]!
+ vld1.32 {$T1},[$Ktbl,:128]!
+ vld1.32 {$T2},[$Ktbl,:128]!
+ vld1.32 {$T3},[$Ktbl,:128]!
+ vrev32.8 @X[0],@X[0] @ yes, even on
+ str $ctx,[sp,#64]
+ vrev32.8 @X[1],@X[1] @ big-endian
+ str $inp,[sp,#68]
+ mov $Xfer,sp
+ vrev32.8 @X[2],@X[2]
+ str $len,[sp,#72]
+ vrev32.8 @X[3],@X[3]
+ str $t2,[sp,#76] @ save original sp
+ vadd.i32 $T0,$T0,@X[0]
+ vadd.i32 $T1,$T1,@X[1]
+ vst1.32 {$T0},[$Xfer,:128]!
+ vadd.i32 $T2,$T2,@X[2]
+ vst1.32 {$T1},[$Xfer,:128]!
+ vadd.i32 $T3,$T3,@X[3]
+ vst1.32 {$T2},[$Xfer,:128]!
+ vst1.32 {$T3},[$Xfer,:128]!
+
+ ldmia $ctx,{$A-$H}
+ sub $Xfer,$Xfer,#64
+ ldr $t1,[sp,#0]
+ eor $t2,$t2,$t2
+ eor $t3,$B,$C
+ b .L_00_48
+
+.align 4
+.L_00_48:
+___
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+ &Xupdate(\&body_00_15);
+$code.=<<___;
+ teq $t1,#0 @ check for K256 terminator
+ ldr $t1,[sp,#0]
+ sub $Xfer,$Xfer,#64
+ bne .L_00_48
+
+ ldr $inp,[sp,#68]
+ ldr $t0,[sp,#72]
+ sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
+ teq $inp,$t0
+ it eq
+ subeq $inp,$inp,#64 @ avoid SEGV
+ vld1.8 {@X[0]},[$inp]! @ load next input block
+ vld1.8 {@X[1]},[$inp]!
+ vld1.8 {@X[2]},[$inp]!
+ vld1.8 {@X[3]},[$inp]!
+ it ne
+ strne $inp,[sp,#68]
+ mov $Xfer,sp
+___
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+ &Xpreload(\&body_00_15);
+$code.=<<___;
+ ldr $t0,[$t1,#0]
+ add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
+ ldr $t2,[$t1,#4]
+ ldr $t3,[$t1,#8]
+ ldr $t4,[$t1,#12]
+ add $A,$A,$t0 @ accumulate
+ ldr $t0,[$t1,#16]
+ add $B,$B,$t2
+ ldr $t2,[$t1,#20]
+ add $C,$C,$t3
+ ldr $t3,[$t1,#24]
+ add $D,$D,$t4
+ ldr $t4,[$t1,#28]
+ add $E,$E,$t0
+ str $A,[$t1],#4
+ add $F,$F,$t2
+ str $B,[$t1],#4
+ add $G,$G,$t3
+ str $C,[$t1],#4
+ add $H,$H,$t4
+ str $D,[$t1],#4
+ stmia $t1,{$E-$H}
+
+ ittte ne
+ movne $Xfer,sp
+ ldrne $t1,[sp,#0]
+ eorne $t2,$t2,$t2
+ ldreq sp,[sp,#76] @ restore original sp
+ itt ne
+ eorne $t3,$B,$C
+ bne .L_00_48
+
+ ldmia sp!,{r4-r12,pc}
+.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
+.type sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+ vld1.32 {$ABCD,$EFGH},[$ctx]
+# ifdef __thumb2__
+ adr $Ktbl,.LARMv8
+ sub $Ktbl,$Ktbl,#.LARMv8-K256
+# else
+ adrl $Ktbl,K256
+# endif
+ add $len,$inp,$len,lsl#6 @ len to point at the end of inp
+
+.Loop_v8:
+ vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
+ vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
+ vld1.32 {$W0},[$Ktbl]!
+ vrev32.8 @MSG[0],@MSG[0]
+ vrev32.8 @MSG[1],@MSG[1]
+ vrev32.8 @MSG[2],@MSG[2]
+ vrev32.8 @MSG[3],@MSG[3]
+ vmov $ABCD_SAVE,$ABCD @ offload
+ vmov $EFGH_SAVE,$EFGH
+ teq $inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ sha256su0 @MSG[0],@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+ sha256su1 @MSG[0],@MSG[2],@MSG[3]
+___
+ ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+ vld1.32 {$W1},[$Ktbl]!
+ vadd.i32 $W0,$W0,@MSG[0]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vld1.32 {$W0},[$Ktbl]!
+ vadd.i32 $W1,$W1,@MSG[1]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vld1.32 {$W1},[$Ktbl]
+ vadd.i32 $W0,$W0,@MSG[2]
+ sub $Ktbl,$Ktbl,#256-16 @ rewind
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W0
+ sha256h2 $EFGH,$abcd,$W0
+
+ vadd.i32 $W1,$W1,@MSG[3]
+ vmov $abcd,$ABCD
+ sha256h $ABCD,$EFGH,$W1
+ sha256h2 $EFGH,$abcd,$W1
+
+ vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
+ vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
+ it ne
+ bne .Loop_v8
+
+ vst1.32 {$ABCD,$EFGH},[$ctx]
+
+ ret @ bx lr
+.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/@/ and !/^$/);
+ print;
+}
+close SELF;
+
+{ my %opcode = (
+ "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
+ "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
+
+ sub unsha256 {
+ my ($mnemonic,$arg)=@_;
+
+ if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<17)|(($2&8)<<4)
+ |(($3&7)<<1) |(($3&8)<<2);
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
+ $mnemonic,$arg;
+ }
+ }
+}
+
+foreach (split($/,$code)) {
+
+ s/\`([^\`]*)\`/eval $1/geo;
+
+ s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+ s/\bret\b/bx lr/go or
+ s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
+
+ print $_,"\n";
+}
+
+close STDOUT; # enforce flush
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sha256-ce.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2015 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+ .arch armv8-a
+ .fpu crypto-neon-fp-armv8
+
+ k0 .req q7
+ k1 .req q8
+ rk .req r3
+
+ ta0 .req q9
+ ta1 .req q10
+ tb0 .req q10
+ tb1 .req q9
+
+ dga .req q11
+ dgb .req q12
+
+ dg0 .req q13
+ dg1 .req q14
+ dg2 .req q15
+
+ .macro add_only, ev, s0
+ vmov dg2, dg0
+ .ifnb \s0
+ vld1.32 {k\ev}, [rk, :128]!
+ .endif
+ sha256h.32 dg0, dg1, tb\ev
+ sha256h2.32 dg1, dg2, tb\ev
+ .ifnb \s0
+ vadd.u32 ta\ev, q\s0, k\ev
+ .endif
+ .endm
+
+ .macro add_update, ev, s0, s1, s2, s3
+ sha256su0.32 q\s0, q\s1
+ add_only \ev, \s1
+ sha256su1.32 q\s0, q\s2, q\s3
+ .endm
+
+ .align 6
+.Lsha256_rcon:
+ .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+ .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+ .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+ .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+ .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+ .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+ .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+ .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+ /*
+ * void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
+ * const u8 *data, size_t nblocks);
+ */
+ENTRY(sha256_ce_transform)
+ /* load state */
+ vld1.32 {dga-dgb}, [r0]
+
+ /* load input */
+0: vld1.32 {q0-q1}, [r1]!
+ vld1.32 {q2-q3}, [r1]!
+ subs r2, r2, #1
+
+#ifndef CONFIG_CPU_BIG_ENDIAN
+ vrev32.8 q0, q0
+ vrev32.8 q1, q1
+ vrev32.8 q2, q2
+ vrev32.8 q3, q3
+#endif
+
+ /* load first round constant */
+ adr rk, .Lsha256_rcon
+ vld1.32 {k0}, [rk, :128]!
+
+ vadd.u32 ta0, q0, k0
+ vmov dg0, dga
+ vmov dg1, dgb
+
+ add_update 1, 0, 1, 2, 3
+ add_update 0, 1, 2, 3, 0
+ add_update 1, 2, 3, 0, 1
+ add_update 0, 3, 0, 1, 2
+ add_update 1, 0, 1, 2, 3
+ add_update 0, 1, 2, 3, 0
+ add_update 1, 2, 3, 0, 1
+ add_update 0, 3, 0, 1, 2
+ add_update 1, 0, 1, 2, 3
+ add_update 0, 1, 2, 3, 0
+ add_update 1, 2, 3, 0, 1
+ add_update 0, 3, 0, 1, 2
+
+ add_only 1, 1
+ add_only 0, 2
+ add_only 1, 3
+ add_only 0
+
+ /* update state */
+ vadd.u32 dga, dga, dg0
+ vadd.u32 dgb, dgb, dg1
+ bne 0b
+
+ /* store new state */
+ vst1.32 {dga-dgb}, [r0]
+ bx lr
+ENDPROC(sha256_ce_transform)
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-256 optimized for ARM
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <crypto/internal/sha2.h>
+#include <crypto/internal/simd.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+asmlinkage void sha256_block_data_order(u32 state[SHA256_STATE_WORDS],
+ const u8 *data, size_t nblocks);
+asmlinkage void sha256_block_data_order_neon(u32 state[SHA256_STATE_WORDS],
+ const u8 *data, size_t nblocks);
+asmlinkage void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
+ const u8 *data, size_t nblocks);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+ const u8 *data, size_t nblocks)
+{
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+ static_branch_likely(&have_neon) && crypto_simd_usable()) {
+ kernel_neon_begin();
+ if (static_branch_likely(&have_ce))
+ sha256_ce_transform(state, data, nblocks);
+ else
+ sha256_block_data_order_neon(state, data, nblocks);
+ kernel_neon_end();
+ } else {
+ sha256_block_data_order(state, data, nblocks);
+ }
+}
+EXPORT_SYMBOL(sha256_blocks_arch);
+
+bool sha256_is_arch_optimized(void)
+{
+ /* We always can use at least the ARM scalar implementation. */
+ return true;
+}
+EXPORT_SYMBOL(sha256_is_arch_optimized);
+
+static int __init sha256_arm_mod_init(void)
+{
+ if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
+ static_branch_enable(&have_neon);
+ if (elf_hwcap2 & HWCAP2_SHA2)
+ static_branch_enable(&have_ce);
+ }
+ return 0;
+}
+arch_initcall(sha256_arm_mod_init);
+
+static void __exit sha256_arm_mod_exit(void)
+{
+}
+module_exit(sha256_arm_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-256 optimized for ARM");