lib/crypto: arm: Move arch/arm/lib/crypto/ into lib/crypto/
authorEric Biggers <ebiggers@kernel.org>
Thu, 19 Jun 2025 19:19:00 +0000 (12:19 -0700)
committerEric Biggers <ebiggers@kernel.org>
Mon, 30 Jun 2025 16:26:20 +0000 (09:26 -0700)
Move the contents of arch/arm/lib/crypto/ into lib/crypto/arm/.

The new code organization makes a lot more sense for how this code
actually works and is developed.  In particular, it makes it possible to
build each algorithm as a single module, with better inlining and dead
code elimination.  For a more detailed explanation, see the patchset
which did this for the CRC library code:
https://lore.kernel.org/r/20250607200454.73587-1-ebiggers@kernel.org/.
Also see the patchset which did this for SHA-512:
https://lore.kernel.org/linux-crypto/20250616014019.415791-1-ebiggers@kernel.org/

This is just a preparatory commit, which does the move to get the files
into their new location but keeps them building the same way as before.
Later commits will make the actual improvements to the way the
arch-optimized code is integrated for each algorithm.

Add a gitignore entry for the removed directory arch/arm/lib/crypto/ so
that people don't accidentally commit leftover generated files.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Sohil Mehta <sohil.mehta@intel.com>
Link: https://lore.kernel.org/r/20250619191908.134235-2-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
30 files changed:
arch/arm/lib/.gitignore [new file with mode: 0644]
arch/arm/lib/Makefile
arch/arm/lib/crypto/.gitignore [deleted file]
arch/arm/lib/crypto/Kconfig [deleted file]
arch/arm/lib/crypto/Makefile [deleted file]
arch/arm/lib/crypto/blake2s-core.S [deleted file]
arch/arm/lib/crypto/blake2s-glue.c [deleted file]
arch/arm/lib/crypto/chacha-glue.c [deleted file]
arch/arm/lib/crypto/chacha-neon-core.S [deleted file]
arch/arm/lib/crypto/chacha-scalar-core.S [deleted file]
arch/arm/lib/crypto/poly1305-armv4.pl [deleted file]
arch/arm/lib/crypto/poly1305-glue.c [deleted file]
arch/arm/lib/crypto/sha256-armv4.pl [deleted file]
arch/arm/lib/crypto/sha256-ce.S [deleted file]
arch/arm/lib/crypto/sha256.c [deleted file]
lib/crypto/Kconfig
lib/crypto/Makefile
lib/crypto/arm/.gitignore
lib/crypto/arm/Kconfig [new file with mode: 0644]
lib/crypto/arm/Makefile [new file with mode: 0644]
lib/crypto/arm/blake2s-core.S [new file with mode: 0644]
lib/crypto/arm/blake2s-glue.c [new file with mode: 0644]
lib/crypto/arm/chacha-glue.c [new file with mode: 0644]
lib/crypto/arm/chacha-neon-core.S [new file with mode: 0644]
lib/crypto/arm/chacha-scalar-core.S [new file with mode: 0644]
lib/crypto/arm/poly1305-armv4.pl [new file with mode: 0644]
lib/crypto/arm/poly1305-glue.c [new file with mode: 0644]
lib/crypto/arm/sha256-armv4.pl [new file with mode: 0644]
lib/crypto/arm/sha256-ce.S [new file with mode: 0644]
lib/crypto/arm/sha256.c [new file with mode: 0644]

diff --git a/arch/arm/lib/.gitignore b/arch/arm/lib/.gitignore
new file mode 100644 (file)
index 0000000..647d7a9
--- /dev/null
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# This now-removed directory used to contain generated files.
+/crypto/
index 91ea0e29107afc13edbc5ae5a7bdc9834c3fba42..d05dd672bcd9c241676e25aca3293cb410d48c00 100644 (file)
@@ -5,8 +5,6 @@
 # Copyright (C) 1995-2000 Russell King
 #
 
-obj-y += crypto/
-
 lib-y          := changebit.o csumipv6.o csumpartial.o               \
                   csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
                   delay.o delay-loop.o findbit.o memchr.o memcpy.o   \
diff --git a/arch/arm/lib/crypto/.gitignore b/arch/arm/lib/crypto/.gitignore
deleted file mode 100644 (file)
index 12d74d8..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-poly1305-core.S
-sha256-core.S
diff --git a/arch/arm/lib/crypto/Kconfig b/arch/arm/lib/crypto/Kconfig
deleted file mode 100644 (file)
index d1ad664..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_BLAKE2S_ARM
-       bool "Hash functions: BLAKE2s"
-       select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-       help
-         BLAKE2s cryptographic hash function (RFC 7693)
-
-         Architecture: arm
-
-         This is faster than the generic implementations of BLAKE2s and
-         BLAKE2b, but slower than the NEON implementation of BLAKE2b.
-         There is no NEON implementation of BLAKE2s, since NEON doesn't
-         really help with it.
-
-config CRYPTO_CHACHA20_NEON
-       tristate
-       default CRYPTO_LIB_CHACHA
-       select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_ARM
-       tristate
-       default CRYPTO_LIB_POLY1305
-       select CRYPTO_ARCH_HAVE_LIB_POLY1305
-
-config CRYPTO_SHA256_ARM
-       tristate
-       depends on !CPU_V7M
-       default CRYPTO_LIB_SHA256
-       select CRYPTO_ARCH_HAVE_LIB_SHA256
-       select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
diff --git a/arch/arm/lib/crypto/Makefile b/arch/arm/lib/crypto/Makefile
deleted file mode 100644 (file)
index 431f77c..0000000
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o
-libblake2s-arm-y := blake2s-core.o blake2s-glue.o
-
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
-chacha-neon-y := chacha-scalar-core.o chacha-glue.o
-chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
-
-obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
-poly1305-arm-y := poly1305-core.o poly1305-glue.o
-
-obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
-sha256-arm-y := sha256.o sha256-core.o
-sha256-arm-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o
-
-quiet_cmd_perl = PERL    $@
-      cmd_perl = $(PERL) $(<) > $(@)
-
-$(obj)/%-core.S: $(src)/%-armv4.pl
-       $(call cmd,perl)
-
-clean-files += poly1305-core.S sha256-core.S
-
-aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
-
-# massage the perlasm code a bit so we only get the NEON routine if we need it
-poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
-poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
-AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y)
-
-AFLAGS_sha256-core.o += $(aflags-thumb2-y)
diff --git a/arch/arm/lib/crypto/blake2s-core.S b/arch/arm/lib/crypto/blake2s-core.S
deleted file mode 100644 (file)
index df40e46..0000000
+++ /dev/null
@@ -1,306 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * BLAKE2s digest algorithm, ARM scalar implementation
- *
- * Copyright 2020 Google LLC
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-       // Registers used to hold message words temporarily.  There aren't
-       // enough ARM registers to hold the whole message block, so we have to
-       // load the words on-demand.
-       M_0             .req    r12
-       M_1             .req    r14
-
-// The BLAKE2s initialization vector
-.Lblake2s_IV:
-       .word   0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
-       .word   0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-
-.macro __ldrd          a, b, src, offset
-#if __LINUX_ARM_ARCH__ >= 6
-       ldrd            \a, \b, [\src, #\offset]
-#else
-       ldr             \a, [\src, #\offset]
-       ldr             \b, [\src, #\offset + 4]
-#endif
-.endm
-
-.macro __strd          a, b, dst, offset
-#if __LINUX_ARM_ARCH__ >= 6
-       strd            \a, \b, [\dst, #\offset]
-#else
-       str             \a, [\dst, #\offset]
-       str             \b, [\dst, #\offset + 4]
-#endif
-.endm
-
-.macro _le32_bswap     a, tmp
-#ifdef __ARMEB__
-       rev_l           \a, \tmp
-#endif
-.endm
-
-.macro _le32_bswap_8x  a, b, c, d, e, f, g, h,  tmp
-       _le32_bswap     \a, \tmp
-       _le32_bswap     \b, \tmp
-       _le32_bswap     \c, \tmp
-       _le32_bswap     \d, \tmp
-       _le32_bswap     \e, \tmp
-       _le32_bswap     \f, \tmp
-       _le32_bswap     \g, \tmp
-       _le32_bswap     \h, \tmp
-.endm
-
-// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
-// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
-// columns/diagonals.  s0-s1 are the word offsets to the message words the first
-// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
-// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
-//
-// Note that to save instructions, the rotations don't happen when the
-// pseudocode says they should, but rather they are delayed until the values are
-// used.  See the comment above _blake2s_round().
-.macro _blake2s_quarterround  a0, b0, c0, d0,  a1, b1, c1, d1,  s0, s1, s2, s3
-
-       ldr             M_0, [sp, #32 + 4 * \s0]
-       ldr             M_1, [sp, #32 + 4 * \s2]
-
-       // a += b + m[blake2s_sigma[r][2*i + 0]];
-       add             \a0, \a0, \b0, ror #brot
-       add             \a1, \a1, \b1, ror #brot
-       add             \a0, \a0, M_0
-       add             \a1, \a1, M_1
-
-       // d = ror32(d ^ a, 16);
-       eor             \d0, \a0, \d0, ror #drot
-       eor             \d1, \a1, \d1, ror #drot
-
-       // c += d;
-       add             \c0, \c0, \d0, ror #16
-       add             \c1, \c1, \d1, ror #16
-
-       // b = ror32(b ^ c, 12);
-       eor             \b0, \c0, \b0, ror #brot
-       eor             \b1, \c1, \b1, ror #brot
-
-       ldr             M_0, [sp, #32 + 4 * \s1]
-       ldr             M_1, [sp, #32 + 4 * \s3]
-
-       // a += b + m[blake2s_sigma[r][2*i + 1]];
-       add             \a0, \a0, \b0, ror #12
-       add             \a1, \a1, \b1, ror #12
-       add             \a0, \a0, M_0
-       add             \a1, \a1, M_1
-
-       // d = ror32(d ^ a, 8);
-       eor             \d0, \a0, \d0, ror#16
-       eor             \d1, \a1, \d1, ror#16
-
-       // c += d;
-       add             \c0, \c0, \d0, ror#8
-       add             \c1, \c1, \d1, ror#8
-
-       // b = ror32(b ^ c, 7);
-       eor             \b0, \c0, \b0, ror#12
-       eor             \b1, \c1, \b1, ror#12
-.endm
-
-// Execute one round of BLAKE2s by updating the state matrix v[0..15].  v[0..9]
-// are in r0..r9.  The stack pointer points to 8 bytes of scratch space for
-// spilling v[8..9], then to v[9..15], then to the message block.  r10-r12 and
-// r14 are free to use.  The macro arguments s0-s15 give the order in which the
-// message words are used in this round.
-//
-// All rotates are performed using the implicit rotate operand accepted by the
-// 'add' and 'eor' instructions.  This is faster than using explicit rotate
-// instructions.  To make this work, we allow the values in the second and last
-// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
-// wrong rotation amount.  The rotation amount is then fixed up just in time
-// when the values are used.  'brot' is the number of bits the values in row 'b'
-// need to be rotated right to arrive at the correct values, and 'drot'
-// similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
-// that they end up as (7, 8) after every round.
-.macro _blake2s_round  s0, s1, s2, s3, s4, s5, s6, s7, \
-                       s8, s9, s10, s11, s12, s13, s14, s15
-
-       // Mix first two columns:
-       // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
-       __ldrd          r10, r11, sp, 16        // load v[12] and v[13]
-       _blake2s_quarterround   r0, r4, r8, r10,  r1, r5, r9, r11, \
-                               \s0, \s1, \s2, \s3
-       __strd          r8, r9, sp, 0
-       __strd          r10, r11, sp, 16
-
-       // Mix second two columns:
-       // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
-       __ldrd          r8, r9, sp, 8           // load v[10] and v[11]
-       __ldrd          r10, r11, sp, 24        // load v[14] and v[15]
-       _blake2s_quarterround   r2, r6, r8, r10,  r3, r7, r9, r11, \
-                               \s4, \s5, \s6, \s7
-       str             r10, [sp, #24]          // store v[14]
-       // v[10], v[11], and v[15] are used below, so no need to store them yet.
-
-       .set brot, 7
-       .set drot, 8
-
-       // Mix first two diagonals:
-       // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
-       ldr             r10, [sp, #16]          // load v[12]
-       _blake2s_quarterround   r0, r5, r8, r11,  r1, r6, r9, r10, \
-                               \s8, \s9, \s10, \s11
-       __strd          r8, r9, sp, 8
-       str             r11, [sp, #28]
-       str             r10, [sp, #16]
-
-       // Mix second two diagonals:
-       // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
-       __ldrd          r8, r9, sp, 0           // load v[8] and v[9]
-       __ldrd          r10, r11, sp, 20        // load v[13] and v[14]
-       _blake2s_quarterround   r2, r7, r8, r10,  r3, r4, r9, r11, \
-                               \s12, \s13, \s14, \s15
-       __strd          r10, r11, sp, 20
-.endm
-
-//
-// void blake2s_compress(struct blake2s_state *state,
-//                      const u8 *block, size_t nblocks, u32 inc);
-//
-// Only the first three fields of struct blake2s_state are used:
-//     u32 h[8];       (inout)
-//     u32 t[2];       (inout)
-//     u32 f[2];       (in)
-//
-       .align          5
-ENTRY(blake2s_compress)
-       push            {r0-r2,r4-r11,lr}       // keep this an even number
-
-.Lnext_block:
-       // r0 is 'state'
-       // r1 is 'block'
-       // r3 is 'inc'
-
-       // Load and increment the counter t[0..1].
-       __ldrd          r10, r11, r0, 32
-       adds            r10, r10, r3
-       adc             r11, r11, #0
-       __strd          r10, r11, r0, 32
-
-       // _blake2s_round is very short on registers, so copy the message block
-       // to the stack to save a register during the rounds.  This also has the
-       // advantage that misalignment only needs to be dealt with in one place.
-       sub             sp, sp, #64
-       mov             r12, sp
-       tst             r1, #3
-       bne             .Lcopy_block_misaligned
-       ldmia           r1!, {r2-r9}
-       _le32_bswap_8x  r2, r3, r4, r5, r6, r7, r8, r9,  r14
-       stmia           r12!, {r2-r9}
-       ldmia           r1!, {r2-r9}
-       _le32_bswap_8x  r2, r3, r4, r5, r6, r7, r8, r9,  r14
-       stmia           r12, {r2-r9}
-.Lcopy_block_done:
-       str             r1, [sp, #68]           // Update message pointer
-
-       // Calculate v[8..15].  Push v[9..15] onto the stack, and leave space
-       // for spilling v[8..9].  Leave v[8..9] in r8-r9.
-       mov             r14, r0                 // r14 = state
-       adr             r12, .Lblake2s_IV
-       ldmia           r12!, {r8-r9}           // load IV[0..1]
-       __ldrd          r0, r1, r14, 40         // load f[0..1]
-       ldm             r12, {r2-r7}            // load IV[3..7]
-       eor             r4, r4, r10             // v[12] = IV[4] ^ t[0]
-       eor             r5, r5, r11             // v[13] = IV[5] ^ t[1]
-       eor             r6, r6, r0              // v[14] = IV[6] ^ f[0]
-       eor             r7, r7, r1              // v[15] = IV[7] ^ f[1]
-       push            {r2-r7}                 // push v[9..15]
-       sub             sp, sp, #8              // leave space for v[8..9]
-
-       // Load h[0..7] == v[0..7].
-       ldm             r14, {r0-r7}
-
-       // Execute the rounds.  Each round is provided the order in which it
-       // needs to use the message words.
-       .set brot, 0
-       .set drot, 0
-       _blake2s_round  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-       _blake2s_round  14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
-       _blake2s_round  11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
-       _blake2s_round  7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
-       _blake2s_round  9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
-       _blake2s_round  2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
-       _blake2s_round  12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
-       _blake2s_round  13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
-       _blake2s_round  6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
-       _blake2s_round  10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
-
-       // Fold the final state matrix into the hash chaining value:
-       //
-       //      for (i = 0; i < 8; i++)
-       //              h[i] ^= v[i] ^ v[i + 8];
-       //
-       ldr             r14, [sp, #96]          // r14 = &h[0]
-       add             sp, sp, #8              // v[8..9] are already loaded.
-       pop             {r10-r11}               // load v[10..11]
-       eor             r0, r0, r8
-       eor             r1, r1, r9
-       eor             r2, r2, r10
-       eor             r3, r3, r11
-       ldm             r14, {r8-r11}           // load h[0..3]
-       eor             r0, r0, r8
-       eor             r1, r1, r9
-       eor             r2, r2, r10
-       eor             r3, r3, r11
-       stmia           r14!, {r0-r3}           // store new h[0..3]
-       ldm             r14, {r0-r3}            // load old h[4..7]
-       pop             {r8-r11}                // load v[12..15]
-       eor             r0, r0, r4, ror #brot
-       eor             r1, r1, r5, ror #brot
-       eor             r2, r2, r6, ror #brot
-       eor             r3, r3, r7, ror #brot
-       eor             r0, r0, r8, ror #drot
-       eor             r1, r1, r9, ror #drot
-       eor             r2, r2, r10, ror #drot
-       eor             r3, r3, r11, ror #drot
-         add           sp, sp, #64             // skip copy of message block
-       stm             r14, {r0-r3}            // store new h[4..7]
-
-       // Advance to the next block, if there is one.  Note that if there are
-       // multiple blocks, then 'inc' (the counter increment amount) must be
-       // 64.  So we can simply set it to 64 without re-loading it.
-       ldm             sp, {r0, r1, r2}        // load (state, block, nblocks)
-       mov             r3, #64                 // set 'inc'
-       subs            r2, r2, #1              // nblocks--
-       str             r2, [sp, #8]
-       bne             .Lnext_block            // nblocks != 0?
-
-       pop             {r0-r2,r4-r11,pc}
-
-       // The next message block (pointed to by r1) isn't 4-byte aligned, so it
-       // can't be loaded using ldmia.  Copy it to the stack buffer (pointed to
-       // by r12) using an alternative method.  r2-r9 are free to use.
-.Lcopy_block_misaligned:
-       mov             r2, #64
-1:
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-       ldr             r3, [r1], #4
-       _le32_bswap     r3, r4
-#else
-       ldrb            r3, [r1, #0]
-       ldrb            r4, [r1, #1]
-       ldrb            r5, [r1, #2]
-       ldrb            r6, [r1, #3]
-       add             r1, r1, #4
-       orr             r3, r3, r4, lsl #8
-       orr             r3, r3, r5, lsl #16
-       orr             r3, r3, r6, lsl #24
-#endif
-       subs            r2, r2, #4
-       str             r3, [r12], #4
-       bne             1b
-       b               .Lcopy_block_done
-ENDPROC(blake2s_compress)
diff --git a/arch/arm/lib/crypto/blake2s-glue.c b/arch/arm/lib/crypto/blake2s-glue.c
deleted file mode 100644 (file)
index 0238a70..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include <crypto/internal/blake2s.h>
-#include <linux/module.h>
-
-/* defined in blake2s-core.S */
-EXPORT_SYMBOL(blake2s_compress);
diff --git a/arch/arm/lib/crypto/chacha-glue.c b/arch/arm/lib/crypto/chacha-glue.c
deleted file mode 100644 (file)
index 88ec964..0000000
+++ /dev/null
@@ -1,138 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ChaCha and HChaCha functions (ARM optimized)
- *
- * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <crypto/chacha.h>
-#include <crypto/internal/simd.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/cputype.h>
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
-                                     u8 *dst, const u8 *src, int nrounds);
-asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
-                                      u8 *dst, const u8 *src,
-                                      int nrounds, unsigned int nbytes);
-asmlinkage void hchacha_block_arm(const struct chacha_state *state,
-                                 u32 out[HCHACHA_OUT_WORDS], int nrounds);
-asmlinkage void hchacha_block_neon(const struct chacha_state *state,
-                                  u32 out[HCHACHA_OUT_WORDS], int nrounds);
-
-asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
-                            const struct chacha_state *state, int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
-
-static inline bool neon_usable(void)
-{
-       return static_branch_likely(&use_neon) && crypto_simd_usable();
-}
-
-static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
-                         unsigned int bytes, int nrounds)
-{
-       u8 buf[CHACHA_BLOCK_SIZE];
-
-       while (bytes > CHACHA_BLOCK_SIZE) {
-               unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
-
-               chacha_4block_xor_neon(state, dst, src, nrounds, l);
-               bytes -= l;
-               src += l;
-               dst += l;
-               state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
-       }
-       if (bytes) {
-               const u8 *s = src;
-               u8 *d = dst;
-
-               if (bytes != CHACHA_BLOCK_SIZE)
-                       s = d = memcpy(buf, src, bytes);
-               chacha_block_xor_neon(state, d, s, nrounds);
-               if (d != dst)
-                       memcpy(dst, buf, bytes);
-               state->x[12]++;
-       }
-}
-
-void hchacha_block_arch(const struct chacha_state *state,
-                       u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
-       if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
-               hchacha_block_arm(state, out, nrounds);
-       } else {
-               kernel_neon_begin();
-               hchacha_block_neon(state, out, nrounds);
-               kernel_neon_end();
-       }
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
-                      unsigned int bytes, int nrounds)
-{
-       if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
-           bytes <= CHACHA_BLOCK_SIZE) {
-               chacha_doarm(dst, src, bytes, state, nrounds);
-               state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
-               return;
-       }
-
-       do {
-               unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
-               kernel_neon_begin();
-               chacha_doneon(state, dst, src, todo, nrounds);
-               kernel_neon_end();
-
-               bytes -= todo;
-               src += todo;
-               dst += todo;
-       } while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
-       /* We always can use at least the ARM scalar implementation. */
-       return true;
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-static int __init chacha_arm_mod_init(void)
-{
-       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
-               switch (read_cpuid_part()) {
-               case ARM_CPU_PART_CORTEX_A7:
-               case ARM_CPU_PART_CORTEX_A5:
-                       /*
-                        * The Cortex-A7 and Cortex-A5 do not perform well with
-                        * the NEON implementation but do incredibly with the
-                        * scalar one and use less power.
-                        */
-                       break;
-               default:
-                       static_branch_enable(&use_neon);
-               }
-       }
-       return 0;
-}
-subsys_initcall(chacha_arm_mod_init);
-
-static void __exit chacha_arm_mod_exit(void)
-{
-}
-module_exit(chacha_arm_mod_exit);
-
-MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/lib/crypto/chacha-neon-core.S b/arch/arm/lib/crypto/chacha-neon-core.S
deleted file mode 100644 (file)
index ddd62b6..0000000
+++ /dev/null
@@ -1,643 +0,0 @@
-/*
- * ChaCha/HChaCha NEON helper functions
- *
- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
- /*
-  * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
-  *
-  * (a)  vshl.u32 + vsri.u32           (needs temporary register)
-  * (b)  vshl.u32 + vshr.u32 + vorr    (needs temporary register)
-  * (c)  vrev32.16                     (16-bit rotations only)
-  * (d)  vtbl.8 + vtbl.8               (multiple of 8 bits rotations only,
-  *                                     needs index vector)
-  *
-  * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
-  * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
-  * cycles of (b) on both Cortex-A7 and Cortex-A53.
-  *
-  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
-  * and doesn't need a temporary register.
-  *
-  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
-  * is twice as fast as (a), even when doing (a) on multiple registers
-  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
-  * parallelizes better when temporary registers are scarce.
-  *
-  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
-  * (a), so the need to load the rotation table actually makes the vtbl method
-  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
-  * seems to be a good compromise to get a more significant speed boost on some
-  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
-  */
-
-#include <linux/linkage.h>
-#include <asm/cache.h>
-
-       .text
-       .fpu            neon
-       .align          5
-
-/*
- * chacha_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is stored in the four NEON
- * registers q0-q3.  It performs matrix operations on four words in parallel,
- * but requires shuffling to rearrange the words after each round.
- *
- * The round count is given in r3.
- *
- * Clobbers: r3, ip, q4-q5
- */
-chacha_permute:
-
-       adr             ip, .Lrol8_table
-       vld1.8          {d10}, [ip, :64]
-
-.Ldoubleround:
-       // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-       vadd.i32        q0, q0, q1
-       veor            q3, q3, q0
-       vrev32.16       q3, q3
-
-       // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-       vadd.i32        q2, q2, q3
-       veor            q4, q1, q2
-       vshl.u32        q1, q4, #12
-       vsri.u32        q1, q4, #20
-
-       // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-       vadd.i32        q0, q0, q1
-       veor            q3, q3, q0
-       vtbl.8          d6, {d6}, d10
-       vtbl.8          d7, {d7}, d10
-
-       // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-       vadd.i32        q2, q2, q3
-       veor            q4, q1, q2
-       vshl.u32        q1, q4, #7
-       vsri.u32        q1, q4, #25
-
-       // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
-       vext.8          q1, q1, q1, #4
-       // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-       vext.8          q2, q2, q2, #8
-       // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
-       vext.8          q3, q3, q3, #12
-
-       // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-       vadd.i32        q0, q0, q1
-       veor            q3, q3, q0
-       vrev32.16       q3, q3
-
-       // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-       vadd.i32        q2, q2, q3
-       veor            q4, q1, q2
-       vshl.u32        q1, q4, #12
-       vsri.u32        q1, q4, #20
-
-       // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-       vadd.i32        q0, q0, q1
-       veor            q3, q3, q0
-       vtbl.8          d6, {d6}, d10
-       vtbl.8          d7, {d7}, d10
-
-       // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-       vadd.i32        q2, q2, q3
-       veor            q4, q1, q2
-       vshl.u32        q1, q4, #7
-       vsri.u32        q1, q4, #25
-
-       // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
-       vext.8          q1, q1, q1, #12
-       // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-       vext.8          q2, q2, q2, #8
-       // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
-       vext.8          q3, q3, q3, #4
-
-       subs            r3, r3, #2
-       bne             .Ldoubleround
-
-       bx              lr
-ENDPROC(chacha_permute)
-
-ENTRY(chacha_block_xor_neon)
-       // r0: Input state matrix, s
-       // r1: 1 data block output, o
-       // r2: 1 data block input, i
-       // r3: nrounds
-       push            {lr}
-
-       // x0..3 = s0..3
-       add             ip, r0, #0x20
-       vld1.32         {q0-q1}, [r0]
-       vld1.32         {q2-q3}, [ip]
-
-       vmov            q8, q0
-       vmov            q9, q1
-       vmov            q10, q2
-       vmov            q11, q3
-
-       bl              chacha_permute
-
-       add             ip, r2, #0x20
-       vld1.8          {q4-q5}, [r2]
-       vld1.8          {q6-q7}, [ip]
-
-       // o0 = i0 ^ (x0 + s0)
-       vadd.i32        q0, q0, q8
-       veor            q0, q0, q4
-
-       // o1 = i1 ^ (x1 + s1)
-       vadd.i32        q1, q1, q9
-       veor            q1, q1, q5
-
-       // o2 = i2 ^ (x2 + s2)
-       vadd.i32        q2, q2, q10
-       veor            q2, q2, q6
-
-       // o3 = i3 ^ (x3 + s3)
-       vadd.i32        q3, q3, q11
-       veor            q3, q3, q7
-
-       add             ip, r1, #0x20
-       vst1.8          {q0-q1}, [r1]
-       vst1.8          {q2-q3}, [ip]
-
-       pop             {pc}
-ENDPROC(chacha_block_xor_neon)
-
-ENTRY(hchacha_block_neon)
-       // r0: Input state matrix, s
-       // r1: output (8 32-bit words)
-       // r2: nrounds
-       push            {lr}
-
-       vld1.32         {q0-q1}, [r0]!
-       vld1.32         {q2-q3}, [r0]
-
-       mov             r3, r2
-       bl              chacha_permute
-
-       vst1.32         {q0}, [r1]!
-       vst1.32         {q3}, [r1]
-
-       pop             {pc}
-ENDPROC(hchacha_block_neon)
-
-       .align          4
-.Lctrinc:      .word   0, 1, 2, 3
-.Lrol8_table:  .byte   3, 0, 1, 2, 7, 4, 5, 6
-
-       .align          5
-ENTRY(chacha_4block_xor_neon)
-       push            {r4, lr}
-       mov             r4, sp                  // preserve the stack pointer
-       sub             ip, sp, #0x20           // allocate a 32 byte buffer
-       bic             ip, ip, #0x1f           // aligned to 32 bytes
-       mov             sp, ip
-
-       // r0: Input state matrix, s
-       // r1: 4 data blocks output, o
-       // r2: 4 data blocks input, i
-       // r3: nrounds
-
-       //
-       // This function encrypts four consecutive ChaCha blocks by loading
-       // the state matrix in NEON registers four times. The algorithm performs
-       // each operation on the corresponding word of each state matrix, hence
-       // requires no word shuffling. The words are re-interleaved before the
-       // final addition of the original state and the XORing step.
-       //
-
-       // x0..15[0-3] = s0..15[0-3]
-       add             ip, r0, #0x20
-       vld1.32         {q0-q1}, [r0]
-       vld1.32         {q2-q3}, [ip]
-
-       adr             lr, .Lctrinc
-       vdup.32         q15, d7[1]
-       vdup.32         q14, d7[0]
-       vld1.32         {q4}, [lr, :128]
-       vdup.32         q13, d6[1]
-       vdup.32         q12, d6[0]
-       vdup.32         q11, d5[1]
-       vdup.32         q10, d5[0]
-       vadd.u32        q12, q12, q4            // x12 += counter values 0-3
-       vdup.32         q9, d4[1]
-       vdup.32         q8, d4[0]
-       vdup.32         q7, d3[1]
-       vdup.32         q6, d3[0]
-       vdup.32         q5, d2[1]
-       vdup.32         q4, d2[0]
-       vdup.32         q3, d1[1]
-       vdup.32         q2, d1[0]
-       vdup.32         q1, d0[1]
-       vdup.32         q0, d0[0]
-
-       adr             ip, .Lrol8_table
-       b               1f
-
-.Ldoubleround4:
-       vld1.32         {q8-q9}, [sp, :256]
-1:
-       // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
-       // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
-       // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
-       // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
-       vadd.i32        q0, q0, q4
-       vadd.i32        q1, q1, q5
-       vadd.i32        q2, q2, q6
-       vadd.i32        q3, q3, q7
-
-       veor            q12, q12, q0
-       veor            q13, q13, q1
-       veor            q14, q14, q2
-       veor            q15, q15, q3
-
-       vrev32.16       q12, q12
-       vrev32.16       q13, q13
-       vrev32.16       q14, q14
-       vrev32.16       q15, q15
-
-       // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
-       // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
-       // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
-       // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
-       vadd.i32        q8, q8, q12
-       vadd.i32        q9, q9, q13
-       vadd.i32        q10, q10, q14
-       vadd.i32        q11, q11, q15
-
-       vst1.32         {q8-q9}, [sp, :256]
-
-       veor            q8, q4, q8
-       veor            q9, q5, q9
-       vshl.u32        q4, q8, #12
-       vshl.u32        q5, q9, #12
-       vsri.u32        q4, q8, #20
-       vsri.u32        q5, q9, #20
-
-       veor            q8, q6, q10
-       veor            q9, q7, q11
-       vshl.u32        q6, q8, #12
-       vshl.u32        q7, q9, #12
-       vsri.u32        q6, q8, #20
-       vsri.u32        q7, q9, #20
-
-       // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
-       // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
-       // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
-       // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
-       vld1.8          {d16}, [ip, :64]
-       vadd.i32        q0, q0, q4
-       vadd.i32        q1, q1, q5
-       vadd.i32        q2, q2, q6
-       vadd.i32        q3, q3, q7
-
-       veor            q12, q12, q0
-       veor            q13, q13, q1
-       veor            q14, q14, q2
-       veor            q15, q15, q3
-
-       vtbl.8          d24, {d24}, d16
-       vtbl.8          d25, {d25}, d16
-       vtbl.8          d26, {d26}, d16
-       vtbl.8          d27, {d27}, d16
-       vtbl.8          d28, {d28}, d16
-       vtbl.8          d29, {d29}, d16
-       vtbl.8          d30, {d30}, d16
-       vtbl.8          d31, {d31}, d16
-
-       vld1.32         {q8-q9}, [sp, :256]
-
-       // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
-       // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
-       // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
-       // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
-       vadd.i32        q8, q8, q12
-       vadd.i32        q9, q9, q13
-       vadd.i32        q10, q10, q14
-       vadd.i32        q11, q11, q15
-
-       vst1.32         {q8-q9}, [sp, :256]
-
-       veor            q8, q4, q8
-       veor            q9, q5, q9
-       vshl.u32        q4, q8, #7
-       vshl.u32        q5, q9, #7
-       vsri.u32        q4, q8, #25
-       vsri.u32        q5, q9, #25
-
-       veor            q8, q6, q10
-       veor            q9, q7, q11
-       vshl.u32        q6, q8, #7
-       vshl.u32        q7, q9, #7
-       vsri.u32        q6, q8, #25
-       vsri.u32        q7, q9, #25
-
-       vld1.32         {q8-q9}, [sp, :256]
-
-       // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
-       // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
-       // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
-       // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
-       vadd.i32        q0, q0, q5
-       vadd.i32        q1, q1, q6
-       vadd.i32        q2, q2, q7
-       vadd.i32        q3, q3, q4
-
-       veor            q15, q15, q0
-       veor            q12, q12, q1
-       veor            q13, q13, q2
-       veor            q14, q14, q3
-
-       vrev32.16       q15, q15
-       vrev32.16       q12, q12
-       vrev32.16       q13, q13
-       vrev32.16       q14, q14
-
-       // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
-       // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
-       // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
-       // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
-       vadd.i32        q10, q10, q15
-       vadd.i32        q11, q11, q12
-       vadd.i32        q8, q8, q13
-       vadd.i32        q9, q9, q14
-
-       vst1.32         {q8-q9}, [sp, :256]
-
-       veor            q8, q7, q8
-       veor            q9, q4, q9
-       vshl.u32        q7, q8, #12
-       vshl.u32        q4, q9, #12
-       vsri.u32        q7, q8, #20
-       vsri.u32        q4, q9, #20
-
-       veor            q8, q5, q10
-       veor            q9, q6, q11
-       vshl.u32        q5, q8, #12
-       vshl.u32        q6, q9, #12
-       vsri.u32        q5, q8, #20
-       vsri.u32        q6, q9, #20
-
-       // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
-       // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
-       // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
-       // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
-       vld1.8          {d16}, [ip, :64]
-       vadd.i32        q0, q0, q5
-       vadd.i32        q1, q1, q6
-       vadd.i32        q2, q2, q7
-       vadd.i32        q3, q3, q4
-
-       veor            q15, q15, q0
-       veor            q12, q12, q1
-       veor            q13, q13, q2
-       veor            q14, q14, q3
-
-       vtbl.8          d30, {d30}, d16
-       vtbl.8          d31, {d31}, d16
-       vtbl.8          d24, {d24}, d16
-       vtbl.8          d25, {d25}, d16
-       vtbl.8          d26, {d26}, d16
-       vtbl.8          d27, {d27}, d16
-       vtbl.8          d28, {d28}, d16
-       vtbl.8          d29, {d29}, d16
-
-       vld1.32         {q8-q9}, [sp, :256]
-
-       // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
-       // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
-       // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
-       // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
-       vadd.i32        q10, q10, q15
-       vadd.i32        q11, q11, q12
-       vadd.i32        q8, q8, q13
-       vadd.i32        q9, q9, q14
-
-       vst1.32         {q8-q9}, [sp, :256]
-
-       veor            q8, q7, q8
-       veor            q9, q4, q9
-       vshl.u32        q7, q8, #7
-       vshl.u32        q4, q9, #7
-       vsri.u32        q7, q8, #25
-       vsri.u32        q4, q9, #25
-
-       veor            q8, q5, q10
-       veor            q9, q6, q11
-       vshl.u32        q5, q8, #7
-       vshl.u32        q6, q9, #7
-       vsri.u32        q5, q8, #25
-       vsri.u32        q6, q9, #25
-
-       subs            r3, r3, #2
-       bne             .Ldoubleround4
-
-       // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
-       // x8..9[0-3] are on the stack.
-
-       // Re-interleave the words in the first two rows of each block (x0..7).
-       // Also add the counter values 0-3 to x12[0-3].
-         vld1.32       {q8}, [lr, :128]        // load counter values 0-3
-       vzip.32         q0, q1                  // => (0 1 0 1) (0 1 0 1)
-       vzip.32         q2, q3                  // => (2 3 2 3) (2 3 2 3)
-       vzip.32         q4, q5                  // => (4 5 4 5) (4 5 4 5)
-       vzip.32         q6, q7                  // => (6 7 6 7) (6 7 6 7)
-         vadd.u32      q12, q8                 // x12 += counter values 0-3
-       vswp            d1, d4
-       vswp            d3, d6
-         vld1.32       {q8-q9}, [r0]!          // load s0..7
-       vswp            d9, d12
-       vswp            d11, d14
-
-       // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
-       // after XORing the first 32 bytes.
-       vswp            q1, q4
-
-       // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
-
-       // x0..3[0-3] += s0..3[0-3]     (add orig state to 1st row of each block)
-       vadd.u32        q0, q0, q8
-       vadd.u32        q2, q2, q8
-       vadd.u32        q4, q4, q8
-       vadd.u32        q3, q3, q8
-
-       // x4..7[0-3] += s4..7[0-3]     (add orig state to 2nd row of each block)
-       vadd.u32        q1, q1, q9
-       vadd.u32        q6, q6, q9
-       vadd.u32        q5, q5, q9
-       vadd.u32        q7, q7, q9
-
-       // XOR first 32 bytes using keystream from first two rows of first block
-       vld1.8          {q8-q9}, [r2]!
-       veor            q8, q8, q0
-       veor            q9, q9, q1
-       vst1.8          {q8-q9}, [r1]!
-
-       // Re-interleave the words in the last two rows of each block (x8..15).
-       vld1.32         {q8-q9}, [sp, :256]
-         mov           sp, r4          // restore original stack pointer
-         ldr           r4, [r4, #8]    // load number of bytes
-       vzip.32         q12, q13        // => (12 13 12 13) (12 13 12 13)
-       vzip.32         q14, q15        // => (14 15 14 15) (14 15 14 15)
-       vzip.32         q8, q9          // => (8 9 8 9) (8 9 8 9)
-       vzip.32         q10, q11        // => (10 11 10 11) (10 11 10 11)
-         vld1.32       {q0-q1}, [r0]   // load s8..15
-       vswp            d25, d28
-       vswp            d27, d30
-       vswp            d17, d20
-       vswp            d19, d22
-
-       // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
-
-       // x8..11[0-3] += s8..11[0-3]   (add orig state to 3rd row of each block)
-       vadd.u32        q8,  q8,  q0
-       vadd.u32        q10, q10, q0
-       vadd.u32        q9,  q9,  q0
-       vadd.u32        q11, q11, q0
-
-       // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
-       vadd.u32        q12, q12, q1
-       vadd.u32        q14, q14, q1
-       vadd.u32        q13, q13, q1
-       vadd.u32        q15, q15, q1
-
-       // XOR the rest of the data with the keystream
-
-       vld1.8          {q0-q1}, [r2]!
-       subs            r4, r4, #96
-       veor            q0, q0, q8
-       veor            q1, q1, q12
-       ble             .Lle96
-       vst1.8          {q0-q1}, [r1]!
-
-       vld1.8          {q0-q1}, [r2]!
-       subs            r4, r4, #32
-       veor            q0, q0, q2
-       veor            q1, q1, q6
-       ble             .Lle128
-       vst1.8          {q0-q1}, [r1]!
-
-       vld1.8          {q0-q1}, [r2]!
-       subs            r4, r4, #32
-       veor            q0, q0, q10
-       veor            q1, q1, q14
-       ble             .Lle160
-       vst1.8          {q0-q1}, [r1]!
-
-       vld1.8          {q0-q1}, [r2]!
-       subs            r4, r4, #32
-       veor            q0, q0, q4
-       veor            q1, q1, q5
-       ble             .Lle192
-       vst1.8          {q0-q1}, [r1]!
-
-       vld1.8          {q0-q1}, [r2]!
-       subs            r4, r4, #32
-       veor            q0, q0, q9
-       veor            q1, q1, q13
-       ble             .Lle224
-       vst1.8          {q0-q1}, [r1]!
-
-       vld1.8          {q0-q1}, [r2]!
-       subs            r4, r4, #32
-       veor            q0, q0, q3
-       veor            q1, q1, q7
-       blt             .Llt256
-.Lout:
-       vst1.8          {q0-q1}, [r1]!
-
-       vld1.8          {q0-q1}, [r2]
-       veor            q0, q0, q11
-       veor            q1, q1, q15
-       vst1.8          {q0-q1}, [r1]
-
-       pop             {r4, pc}
-
-.Lle192:
-       vmov            q4, q9
-       vmov            q5, q13
-
-.Lle160:
-       // nothing to do
-
-.Lfinalblock:
-       // Process the final block if processing less than 4 full blocks.
-       // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
-       // previous 32 byte output block that still needs to be written at
-       // [r1] in q0-q1.
-       beq             .Lfullblock
-
-.Lpartialblock:
-       adr             lr, .Lpermute + 32
-       add             r2, r2, r4
-       add             lr, lr, r4
-       add             r4, r4, r1
-
-       vld1.8          {q2-q3}, [lr]
-       vld1.8          {q6-q7}, [r2]
-
-       add             r4, r4, #32
-
-       vtbl.8          d4, {q4-q5}, d4
-       vtbl.8          d5, {q4-q5}, d5
-       vtbl.8          d6, {q4-q5}, d6
-       vtbl.8          d7, {q4-q5}, d7
-
-       veor            q6, q6, q2
-       veor            q7, q7, q3
-
-       vst1.8          {q6-q7}, [r4]   // overlapping stores
-       vst1.8          {q0-q1}, [r1]
-       pop             {r4, pc}
-
-.Lfullblock:
-       vmov            q11, q4
-       vmov            q15, q5
-       b               .Lout
-.Lle96:
-       vmov            q4, q2
-       vmov            q5, q6
-       b               .Lfinalblock
-.Lle128:
-       vmov            q4, q10
-       vmov            q5, q14
-       b               .Lfinalblock
-.Lle224:
-       vmov            q4, q3
-       vmov            q5, q7
-       b               .Lfinalblock
-.Llt256:
-       vmov            q4, q11
-       vmov            q5, q15
-       b               .Lpartialblock
-ENDPROC(chacha_4block_xor_neon)
-
-       .align          L1_CACHE_SHIFT
-.Lpermute:
-       .byte           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
-       .byte           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
-       .byte           0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
-       .byte           0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
-       .byte           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
-       .byte           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
-       .byte           0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
-       .byte           0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
diff --git a/arch/arm/lib/crypto/chacha-scalar-core.S b/arch/arm/lib/crypto/chacha-scalar-core.S
deleted file mode 100644 (file)
index 4951df0..0000000
+++ /dev/null
@@ -1,444 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2018 Google, Inc.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-/*
- * Design notes:
- *
- * 16 registers would be needed to hold the state matrix, but only 14 are
- * available because 'sp' and 'pc' cannot be used.  So we spill the elements
- * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
- * 'ldrd' and one 'strd' instruction per round.
- *
- * All rotates are performed using the implicit rotate operand accepted by the
- * 'add' and 'eor' instructions.  This is faster than using explicit rotate
- * instructions.  To make this work, we allow the values in the second and last
- * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
- * wrong rotation amount.  The rotation amount is then fixed up just in time
- * when the values are used.  'brot' is the number of bits the values in row 'b'
- * need to be rotated right to arrive at the correct values, and 'drot'
- * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
- * that they end up as (25, 24) after every round.
- */
-
-       // ChaCha state registers
-       X0      .req    r0
-       X1      .req    r1
-       X2      .req    r2
-       X3      .req    r3
-       X4      .req    r4
-       X5      .req    r5
-       X6      .req    r6
-       X7      .req    r7
-       X8_X10  .req    r8      // shared by x8 and x10
-       X9_X11  .req    r9      // shared by x9 and x11
-       X12     .req    r10
-       X13     .req    r11
-       X14     .req    r12
-       X15     .req    r14
-
-.macro _le32_bswap_4x  a, b, c, d,  tmp
-#ifdef __ARMEB__
-       rev_l           \a,  \tmp
-       rev_l           \b,  \tmp
-       rev_l           \c,  \tmp
-       rev_l           \d,  \tmp
-#endif
-.endm
-
-.macro __ldrd          a, b, src, offset
-#if __LINUX_ARM_ARCH__ >= 6
-       ldrd            \a, \b, [\src, #\offset]
-#else
-       ldr             \a, [\src, #\offset]
-       ldr             \b, [\src, #\offset + 4]
-#endif
-.endm
-
-.macro __strd          a, b, dst, offset
-#if __LINUX_ARM_ARCH__ >= 6
-       strd            \a, \b, [\dst, #\offset]
-#else
-       str             \a, [\dst, #\offset]
-       str             \b, [\dst, #\offset + 4]
-#endif
-.endm
-
-.macro _halfround      a1, b1, c1, d1,  a2, b2, c2, d2
-
-       // a += b; d ^= a; d = rol(d, 16);
-       add             \a1, \a1, \b1, ror #brot
-       add             \a2, \a2, \b2, ror #brot
-       eor             \d1, \a1, \d1, ror #drot
-       eor             \d2, \a2, \d2, ror #drot
-       // drot == 32 - 16 == 16
-
-       // c += d; b ^= c; b = rol(b, 12);
-       add             \c1, \c1, \d1, ror #16
-       add             \c2, \c2, \d2, ror #16
-       eor             \b1, \c1, \b1, ror #brot
-       eor             \b2, \c2, \b2, ror #brot
-       // brot == 32 - 12 == 20
-
-       // a += b; d ^= a; d = rol(d, 8);
-       add             \a1, \a1, \b1, ror #20
-       add             \a2, \a2, \b2, ror #20
-       eor             \d1, \a1, \d1, ror #16
-       eor             \d2, \a2, \d2, ror #16
-       // drot == 32 - 8 == 24
-
-       // c += d; b ^= c; b = rol(b, 7);
-       add             \c1, \c1, \d1, ror #24
-       add             \c2, \c2, \d2, ror #24
-       eor             \b1, \c1, \b1, ror #20
-       eor             \b2, \c2, \b2, ror #20
-       // brot == 32 - 7 == 25
-.endm
-
-.macro _doubleround
-
-       // column round
-
-       // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
-       _halfround      X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
-
-       // save (x8, x9); restore (x10, x11)
-       __strd          X8_X10, X9_X11, sp, 0
-       __ldrd          X8_X10, X9_X11, sp, 8
-
-       // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
-       _halfround      X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
-
-       .set brot, 25
-       .set drot, 24
-
-       // diagonal round
-
-       // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
-       _halfround      X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
-
-       // save (x10, x11); restore (x8, x9)
-       __strd          X8_X10, X9_X11, sp, 8
-       __ldrd          X8_X10, X9_X11, sp, 0
-
-       // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
-       _halfround      X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
-.endm
-
-.macro _chacha_permute nrounds
-       .set brot, 0
-       .set drot, 0
-       .rept \nrounds / 2
-        _doubleround
-       .endr
-.endm
-
-.macro _chacha         nrounds
-
-.Lnext_block\@:
-       // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
-       // Registers contain x0-x9,x12-x15.
-
-       // Do the core ChaCha permutation to update x0-x15.
-       _chacha_permute \nrounds
-
-       add             sp, #8
-       // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
-       // Registers contain x0-x9,x12-x15.
-       // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
-       // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
-       push            {X8_X10, X9_X11, X12, X13, X14, X15}
-
-       // Load (OUT, IN, LEN).
-       ldr             r14, [sp, #96]
-       ldr             r12, [sp, #100]
-       ldr             r11, [sp, #104]
-
-       orr             r10, r14, r12
-
-       // Use slow path if fewer than 64 bytes remain.
-       cmp             r11, #64
-       blt             .Lxor_slowpath\@
-
-       // Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
-       // ARMv6+, since ldmia and stmia (used below) still require alignment.
-       tst             r10, #3
-       bne             .Lxor_slowpath\@
-
-       // Fast path: XOR 64 bytes of aligned data.
-
-       // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
-       // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
-       // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
-       // x0-x3
-       __ldrd          r8, r9, sp, 32
-       __ldrd          r10, r11, sp, 40
-       add             X0, X0, r8
-       add             X1, X1, r9
-       add             X2, X2, r10
-       add             X3, X3, r11
-       _le32_bswap_4x  X0, X1, X2, X3,  r8
-       ldmia           r12!, {r8-r11}
-       eor             X0, X0, r8
-       eor             X1, X1, r9
-       eor             X2, X2, r10
-       eor             X3, X3, r11
-       stmia           r14!, {X0-X3}
-
-       // x4-x7
-       __ldrd          r8, r9, sp, 48
-       __ldrd          r10, r11, sp, 56
-       add             X4, r8, X4, ror #brot
-       add             X5, r9, X5, ror #brot
-       ldmia           r12!, {X0-X3}
-       add             X6, r10, X6, ror #brot
-       add             X7, r11, X7, ror #brot
-       _le32_bswap_4x  X4, X5, X6, X7,  r8
-       eor             X4, X4, X0
-       eor             X5, X5, X1
-       eor             X6, X6, X2
-       eor             X7, X7, X3
-       stmia           r14!, {X4-X7}
-
-       // x8-x15
-       pop             {r0-r7}                 // (x8-x9,x12-x15,x10-x11)
-       __ldrd          r8, r9, sp, 32
-       __ldrd          r10, r11, sp, 40
-       add             r0, r0, r8              // x8
-       add             r1, r1, r9              // x9
-       add             r6, r6, r10             // x10
-       add             r7, r7, r11             // x11
-       _le32_bswap_4x  r0, r1, r6, r7,  r8
-       ldmia           r12!, {r8-r11}
-       eor             r0, r0, r8              // x8
-       eor             r1, r1, r9              // x9
-       eor             r6, r6, r10             // x10
-       eor             r7, r7, r11             // x11
-       stmia           r14!, {r0,r1,r6,r7}
-       ldmia           r12!, {r0,r1,r6,r7}
-       __ldrd          r8, r9, sp, 48
-       __ldrd          r10, r11, sp, 56
-       add             r2, r8, r2, ror #drot   // x12
-       add             r3, r9, r3, ror #drot   // x13
-       add             r4, r10, r4, ror #drot  // x14
-       add             r5, r11, r5, ror #drot  // x15
-       _le32_bswap_4x  r2, r3, r4, r5,  r9
-         ldr           r9, [sp, #72]           // load LEN
-       eor             r2, r2, r0              // x12
-       eor             r3, r3, r1              // x13
-       eor             r4, r4, r6              // x14
-       eor             r5, r5, r7              // x15
-         subs          r9, #64                 // decrement and check LEN
-       stmia           r14!, {r2-r5}
-
-       beq             .Ldone\@
-
-.Lprepare_for_next_block\@:
-
-       // Stack: x0-x15 OUT IN LEN
-
-       // Increment block counter (x12)
-       add             r8, #1
-
-       // Store updated (OUT, IN, LEN)
-       str             r14, [sp, #64]
-       str             r12, [sp, #68]
-       str             r9, [sp, #72]
-
-         mov           r14, sp
-
-       // Store updated block counter (x12)
-       str             r8, [sp, #48]
-
-         sub           sp, #16
-
-       // Reload state and do next block
-       ldmia           r14!, {r0-r11}          // load x0-x11
-       __strd          r10, r11, sp, 8         // store x10-x11 before state
-       ldmia           r14, {r10-r12,r14}      // load x12-x15
-       b               .Lnext_block\@
-
-.Lxor_slowpath\@:
-       // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
-       // We handle it by storing the 64 bytes of keystream to the stack, then
-       // XOR-ing the needed portion with the data.
-
-       // Allocate keystream buffer
-       sub             sp, #64
-       mov             r14, sp
-
-       // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
-       // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
-       // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
-       // Save keystream for x0-x3
-       __ldrd          r8, r9, sp, 96
-       __ldrd          r10, r11, sp, 104
-       add             X0, X0, r8
-       add             X1, X1, r9
-       add             X2, X2, r10
-       add             X3, X3, r11
-       _le32_bswap_4x  X0, X1, X2, X3,  r8
-       stmia           r14!, {X0-X3}
-
-       // Save keystream for x4-x7
-       __ldrd          r8, r9, sp, 112
-       __ldrd          r10, r11, sp, 120
-       add             X4, r8, X4, ror #brot
-       add             X5, r9, X5, ror #brot
-       add             X6, r10, X6, ror #brot
-       add             X7, r11, X7, ror #brot
-       _le32_bswap_4x  X4, X5, X6, X7,  r8
-         add           r8, sp, #64
-       stmia           r14!, {X4-X7}
-
-       // Save keystream for x8-x15
-       ldm             r8, {r0-r7}             // (x8-x9,x12-x15,x10-x11)
-       __ldrd          r8, r9, sp, 128
-       __ldrd          r10, r11, sp, 136
-       add             r0, r0, r8              // x8
-       add             r1, r1, r9              // x9
-       add             r6, r6, r10             // x10
-       add             r7, r7, r11             // x11
-       _le32_bswap_4x  r0, r1, r6, r7,  r8
-       stmia           r14!, {r0,r1,r6,r7}
-       __ldrd          r8, r9, sp, 144
-       __ldrd          r10, r11, sp, 152
-       add             r2, r8, r2, ror #drot   // x12
-       add             r3, r9, r3, ror #drot   // x13
-       add             r4, r10, r4, ror #drot  // x14
-       add             r5, r11, r5, ror #drot  // x15
-       _le32_bswap_4x  r2, r3, r4, r5,  r9
-       stmia           r14, {r2-r5}
-
-       // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
-       // Registers: r8 is block counter, r12 is IN.
-
-       ldr             r9, [sp, #168]          // LEN
-       ldr             r14, [sp, #160]         // OUT
-       cmp             r9, #64
-         mov           r0, sp
-       movle           r1, r9
-       movgt           r1, #64
-       // r1 is number of bytes to XOR, in range [1, 64]
-
-.if __LINUX_ARM_ARCH__ < 6
-       orr             r2, r12, r14
-       tst             r2, #3                  // IN or OUT misaligned?
-       bne             .Lxor_next_byte\@
-.endif
-
-       // XOR a word at a time
-.rept 16
-       subs            r1, #4
-       blt             .Lxor_words_done\@
-       ldr             r2, [r12], #4
-       ldr             r3, [r0], #4
-       eor             r2, r2, r3
-       str             r2, [r14], #4
-.endr
-       b               .Lxor_slowpath_done\@
-.Lxor_words_done\@:
-       ands            r1, r1, #3
-       beq             .Lxor_slowpath_done\@
-
-       // XOR a byte at a time
-.Lxor_next_byte\@:
-       ldrb            r2, [r12], #1
-       ldrb            r3, [r0], #1
-       eor             r2, r2, r3
-       strb            r2, [r14], #1
-       subs            r1, #1
-       bne             .Lxor_next_byte\@
-
-.Lxor_slowpath_done\@:
-       subs            r9, #64
-       add             sp, #96
-       bgt             .Lprepare_for_next_block\@
-
-.Ldone\@:
-.endm  // _chacha
-
-/*
- * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
- *                  const struct chacha_state *state, int nrounds);
- */
-ENTRY(chacha_doarm)
-       cmp             r2, #0                  // len == 0?
-       reteq           lr
-
-       ldr             ip, [sp]
-       cmp             ip, #12
-
-       push            {r0-r2,r4-r11,lr}
-
-       // Push state x0-x15 onto stack.
-       // Also store an extra copy of x10-x11 just before the state.
-
-       add             X12, r3, #48
-       ldm             X12, {X12,X13,X14,X15}
-       push            {X12,X13,X14,X15}
-       sub             sp, sp, #64
-
-       __ldrd          X8_X10, X9_X11, r3, 40
-       __strd          X8_X10, X9_X11, sp, 8
-       __strd          X8_X10, X9_X11, sp, 56
-       ldm             r3, {X0-X9_X11}
-       __strd          X0, X1, sp, 16
-       __strd          X2, X3, sp, 24
-       __strd          X4, X5, sp, 32
-       __strd          X6, X7, sp, 40
-       __strd          X8_X10, X9_X11, sp, 48
-
-       beq             1f
-       _chacha         20
-
-0:     add             sp, #76
-       pop             {r4-r11, pc}
-
-1:     _chacha         12
-       b               0b
-ENDPROC(chacha_doarm)
-
-/*
- * void hchacha_block_arm(const struct chacha_state *state,
- *                       u32 out[HCHACHA_OUT_WORDS], int nrounds);
- */
-ENTRY(hchacha_block_arm)
-       push            {r1,r4-r11,lr}
-
-       cmp             r2, #12                 // ChaCha12 ?
-
-       mov             r14, r0
-       ldmia           r14!, {r0-r11}          // load x0-x11
-       push            {r10-r11}               // store x10-x11 to stack
-       ldm             r14, {r10-r12,r14}      // load x12-x15
-       sub             sp, #8
-
-       beq             1f
-       _chacha_permute 20
-
-       // Skip over (unused0-unused1, x10-x11)
-0:     add             sp, #16
-
-       // Fix up rotations of x12-x15
-       ror             X12, X12, #drot
-       ror             X13, X13, #drot
-         pop           {r4}                    // load 'out'
-       ror             X14, X14, #drot
-       ror             X15, X15, #drot
-
-       // Store (x0-x3,x12-x15) to 'out'
-       stm             r4, {X0,X1,X2,X3,X12,X13,X14,X15}
-
-       pop             {r4-r11,pc}
-
-1:     _chacha_permute 12
-       b               0b
-ENDPROC(hchacha_block_arm)
diff --git a/arch/arm/lib/crypto/poly1305-armv4.pl b/arch/arm/lib/crypto/poly1305-armv4.pl
deleted file mode 100644 (file)
index d57c6e2..0000000
+++ /dev/null
@@ -1,1236 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
-#
-# ====================================================================
-# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
-# project.
-# ====================================================================
-#
-#                      IALU(*)/gcc-4.4         NEON
-#
-# ARM11xx(ARMv6)       7.78/+100%              -
-# Cortex-A5            6.35/+130%              3.00
-# Cortex-A8            6.25/+115%              2.36
-# Cortex-A9            5.10/+95%               2.55
-# Cortex-A15           3.85/+85%               1.25(**)
-# Snapdragon S4                5.70/+100%              1.48(**)
-#
-# (*)  this is for -march=armv6, i.e. with bunch of ldrb loading data;
-# (**) these are trade-off results, they can be improved by ~8% but at
-#      the cost of 15/12% regression on Cortex-A5/A7, it's even possible
-#      to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
-
-$flavour = shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
-    open STDOUT,">$output";
-}
-
-($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
-
-$code.=<<___;
-#ifndef        __KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
-# define poly1305_init   poly1305_block_init_arch
-# define poly1305_blocks poly1305_blocks_arm
-# define poly1305_emit   poly1305_emit_arch
-.globl poly1305_blocks_neon
-#endif
-
-#if defined(__thumb2__)
-.syntax        unified
-.thumb
-#else
-.code  32
-#endif
-
-.text
-
-.globl poly1305_emit
-.globl poly1305_blocks
-.globl poly1305_init
-.type  poly1305_init,%function
-.align 5
-poly1305_init:
-.Lpoly1305_init:
-       stmdb   sp!,{r4-r11}
-
-       eor     r3,r3,r3
-       cmp     $inp,#0
-       str     r3,[$ctx,#0]            @ zero hash value
-       str     r3,[$ctx,#4]
-       str     r3,[$ctx,#8]
-       str     r3,[$ctx,#12]
-       str     r3,[$ctx,#16]
-       str     r3,[$ctx,#36]           @ clear is_base2_26
-       add     $ctx,$ctx,#20
-
-#ifdef __thumb2__
-       it      eq
-#endif
-       moveq   r0,#0
-       beq     .Lno_key
-
-#if    __ARM_MAX_ARCH__>=7
-       mov     r3,#-1
-       str     r3,[$ctx,#28]           @ impossible key power value
-# ifndef __KERNEL__
-       adr     r11,.Lpoly1305_init
-       ldr     r12,.LOPENSSL_armcap
-# endif
-#endif
-       ldrb    r4,[$inp,#0]
-       mov     r10,#0x0fffffff
-       ldrb    r5,[$inp,#1]
-       and     r3,r10,#-4              @ 0x0ffffffc
-       ldrb    r6,[$inp,#2]
-       ldrb    r7,[$inp,#3]
-       orr     r4,r4,r5,lsl#8
-       ldrb    r5,[$inp,#4]
-       orr     r4,r4,r6,lsl#16
-       ldrb    r6,[$inp,#5]
-       orr     r4,r4,r7,lsl#24
-       ldrb    r7,[$inp,#6]
-       and     r4,r4,r10
-
-#if    __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-# if !defined(_WIN32)
-       ldr     r12,[r11,r12]           @ OPENSSL_armcap_P
-# endif
-# if defined(__APPLE__) || defined(_WIN32)
-       ldr     r12,[r12]
-# endif
-#endif
-       ldrb    r8,[$inp,#7]
-       orr     r5,r5,r6,lsl#8
-       ldrb    r6,[$inp,#8]
-       orr     r5,r5,r7,lsl#16
-       ldrb    r7,[$inp,#9]
-       orr     r5,r5,r8,lsl#24
-       ldrb    r8,[$inp,#10]
-       and     r5,r5,r3
-
-#if    __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-       tst     r12,#ARMV7_NEON         @ check for NEON
-# ifdef        __thumb2__
-       adr     r9,.Lpoly1305_blocks_neon
-       adr     r11,.Lpoly1305_blocks
-       it      ne
-       movne   r11,r9
-       adr     r12,.Lpoly1305_emit
-       orr     r11,r11,#1              @ thumb-ify addresses
-       orr     r12,r12,#1
-# else
-       add     r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
-       ite     eq
-       addeq   r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
-       addne   r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
-# endif
-#endif
-       ldrb    r9,[$inp,#11]
-       orr     r6,r6,r7,lsl#8
-       ldrb    r7,[$inp,#12]
-       orr     r6,r6,r8,lsl#16
-       ldrb    r8,[$inp,#13]
-       orr     r6,r6,r9,lsl#24
-       ldrb    r9,[$inp,#14]
-       and     r6,r6,r3
-
-       ldrb    r10,[$inp,#15]
-       orr     r7,r7,r8,lsl#8
-       str     r4,[$ctx,#0]
-       orr     r7,r7,r9,lsl#16
-       str     r5,[$ctx,#4]
-       orr     r7,r7,r10,lsl#24
-       str     r6,[$ctx,#8]
-       and     r7,r7,r3
-       str     r7,[$ctx,#12]
-#if    __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-       stmia   r2,{r11,r12}            @ fill functions table
-       mov     r0,#1
-#else
-       mov     r0,#0
-#endif
-.Lno_key:
-       ldmia   sp!,{r4-r11}
-#if    __ARM_ARCH__>=5
-       ret                             @ bx    lr
-#else
-       tst     lr,#1
-       moveq   pc,lr                   @ be binary compatible with V4, yet
-       bx      lr                      @ interoperable with Thumb ISA:-)
-#endif
-.size  poly1305_init,.-poly1305_init
-___
-{
-my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
-my ($s1,$s2,$s3)=($r1,$r2,$r3);
-
-$code.=<<___;
-.type  poly1305_blocks,%function
-.align 5
-poly1305_blocks:
-.Lpoly1305_blocks:
-       stmdb   sp!,{r3-r11,lr}
-
-       ands    $len,$len,#-16
-       beq     .Lno_data
-
-       add     $len,$len,$inp          @ end pointer
-       sub     sp,sp,#32
-
-#if __ARM_ARCH__<7
-       ldmia   $ctx,{$h0-$r3}          @ load context
-       add     $ctx,$ctx,#20
-       str     $len,[sp,#16]           @ offload stuff
-       str     $ctx,[sp,#12]
-#else
-       ldr     lr,[$ctx,#36]           @ is_base2_26
-       ldmia   $ctx!,{$h0-$h4}         @ load hash value
-       str     $len,[sp,#16]           @ offload stuff
-       str     $ctx,[sp,#12]
-
-       adds    $r0,$h0,$h1,lsl#26      @ base 2^26 -> base 2^32
-       mov     $r1,$h1,lsr#6
-       adcs    $r1,$r1,$h2,lsl#20
-       mov     $r2,$h2,lsr#12
-       adcs    $r2,$r2,$h3,lsl#14
-       mov     $r3,$h3,lsr#18
-       adcs    $r3,$r3,$h4,lsl#8
-       mov     $len,#0
-       teq     lr,#0
-       str     $len,[$ctx,#16]         @ clear is_base2_26
-       adc     $len,$len,$h4,lsr#24
-
-       itttt   ne
-       movne   $h0,$r0                 @ choose between radixes
-       movne   $h1,$r1
-       movne   $h2,$r2
-       movne   $h3,$r3
-       ldmia   $ctx,{$r0-$r3}          @ load key
-       it      ne
-       movne   $h4,$len
-#endif
-
-       mov     lr,$inp
-       cmp     $padbit,#0
-       str     $r1,[sp,#20]
-       str     $r2,[sp,#24]
-       str     $r3,[sp,#28]
-       b       .Loop
-
-.align 4
-.Loop:
-#if __ARM_ARCH__<7
-       ldrb    r0,[lr],#16             @ load input
-# ifdef        __thumb2__
-       it      hi
-# endif
-       addhi   $h4,$h4,#1              @ 1<<128
-       ldrb    r1,[lr,#-15]
-       ldrb    r2,[lr,#-14]
-       ldrb    r3,[lr,#-13]
-       orr     r1,r0,r1,lsl#8
-       ldrb    r0,[lr,#-12]
-       orr     r2,r1,r2,lsl#16
-       ldrb    r1,[lr,#-11]
-       orr     r3,r2,r3,lsl#24
-       ldrb    r2,[lr,#-10]
-       adds    $h0,$h0,r3              @ accumulate input
-
-       ldrb    r3,[lr,#-9]
-       orr     r1,r0,r1,lsl#8
-       ldrb    r0,[lr,#-8]
-       orr     r2,r1,r2,lsl#16
-       ldrb    r1,[lr,#-7]
-       orr     r3,r2,r3,lsl#24
-       ldrb    r2,[lr,#-6]
-       adcs    $h1,$h1,r3
-
-       ldrb    r3,[lr,#-5]
-       orr     r1,r0,r1,lsl#8
-       ldrb    r0,[lr,#-4]
-       orr     r2,r1,r2,lsl#16
-       ldrb    r1,[lr,#-3]
-       orr     r3,r2,r3,lsl#24
-       ldrb    r2,[lr,#-2]
-       adcs    $h2,$h2,r3
-
-       ldrb    r3,[lr,#-1]
-       orr     r1,r0,r1,lsl#8
-       str     lr,[sp,#8]              @ offload input pointer
-       orr     r2,r1,r2,lsl#16
-       add     $s1,$r1,$r1,lsr#2
-       orr     r3,r2,r3,lsl#24
-#else
-       ldr     r0,[lr],#16             @ load input
-       it      hi
-       addhi   $h4,$h4,#1              @ padbit
-       ldr     r1,[lr,#-12]
-       ldr     r2,[lr,#-8]
-       ldr     r3,[lr,#-4]
-# ifdef        __ARMEB__
-       rev     r0,r0
-       rev     r1,r1
-       rev     r2,r2
-       rev     r3,r3
-# endif
-       adds    $h0,$h0,r0              @ accumulate input
-       str     lr,[sp,#8]              @ offload input pointer
-       adcs    $h1,$h1,r1
-       add     $s1,$r1,$r1,lsr#2
-       adcs    $h2,$h2,r2
-#endif
-       add     $s2,$r2,$r2,lsr#2
-       adcs    $h3,$h3,r3
-       add     $s3,$r3,$r3,lsr#2
-
-       umull   r2,r3,$h1,$r0
-        adc    $h4,$h4,#0
-       umull   r0,r1,$h0,$r0
-       umlal   r2,r3,$h4,$s1
-       umlal   r0,r1,$h3,$s1
-       ldr     $r1,[sp,#20]            @ reload $r1
-       umlal   r2,r3,$h2,$s3
-       umlal   r0,r1,$h1,$s3
-       umlal   r2,r3,$h3,$s2
-       umlal   r0,r1,$h2,$s2
-       umlal   r2,r3,$h0,$r1
-       str     r0,[sp,#0]              @ future $h0
-        mul    r0,$s2,$h4
-       ldr     $r2,[sp,#24]            @ reload $r2
-       adds    r2,r2,r1                @ d1+=d0>>32
-        eor    r1,r1,r1
-       adc     lr,r3,#0                @ future $h2
-       str     r2,[sp,#4]              @ future $h1
-
-       mul     r2,$s3,$h4
-       eor     r3,r3,r3
-       umlal   r0,r1,$h3,$s3
-       ldr     $r3,[sp,#28]            @ reload $r3
-       umlal   r2,r3,$h3,$r0
-       umlal   r0,r1,$h2,$r0
-       umlal   r2,r3,$h2,$r1
-       umlal   r0,r1,$h1,$r1
-       umlal   r2,r3,$h1,$r2
-       umlal   r0,r1,$h0,$r2
-       umlal   r2,r3,$h0,$r3
-       ldr     $h0,[sp,#0]
-       mul     $h4,$r0,$h4
-       ldr     $h1,[sp,#4]
-
-       adds    $h2,lr,r0               @ d2+=d1>>32
-       ldr     lr,[sp,#8]              @ reload input pointer
-       adc     r1,r1,#0
-       adds    $h3,r2,r1               @ d3+=d2>>32
-       ldr     r0,[sp,#16]             @ reload end pointer
-       adc     r3,r3,#0
-       add     $h4,$h4,r3              @ h4+=d3>>32
-
-       and     r1,$h4,#-4
-       and     $h4,$h4,#3
-       add     r1,r1,r1,lsr#2          @ *=5
-       adds    $h0,$h0,r1
-       adcs    $h1,$h1,#0
-       adcs    $h2,$h2,#0
-       adcs    $h3,$h3,#0
-       adc     $h4,$h4,#0
-
-       cmp     r0,lr                   @ done yet?
-       bhi     .Loop
-
-       ldr     $ctx,[sp,#12]
-       add     sp,sp,#32
-       stmdb   $ctx,{$h0-$h4}          @ store the result
-
-.Lno_data:
-#if    __ARM_ARCH__>=5
-       ldmia   sp!,{r3-r11,pc}
-#else
-       ldmia   sp!,{r3-r11,lr}
-       tst     lr,#1
-       moveq   pc,lr                   @ be binary compatible with V4, yet
-       bx      lr                      @ interoperable with Thumb ISA:-)
-#endif
-.size  poly1305_blocks,.-poly1305_blocks
-___
-}
-{
-my ($ctx,$mac,$nonce)=map("r$_",(0..2));
-my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
-my $g4=$ctx;
-
-$code.=<<___;
-.type  poly1305_emit,%function
-.align 5
-poly1305_emit:
-.Lpoly1305_emit:
-       stmdb   sp!,{r4-r11}
-
-       ldmia   $ctx,{$h0-$h4}
-
-#if __ARM_ARCH__>=7
-       ldr     ip,[$ctx,#36]           @ is_base2_26
-
-       adds    $g0,$h0,$h1,lsl#26      @ base 2^26 -> base 2^32
-       mov     $g1,$h1,lsr#6
-       adcs    $g1,$g1,$h2,lsl#20
-       mov     $g2,$h2,lsr#12
-       adcs    $g2,$g2,$h3,lsl#14
-       mov     $g3,$h3,lsr#18
-       adcs    $g3,$g3,$h4,lsl#8
-       mov     $g4,#0
-       adc     $g4,$g4,$h4,lsr#24
-
-       tst     ip,ip
-       itttt   ne
-       movne   $h0,$g0
-       movne   $h1,$g1
-       movne   $h2,$g2
-       movne   $h3,$g3
-       it      ne
-       movne   $h4,$g4
-#endif
-
-       adds    $g0,$h0,#5              @ compare to modulus
-       adcs    $g1,$h1,#0
-       adcs    $g2,$h2,#0
-       adcs    $g3,$h3,#0
-       adc     $g4,$h4,#0
-       tst     $g4,#4                  @ did it carry/borrow?
-
-#ifdef __thumb2__
-       it      ne
-#endif
-       movne   $h0,$g0
-       ldr     $g0,[$nonce,#0]
-#ifdef __thumb2__
-       it      ne
-#endif
-       movne   $h1,$g1
-       ldr     $g1,[$nonce,#4]
-#ifdef __thumb2__
-       it      ne
-#endif
-       movne   $h2,$g2
-       ldr     $g2,[$nonce,#8]
-#ifdef __thumb2__
-       it      ne
-#endif
-       movne   $h3,$g3
-       ldr     $g3,[$nonce,#12]
-
-       adds    $h0,$h0,$g0
-       adcs    $h1,$h1,$g1
-       adcs    $h2,$h2,$g2
-       adc     $h3,$h3,$g3
-
-#if __ARM_ARCH__>=7
-# ifdef __ARMEB__
-       rev     $h0,$h0
-       rev     $h1,$h1
-       rev     $h2,$h2
-       rev     $h3,$h3
-# endif
-       str     $h0,[$mac,#0]
-       str     $h1,[$mac,#4]
-       str     $h2,[$mac,#8]
-       str     $h3,[$mac,#12]
-#else
-       strb    $h0,[$mac,#0]
-       mov     $h0,$h0,lsr#8
-       strb    $h1,[$mac,#4]
-       mov     $h1,$h1,lsr#8
-       strb    $h2,[$mac,#8]
-       mov     $h2,$h2,lsr#8
-       strb    $h3,[$mac,#12]
-       mov     $h3,$h3,lsr#8
-
-       strb    $h0,[$mac,#1]
-       mov     $h0,$h0,lsr#8
-       strb    $h1,[$mac,#5]
-       mov     $h1,$h1,lsr#8
-       strb    $h2,[$mac,#9]
-       mov     $h2,$h2,lsr#8
-       strb    $h3,[$mac,#13]
-       mov     $h3,$h3,lsr#8
-
-       strb    $h0,[$mac,#2]
-       mov     $h0,$h0,lsr#8
-       strb    $h1,[$mac,#6]
-       mov     $h1,$h1,lsr#8
-       strb    $h2,[$mac,#10]
-       mov     $h2,$h2,lsr#8
-       strb    $h3,[$mac,#14]
-       mov     $h3,$h3,lsr#8
-
-       strb    $h0,[$mac,#3]
-       strb    $h1,[$mac,#7]
-       strb    $h2,[$mac,#11]
-       strb    $h3,[$mac,#15]
-#endif
-       ldmia   sp!,{r4-r11}
-#if    __ARM_ARCH__>=5
-       ret                             @ bx    lr
-#else
-       tst     lr,#1
-       moveq   pc,lr                   @ be binary compatible with V4, yet
-       bx      lr                      @ interoperable with Thumb ISA:-)
-#endif
-.size  poly1305_emit,.-poly1305_emit
-___
-{
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
-my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
-my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
-
-my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
-
-$code.=<<___;
-#if    __ARM_MAX_ARCH__>=7
-.fpu   neon
-
-.type  poly1305_init_neon,%function
-.align 5
-poly1305_init_neon:
-.Lpoly1305_init_neon:
-       ldr     r3,[$ctx,#48]           @ first table element
-       cmp     r3,#-1                  @ is value impossible?
-       bne     .Lno_init_neon
-
-       ldr     r4,[$ctx,#20]           @ load key base 2^32
-       ldr     r5,[$ctx,#24]
-       ldr     r6,[$ctx,#28]
-       ldr     r7,[$ctx,#32]
-
-       and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
-       mov     r3,r4,lsr#26
-       mov     r4,r5,lsr#20
-       orr     r3,r3,r5,lsl#6
-       mov     r5,r6,lsr#14
-       orr     r4,r4,r6,lsl#12
-       mov     r6,r7,lsr#8
-       orr     r5,r5,r7,lsl#18
-       and     r3,r3,#0x03ffffff
-       and     r4,r4,#0x03ffffff
-       and     r5,r5,#0x03ffffff
-
-       vdup.32 $R0,r2                  @ r^1 in both lanes
-       add     r2,r3,r3,lsl#2          @ *5
-       vdup.32 $R1,r3
-       add     r3,r4,r4,lsl#2
-       vdup.32 $S1,r2
-       vdup.32 $R2,r4
-       add     r4,r5,r5,lsl#2
-       vdup.32 $S2,r3
-       vdup.32 $R3,r5
-       add     r5,r6,r6,lsl#2
-       vdup.32 $S3,r4
-       vdup.32 $R4,r6
-       vdup.32 $S4,r5
-
-       mov     $zeros,#2               @ counter
-
-.Lsquare_neon:
-       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-       @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-       @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-       @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-       @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-       @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-
-       vmull.u32       $D0,$R0,${R0}[1]
-       vmull.u32       $D1,$R1,${R0}[1]
-       vmull.u32       $D2,$R2,${R0}[1]
-       vmull.u32       $D3,$R3,${R0}[1]
-       vmull.u32       $D4,$R4,${R0}[1]
-
-       vmlal.u32       $D0,$R4,${S1}[1]
-       vmlal.u32       $D1,$R0,${R1}[1]
-       vmlal.u32       $D2,$R1,${R1}[1]
-       vmlal.u32       $D3,$R2,${R1}[1]
-       vmlal.u32       $D4,$R3,${R1}[1]
-
-       vmlal.u32       $D0,$R3,${S2}[1]
-       vmlal.u32       $D1,$R4,${S2}[1]
-       vmlal.u32       $D3,$R1,${R2}[1]
-       vmlal.u32       $D2,$R0,${R2}[1]
-       vmlal.u32       $D4,$R2,${R2}[1]
-
-       vmlal.u32       $D0,$R2,${S3}[1]
-       vmlal.u32       $D3,$R0,${R3}[1]
-       vmlal.u32       $D1,$R3,${S3}[1]
-       vmlal.u32       $D2,$R4,${S3}[1]
-       vmlal.u32       $D4,$R1,${R3}[1]
-
-       vmlal.u32       $D3,$R4,${S4}[1]
-       vmlal.u32       $D0,$R1,${S4}[1]
-       vmlal.u32       $D1,$R2,${S4}[1]
-       vmlal.u32       $D2,$R3,${S4}[1]
-       vmlal.u32       $D4,$R0,${R4}[1]
-
-       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-       @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-       @ and P. Schwabe
-       @
-       @ H0>>+H1>>+H2>>+H3>>+H4
-       @ H3>>+H4>>*5+H0>>+H1
-       @
-       @ Trivia.
-       @
-       @ Result of multiplication of n-bit number by m-bit number is
-       @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
-       @ m-bit number multiplied by 2^n is still n+m bits wide.
-       @
-       @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
-       @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
-       @ one is n+1 bits wide.
-       @
-       @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
-       @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
-       @ can be 27. However! In cases when their width exceeds 26 bits
-       @ they are limited by 2^26+2^6. This in turn means that *sum*
-       @ of the products with these values can still be viewed as sum
-       @ of 52-bit numbers as long as the amount of addends is not a
-       @ power of 2. For example,
-       @
-       @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
-       @
-       @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
-       @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
-       @ 8 * (2^52) or 2^55. However, the value is then multiplied by
-       @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
-       @ which is less than 32 * (2^52) or 2^57. And when processing
-       @ data we are looking at triple as many addends...
-       @
-       @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
-       @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
-       @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
-       @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
-       @ instruction accepts 2x32-bit input and writes 2x64-bit result.
-       @ This means that result of reduction have to be compressed upon
-       @ loop wrap-around. This can be done in the process of reduction
-       @ to minimize amount of instructions [as well as amount of
-       @ 128-bit instructions, which benefits low-end processors], but
-       @ one has to watch for H2 (which is narrower than H0) and 5*H4
-       @ not being wider than 58 bits, so that result of right shift
-       @ by 26 bits fits in 32 bits. This is also useful on x86,
-       @ because it allows to use paddd in place for paddq, which
-       @ benefits Atom, where paddq is ridiculously slow.
-
-       vshr.u64        $T0,$D3,#26
-       vmovn.i64       $D3#lo,$D3
-        vshr.u64       $T1,$D0,#26
-        vmovn.i64      $D0#lo,$D0
-       vadd.i64        $D4,$D4,$T0             @ h3 -> h4
-       vbic.i32        $D3#lo,#0xfc000000      @ &=0x03ffffff
-        vadd.i64       $D1,$D1,$T1             @ h0 -> h1
-        vbic.i32       $D0#lo,#0xfc000000
-
-       vshrn.u64       $T0#lo,$D4,#26
-       vmovn.i64       $D4#lo,$D4
-        vshr.u64       $T1,$D1,#26
-        vmovn.i64      $D1#lo,$D1
-        vadd.i64       $D2,$D2,$T1             @ h1 -> h2
-       vbic.i32        $D4#lo,#0xfc000000
-        vbic.i32       $D1#lo,#0xfc000000
-
-       vadd.i32        $D0#lo,$D0#lo,$T0#lo
-       vshl.u32        $T0#lo,$T0#lo,#2
-        vshrn.u64      $T1#lo,$D2,#26
-        vmovn.i64      $D2#lo,$D2
-       vadd.i32        $D0#lo,$D0#lo,$T0#lo    @ h4 -> h0
-        vadd.i32       $D3#lo,$D3#lo,$T1#lo    @ h2 -> h3
-        vbic.i32       $D2#lo,#0xfc000000
-
-       vshr.u32        $T0#lo,$D0#lo,#26
-       vbic.i32        $D0#lo,#0xfc000000
-        vshr.u32       $T1#lo,$D3#lo,#26
-        vbic.i32       $D3#lo,#0xfc000000
-       vadd.i32        $D1#lo,$D1#lo,$T0#lo    @ h0 -> h1
-        vadd.i32       $D4#lo,$D4#lo,$T1#lo    @ h3 -> h4
-
-       subs            $zeros,$zeros,#1
-       beq             .Lsquare_break_neon
-
-       add             $tbl0,$ctx,#(48+0*9*4)
-       add             $tbl1,$ctx,#(48+1*9*4)
-
-       vtrn.32         $R0,$D0#lo              @ r^2:r^1
-       vtrn.32         $R2,$D2#lo
-       vtrn.32         $R3,$D3#lo
-       vtrn.32         $R1,$D1#lo
-       vtrn.32         $R4,$D4#lo
-
-       vshl.u32        $S2,$R2,#2              @ *5
-       vshl.u32        $S3,$R3,#2
-       vshl.u32        $S1,$R1,#2
-       vshl.u32        $S4,$R4,#2
-       vadd.i32        $S2,$S2,$R2
-       vadd.i32        $S1,$S1,$R1
-       vadd.i32        $S3,$S3,$R3
-       vadd.i32        $S4,$S4,$R4
-
-       vst4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
-       vst4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
-       vst4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-       vst4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-       vst1.32         {${S4}[0]},[$tbl0,:32]
-       vst1.32         {${S4}[1]},[$tbl1,:32]
-
-       b               .Lsquare_neon
-
-.align 4
-.Lsquare_break_neon:
-       add             $tbl0,$ctx,#(48+2*4*9)
-       add             $tbl1,$ctx,#(48+3*4*9)
-
-       vmov            $R0,$D0#lo              @ r^4:r^3
-       vshl.u32        $S1,$D1#lo,#2           @ *5
-       vmov            $R1,$D1#lo
-       vshl.u32        $S2,$D2#lo,#2
-       vmov            $R2,$D2#lo
-       vshl.u32        $S3,$D3#lo,#2
-       vmov            $R3,$D3#lo
-       vshl.u32        $S4,$D4#lo,#2
-       vmov            $R4,$D4#lo
-       vadd.i32        $S1,$S1,$D1#lo
-       vadd.i32        $S2,$S2,$D2#lo
-       vadd.i32        $S3,$S3,$D3#lo
-       vadd.i32        $S4,$S4,$D4#lo
-
-       vst4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
-       vst4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
-       vst4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-       vst4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-       vst1.32         {${S4}[0]},[$tbl0]
-       vst1.32         {${S4}[1]},[$tbl1]
-
-.Lno_init_neon:
-       ret                             @ bx    lr
-.size  poly1305_init_neon,.-poly1305_init_neon
-
-.type  poly1305_blocks_neon,%function
-.align 5
-poly1305_blocks_neon:
-.Lpoly1305_blocks_neon:
-       ldr     ip,[$ctx,#36]           @ is_base2_26
-
-       cmp     $len,#64
-       blo     .Lpoly1305_blocks
-
-       stmdb   sp!,{r4-r7}
-       vstmdb  sp!,{d8-d15}            @ ABI specification says so
-
-       tst     ip,ip                   @ is_base2_26?
-       bne     .Lbase2_26_neon
-
-       stmdb   sp!,{r1-r3,lr}
-       bl      .Lpoly1305_init_neon
-
-       ldr     r4,[$ctx,#0]            @ load hash value base 2^32
-       ldr     r5,[$ctx,#4]
-       ldr     r6,[$ctx,#8]
-       ldr     r7,[$ctx,#12]
-       ldr     ip,[$ctx,#16]
-
-       and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
-       mov     r3,r4,lsr#26
-        veor   $D0#lo,$D0#lo,$D0#lo
-       mov     r4,r5,lsr#20
-       orr     r3,r3,r5,lsl#6
-        veor   $D1#lo,$D1#lo,$D1#lo
-       mov     r5,r6,lsr#14
-       orr     r4,r4,r6,lsl#12
-        veor   $D2#lo,$D2#lo,$D2#lo
-       mov     r6,r7,lsr#8
-       orr     r5,r5,r7,lsl#18
-        veor   $D3#lo,$D3#lo,$D3#lo
-       and     r3,r3,#0x03ffffff
-       orr     r6,r6,ip,lsl#24
-        veor   $D4#lo,$D4#lo,$D4#lo
-       and     r4,r4,#0x03ffffff
-       mov     r1,#1
-       and     r5,r5,#0x03ffffff
-       str     r1,[$ctx,#36]           @ set is_base2_26
-
-       vmov.32 $D0#lo[0],r2
-       vmov.32 $D1#lo[0],r3
-       vmov.32 $D2#lo[0],r4
-       vmov.32 $D3#lo[0],r5
-       vmov.32 $D4#lo[0],r6
-       adr     $zeros,.Lzeros
-
-       ldmia   sp!,{r1-r3,lr}
-       b       .Lhash_loaded
-
-.align 4
-.Lbase2_26_neon:
-       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-       @ load hash value
-
-       veor            $D0#lo,$D0#lo,$D0#lo
-       veor            $D1#lo,$D1#lo,$D1#lo
-       veor            $D2#lo,$D2#lo,$D2#lo
-       veor            $D3#lo,$D3#lo,$D3#lo
-       veor            $D4#lo,$D4#lo,$D4#lo
-       vld4.32         {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
-       adr             $zeros,.Lzeros
-       vld1.32         {$D4#lo[0]},[$ctx]
-       sub             $ctx,$ctx,#16           @ rewind
-
-.Lhash_loaded:
-       add             $in2,$inp,#32
-       mov             $padbit,$padbit,lsl#24
-       tst             $len,#31
-       beq             .Leven
-
-       vld4.32         {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
-       vmov.32         $H4#lo[0],$padbit
-       sub             $len,$len,#16
-       add             $in2,$inp,#32
-
-# ifdef        __ARMEB__
-       vrev32.8        $H0,$H0
-       vrev32.8        $H3,$H3
-       vrev32.8        $H1,$H1
-       vrev32.8        $H2,$H2
-# endif
-       vsri.u32        $H4#lo,$H3#lo,#8        @ base 2^32 -> base 2^26
-       vshl.u32        $H3#lo,$H3#lo,#18
-
-       vsri.u32        $H3#lo,$H2#lo,#14
-       vshl.u32        $H2#lo,$H2#lo,#12
-       vadd.i32        $H4#hi,$H4#lo,$D4#lo    @ add hash value and move to #hi
-
-       vbic.i32        $H3#lo,#0xfc000000
-       vsri.u32        $H2#lo,$H1#lo,#20
-       vshl.u32        $H1#lo,$H1#lo,#6
-
-       vbic.i32        $H2#lo,#0xfc000000
-       vsri.u32        $H1#lo,$H0#lo,#26
-       vadd.i32        $H3#hi,$H3#lo,$D3#lo
-
-       vbic.i32        $H0#lo,#0xfc000000
-       vbic.i32        $H1#lo,#0xfc000000
-       vadd.i32        $H2#hi,$H2#lo,$D2#lo
-
-       vadd.i32        $H0#hi,$H0#lo,$D0#lo
-       vadd.i32        $H1#hi,$H1#lo,$D1#lo
-
-       mov             $tbl1,$zeros
-       add             $tbl0,$ctx,#48
-
-       cmp             $len,$len
-       b               .Long_tail
-
-.align 4
-.Leven:
-       subs            $len,$len,#64
-       it              lo
-       movlo           $in2,$zeros
-
-       vmov.i32        $H4,#1<<24              @ padbit, yes, always
-       vld4.32         {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]    @ inp[0:1]
-       add             $inp,$inp,#64
-       vld4.32         {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]    @ inp[2:3] (or 0)
-       add             $in2,$in2,#64
-       itt             hi
-       addhi           $tbl1,$ctx,#(48+1*9*4)
-       addhi           $tbl0,$ctx,#(48+3*9*4)
-
-# ifdef        __ARMEB__
-       vrev32.8        $H0,$H0
-       vrev32.8        $H3,$H3
-       vrev32.8        $H1,$H1
-       vrev32.8        $H2,$H2
-# endif
-       vsri.u32        $H4,$H3,#8              @ base 2^32 -> base 2^26
-       vshl.u32        $H3,$H3,#18
-
-       vsri.u32        $H3,$H2,#14
-       vshl.u32        $H2,$H2,#12
-
-       vbic.i32        $H3,#0xfc000000
-       vsri.u32        $H2,$H1,#20
-       vshl.u32        $H1,$H1,#6
-
-       vbic.i32        $H2,#0xfc000000
-       vsri.u32        $H1,$H0,#26
-
-       vbic.i32        $H0,#0xfc000000
-       vbic.i32        $H1,#0xfc000000
-
-       bls             .Lskip_loop
-
-       vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^2
-       vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^4
-       vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-       vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-       b               .Loop_neon
-
-.align 5
-.Loop_neon:
-       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-       @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
-       @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
-       @   \___________________/
-       @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
-       @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
-       @   \___________________/ \____________________/
-       @
-       @ Note that we start with inp[2:3]*r^2. This is because it
-       @ doesn't depend on reduction in previous iteration.
-       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-       @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-       @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-       @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-       @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-       @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
-       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-       @ inp[2:3]*r^2
-
-       vadd.i32        $H2#lo,$H2#lo,$D2#lo    @ accumulate inp[0:1]
-       vmull.u32       $D2,$H2#hi,${R0}[1]
-       vadd.i32        $H0#lo,$H0#lo,$D0#lo
-       vmull.u32       $D0,$H0#hi,${R0}[1]
-       vadd.i32        $H3#lo,$H3#lo,$D3#lo
-       vmull.u32       $D3,$H3#hi,${R0}[1]
-       vmlal.u32       $D2,$H1#hi,${R1}[1]
-       vadd.i32        $H1#lo,$H1#lo,$D1#lo
-       vmull.u32       $D1,$H1#hi,${R0}[1]
-
-       vadd.i32        $H4#lo,$H4#lo,$D4#lo
-       vmull.u32       $D4,$H4#hi,${R0}[1]
-       subs            $len,$len,#64
-       vmlal.u32       $D0,$H4#hi,${S1}[1]
-       it              lo
-       movlo           $in2,$zeros
-       vmlal.u32       $D3,$H2#hi,${R1}[1]
-       vld1.32         ${S4}[1],[$tbl1,:32]
-       vmlal.u32       $D1,$H0#hi,${R1}[1]
-       vmlal.u32       $D4,$H3#hi,${R1}[1]
-
-       vmlal.u32       $D0,$H3#hi,${S2}[1]
-       vmlal.u32       $D3,$H1#hi,${R2}[1]
-       vmlal.u32       $D4,$H2#hi,${R2}[1]
-       vmlal.u32       $D1,$H4#hi,${S2}[1]
-       vmlal.u32       $D2,$H0#hi,${R2}[1]
-
-       vmlal.u32       $D3,$H0#hi,${R3}[1]
-       vmlal.u32       $D0,$H2#hi,${S3}[1]
-       vmlal.u32       $D4,$H1#hi,${R3}[1]
-       vmlal.u32       $D1,$H3#hi,${S3}[1]
-       vmlal.u32       $D2,$H4#hi,${S3}[1]
-
-       vmlal.u32       $D3,$H4#hi,${S4}[1]
-       vmlal.u32       $D0,$H1#hi,${S4}[1]
-       vmlal.u32       $D4,$H0#hi,${R4}[1]
-       vmlal.u32       $D1,$H2#hi,${S4}[1]
-       vmlal.u32       $D2,$H3#hi,${S4}[1]
-
-       vld4.32         {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]    @ inp[2:3] (or 0)
-       add             $in2,$in2,#64
-
-       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-       @ (hash+inp[0:1])*r^4 and accumulate
-
-       vmlal.u32       $D3,$H3#lo,${R0}[0]
-       vmlal.u32       $D0,$H0#lo,${R0}[0]
-       vmlal.u32       $D4,$H4#lo,${R0}[0]
-       vmlal.u32       $D1,$H1#lo,${R0}[0]
-       vmlal.u32       $D2,$H2#lo,${R0}[0]
-       vld1.32         ${S4}[0],[$tbl0,:32]
-
-       vmlal.u32       $D3,$H2#lo,${R1}[0]
-       vmlal.u32       $D0,$H4#lo,${S1}[0]
-       vmlal.u32       $D4,$H3#lo,${R1}[0]
-       vmlal.u32       $D1,$H0#lo,${R1}[0]
-       vmlal.u32       $D2,$H1#lo,${R1}[0]
-
-       vmlal.u32       $D3,$H1#lo,${R2}[0]
-       vmlal.u32       $D0,$H3#lo,${S2}[0]
-       vmlal.u32       $D4,$H2#lo,${R2}[0]
-       vmlal.u32       $D1,$H4#lo,${S2}[0]
-       vmlal.u32       $D2,$H0#lo,${R2}[0]
-
-       vmlal.u32       $D3,$H0#lo,${R3}[0]
-       vmlal.u32       $D0,$H2#lo,${S3}[0]
-       vmlal.u32       $D4,$H1#lo,${R3}[0]
-       vmlal.u32       $D1,$H3#lo,${S3}[0]
-       vmlal.u32       $D3,$H4#lo,${S4}[0]
-
-       vmlal.u32       $D2,$H4#lo,${S3}[0]
-       vmlal.u32       $D0,$H1#lo,${S4}[0]
-       vmlal.u32       $D4,$H0#lo,${R4}[0]
-       vmov.i32        $H4,#1<<24              @ padbit, yes, always
-       vmlal.u32       $D1,$H2#lo,${S4}[0]
-       vmlal.u32       $D2,$H3#lo,${S4}[0]
-
-       vld4.32         {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]    @ inp[0:1]
-       add             $inp,$inp,#64
-# ifdef        __ARMEB__
-       vrev32.8        $H0,$H0
-       vrev32.8        $H1,$H1
-       vrev32.8        $H2,$H2
-       vrev32.8        $H3,$H3
-# endif
-
-       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-       @ lazy reduction interleaved with base 2^32 -> base 2^26 of
-       @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
-
-       vshr.u64        $T0,$D3,#26
-       vmovn.i64       $D3#lo,$D3
-        vshr.u64       $T1,$D0,#26
-        vmovn.i64      $D0#lo,$D0
-       vadd.i64        $D4,$D4,$T0             @ h3 -> h4
-       vbic.i32        $D3#lo,#0xfc000000
-         vsri.u32      $H4,$H3,#8              @ base 2^32 -> base 2^26
-        vadd.i64       $D1,$D1,$T1             @ h0 -> h1
-         vshl.u32      $H3,$H3,#18
-        vbic.i32       $D0#lo,#0xfc000000
-
-       vshrn.u64       $T0#lo,$D4,#26
-       vmovn.i64       $D4#lo,$D4
-        vshr.u64       $T1,$D1,#26
-        vmovn.i64      $D1#lo,$D1
-        vadd.i64       $D2,$D2,$T1             @ h1 -> h2
-         vsri.u32      $H3,$H2,#14
-       vbic.i32        $D4#lo,#0xfc000000
-         vshl.u32      $H2,$H2,#12
-        vbic.i32       $D1#lo,#0xfc000000
-
-       vadd.i32        $D0#lo,$D0#lo,$T0#lo
-       vshl.u32        $T0#lo,$T0#lo,#2
-         vbic.i32      $H3,#0xfc000000
-        vshrn.u64      $T1#lo,$D2,#26
-        vmovn.i64      $D2#lo,$D2
-       vaddl.u32       $D0,$D0#lo,$T0#lo       @ h4 -> h0 [widen for a sec]
-         vsri.u32      $H2,$H1,#20
-        vadd.i32       $D3#lo,$D3#lo,$T1#lo    @ h2 -> h3
-         vshl.u32      $H1,$H1,#6
-        vbic.i32       $D2#lo,#0xfc000000
-         vbic.i32      $H2,#0xfc000000
-
-       vshrn.u64       $T0#lo,$D0,#26          @ re-narrow
-       vmovn.i64       $D0#lo,$D0
-         vsri.u32      $H1,$H0,#26
-         vbic.i32      $H0,#0xfc000000
-        vshr.u32       $T1#lo,$D3#lo,#26
-        vbic.i32       $D3#lo,#0xfc000000
-       vbic.i32        $D0#lo,#0xfc000000
-       vadd.i32        $D1#lo,$D1#lo,$T0#lo    @ h0 -> h1
-        vadd.i32       $D4#lo,$D4#lo,$T1#lo    @ h3 -> h4
-         vbic.i32      $H1,#0xfc000000
-
-       bhi             .Loop_neon
-
-.Lskip_loop:
-       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-       @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
-       add             $tbl1,$ctx,#(48+0*9*4)
-       add             $tbl0,$ctx,#(48+1*9*4)
-       adds            $len,$len,#32
-       it              ne
-       movne           $len,#0
-       bne             .Long_tail
-
-       vadd.i32        $H2#hi,$H2#lo,$D2#lo    @ add hash value and move to #hi
-       vadd.i32        $H0#hi,$H0#lo,$D0#lo
-       vadd.i32        $H3#hi,$H3#lo,$D3#lo
-       vadd.i32        $H1#hi,$H1#lo,$D1#lo
-       vadd.i32        $H4#hi,$H4#lo,$D4#lo
-
-.Long_tail:
-       vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^1
-       vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^2
-
-       vadd.i32        $H2#lo,$H2#lo,$D2#lo    @ can be redundant
-       vmull.u32       $D2,$H2#hi,$R0
-       vadd.i32        $H0#lo,$H0#lo,$D0#lo
-       vmull.u32       $D0,$H0#hi,$R0
-       vadd.i32        $H3#lo,$H3#lo,$D3#lo
-       vmull.u32       $D3,$H3#hi,$R0
-       vadd.i32        $H1#lo,$H1#lo,$D1#lo
-       vmull.u32       $D1,$H1#hi,$R0
-       vadd.i32        $H4#lo,$H4#lo,$D4#lo
-       vmull.u32       $D4,$H4#hi,$R0
-
-       vmlal.u32       $D0,$H4#hi,$S1
-       vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-       vmlal.u32       $D3,$H2#hi,$R1
-       vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-       vmlal.u32       $D1,$H0#hi,$R1
-       vmlal.u32       $D4,$H3#hi,$R1
-       vmlal.u32       $D2,$H1#hi,$R1
-
-       vmlal.u32       $D3,$H1#hi,$R2
-       vld1.32         ${S4}[1],[$tbl1,:32]
-       vmlal.u32       $D0,$H3#hi,$S2
-       vld1.32         ${S4}[0],[$tbl0,:32]
-       vmlal.u32       $D4,$H2#hi,$R2
-       vmlal.u32       $D1,$H4#hi,$S2
-       vmlal.u32       $D2,$H0#hi,$R2
-
-       vmlal.u32       $D3,$H0#hi,$R3
-        it             ne
-        addne          $tbl1,$ctx,#(48+2*9*4)
-       vmlal.u32       $D0,$H2#hi,$S3
-        it             ne
-        addne          $tbl0,$ctx,#(48+3*9*4)
-       vmlal.u32       $D4,$H1#hi,$R3
-       vmlal.u32       $D1,$H3#hi,$S3
-       vmlal.u32       $D2,$H4#hi,$S3
-
-       vmlal.u32       $D3,$H4#hi,$S4
-        vorn           $MASK,$MASK,$MASK       @ all-ones, can be redundant
-       vmlal.u32       $D0,$H1#hi,$S4
-        vshr.u64       $MASK,$MASK,#38
-       vmlal.u32       $D4,$H0#hi,$R4
-       vmlal.u32       $D1,$H2#hi,$S4
-       vmlal.u32       $D2,$H3#hi,$S4
-
-       beq             .Lshort_tail
-
-       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-       @ (hash+inp[0:1])*r^4:r^3 and accumulate
-
-       vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^3
-       vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^4
-
-       vmlal.u32       $D2,$H2#lo,$R0
-       vmlal.u32       $D0,$H0#lo,$R0
-       vmlal.u32       $D3,$H3#lo,$R0
-       vmlal.u32       $D1,$H1#lo,$R0
-       vmlal.u32       $D4,$H4#lo,$R0
-
-       vmlal.u32       $D0,$H4#lo,$S1
-       vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-       vmlal.u32       $D3,$H2#lo,$R1
-       vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-       vmlal.u32       $D1,$H0#lo,$R1
-       vmlal.u32       $D4,$H3#lo,$R1
-       vmlal.u32       $D2,$H1#lo,$R1
-
-       vmlal.u32       $D3,$H1#lo,$R2
-       vld1.32         ${S4}[1],[$tbl1,:32]
-       vmlal.u32       $D0,$H3#lo,$S2
-       vld1.32         ${S4}[0],[$tbl0,:32]
-       vmlal.u32       $D4,$H2#lo,$R2
-       vmlal.u32       $D1,$H4#lo,$S2
-       vmlal.u32       $D2,$H0#lo,$R2
-
-       vmlal.u32       $D3,$H0#lo,$R3
-       vmlal.u32       $D0,$H2#lo,$S3
-       vmlal.u32       $D4,$H1#lo,$R3
-       vmlal.u32       $D1,$H3#lo,$S3
-       vmlal.u32       $D2,$H4#lo,$S3
-
-       vmlal.u32       $D3,$H4#lo,$S4
-        vorn           $MASK,$MASK,$MASK       @ all-ones
-       vmlal.u32       $D0,$H1#lo,$S4
-        vshr.u64       $MASK,$MASK,#38
-       vmlal.u32       $D4,$H0#lo,$R4
-       vmlal.u32       $D1,$H2#lo,$S4
-       vmlal.u32       $D2,$H3#lo,$S4
-
-.Lshort_tail:
-       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-       @ horizontal addition
-
-       vadd.i64        $D3#lo,$D3#lo,$D3#hi
-       vadd.i64        $D0#lo,$D0#lo,$D0#hi
-       vadd.i64        $D4#lo,$D4#lo,$D4#hi
-       vadd.i64        $D1#lo,$D1#lo,$D1#hi
-       vadd.i64        $D2#lo,$D2#lo,$D2#hi
-
-       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-       @ lazy reduction, but without narrowing
-
-       vshr.u64        $T0,$D3,#26
-       vand.i64        $D3,$D3,$MASK
-        vshr.u64       $T1,$D0,#26
-        vand.i64       $D0,$D0,$MASK
-       vadd.i64        $D4,$D4,$T0             @ h3 -> h4
-        vadd.i64       $D1,$D1,$T1             @ h0 -> h1
-
-       vshr.u64        $T0,$D4,#26
-       vand.i64        $D4,$D4,$MASK
-        vshr.u64       $T1,$D1,#26
-        vand.i64       $D1,$D1,$MASK
-        vadd.i64       $D2,$D2,$T1             @ h1 -> h2
-
-       vadd.i64        $D0,$D0,$T0
-       vshl.u64        $T0,$T0,#2
-        vshr.u64       $T1,$D2,#26
-        vand.i64       $D2,$D2,$MASK
-       vadd.i64        $D0,$D0,$T0             @ h4 -> h0
-        vadd.i64       $D3,$D3,$T1             @ h2 -> h3
-
-       vshr.u64        $T0,$D0,#26
-       vand.i64        $D0,$D0,$MASK
-        vshr.u64       $T1,$D3,#26
-        vand.i64       $D3,$D3,$MASK
-       vadd.i64        $D1,$D1,$T0             @ h0 -> h1
-        vadd.i64       $D4,$D4,$T1             @ h3 -> h4
-
-       cmp             $len,#0
-       bne             .Leven
-
-       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-       @ store hash value
-
-       vst4.32         {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
-       vst1.32         {$D4#lo[0]},[$ctx]
-
-       vldmia  sp!,{d8-d15}                    @ epilogue
-       ldmia   sp!,{r4-r7}
-       ret                                     @ bx    lr
-.size  poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.align 5
-.Lzeros:
-.long  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-#ifndef        __KERNEL__
-.LOPENSSL_armcap:
-# ifdef        _WIN32
-.word  OPENSSL_armcap_P
-# else
-.word  OPENSSL_armcap_P-.Lpoly1305_init
-# endif
-.comm  OPENSSL_armcap_P,4,4
-.hidden        OPENSSL_armcap_P
-#endif
-#endif
-___
-}      }
-$code.=<<___;
-.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
-.align 2
-___
-
-foreach (split("\n",$code)) {
-       s/\`([^\`]*)\`/eval $1/geo;
-
-       s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
-       s/\bret\b/bx    lr/go                                           or
-       s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
-
-       print $_,"\n";
-}
-close STDOUT; # enforce flush
diff --git a/arch/arm/lib/crypto/poly1305-glue.c b/arch/arm/lib/crypto/poly1305-glue.c
deleted file mode 100644 (file)
index 2603b07..0000000
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
- *
- * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <crypto/internal/poly1305.h>
-#include <linux/cpufeature.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/unaligned.h>
-
-asmlinkage void poly1305_block_init_arch(
-       struct poly1305_block_state *state,
-       const u8 raw_key[POLY1305_BLOCK_SIZE]);
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
-asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state,
-                                   const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
-                                    const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
-                                  u8 digest[POLY1305_DIGEST_SIZE],
-                                  const u32 nonce[4]);
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
-
-void __weak poly1305_blocks_neon(struct poly1305_block_state *state,
-                                const u8 *src, u32 len, u32 hibit)
-{
-}
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
-                         unsigned int len, u32 padbit)
-{
-       len = round_down(len, POLY1305_BLOCK_SIZE);
-       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-           static_branch_likely(&have_neon)) {
-               do {
-                       unsigned int todo = min_t(unsigned int, len, SZ_4K);
-
-                       kernel_neon_begin();
-                       poly1305_blocks_neon(state, src, todo, padbit);
-                       kernel_neon_end();
-
-                       len -= todo;
-                       src += todo;
-               } while (len);
-       } else
-               poly1305_blocks_arm(state, src, len, padbit);
-}
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
-       /* We always can use at least the ARM scalar implementation. */
-       return true;
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-static int __init arm_poly1305_mod_init(void)
-{
-       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-           (elf_hwcap & HWCAP_NEON))
-               static_branch_enable(&have_neon);
-       return 0;
-}
-subsys_initcall(arm_poly1305_mod_init);
-
-static void __exit arm_poly1305_mod_exit(void)
-{
-}
-module_exit(arm_poly1305_mod_exit);
-
-MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/lib/crypto/sha256-armv4.pl b/arch/arm/lib/crypto/sha256-armv4.pl
deleted file mode 100644 (file)
index 8122db7..0000000
+++ /dev/null
@@ -1,724 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from the OpenSSL project but the author (Andy Polyakov)
-# has relicensed it under the GPLv2. Therefore this program is free software;
-# you can redistribute it and/or modify it under the terms of the GNU General
-# Public License version 2 as published by the Free Software Foundation.
-#
-# The original headers, including the original license headers, are
-# included below for completeness.
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see https://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# SHA256 block procedure for ARMv4. May 2007.
-
-# Performance is ~2x better than gcc 3.4 generated code and in "abso-
-# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
-# byte [on single-issue Xscale PXA250 core].
-
-# July 2010.
-#
-# Rescheduling for dual-issue pipeline resulted in 22% improvement on
-# Cortex A8 core and ~20 cycles per processed byte.
-
-# February 2011.
-#
-# Profiler-assisted and platform-specific optimization resulted in 16%
-# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
-
-# September 2013.
-#
-# Add NEON implementation. On Cortex A8 it was measured to process one
-# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
-# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
-# code (meaning that latter performs sub-optimally, nothing was done
-# about it).
-
-# May 2014.
-#
-# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
-
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-$ctx="r0";     $t0="r0";
-$inp="r1";     $t4="r1";
-$len="r2";     $t1="r2";
-$T1="r3";      $t3="r3";
-$A="r4";
-$B="r5";
-$C="r6";
-$D="r7";
-$E="r8";
-$F="r9";
-$G="r10";
-$H="r11";
-@V=($A,$B,$C,$D,$E,$F,$G,$H);
-$t2="r12";
-$Ktbl="r14";
-
-@Sigma0=( 2,13,22);
-@Sigma1=( 6,11,25);
-@sigma0=( 7,18, 3);
-@sigma1=(17,19,10);
-
-sub BODY_00_15 {
-my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
-
-$code.=<<___ if ($i<16);
-#if __ARM_ARCH__>=7
-       @ ldr   $t1,[$inp],#4                   @ $i
-# if $i==15
-       str     $inp,[sp,#17*4]                 @ make room for $t4
-# endif
-       eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
-       add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
-       eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
-# ifndef __ARMEB__
-       rev     $t1,$t1
-# endif
-#else
-       @ ldrb  $t1,[$inp,#3]                   @ $i
-       add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
-       ldrb    $t2,[$inp,#2]
-       ldrb    $t0,[$inp,#1]
-       orr     $t1,$t1,$t2,lsl#8
-       ldrb    $t2,[$inp],#4
-       orr     $t1,$t1,$t0,lsl#16
-# if $i==15
-       str     $inp,[sp,#17*4]                 @ make room for $t4
-# endif
-       eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
-       orr     $t1,$t1,$t2,lsl#24
-       eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
-#endif
-___
-$code.=<<___;
-       ldr     $t2,[$Ktbl],#4                  @ *K256++
-       add     $h,$h,$t1                       @ h+=X[i]
-       str     $t1,[sp,#`$i%16`*4]
-       eor     $t1,$f,$g
-       add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
-       and     $t1,$t1,$e
-       add     $h,$h,$t2                       @ h+=K256[i]
-       eor     $t1,$t1,$g                      @ Ch(e,f,g)
-       eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
-       add     $h,$h,$t1                       @ h+=Ch(e,f,g)
-#if $i==31
-       and     $t2,$t2,#0xff
-       cmp     $t2,#0xf2                       @ done?
-#endif
-#if $i<15
-# if __ARM_ARCH__>=7
-       ldr     $t1,[$inp],#4                   @ prefetch
-# else
-       ldrb    $t1,[$inp,#3]
-# endif
-       eor     $t2,$a,$b                       @ a^b, b^c in next round
-#else
-       ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
-       eor     $t2,$a,$b                       @ a^b, b^c in next round
-       ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
-#endif
-       eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
-       and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
-       add     $d,$d,$h                        @ d+=h
-       eor     $t3,$t3,$b                      @ Maj(a,b,c)
-       add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
-       @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
-___
-       ($t2,$t3)=($t3,$t2);
-}
-
-sub BODY_16_XX {
-my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
-
-$code.=<<___;
-       @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
-       @ ldr   $t4,[sp,#`($i+14)%16`*4]
-       mov     $t0,$t1,ror#$sigma0[0]
-       add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
-       mov     $t2,$t4,ror#$sigma1[0]
-       eor     $t0,$t0,$t1,ror#$sigma0[1]
-       eor     $t2,$t2,$t4,ror#$sigma1[1]
-       eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
-       ldr     $t1,[sp,#`($i+0)%16`*4]
-       eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
-       ldr     $t4,[sp,#`($i+9)%16`*4]
-
-       add     $t2,$t2,$t0
-       eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
-       add     $t1,$t1,$t2
-       eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
-       add     $t1,$t1,$t4                     @ X[i]
-___
-       &BODY_00_15(@_);
-}
-
-$code=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ 7
-#endif
-
-.text
-#if __ARM_ARCH__<7
-.code  32
-#else
-.syntax unified
-# ifdef __thumb2__
-.thumb
-# else
-.code   32
-# endif
-#endif
-
-.type  K256,%object
-.align 5
-K256:
-.word  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-.word  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-.word  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-.word  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-.word  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-.word  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-.word  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-.word  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-.word  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-.word  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-.word  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-.word  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-.word  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-.word  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-.word  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-.word  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-.size  K256,.-K256
-.word  0                               @ terminator
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.LOPENSSL_armcap:
-.word  OPENSSL_armcap_P-sha256_blocks_arch
-#endif
-.align 5
-
-.global        sha256_blocks_arch
-.type  sha256_blocks_arch,%function
-sha256_blocks_arch:
-.Lsha256_blocks_arch:
-#if __ARM_ARCH__<7
-       sub     r3,pc,#8                @ sha256_blocks_arch
-#else
-       adr     r3,.Lsha256_blocks_arch
-#endif
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-       ldr     r12,.LOPENSSL_armcap
-       ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
-       tst     r12,#ARMV8_SHA256
-       bne     .LARMv8
-       tst     r12,#ARMV7_NEON
-       bne     .LNEON
-#endif
-       add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
-       stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
-       ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
-       sub     $Ktbl,r3,#256+32        @ K256
-       sub     sp,sp,#16*4             @ alloca(X[16])
-.Loop:
-# if __ARM_ARCH__>=7
-       ldr     $t1,[$inp],#4
-# else
-       ldrb    $t1,[$inp,#3]
-# endif
-       eor     $t3,$B,$C               @ magic
-       eor     $t2,$t2,$t2
-___
-for($i=0;$i<16;$i++)   { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
-$code.=".Lrounds_16_xx:\n";
-for (;$i<32;$i++)      { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
-#if __ARM_ARCH__>=7
-       ite     eq                      @ Thumb2 thing, sanity check in ARM
-#endif
-       ldreq   $t3,[sp,#16*4]          @ pull ctx
-       bne     .Lrounds_16_xx
-
-       add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
-       ldr     $t0,[$t3,#0]
-       ldr     $t1,[$t3,#4]
-       ldr     $t2,[$t3,#8]
-       add     $A,$A,$t0
-       ldr     $t0,[$t3,#12]
-       add     $B,$B,$t1
-       ldr     $t1,[$t3,#16]
-       add     $C,$C,$t2
-       ldr     $t2,[$t3,#20]
-       add     $D,$D,$t0
-       ldr     $t0,[$t3,#24]
-       add     $E,$E,$t1
-       ldr     $t1,[$t3,#28]
-       add     $F,$F,$t2
-       ldr     $inp,[sp,#17*4]         @ pull inp
-       ldr     $t2,[sp,#18*4]          @ pull inp+len
-       add     $G,$G,$t0
-       add     $H,$H,$t1
-       stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
-       cmp     $inp,$t2
-       sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
-       bne     .Loop
-
-       add     sp,sp,#`16+3`*4 @ destroy frame
-#if __ARM_ARCH__>=5
-       ldmia   sp!,{r4-r11,pc}
-#else
-       ldmia   sp!,{r4-r11,lr}
-       tst     lr,#1
-       moveq   pc,lr                   @ be binary compatible with V4, yet
-       bx      lr                      @ interoperable with Thumb ISA:-)
-#endif
-.size  sha256_blocks_arch,.-sha256_blocks_arch
-___
-######################################################################
-# NEON stuff
-#
-{{{
-my @X=map("q$_",(0..3));
-my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
-my $Xfer=$t4;
-my $j=0;
-
-sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
-sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
-
-sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
-  my $arg = pop;
-    $arg = "#$arg" if ($arg*1 eq $arg);
-    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
-}
-
-sub Xupdate()
-{ use integer;
-  my $body = shift;
-  my @insns = (&$body,&$body,&$body,&$body);
-  my ($a,$b,$c,$d,$e,$f,$g,$h);
-
-       &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vshr_u32       ($T2,$T0,$sigma0[0]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vshr_u32       ($T1,$T0,$sigma0[2]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vsli_32        ($T2,$T0,32-$sigma0[0]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vshr_u32       ($T3,$T0,$sigma0[1]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &veor           ($T1,$T1,$T2);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vsli_32        ($T3,$T0,32-$sigma0[1]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &veor         ($T5,$T5,$T4);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &veor         ($T5,$T5,$T4);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vld1_32        ("{$T0}","[$Ktbl,:128]!");
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vadd_i32       ($T0,$T0,@X[0]);
-        while($#insns>=2) { eval(shift(@insns)); }
-       &vst1_32        ("{$T0}","[$Xfer,:128]!");
-        eval(shift(@insns));
-        eval(shift(@insns));
-
-       push(@X,shift(@X));             # "rotate" X[]
-}
-
-sub Xpreload()
-{ use integer;
-  my $body = shift;
-  my @insns = (&$body,&$body,&$body,&$body);
-  my ($a,$b,$c,$d,$e,$f,$g,$h);
-
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vld1_32        ("{$T0}","[$Ktbl,:128]!");
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vrev32_8       (@X[0],@X[0]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &vadd_i32       ($T0,$T0,@X[0]);
-        foreach (@insns) { eval; }     # remaining instructions
-       &vst1_32        ("{$T0}","[$Xfer,:128]!");
-
-       push(@X,shift(@X));             # "rotate" X[]
-}
-
-sub body_00_15 () {
-       (
-       '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
-       '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
-       '&eor   ($t1,$f,$g)',
-       '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
-       '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
-       '&and   ($t1,$t1,$e)',
-       '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
-       '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
-       '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
-       '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
-       '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
-       '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
-       '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
-       '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
-       '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
-       '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
-       '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
-       '&add   ($d,$d,$h)',                    # d+=h
-       '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
-       '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
-       '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
-       )
-}
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
-.arch  armv7-a
-.fpu   neon
-
-.global        sha256_block_data_order_neon
-.type  sha256_block_data_order_neon,%function
-.align 4
-sha256_block_data_order_neon:
-.LNEON:
-       stmdb   sp!,{r4-r12,lr}
-
-       sub     $H,sp,#16*4+16
-       adr     $Ktbl,.Lsha256_blocks_arch
-       sub     $Ktbl,$Ktbl,#.Lsha256_blocks_arch-K256
-       bic     $H,$H,#15               @ align for 128-bit stores
-       mov     $t2,sp
-       mov     sp,$H                   @ alloca
-       add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
-
-       vld1.8          {@X[0]},[$inp]!
-       vld1.8          {@X[1]},[$inp]!
-       vld1.8          {@X[2]},[$inp]!
-       vld1.8          {@X[3]},[$inp]!
-       vld1.32         {$T0},[$Ktbl,:128]!
-       vld1.32         {$T1},[$Ktbl,:128]!
-       vld1.32         {$T2},[$Ktbl,:128]!
-       vld1.32         {$T3},[$Ktbl,:128]!
-       vrev32.8        @X[0],@X[0]             @ yes, even on
-       str             $ctx,[sp,#64]
-       vrev32.8        @X[1],@X[1]             @ big-endian
-       str             $inp,[sp,#68]
-       mov             $Xfer,sp
-       vrev32.8        @X[2],@X[2]
-       str             $len,[sp,#72]
-       vrev32.8        @X[3],@X[3]
-       str             $t2,[sp,#76]            @ save original sp
-       vadd.i32        $T0,$T0,@X[0]
-       vadd.i32        $T1,$T1,@X[1]
-       vst1.32         {$T0},[$Xfer,:128]!
-       vadd.i32        $T2,$T2,@X[2]
-       vst1.32         {$T1},[$Xfer,:128]!
-       vadd.i32        $T3,$T3,@X[3]
-       vst1.32         {$T2},[$Xfer,:128]!
-       vst1.32         {$T3},[$Xfer,:128]!
-
-       ldmia           $ctx,{$A-$H}
-       sub             $Xfer,$Xfer,#64
-       ldr             $t1,[sp,#0]
-       eor             $t2,$t2,$t2
-       eor             $t3,$B,$C
-       b               .L_00_48
-
-.align 4
-.L_00_48:
-___
-       &Xupdate(\&body_00_15);
-       &Xupdate(\&body_00_15);
-       &Xupdate(\&body_00_15);
-       &Xupdate(\&body_00_15);
-$code.=<<___;
-       teq     $t1,#0                          @ check for K256 terminator
-       ldr     $t1,[sp,#0]
-       sub     $Xfer,$Xfer,#64
-       bne     .L_00_48
-
-       ldr             $inp,[sp,#68]
-       ldr             $t0,[sp,#72]
-       sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
-       teq             $inp,$t0
-       it              eq
-       subeq           $inp,$inp,#64           @ avoid SEGV
-       vld1.8          {@X[0]},[$inp]!         @ load next input block
-       vld1.8          {@X[1]},[$inp]!
-       vld1.8          {@X[2]},[$inp]!
-       vld1.8          {@X[3]},[$inp]!
-       it              ne
-       strne           $inp,[sp,#68]
-       mov             $Xfer,sp
-___
-       &Xpreload(\&body_00_15);
-       &Xpreload(\&body_00_15);
-       &Xpreload(\&body_00_15);
-       &Xpreload(\&body_00_15);
-$code.=<<___;
-       ldr     $t0,[$t1,#0]
-       add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
-       ldr     $t2,[$t1,#4]
-       ldr     $t3,[$t1,#8]
-       ldr     $t4,[$t1,#12]
-       add     $A,$A,$t0                       @ accumulate
-       ldr     $t0,[$t1,#16]
-       add     $B,$B,$t2
-       ldr     $t2,[$t1,#20]
-       add     $C,$C,$t3
-       ldr     $t3,[$t1,#24]
-       add     $D,$D,$t4
-       ldr     $t4,[$t1,#28]
-       add     $E,$E,$t0
-       str     $A,[$t1],#4
-       add     $F,$F,$t2
-       str     $B,[$t1],#4
-       add     $G,$G,$t3
-       str     $C,[$t1],#4
-       add     $H,$H,$t4
-       str     $D,[$t1],#4
-       stmia   $t1,{$E-$H}
-
-       ittte   ne
-       movne   $Xfer,sp
-       ldrne   $t1,[sp,#0]
-       eorne   $t2,$t2,$t2
-       ldreq   sp,[sp,#76]                     @ restore original sp
-       itt     ne
-       eorne   $t3,$B,$C
-       bne     .L_00_48
-
-       ldmia   sp!,{r4-r12,pc}
-.size  sha256_block_data_order_neon,.-sha256_block_data_order_neon
-#endif
-___
-}}}
-######################################################################
-# ARMv8 stuff
-#
-{{{
-my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
-my @MSG=map("q$_",(8..11));
-my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
-my $Ktbl="r3";
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-
-# ifdef __thumb2__
-#  define INST(a,b,c,d)        .byte   c,d|0xc,a,b
-# else
-#  define INST(a,b,c,d)        .byte   a,b,c,d
-# endif
-
-.type  sha256_block_data_order_armv8,%function
-.align 5
-sha256_block_data_order_armv8:
-.LARMv8:
-       vld1.32 {$ABCD,$EFGH},[$ctx]
-# ifdef __thumb2__
-       adr     $Ktbl,.LARMv8
-       sub     $Ktbl,$Ktbl,#.LARMv8-K256
-# else
-       adrl    $Ktbl,K256
-# endif
-       add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
-
-.Loop_v8:
-       vld1.8          {@MSG[0]-@MSG[1]},[$inp]!
-       vld1.8          {@MSG[2]-@MSG[3]},[$inp]!
-       vld1.32         {$W0},[$Ktbl]!
-       vrev32.8        @MSG[0],@MSG[0]
-       vrev32.8        @MSG[1],@MSG[1]
-       vrev32.8        @MSG[2],@MSG[2]
-       vrev32.8        @MSG[3],@MSG[3]
-       vmov            $ABCD_SAVE,$ABCD        @ offload
-       vmov            $EFGH_SAVE,$EFGH
-       teq             $inp,$len
-___
-for($i=0;$i<12;$i++) {
-$code.=<<___;
-       vld1.32         {$W1},[$Ktbl]!
-       vadd.i32        $W0,$W0,@MSG[0]
-       sha256su0       @MSG[0],@MSG[1]
-       vmov            $abcd,$ABCD
-       sha256h         $ABCD,$EFGH,$W0
-       sha256h2        $EFGH,$abcd,$W0
-       sha256su1       @MSG[0],@MSG[2],@MSG[3]
-___
-       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
-}
-$code.=<<___;
-       vld1.32         {$W1},[$Ktbl]!
-       vadd.i32        $W0,$W0,@MSG[0]
-       vmov            $abcd,$ABCD
-       sha256h         $ABCD,$EFGH,$W0
-       sha256h2        $EFGH,$abcd,$W0
-
-       vld1.32         {$W0},[$Ktbl]!
-       vadd.i32        $W1,$W1,@MSG[1]
-       vmov            $abcd,$ABCD
-       sha256h         $ABCD,$EFGH,$W1
-       sha256h2        $EFGH,$abcd,$W1
-
-       vld1.32         {$W1},[$Ktbl]
-       vadd.i32        $W0,$W0,@MSG[2]
-       sub             $Ktbl,$Ktbl,#256-16     @ rewind
-       vmov            $abcd,$ABCD
-       sha256h         $ABCD,$EFGH,$W0
-       sha256h2        $EFGH,$abcd,$W0
-
-       vadd.i32        $W1,$W1,@MSG[3]
-       vmov            $abcd,$ABCD
-       sha256h         $ABCD,$EFGH,$W1
-       sha256h2        $EFGH,$abcd,$W1
-
-       vadd.i32        $ABCD,$ABCD,$ABCD_SAVE
-       vadd.i32        $EFGH,$EFGH,$EFGH_SAVE
-       it              ne
-       bne             .Loop_v8
-
-       vst1.32         {$ABCD,$EFGH},[$ctx]
-
-       ret             @ bx lr
-.size  sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
-#endif
-___
-}}}
-$code.=<<___;
-.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
-.align 2
-#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-.comm   OPENSSL_armcap_P,4,4
-#endif
-___
-
-open SELF,$0;
-while(<SELF>) {
-       next if (/^#!/);
-       last if (!s/^#/@/ and !/^$/);
-       print;
-}
-close SELF;
-
-{   my  %opcode = (
-       "sha256h"       => 0xf3000c40,  "sha256h2"      => 0xf3100c40,
-       "sha256su0"     => 0xf3ba03c0,  "sha256su1"     => 0xf3200c40   );
-
-    sub unsha256 {
-       my ($mnemonic,$arg)=@_;
-
-       if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
-           my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
-                                        |(($2&7)<<17)|(($2&8)<<4)
-                                        |(($3&7)<<1) |(($3&8)<<2);
-           # since ARMv7 instructions are always encoded little-endian.
-           # correct solution is to use .inst directive, but older
-           # assemblers don't implement it:-(
-           sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
-                       $word&0xff,($word>>8)&0xff,
-                       ($word>>16)&0xff,($word>>24)&0xff,
-                       $mnemonic,$arg;
-       }
-    }
-}
-
-foreach (split($/,$code)) {
-
-       s/\`([^\`]*)\`/eval $1/geo;
-
-       s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
-
-       s/\bret\b/bx    lr/go           or
-       s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
-
-       print $_,"\n";
-}
-
-close STDOUT; # enforce flush
diff --git a/arch/arm/lib/crypto/sha256-ce.S b/arch/arm/lib/crypto/sha256-ce.S
deleted file mode 100644 (file)
index ac2c9b0..0000000
+++ /dev/null
@@ -1,123 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha256-ce.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2015 Linaro Ltd.
- * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-       .text
-       .arch           armv8-a
-       .fpu            crypto-neon-fp-armv8
-
-       k0              .req    q7
-       k1              .req    q8
-       rk              .req    r3
-
-       ta0             .req    q9
-       ta1             .req    q10
-       tb0             .req    q10
-       tb1             .req    q9
-
-       dga             .req    q11
-       dgb             .req    q12
-
-       dg0             .req    q13
-       dg1             .req    q14
-       dg2             .req    q15
-
-       .macro          add_only, ev, s0
-       vmov            dg2, dg0
-       .ifnb           \s0
-       vld1.32         {k\ev}, [rk, :128]!
-       .endif
-       sha256h.32      dg0, dg1, tb\ev
-       sha256h2.32     dg1, dg2, tb\ev
-       .ifnb           \s0
-       vadd.u32        ta\ev, q\s0, k\ev
-       .endif
-       .endm
-
-       .macro          add_update, ev, s0, s1, s2, s3
-       sha256su0.32    q\s0, q\s1
-       add_only        \ev, \s1
-       sha256su1.32    q\s0, q\s2, q\s3
-       .endm
-
-       .align          6
-.Lsha256_rcon:
-       .word           0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
-       .word           0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
-       .word           0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
-       .word           0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
-       .word           0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
-       .word           0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
-       .word           0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
-       .word           0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
-       .word           0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
-       .word           0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
-       .word           0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
-       .word           0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
-       .word           0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
-       .word           0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
-       .word           0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
-       .word           0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-
-       /*
-        * void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
-        *                          const u8 *data, size_t nblocks);
-        */
-ENTRY(sha256_ce_transform)
-       /* load state */
-       vld1.32         {dga-dgb}, [r0]
-
-       /* load input */
-0:     vld1.32         {q0-q1}, [r1]!
-       vld1.32         {q2-q3}, [r1]!
-       subs            r2, r2, #1
-
-#ifndef CONFIG_CPU_BIG_ENDIAN
-       vrev32.8        q0, q0
-       vrev32.8        q1, q1
-       vrev32.8        q2, q2
-       vrev32.8        q3, q3
-#endif
-
-       /* load first round constant */
-       adr             rk, .Lsha256_rcon
-       vld1.32         {k0}, [rk, :128]!
-
-       vadd.u32        ta0, q0, k0
-       vmov            dg0, dga
-       vmov            dg1, dgb
-
-       add_update      1, 0, 1, 2, 3
-       add_update      0, 1, 2, 3, 0
-       add_update      1, 2, 3, 0, 1
-       add_update      0, 3, 0, 1, 2
-       add_update      1, 0, 1, 2, 3
-       add_update      0, 1, 2, 3, 0
-       add_update      1, 2, 3, 0, 1
-       add_update      0, 3, 0, 1, 2
-       add_update      1, 0, 1, 2, 3
-       add_update      0, 1, 2, 3, 0
-       add_update      1, 2, 3, 0, 1
-       add_update      0, 3, 0, 1, 2
-
-       add_only        1, 1
-       add_only        0, 2
-       add_only        1, 3
-       add_only        0
-
-       /* update state */
-       vadd.u32        dga, dga, dg0
-       vadd.u32        dgb, dgb, dg1
-       bne             0b
-
-       /* store new state */
-       vst1.32         {dga-dgb}, [r0]
-       bx              lr
-ENDPROC(sha256_ce_transform)
diff --git a/arch/arm/lib/crypto/sha256.c b/arch/arm/lib/crypto/sha256.c
deleted file mode 100644 (file)
index 109192e..0000000
+++ /dev/null
@@ -1,64 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 optimized for ARM
- *
- * Copyright 2025 Google LLC
- */
-#include <asm/neon.h>
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
-                                  const u8 *data, size_t nblocks);
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-asmlinkage void sha256_block_data_order_neon(u32 state[SHA256_STATE_WORDS],
-                                            const u8 *data, size_t nblocks);
-asmlinkage void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
-                                   const u8 *data, size_t nblocks);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
-
-void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
-                       const u8 *data, size_t nblocks)
-{
-       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-           static_branch_likely(&have_neon)) {
-               kernel_neon_begin();
-               if (static_branch_likely(&have_ce))
-                       sha256_ce_transform(state, data, nblocks);
-               else
-                       sha256_block_data_order_neon(state, data, nblocks);
-               kernel_neon_end();
-       } else {
-               sha256_blocks_arch(state, data, nblocks);
-       }
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_simd);
-
-bool sha256_is_arch_optimized(void)
-{
-       /* We always can use at least the ARM scalar implementation. */
-       return true;
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
-static int __init sha256_arm_mod_init(void)
-{
-       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
-               static_branch_enable(&have_neon);
-               if (elf_hwcap2 & HWCAP2_SHA2)
-                       static_branch_enable(&have_ce);
-       }
-       return 0;
-}
-subsys_initcall(sha256_arm_mod_init);
-
-static void __exit sha256_arm_mod_exit(void)
-{
-}
-module_exit(sha256_arm_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 optimized for ARM");
index dce127a69f1310e25426af9e7780f8f51ed25021..e14bef8e87af249c58959035413ec301467c6750 100644 (file)
@@ -190,7 +190,7 @@ config CRYPTO_LIB_SM3
 
 if !KMSAN # avoid false positives from assembly
 if ARM
-source "arch/arm/lib/crypto/Kconfig"
+source "lib/crypto/arm/Kconfig"
 endif
 if ARM64
 source "arch/arm64/lib/crypto/Kconfig"
index aaf445a85384cef63552c9c08319478112300675..5f2b81f82a85d62cef781a06da79155d278dbe6d 100644 (file)
@@ -106,3 +106,5 @@ obj-$(CONFIG_CRYPTO_SELFTESTS_FULL)         += simd.o
 
 obj-$(CONFIG_CRYPTO_LIB_SM3)                   += libsm3.o
 libsm3-y                                       := sm3.o
+
+obj-$(CONFIG_ARM) += arm/
index 670a4d97b5684051b679cac0b1b70cc2ffc2605c..f6c4e8ef80dae9943f0e935373396c0306ea1614 100644 (file)
@@ -1,2 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
+sha256-core.S
 sha512-core.S
diff --git a/lib/crypto/arm/Kconfig b/lib/crypto/arm/Kconfig
new file mode 100644 (file)
index 0000000..d1ad664
--- /dev/null
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_BLAKE2S_ARM
+       bool "Hash functions: BLAKE2s"
+       select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
+       help
+         BLAKE2s cryptographic hash function (RFC 7693)
+
+         Architecture: arm
+
+         This is faster than the generic implementations of BLAKE2s and
+         BLAKE2b, but slower than the NEON implementation of BLAKE2b.
+         There is no NEON implementation of BLAKE2s, since NEON doesn't
+         really help with it.
+
+config CRYPTO_CHACHA20_NEON
+       tristate
+       default CRYPTO_LIB_CHACHA
+       select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_ARM
+       tristate
+       default CRYPTO_LIB_POLY1305
+       select CRYPTO_ARCH_HAVE_LIB_POLY1305
+
+config CRYPTO_SHA256_ARM
+       tristate
+       depends on !CPU_V7M
+       default CRYPTO_LIB_SHA256
+       select CRYPTO_ARCH_HAVE_LIB_SHA256
+       select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
diff --git a/lib/crypto/arm/Makefile b/lib/crypto/arm/Makefile
new file mode 100644 (file)
index 0000000..431f77c
--- /dev/null
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o
+libblake2s-arm-y := blake2s-core.o blake2s-glue.o
+
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+chacha-neon-y := chacha-scalar-core.o chacha-glue.o
+chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
+
+obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
+poly1305-arm-y := poly1305-core.o poly1305-glue.o
+
+obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
+sha256-arm-y := sha256.o sha256-core.o
+sha256-arm-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o
+
+quiet_cmd_perl = PERL    $@
+      cmd_perl = $(PERL) $(<) > $(@)
+
+$(obj)/%-core.S: $(src)/%-armv4.pl
+       $(call cmd,perl)
+
+clean-files += poly1305-core.S sha256-core.S
+
+aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
+
+# massage the perlasm code a bit so we only get the NEON routine if we need it
+poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
+poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
+AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y)
+
+AFLAGS_sha256-core.o += $(aflags-thumb2-y)
diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S
new file mode 100644 (file)
index 0000000..df40e46
--- /dev/null
@@ -0,0 +1,306 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2s digest algorithm, ARM scalar implementation
+ *
+ * Copyright 2020 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+       // Registers used to hold message words temporarily.  There aren't
+       // enough ARM registers to hold the whole message block, so we have to
+       // load the words on-demand.
+       M_0             .req    r12
+       M_1             .req    r14
+
+// The BLAKE2s initialization vector
+.Lblake2s_IV:
+       .word   0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+       .word   0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+
+.macro __ldrd          a, b, src, offset
+#if __LINUX_ARM_ARCH__ >= 6
+       ldrd            \a, \b, [\src, #\offset]
+#else
+       ldr             \a, [\src, #\offset]
+       ldr             \b, [\src, #\offset + 4]
+#endif
+.endm
+
+.macro __strd          a, b, dst, offset
+#if __LINUX_ARM_ARCH__ >= 6
+       strd            \a, \b, [\dst, #\offset]
+#else
+       str             \a, [\dst, #\offset]
+       str             \b, [\dst, #\offset + 4]
+#endif
+.endm
+
+.macro _le32_bswap     a, tmp
+#ifdef __ARMEB__
+       rev_l           \a, \tmp
+#endif
+.endm
+
+.macro _le32_bswap_8x  a, b, c, d, e, f, g, h,  tmp
+       _le32_bswap     \a, \tmp
+       _le32_bswap     \b, \tmp
+       _le32_bswap     \c, \tmp
+       _le32_bswap     \d, \tmp
+       _le32_bswap     \e, \tmp
+       _le32_bswap     \f, \tmp
+       _le32_bswap     \g, \tmp
+       _le32_bswap     \h, \tmp
+.endm
+
+// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
+// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
+// columns/diagonals.  s0-s1 are the word offsets to the message words the first
+// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
+// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
+//
+// Note that to save instructions, the rotations don't happen when the
+// pseudocode says they should, but rather they are delayed until the values are
+// used.  See the comment above _blake2s_round().
+.macro _blake2s_quarterround  a0, b0, c0, d0,  a1, b1, c1, d1,  s0, s1, s2, s3
+
+       ldr             M_0, [sp, #32 + 4 * \s0]
+       ldr             M_1, [sp, #32 + 4 * \s2]
+
+       // a += b + m[blake2s_sigma[r][2*i + 0]];
+       add             \a0, \a0, \b0, ror #brot
+       add             \a1, \a1, \b1, ror #brot
+       add             \a0, \a0, M_0
+       add             \a1, \a1, M_1
+
+       // d = ror32(d ^ a, 16);
+       eor             \d0, \a0, \d0, ror #drot
+       eor             \d1, \a1, \d1, ror #drot
+
+       // c += d;
+       add             \c0, \c0, \d0, ror #16
+       add             \c1, \c1, \d1, ror #16
+
+       // b = ror32(b ^ c, 12);
+       eor             \b0, \c0, \b0, ror #brot
+       eor             \b1, \c1, \b1, ror #brot
+
+       ldr             M_0, [sp, #32 + 4 * \s1]
+       ldr             M_1, [sp, #32 + 4 * \s3]
+
+       // a += b + m[blake2s_sigma[r][2*i + 1]];
+       add             \a0, \a0, \b0, ror #12
+       add             \a1, \a1, \b1, ror #12
+       add             \a0, \a0, M_0
+       add             \a1, \a1, M_1
+
+       // d = ror32(d ^ a, 8);
+       eor             \d0, \a0, \d0, ror#16
+       eor             \d1, \a1, \d1, ror#16
+
+       // c += d;
+       add             \c0, \c0, \d0, ror#8
+       add             \c1, \c1, \d1, ror#8
+
+       // b = ror32(b ^ c, 7);
+       eor             \b0, \c0, \b0, ror#12
+       eor             \b1, \c1, \b1, ror#12
+.endm
+
+// Execute one round of BLAKE2s by updating the state matrix v[0..15].  v[0..9]
+// are in r0..r9.  The stack pointer points to 8 bytes of scratch space for
+// spilling v[8..9], then to v[9..15], then to the message block.  r10-r12 and
+// r14 are free to use.  The macro arguments s0-s15 give the order in which the
+// message words are used in this round.
+//
+// All rotates are performed using the implicit rotate operand accepted by the
+// 'add' and 'eor' instructions.  This is faster than using explicit rotate
+// instructions.  To make this work, we allow the values in the second and last
+// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
+// wrong rotation amount.  The rotation amount is then fixed up just in time
+// when the values are used.  'brot' is the number of bits the values in row 'b'
+// need to be rotated right to arrive at the correct values, and 'drot'
+// similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
+// that they end up as (7, 8) after every round.
+.macro _blake2s_round  s0, s1, s2, s3, s4, s5, s6, s7, \
+                       s8, s9, s10, s11, s12, s13, s14, s15
+
+       // Mix first two columns:
+       // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
+       __ldrd          r10, r11, sp, 16        // load v[12] and v[13]
+       _blake2s_quarterround   r0, r4, r8, r10,  r1, r5, r9, r11, \
+                               \s0, \s1, \s2, \s3
+       __strd          r8, r9, sp, 0
+       __strd          r10, r11, sp, 16
+
+       // Mix second two columns:
+       // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
+       __ldrd          r8, r9, sp, 8           // load v[10] and v[11]
+       __ldrd          r10, r11, sp, 24        // load v[14] and v[15]
+       _blake2s_quarterround   r2, r6, r8, r10,  r3, r7, r9, r11, \
+                               \s4, \s5, \s6, \s7
+       str             r10, [sp, #24]          // store v[14]
+       // v[10], v[11], and v[15] are used below, so no need to store them yet.
+
+       .set brot, 7
+       .set drot, 8
+
+       // Mix first two diagonals:
+       // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
+       ldr             r10, [sp, #16]          // load v[12]
+       _blake2s_quarterround   r0, r5, r8, r11,  r1, r6, r9, r10, \
+                               \s8, \s9, \s10, \s11
+       __strd          r8, r9, sp, 8
+       str             r11, [sp, #28]
+       str             r10, [sp, #16]
+
+       // Mix second two diagonals:
+       // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
+       __ldrd          r8, r9, sp, 0           // load v[8] and v[9]
+       __ldrd          r10, r11, sp, 20        // load v[13] and v[14]
+       _blake2s_quarterround   r2, r7, r8, r10,  r3, r4, r9, r11, \
+                               \s12, \s13, \s14, \s15
+       __strd          r10, r11, sp, 20
+.endm
+
+//
+// void blake2s_compress(struct blake2s_state *state,
+//                      const u8 *block, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_state are used:
+//     u32 h[8];       (inout)
+//     u32 t[2];       (inout)
+//     u32 f[2];       (in)
+//
+       .align          5
+ENTRY(blake2s_compress)
+       push            {r0-r2,r4-r11,lr}       // keep this an even number
+
+.Lnext_block:
+       // r0 is 'state'
+       // r1 is 'block'
+       // r3 is 'inc'
+
+       // Load and increment the counter t[0..1].
+       __ldrd          r10, r11, r0, 32
+       adds            r10, r10, r3
+       adc             r11, r11, #0
+       __strd          r10, r11, r0, 32
+
+       // _blake2s_round is very short on registers, so copy the message block
+       // to the stack to save a register during the rounds.  This also has the
+       // advantage that misalignment only needs to be dealt with in one place.
+       sub             sp, sp, #64
+       mov             r12, sp
+       tst             r1, #3
+       bne             .Lcopy_block_misaligned
+       ldmia           r1!, {r2-r9}
+       _le32_bswap_8x  r2, r3, r4, r5, r6, r7, r8, r9,  r14
+       stmia           r12!, {r2-r9}
+       ldmia           r1!, {r2-r9}
+       _le32_bswap_8x  r2, r3, r4, r5, r6, r7, r8, r9,  r14
+       stmia           r12, {r2-r9}
+.Lcopy_block_done:
+       str             r1, [sp, #68]           // Update message pointer
+
+       // Calculate v[8..15].  Push v[9..15] onto the stack, and leave space
+       // for spilling v[8..9].  Leave v[8..9] in r8-r9.
+       mov             r14, r0                 // r14 = state
+       adr             r12, .Lblake2s_IV
+       ldmia           r12!, {r8-r9}           // load IV[0..1]
+       __ldrd          r0, r1, r14, 40         // load f[0..1]
+       ldm             r12, {r2-r7}            // load IV[3..7]
+       eor             r4, r4, r10             // v[12] = IV[4] ^ t[0]
+       eor             r5, r5, r11             // v[13] = IV[5] ^ t[1]
+       eor             r6, r6, r0              // v[14] = IV[6] ^ f[0]
+       eor             r7, r7, r1              // v[15] = IV[7] ^ f[1]
+       push            {r2-r7}                 // push v[9..15]
+       sub             sp, sp, #8              // leave space for v[8..9]
+
+       // Load h[0..7] == v[0..7].
+       ldm             r14, {r0-r7}
+
+       // Execute the rounds.  Each round is provided the order in which it
+       // needs to use the message words.
+       .set brot, 0
+       .set drot, 0
+       _blake2s_round  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+       _blake2s_round  14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
+       _blake2s_round  11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
+       _blake2s_round  7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
+       _blake2s_round  9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
+       _blake2s_round  2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
+       _blake2s_round  12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
+       _blake2s_round  13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
+       _blake2s_round  6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
+       _blake2s_round  10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
+
+       // Fold the final state matrix into the hash chaining value:
+       //
+       //      for (i = 0; i < 8; i++)
+       //              h[i] ^= v[i] ^ v[i + 8];
+       //
+       ldr             r14, [sp, #96]          // r14 = &h[0]
+       add             sp, sp, #8              // v[8..9] are already loaded.
+       pop             {r10-r11}               // load v[10..11]
+       eor             r0, r0, r8
+       eor             r1, r1, r9
+       eor             r2, r2, r10
+       eor             r3, r3, r11
+       ldm             r14, {r8-r11}           // load h[0..3]
+       eor             r0, r0, r8
+       eor             r1, r1, r9
+       eor             r2, r2, r10
+       eor             r3, r3, r11
+       stmia           r14!, {r0-r3}           // store new h[0..3]
+       ldm             r14, {r0-r3}            // load old h[4..7]
+       pop             {r8-r11}                // load v[12..15]
+       eor             r0, r0, r4, ror #brot
+       eor             r1, r1, r5, ror #brot
+       eor             r2, r2, r6, ror #brot
+       eor             r3, r3, r7, ror #brot
+       eor             r0, r0, r8, ror #drot
+       eor             r1, r1, r9, ror #drot
+       eor             r2, r2, r10, ror #drot
+       eor             r3, r3, r11, ror #drot
+         add           sp, sp, #64             // skip copy of message block
+       stm             r14, {r0-r3}            // store new h[4..7]
+
+       // Advance to the next block, if there is one.  Note that if there are
+       // multiple blocks, then 'inc' (the counter increment amount) must be
+       // 64.  So we can simply set it to 64 without re-loading it.
+       ldm             sp, {r0, r1, r2}        // load (state, block, nblocks)
+       mov             r3, #64                 // set 'inc'
+       subs            r2, r2, #1              // nblocks--
+       str             r2, [sp, #8]
+       bne             .Lnext_block            // nblocks != 0?
+
+       pop             {r0-r2,r4-r11,pc}
+
+       // The next message block (pointed to by r1) isn't 4-byte aligned, so it
+       // can't be loaded using ldmia.  Copy it to the stack buffer (pointed to
+       // by r12) using an alternative method.  r2-r9 are free to use.
+.Lcopy_block_misaligned:
+       mov             r2, #64
+1:
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+       ldr             r3, [r1], #4
+       _le32_bswap     r3, r4
+#else
+       ldrb            r3, [r1, #0]
+       ldrb            r4, [r1, #1]
+       ldrb            r5, [r1, #2]
+       ldrb            r6, [r1, #3]
+       add             r1, r1, #4
+       orr             r3, r3, r4, lsl #8
+       orr             r3, r3, r5, lsl #16
+       orr             r3, r3, r6, lsl #24
+#endif
+       subs            r2, r2, #4
+       str             r3, [r12], #4
+       bne             1b
+       b               .Lcopy_block_done
+ENDPROC(blake2s_compress)
diff --git a/lib/crypto/arm/blake2s-glue.c b/lib/crypto/arm/blake2s-glue.c
new file mode 100644 (file)
index 0000000..0238a70
--- /dev/null
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <crypto/internal/blake2s.h>
+#include <linux/module.h>
+
+/* defined in blake2s-core.S */
+EXPORT_SYMBOL(blake2s_compress);
diff --git a/lib/crypto/arm/chacha-glue.c b/lib/crypto/arm/chacha-glue.c
new file mode 100644 (file)
index 0000000..88ec964
--- /dev/null
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ChaCha and HChaCha functions (ARM optimized)
+ *
+ * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/cputype.h>
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
+                                     u8 *dst, const u8 *src, int nrounds);
+asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
+                                      u8 *dst, const u8 *src,
+                                      int nrounds, unsigned int nbytes);
+asmlinkage void hchacha_block_arm(const struct chacha_state *state,
+                                 u32 out[HCHACHA_OUT_WORDS], int nrounds);
+asmlinkage void hchacha_block_neon(const struct chacha_state *state,
+                                  u32 out[HCHACHA_OUT_WORDS], int nrounds);
+
+asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+                            const struct chacha_state *state, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
+
+static inline bool neon_usable(void)
+{
+       return static_branch_likely(&use_neon) && crypto_simd_usable();
+}
+
+static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
+                         unsigned int bytes, int nrounds)
+{
+       u8 buf[CHACHA_BLOCK_SIZE];
+
+       while (bytes > CHACHA_BLOCK_SIZE) {
+               unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
+
+               chacha_4block_xor_neon(state, dst, src, nrounds, l);
+               bytes -= l;
+               src += l;
+               dst += l;
+               state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+       }
+       if (bytes) {
+               const u8 *s = src;
+               u8 *d = dst;
+
+               if (bytes != CHACHA_BLOCK_SIZE)
+                       s = d = memcpy(buf, src, bytes);
+               chacha_block_xor_neon(state, d, s, nrounds);
+               if (d != dst)
+                       memcpy(dst, buf, bytes);
+               state->x[12]++;
+       }
+}
+
+void hchacha_block_arch(const struct chacha_state *state,
+                       u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+       if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
+               hchacha_block_arm(state, out, nrounds);
+       } else {
+               kernel_neon_begin();
+               hchacha_block_neon(state, out, nrounds);
+               kernel_neon_end();
+       }
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
+                      unsigned int bytes, int nrounds)
+{
+       if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
+           bytes <= CHACHA_BLOCK_SIZE) {
+               chacha_doarm(dst, src, bytes, state, nrounds);
+               state->x[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
+               return;
+       }
+
+       do {
+               unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+               kernel_neon_begin();
+               chacha_doneon(state, dst, src, todo, nrounds);
+               kernel_neon_end();
+
+               bytes -= todo;
+               src += todo;
+               dst += todo;
+       } while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+       /* We always can use at least the ARM scalar implementation. */
+       return true;
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+static int __init chacha_arm_mod_init(void)
+{
+       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
+               switch (read_cpuid_part()) {
+               case ARM_CPU_PART_CORTEX_A7:
+               case ARM_CPU_PART_CORTEX_A5:
+                       /*
+                        * The Cortex-A7 and Cortex-A5 do not perform well with
+                        * the NEON implementation but do incredibly with the
+                        * scalar one and use less power.
+                        */
+                       break;
+               default:
+                       static_branch_enable(&use_neon);
+               }
+       }
+       return 0;
+}
+subsys_initcall(chacha_arm_mod_init);
+
+static void __exit chacha_arm_mod_exit(void)
+{
+}
+module_exit(chacha_arm_mod_exit);
+
+MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/arm/chacha-neon-core.S b/lib/crypto/arm/chacha-neon-core.S
new file mode 100644 (file)
index 0000000..ddd62b6
--- /dev/null
@@ -0,0 +1,643 @@
+/*
+ * ChaCha/HChaCha NEON helper functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+ /*
+  * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
+  *
+  * (a)  vshl.u32 + vsri.u32           (needs temporary register)
+  * (b)  vshl.u32 + vshr.u32 + vorr    (needs temporary register)
+  * (c)  vrev32.16                     (16-bit rotations only)
+  * (d)  vtbl.8 + vtbl.8               (multiple of 8 bits rotations only,
+  *                                     needs index vector)
+  *
+  * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
+  * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
+  * cycles of (b) on both Cortex-A7 and Cortex-A53.
+  *
+  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
+  * and doesn't need a temporary register.
+  *
+  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
+  * is twice as fast as (a), even when doing (a) on multiple registers
+  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
+  * parallelizes better when temporary registers are scarce.
+  *
+  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
+  * (a), so the need to load the rotation table actually makes the vtbl method
+  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
+  * seems to be a good compromise to get a more significant speed boost on some
+  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
+  */
+
+#include <linux/linkage.h>
+#include <asm/cache.h>
+
+       .text
+       .fpu            neon
+       .align          5
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers q0-q3.  It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in r3.
+ *
+ * Clobbers: r3, ip, q4-q5
+ */
+chacha_permute:
+
+       adr             ip, .Lrol8_table
+       vld1.8          {d10}, [ip, :64]
+
+.Ldoubleround:
+       // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+       vadd.i32        q0, q0, q1
+       veor            q3, q3, q0
+       vrev32.16       q3, q3
+
+       // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+       vadd.i32        q2, q2, q3
+       veor            q4, q1, q2
+       vshl.u32        q1, q4, #12
+       vsri.u32        q1, q4, #20
+
+       // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+       vadd.i32        q0, q0, q1
+       veor            q3, q3, q0
+       vtbl.8          d6, {d6}, d10
+       vtbl.8          d7, {d7}, d10
+
+       // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+       vadd.i32        q2, q2, q3
+       veor            q4, q1, q2
+       vshl.u32        q1, q4, #7
+       vsri.u32        q1, q4, #25
+
+       // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+       vext.8          q1, q1, q1, #4
+       // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+       vext.8          q2, q2, q2, #8
+       // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+       vext.8          q3, q3, q3, #12
+
+       // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+       vadd.i32        q0, q0, q1
+       veor            q3, q3, q0
+       vrev32.16       q3, q3
+
+       // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+       vadd.i32        q2, q2, q3
+       veor            q4, q1, q2
+       vshl.u32        q1, q4, #12
+       vsri.u32        q1, q4, #20
+
+       // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+       vadd.i32        q0, q0, q1
+       veor            q3, q3, q0
+       vtbl.8          d6, {d6}, d10
+       vtbl.8          d7, {d7}, d10
+
+       // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+       vadd.i32        q2, q2, q3
+       veor            q4, q1, q2
+       vshl.u32        q1, q4, #7
+       vsri.u32        q1, q4, #25
+
+       // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+       vext.8          q1, q1, q1, #12
+       // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+       vext.8          q2, q2, q2, #8
+       // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+       vext.8          q3, q3, q3, #4
+
+       subs            r3, r3, #2
+       bne             .Ldoubleround
+
+       bx              lr
+ENDPROC(chacha_permute)
+
+ENTRY(chacha_block_xor_neon)
+       // r0: Input state matrix, s
+       // r1: 1 data block output, o
+       // r2: 1 data block input, i
+       // r3: nrounds
+       push            {lr}
+
+       // x0..3 = s0..3
+       add             ip, r0, #0x20
+       vld1.32         {q0-q1}, [r0]
+       vld1.32         {q2-q3}, [ip]
+
+       vmov            q8, q0
+       vmov            q9, q1
+       vmov            q10, q2
+       vmov            q11, q3
+
+       bl              chacha_permute
+
+       add             ip, r2, #0x20
+       vld1.8          {q4-q5}, [r2]
+       vld1.8          {q6-q7}, [ip]
+
+       // o0 = i0 ^ (x0 + s0)
+       vadd.i32        q0, q0, q8
+       veor            q0, q0, q4
+
+       // o1 = i1 ^ (x1 + s1)
+       vadd.i32        q1, q1, q9
+       veor            q1, q1, q5
+
+       // o2 = i2 ^ (x2 + s2)
+       vadd.i32        q2, q2, q10
+       veor            q2, q2, q6
+
+       // o3 = i3 ^ (x3 + s3)
+       vadd.i32        q3, q3, q11
+       veor            q3, q3, q7
+
+       add             ip, r1, #0x20
+       vst1.8          {q0-q1}, [r1]
+       vst1.8          {q2-q3}, [ip]
+
+       pop             {pc}
+ENDPROC(chacha_block_xor_neon)
+
+ENTRY(hchacha_block_neon)
+       // r0: Input state matrix, s
+       // r1: output (8 32-bit words)
+       // r2: nrounds
+       push            {lr}
+
+       vld1.32         {q0-q1}, [r0]!
+       vld1.32         {q2-q3}, [r0]
+
+       mov             r3, r2
+       bl              chacha_permute
+
+       vst1.32         {q0}, [r1]!
+       vst1.32         {q3}, [r1]
+
+       pop             {pc}
+ENDPROC(hchacha_block_neon)
+
+       .align          4
+.Lctrinc:      .word   0, 1, 2, 3
+.Lrol8_table:  .byte   3, 0, 1, 2, 7, 4, 5, 6
+
+       .align          5
+ENTRY(chacha_4block_xor_neon)
+       push            {r4, lr}
+       mov             r4, sp                  // preserve the stack pointer
+       sub             ip, sp, #0x20           // allocate a 32 byte buffer
+       bic             ip, ip, #0x1f           // aligned to 32 bytes
+       mov             sp, ip
+
+       // r0: Input state matrix, s
+       // r1: 4 data blocks output, o
+       // r2: 4 data blocks input, i
+       // r3: nrounds
+
+       //
+       // This function encrypts four consecutive ChaCha blocks by loading
+       // the state matrix in NEON registers four times. The algorithm performs
+       // each operation on the corresponding word of each state matrix, hence
+       // requires no word shuffling. The words are re-interleaved before the
+       // final addition of the original state and the XORing step.
+       //
+
+       // x0..15[0-3] = s0..15[0-3]
+       add             ip, r0, #0x20
+       vld1.32         {q0-q1}, [r0]
+       vld1.32         {q2-q3}, [ip]
+
+       adr             lr, .Lctrinc
+       vdup.32         q15, d7[1]
+       vdup.32         q14, d7[0]
+       vld1.32         {q4}, [lr, :128]
+       vdup.32         q13, d6[1]
+       vdup.32         q12, d6[0]
+       vdup.32         q11, d5[1]
+       vdup.32         q10, d5[0]
+       vadd.u32        q12, q12, q4            // x12 += counter values 0-3
+       vdup.32         q9, d4[1]
+       vdup.32         q8, d4[0]
+       vdup.32         q7, d3[1]
+       vdup.32         q6, d3[0]
+       vdup.32         q5, d2[1]
+       vdup.32         q4, d2[0]
+       vdup.32         q3, d1[1]
+       vdup.32         q2, d1[0]
+       vdup.32         q1, d0[1]
+       vdup.32         q0, d0[0]
+
+       adr             ip, .Lrol8_table
+       b               1f
+
+.Ldoubleround4:
+       vld1.32         {q8-q9}, [sp, :256]
+1:
+       // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+       // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+       // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+       // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+       vadd.i32        q0, q0, q4
+       vadd.i32        q1, q1, q5
+       vadd.i32        q2, q2, q6
+       vadd.i32        q3, q3, q7
+
+       veor            q12, q12, q0
+       veor            q13, q13, q1
+       veor            q14, q14, q2
+       veor            q15, q15, q3
+
+       vrev32.16       q12, q12
+       vrev32.16       q13, q13
+       vrev32.16       q14, q14
+       vrev32.16       q15, q15
+
+       // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+       // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+       // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+       // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+       vadd.i32        q8, q8, q12
+       vadd.i32        q9, q9, q13
+       vadd.i32        q10, q10, q14
+       vadd.i32        q11, q11, q15
+
+       vst1.32         {q8-q9}, [sp, :256]
+
+       veor            q8, q4, q8
+       veor            q9, q5, q9
+       vshl.u32        q4, q8, #12
+       vshl.u32        q5, q9, #12
+       vsri.u32        q4, q8, #20
+       vsri.u32        q5, q9, #20
+
+       veor            q8, q6, q10
+       veor            q9, q7, q11
+       vshl.u32        q6, q8, #12
+       vshl.u32        q7, q9, #12
+       vsri.u32        q6, q8, #20
+       vsri.u32        q7, q9, #20
+
+       // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+       // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+       // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+       // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+       vld1.8          {d16}, [ip, :64]
+       vadd.i32        q0, q0, q4
+       vadd.i32        q1, q1, q5
+       vadd.i32        q2, q2, q6
+       vadd.i32        q3, q3, q7
+
+       veor            q12, q12, q0
+       veor            q13, q13, q1
+       veor            q14, q14, q2
+       veor            q15, q15, q3
+
+       vtbl.8          d24, {d24}, d16
+       vtbl.8          d25, {d25}, d16
+       vtbl.8          d26, {d26}, d16
+       vtbl.8          d27, {d27}, d16
+       vtbl.8          d28, {d28}, d16
+       vtbl.8          d29, {d29}, d16
+       vtbl.8          d30, {d30}, d16
+       vtbl.8          d31, {d31}, d16
+
+       vld1.32         {q8-q9}, [sp, :256]
+
+       // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+       // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+       // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+       // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+       vadd.i32        q8, q8, q12
+       vadd.i32        q9, q9, q13
+       vadd.i32        q10, q10, q14
+       vadd.i32        q11, q11, q15
+
+       vst1.32         {q8-q9}, [sp, :256]
+
+       veor            q8, q4, q8
+       veor            q9, q5, q9
+       vshl.u32        q4, q8, #7
+       vshl.u32        q5, q9, #7
+       vsri.u32        q4, q8, #25
+       vsri.u32        q5, q9, #25
+
+       veor            q8, q6, q10
+       veor            q9, q7, q11
+       vshl.u32        q6, q8, #7
+       vshl.u32        q7, q9, #7
+       vsri.u32        q6, q8, #25
+       vsri.u32        q7, q9, #25
+
+       vld1.32         {q8-q9}, [sp, :256]
+
+       // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+       // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+       // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+       // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+       vadd.i32        q0, q0, q5
+       vadd.i32        q1, q1, q6
+       vadd.i32        q2, q2, q7
+       vadd.i32        q3, q3, q4
+
+       veor            q15, q15, q0
+       veor            q12, q12, q1
+       veor            q13, q13, q2
+       veor            q14, q14, q3
+
+       vrev32.16       q15, q15
+       vrev32.16       q12, q12
+       vrev32.16       q13, q13
+       vrev32.16       q14, q14
+
+       // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+       // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+       // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+       // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+       vadd.i32        q10, q10, q15
+       vadd.i32        q11, q11, q12
+       vadd.i32        q8, q8, q13
+       vadd.i32        q9, q9, q14
+
+       vst1.32         {q8-q9}, [sp, :256]
+
+       veor            q8, q7, q8
+       veor            q9, q4, q9
+       vshl.u32        q7, q8, #12
+       vshl.u32        q4, q9, #12
+       vsri.u32        q7, q8, #20
+       vsri.u32        q4, q9, #20
+
+       veor            q8, q5, q10
+       veor            q9, q6, q11
+       vshl.u32        q5, q8, #12
+       vshl.u32        q6, q9, #12
+       vsri.u32        q5, q8, #20
+       vsri.u32        q6, q9, #20
+
+       // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+       // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+       // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+       // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+       vld1.8          {d16}, [ip, :64]
+       vadd.i32        q0, q0, q5
+       vadd.i32        q1, q1, q6
+       vadd.i32        q2, q2, q7
+       vadd.i32        q3, q3, q4
+
+       veor            q15, q15, q0
+       veor            q12, q12, q1
+       veor            q13, q13, q2
+       veor            q14, q14, q3
+
+       vtbl.8          d30, {d30}, d16
+       vtbl.8          d31, {d31}, d16
+       vtbl.8          d24, {d24}, d16
+       vtbl.8          d25, {d25}, d16
+       vtbl.8          d26, {d26}, d16
+       vtbl.8          d27, {d27}, d16
+       vtbl.8          d28, {d28}, d16
+       vtbl.8          d29, {d29}, d16
+
+       vld1.32         {q8-q9}, [sp, :256]
+
+       // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+       // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+       // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+       // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+       vadd.i32        q10, q10, q15
+       vadd.i32        q11, q11, q12
+       vadd.i32        q8, q8, q13
+       vadd.i32        q9, q9, q14
+
+       vst1.32         {q8-q9}, [sp, :256]
+
+       veor            q8, q7, q8
+       veor            q9, q4, q9
+       vshl.u32        q7, q8, #7
+       vshl.u32        q4, q9, #7
+       vsri.u32        q7, q8, #25
+       vsri.u32        q4, q9, #25
+
+       veor            q8, q5, q10
+       veor            q9, q6, q11
+       vshl.u32        q5, q8, #7
+       vshl.u32        q6, q9, #7
+       vsri.u32        q5, q8, #25
+       vsri.u32        q6, q9, #25
+
+       subs            r3, r3, #2
+       bne             .Ldoubleround4
+
+       // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
+       // x8..9[0-3] are on the stack.
+
+       // Re-interleave the words in the first two rows of each block (x0..7).
+       // Also add the counter values 0-3 to x12[0-3].
+         vld1.32       {q8}, [lr, :128]        // load counter values 0-3
+       vzip.32         q0, q1                  // => (0 1 0 1) (0 1 0 1)
+       vzip.32         q2, q3                  // => (2 3 2 3) (2 3 2 3)
+       vzip.32         q4, q5                  // => (4 5 4 5) (4 5 4 5)
+       vzip.32         q6, q7                  // => (6 7 6 7) (6 7 6 7)
+         vadd.u32      q12, q8                 // x12 += counter values 0-3
+       vswp            d1, d4
+       vswp            d3, d6
+         vld1.32       {q8-q9}, [r0]!          // load s0..7
+       vswp            d9, d12
+       vswp            d11, d14
+
+       // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
+       // after XORing the first 32 bytes.
+       vswp            q1, q4
+
+       // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
+
+       // x0..3[0-3] += s0..3[0-3]     (add orig state to 1st row of each block)
+       vadd.u32        q0, q0, q8
+       vadd.u32        q2, q2, q8
+       vadd.u32        q4, q4, q8
+       vadd.u32        q3, q3, q8
+
+       // x4..7[0-3] += s4..7[0-3]     (add orig state to 2nd row of each block)
+       vadd.u32        q1, q1, q9
+       vadd.u32        q6, q6, q9
+       vadd.u32        q5, q5, q9
+       vadd.u32        q7, q7, q9
+
+       // XOR first 32 bytes using keystream from first two rows of first block
+       vld1.8          {q8-q9}, [r2]!
+       veor            q8, q8, q0
+       veor            q9, q9, q1
+       vst1.8          {q8-q9}, [r1]!
+
+       // Re-interleave the words in the last two rows of each block (x8..15).
+       vld1.32         {q8-q9}, [sp, :256]
+         mov           sp, r4          // restore original stack pointer
+         ldr           r4, [r4, #8]    // load number of bytes
+       vzip.32         q12, q13        // => (12 13 12 13) (12 13 12 13)
+       vzip.32         q14, q15        // => (14 15 14 15) (14 15 14 15)
+       vzip.32         q8, q9          // => (8 9 8 9) (8 9 8 9)
+       vzip.32         q10, q11        // => (10 11 10 11) (10 11 10 11)
+         vld1.32       {q0-q1}, [r0]   // load s8..15
+       vswp            d25, d28
+       vswp            d27, d30
+       vswp            d17, d20
+       vswp            d19, d22
+
+       // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
+
+       // x8..11[0-3] += s8..11[0-3]   (add orig state to 3rd row of each block)
+       vadd.u32        q8,  q8,  q0
+       vadd.u32        q10, q10, q0
+       vadd.u32        q9,  q9,  q0
+       vadd.u32        q11, q11, q0
+
+       // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
+       vadd.u32        q12, q12, q1
+       vadd.u32        q14, q14, q1
+       vadd.u32        q13, q13, q1
+       vadd.u32        q15, q15, q1
+
+       // XOR the rest of the data with the keystream
+
+       vld1.8          {q0-q1}, [r2]!
+       subs            r4, r4, #96
+       veor            q0, q0, q8
+       veor            q1, q1, q12
+       ble             .Lle96
+       vst1.8          {q0-q1}, [r1]!
+
+       vld1.8          {q0-q1}, [r2]!
+       subs            r4, r4, #32
+       veor            q0, q0, q2
+       veor            q1, q1, q6
+       ble             .Lle128
+       vst1.8          {q0-q1}, [r1]!
+
+       vld1.8          {q0-q1}, [r2]!
+       subs            r4, r4, #32
+       veor            q0, q0, q10
+       veor            q1, q1, q14
+       ble             .Lle160
+       vst1.8          {q0-q1}, [r1]!
+
+       vld1.8          {q0-q1}, [r2]!
+       subs            r4, r4, #32
+       veor            q0, q0, q4
+       veor            q1, q1, q5
+       ble             .Lle192
+       vst1.8          {q0-q1}, [r1]!
+
+       vld1.8          {q0-q1}, [r2]!
+       subs            r4, r4, #32
+       veor            q0, q0, q9
+       veor            q1, q1, q13
+       ble             .Lle224
+       vst1.8          {q0-q1}, [r1]!
+
+       vld1.8          {q0-q1}, [r2]!
+       subs            r4, r4, #32
+       veor            q0, q0, q3
+       veor            q1, q1, q7
+       blt             .Llt256
+.Lout:
+       vst1.8          {q0-q1}, [r1]!
+
+       vld1.8          {q0-q1}, [r2]
+       veor            q0, q0, q11
+       veor            q1, q1, q15
+       vst1.8          {q0-q1}, [r1]
+
+       pop             {r4, pc}
+
+.Lle192:
+       vmov            q4, q9
+       vmov            q5, q13
+
+.Lle160:
+       // nothing to do
+
+.Lfinalblock:
+       // Process the final block if processing less than 4 full blocks.
+       // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
+       // previous 32 byte output block that still needs to be written at
+       // [r1] in q0-q1.
+       beq             .Lfullblock
+
+.Lpartialblock:
+       adr             lr, .Lpermute + 32
+       add             r2, r2, r4
+       add             lr, lr, r4
+       add             r4, r4, r1
+
+       vld1.8          {q2-q3}, [lr]
+       vld1.8          {q6-q7}, [r2]
+
+       add             r4, r4, #32
+
+       vtbl.8          d4, {q4-q5}, d4
+       vtbl.8          d5, {q4-q5}, d5
+       vtbl.8          d6, {q4-q5}, d6
+       vtbl.8          d7, {q4-q5}, d7
+
+       veor            q6, q6, q2
+       veor            q7, q7, q3
+
+       vst1.8          {q6-q7}, [r4]   // overlapping stores
+       vst1.8          {q0-q1}, [r1]
+       pop             {r4, pc}
+
+.Lfullblock:
+       vmov            q11, q4
+       vmov            q15, q5
+       b               .Lout
+.Lle96:
+       vmov            q4, q2
+       vmov            q5, q6
+       b               .Lfinalblock
+.Lle128:
+       vmov            q4, q10
+       vmov            q5, q14
+       b               .Lfinalblock
+.Lle224:
+       vmov            q4, q3
+       vmov            q5, q7
+       b               .Lfinalblock
+.Llt256:
+       vmov            q4, q11
+       vmov            q5, q15
+       b               .Lpartialblock
+ENDPROC(chacha_4block_xor_neon)
+
+       .align          L1_CACHE_SHIFT
+.Lpermute:
+       .byte           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+       .byte           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+       .byte           0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+       .byte           0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+       .byte           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+       .byte           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+       .byte           0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+       .byte           0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
diff --git a/lib/crypto/arm/chacha-scalar-core.S b/lib/crypto/arm/chacha-scalar-core.S
new file mode 100644 (file)
index 0000000..4951df0
--- /dev/null
@@ -0,0 +1,444 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Google, Inc.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * Design notes:
+ *
+ * 16 registers would be needed to hold the state matrix, but only 14 are
+ * available because 'sp' and 'pc' cannot be used.  So we spill the elements
+ * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
+ * 'ldrd' and one 'strd' instruction per round.
+ *
+ * All rotates are performed using the implicit rotate operand accepted by the
+ * 'add' and 'eor' instructions.  This is faster than using explicit rotate
+ * instructions.  To make this work, we allow the values in the second and last
+ * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
+ * wrong rotation amount.  The rotation amount is then fixed up just in time
+ * when the values are used.  'brot' is the number of bits the values in row 'b'
+ * need to be rotated right to arrive at the correct values, and 'drot'
+ * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
+ * that they end up as (25, 24) after every round.
+ */
+
+       // ChaCha state registers
+       X0      .req    r0
+       X1      .req    r1
+       X2      .req    r2
+       X3      .req    r3
+       X4      .req    r4
+       X5      .req    r5
+       X6      .req    r6
+       X7      .req    r7
+       X8_X10  .req    r8      // shared by x8 and x10
+       X9_X11  .req    r9      // shared by x9 and x11
+       X12     .req    r10
+       X13     .req    r11
+       X14     .req    r12
+       X15     .req    r14
+
+.macro _le32_bswap_4x  a, b, c, d,  tmp
+#ifdef __ARMEB__
+       rev_l           \a,  \tmp
+       rev_l           \b,  \tmp
+       rev_l           \c,  \tmp
+       rev_l           \d,  \tmp
+#endif
+.endm
+
+.macro __ldrd          a, b, src, offset
+#if __LINUX_ARM_ARCH__ >= 6
+       ldrd            \a, \b, [\src, #\offset]
+#else
+       ldr             \a, [\src, #\offset]
+       ldr             \b, [\src, #\offset + 4]
+#endif
+.endm
+
+.macro __strd          a, b, dst, offset
+#if __LINUX_ARM_ARCH__ >= 6
+       strd            \a, \b, [\dst, #\offset]
+#else
+       str             \a, [\dst, #\offset]
+       str             \b, [\dst, #\offset + 4]
+#endif
+.endm
+
+.macro _halfround      a1, b1, c1, d1,  a2, b2, c2, d2
+
+       // a += b; d ^= a; d = rol(d, 16);
+       add             \a1, \a1, \b1, ror #brot
+       add             \a2, \a2, \b2, ror #brot
+       eor             \d1, \a1, \d1, ror #drot
+       eor             \d2, \a2, \d2, ror #drot
+       // drot == 32 - 16 == 16
+
+       // c += d; b ^= c; b = rol(b, 12);
+       add             \c1, \c1, \d1, ror #16
+       add             \c2, \c2, \d2, ror #16
+       eor             \b1, \c1, \b1, ror #brot
+       eor             \b2, \c2, \b2, ror #brot
+       // brot == 32 - 12 == 20
+
+       // a += b; d ^= a; d = rol(d, 8);
+       add             \a1, \a1, \b1, ror #20
+       add             \a2, \a2, \b2, ror #20
+       eor             \d1, \a1, \d1, ror #16
+       eor             \d2, \a2, \d2, ror #16
+       // drot == 32 - 8 == 24
+
+       // c += d; b ^= c; b = rol(b, 7);
+       add             \c1, \c1, \d1, ror #24
+       add             \c2, \c2, \d2, ror #24
+       eor             \b1, \c1, \b1, ror #20
+       eor             \b2, \c2, \b2, ror #20
+       // brot == 32 - 7 == 25
+.endm
+
+.macro _doubleround
+
+       // column round
+
+       // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
+       _halfround      X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
+
+       // save (x8, x9); restore (x10, x11)
+       __strd          X8_X10, X9_X11, sp, 0
+       __ldrd          X8_X10, X9_X11, sp, 8
+
+       // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
+       _halfround      X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
+
+       .set brot, 25
+       .set drot, 24
+
+       // diagonal round
+
+       // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
+       _halfround      X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
+
+       // save (x10, x11); restore (x8, x9)
+       __strd          X8_X10, X9_X11, sp, 8
+       __ldrd          X8_X10, X9_X11, sp, 0
+
+       // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
+       _halfround      X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
+.endm
+
+.macro _chacha_permute nrounds
+       .set brot, 0
+       .set drot, 0
+       .rept \nrounds / 2
+        _doubleround
+       .endr
+.endm
+
+.macro _chacha         nrounds
+
+.Lnext_block\@:
+       // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
+       // Registers contain x0-x9,x12-x15.
+
+       // Do the core ChaCha permutation to update x0-x15.
+       _chacha_permute \nrounds
+
+       add             sp, #8
+       // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
+       // Registers contain x0-x9,x12-x15.
+       // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+       // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
+       push            {X8_X10, X9_X11, X12, X13, X14, X15}
+
+       // Load (OUT, IN, LEN).
+       ldr             r14, [sp, #96]
+       ldr             r12, [sp, #100]
+       ldr             r11, [sp, #104]
+
+       orr             r10, r14, r12
+
+       // Use slow path if fewer than 64 bytes remain.
+       cmp             r11, #64
+       blt             .Lxor_slowpath\@
+
+       // Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
+       // ARMv6+, since ldmia and stmia (used below) still require alignment.
+       tst             r10, #3
+       bne             .Lxor_slowpath\@
+
+       // Fast path: XOR 64 bytes of aligned data.
+
+       // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+       // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
+       // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+       // x0-x3
+       __ldrd          r8, r9, sp, 32
+       __ldrd          r10, r11, sp, 40
+       add             X0, X0, r8
+       add             X1, X1, r9
+       add             X2, X2, r10
+       add             X3, X3, r11
+       _le32_bswap_4x  X0, X1, X2, X3,  r8
+       ldmia           r12!, {r8-r11}
+       eor             X0, X0, r8
+       eor             X1, X1, r9
+       eor             X2, X2, r10
+       eor             X3, X3, r11
+       stmia           r14!, {X0-X3}
+
+       // x4-x7
+       __ldrd          r8, r9, sp, 48
+       __ldrd          r10, r11, sp, 56
+       add             X4, r8, X4, ror #brot
+       add             X5, r9, X5, ror #brot
+       ldmia           r12!, {X0-X3}
+       add             X6, r10, X6, ror #brot
+       add             X7, r11, X7, ror #brot
+       _le32_bswap_4x  X4, X5, X6, X7,  r8
+       eor             X4, X4, X0
+       eor             X5, X5, X1
+       eor             X6, X6, X2
+       eor             X7, X7, X3
+       stmia           r14!, {X4-X7}
+
+       // x8-x15
+       pop             {r0-r7}                 // (x8-x9,x12-x15,x10-x11)
+       __ldrd          r8, r9, sp, 32
+       __ldrd          r10, r11, sp, 40
+       add             r0, r0, r8              // x8
+       add             r1, r1, r9              // x9
+       add             r6, r6, r10             // x10
+       add             r7, r7, r11             // x11
+       _le32_bswap_4x  r0, r1, r6, r7,  r8
+       ldmia           r12!, {r8-r11}
+       eor             r0, r0, r8              // x8
+       eor             r1, r1, r9              // x9
+       eor             r6, r6, r10             // x10
+       eor             r7, r7, r11             // x11
+       stmia           r14!, {r0,r1,r6,r7}
+       ldmia           r12!, {r0,r1,r6,r7}
+       __ldrd          r8, r9, sp, 48
+       __ldrd          r10, r11, sp, 56
+       add             r2, r8, r2, ror #drot   // x12
+       add             r3, r9, r3, ror #drot   // x13
+       add             r4, r10, r4, ror #drot  // x14
+       add             r5, r11, r5, ror #drot  // x15
+       _le32_bswap_4x  r2, r3, r4, r5,  r9
+         ldr           r9, [sp, #72]           // load LEN
+       eor             r2, r2, r0              // x12
+       eor             r3, r3, r1              // x13
+       eor             r4, r4, r6              // x14
+       eor             r5, r5, r7              // x15
+         subs          r9, #64                 // decrement and check LEN
+       stmia           r14!, {r2-r5}
+
+       beq             .Ldone\@
+
+.Lprepare_for_next_block\@:
+
+       // Stack: x0-x15 OUT IN LEN
+
+       // Increment block counter (x12)
+       add             r8, #1
+
+       // Store updated (OUT, IN, LEN)
+       str             r14, [sp, #64]
+       str             r12, [sp, #68]
+       str             r9, [sp, #72]
+
+         mov           r14, sp
+
+       // Store updated block counter (x12)
+       str             r8, [sp, #48]
+
+         sub           sp, #16
+
+       // Reload state and do next block
+       ldmia           r14!, {r0-r11}          // load x0-x11
+       __strd          r10, r11, sp, 8         // store x10-x11 before state
+       ldmia           r14, {r10-r12,r14}      // load x12-x15
+       b               .Lnext_block\@
+
+.Lxor_slowpath\@:
+       // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
+       // We handle it by storing the 64 bytes of keystream to the stack, then
+       // XOR-ing the needed portion with the data.
+
+       // Allocate keystream buffer
+       sub             sp, #64
+       mov             r14, sp
+
+       // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+       // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
+       // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+       // Save keystream for x0-x3
+       __ldrd          r8, r9, sp, 96
+       __ldrd          r10, r11, sp, 104
+       add             X0, X0, r8
+       add             X1, X1, r9
+       add             X2, X2, r10
+       add             X3, X3, r11
+       _le32_bswap_4x  X0, X1, X2, X3,  r8
+       stmia           r14!, {X0-X3}
+
+       // Save keystream for x4-x7
+       __ldrd          r8, r9, sp, 112
+       __ldrd          r10, r11, sp, 120
+       add             X4, r8, X4, ror #brot
+       add             X5, r9, X5, ror #brot
+       add             X6, r10, X6, ror #brot
+       add             X7, r11, X7, ror #brot
+       _le32_bswap_4x  X4, X5, X6, X7,  r8
+         add           r8, sp, #64
+       stmia           r14!, {X4-X7}
+
+       // Save keystream for x8-x15
+       ldm             r8, {r0-r7}             // (x8-x9,x12-x15,x10-x11)
+       __ldrd          r8, r9, sp, 128
+       __ldrd          r10, r11, sp, 136
+       add             r0, r0, r8              // x8
+       add             r1, r1, r9              // x9
+       add             r6, r6, r10             // x10
+       add             r7, r7, r11             // x11
+       _le32_bswap_4x  r0, r1, r6, r7,  r8
+       stmia           r14!, {r0,r1,r6,r7}
+       __ldrd          r8, r9, sp, 144
+       __ldrd          r10, r11, sp, 152
+       add             r2, r8, r2, ror #drot   // x12
+       add             r3, r9, r3, ror #drot   // x13
+       add             r4, r10, r4, ror #drot  // x14
+       add             r5, r11, r5, ror #drot  // x15
+       _le32_bswap_4x  r2, r3, r4, r5,  r9
+       stmia           r14, {r2-r5}
+
+       // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
+       // Registers: r8 is block counter, r12 is IN.
+
+       ldr             r9, [sp, #168]          // LEN
+       ldr             r14, [sp, #160]         // OUT
+       cmp             r9, #64
+         mov           r0, sp
+       movle           r1, r9
+       movgt           r1, #64
+       // r1 is number of bytes to XOR, in range [1, 64]
+
+.if __LINUX_ARM_ARCH__ < 6
+       orr             r2, r12, r14
+       tst             r2, #3                  // IN or OUT misaligned?
+       bne             .Lxor_next_byte\@
+.endif
+
+       // XOR a word at a time
+.rept 16
+       subs            r1, #4
+       blt             .Lxor_words_done\@
+       ldr             r2, [r12], #4
+       ldr             r3, [r0], #4
+       eor             r2, r2, r3
+       str             r2, [r14], #4
+.endr
+       b               .Lxor_slowpath_done\@
+.Lxor_words_done\@:
+       ands            r1, r1, #3
+       beq             .Lxor_slowpath_done\@
+
+       // XOR a byte at a time
+.Lxor_next_byte\@:
+       ldrb            r2, [r12], #1
+       ldrb            r3, [r0], #1
+       eor             r2, r2, r3
+       strb            r2, [r14], #1
+       subs            r1, #1
+       bne             .Lxor_next_byte\@
+
+.Lxor_slowpath_done\@:
+       subs            r9, #64
+       add             sp, #96
+       bgt             .Lprepare_for_next_block\@
+
+.Ldone\@:
+.endm  // _chacha
+
+/*
+ * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+ *                  const struct chacha_state *state, int nrounds);
+ */
+ENTRY(chacha_doarm)
+       cmp             r2, #0                  // len == 0?
+       reteq           lr
+
+       ldr             ip, [sp]
+       cmp             ip, #12
+
+       push            {r0-r2,r4-r11,lr}
+
+       // Push state x0-x15 onto stack.
+       // Also store an extra copy of x10-x11 just before the state.
+
+       add             X12, r3, #48
+       ldm             X12, {X12,X13,X14,X15}
+       push            {X12,X13,X14,X15}
+       sub             sp, sp, #64
+
+       __ldrd          X8_X10, X9_X11, r3, 40
+       __strd          X8_X10, X9_X11, sp, 8
+       __strd          X8_X10, X9_X11, sp, 56
+       ldm             r3, {X0-X9_X11}
+       __strd          X0, X1, sp, 16
+       __strd          X2, X3, sp, 24
+       __strd          X4, X5, sp, 32
+       __strd          X6, X7, sp, 40
+       __strd          X8_X10, X9_X11, sp, 48
+
+       beq             1f
+       _chacha         20
+
+0:     add             sp, #76
+       pop             {r4-r11, pc}
+
+1:     _chacha         12
+       b               0b
+ENDPROC(chacha_doarm)
+
+/*
+ * void hchacha_block_arm(const struct chacha_state *state,
+ *                       u32 out[HCHACHA_OUT_WORDS], int nrounds);
+ */
+ENTRY(hchacha_block_arm)
+       push            {r1,r4-r11,lr}
+
+       cmp             r2, #12                 // ChaCha12 ?
+
+       mov             r14, r0
+       ldmia           r14!, {r0-r11}          // load x0-x11
+       push            {r10-r11}               // store x10-x11 to stack
+       ldm             r14, {r10-r12,r14}      // load x12-x15
+       sub             sp, #8
+
+       beq             1f
+       _chacha_permute 20
+
+       // Skip over (unused0-unused1, x10-x11)
+0:     add             sp, #16
+
+       // Fix up rotations of x12-x15
+       ror             X12, X12, #drot
+       ror             X13, X13, #drot
+         pop           {r4}                    // load 'out'
+       ror             X14, X14, #drot
+       ror             X15, X15, #drot
+
+       // Store (x0-x3,x12-x15) to 'out'
+       stm             r4, {X0,X1,X2,X3,X12,X13,X14,X15}
+
+       pop             {r4-r11,pc}
+
+1:     _chacha_permute 12
+       b               0b
+ENDPROC(hchacha_block_arm)
diff --git a/lib/crypto/arm/poly1305-armv4.pl b/lib/crypto/arm/poly1305-armv4.pl
new file mode 100644 (file)
index 0000000..d57c6e2
--- /dev/null
@@ -0,0 +1,1236 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+#                      IALU(*)/gcc-4.4         NEON
+#
+# ARM11xx(ARMv6)       7.78/+100%              -
+# Cortex-A5            6.35/+130%              3.00
+# Cortex-A8            6.25/+115%              2.36
+# Cortex-A9            5.10/+95%               2.55
+# Cortex-A15           3.85/+85%               1.25(**)
+# Snapdragon S4                5.70/+100%              1.48(**)
+#
+# (*)  this is for -march=armv6, i.e. with bunch of ldrb loading data;
+# (**) these are trade-off results, they can be improved by ~8% but at
+#      the cost of 15/12% regression on Cortex-A5/A7, it's even possible
+#      to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
+
+$code.=<<___;
+#ifndef        __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
+# define poly1305_init   poly1305_block_init_arch
+# define poly1305_blocks poly1305_blocks_arm
+# define poly1305_emit   poly1305_emit_arch
+.globl poly1305_blocks_neon
+#endif
+
+#if defined(__thumb2__)
+.syntax        unified
+.thumb
+#else
+.code  32
+#endif
+
+.text
+
+.globl poly1305_emit
+.globl poly1305_blocks
+.globl poly1305_init
+.type  poly1305_init,%function
+.align 5
+poly1305_init:
+.Lpoly1305_init:
+       stmdb   sp!,{r4-r11}
+
+       eor     r3,r3,r3
+       cmp     $inp,#0
+       str     r3,[$ctx,#0]            @ zero hash value
+       str     r3,[$ctx,#4]
+       str     r3,[$ctx,#8]
+       str     r3,[$ctx,#12]
+       str     r3,[$ctx,#16]
+       str     r3,[$ctx,#36]           @ clear is_base2_26
+       add     $ctx,$ctx,#20
+
+#ifdef __thumb2__
+       it      eq
+#endif
+       moveq   r0,#0
+       beq     .Lno_key
+
+#if    __ARM_MAX_ARCH__>=7
+       mov     r3,#-1
+       str     r3,[$ctx,#28]           @ impossible key power value
+# ifndef __KERNEL__
+       adr     r11,.Lpoly1305_init
+       ldr     r12,.LOPENSSL_armcap
+# endif
+#endif
+       ldrb    r4,[$inp,#0]
+       mov     r10,#0x0fffffff
+       ldrb    r5,[$inp,#1]
+       and     r3,r10,#-4              @ 0x0ffffffc
+       ldrb    r6,[$inp,#2]
+       ldrb    r7,[$inp,#3]
+       orr     r4,r4,r5,lsl#8
+       ldrb    r5,[$inp,#4]
+       orr     r4,r4,r6,lsl#16
+       ldrb    r6,[$inp,#5]
+       orr     r4,r4,r7,lsl#24
+       ldrb    r7,[$inp,#6]
+       and     r4,r4,r10
+
+#if    __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+# if !defined(_WIN32)
+       ldr     r12,[r11,r12]           @ OPENSSL_armcap_P
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
+       ldr     r12,[r12]
+# endif
+#endif
+       ldrb    r8,[$inp,#7]
+       orr     r5,r5,r6,lsl#8
+       ldrb    r6,[$inp,#8]
+       orr     r5,r5,r7,lsl#16
+       ldrb    r7,[$inp,#9]
+       orr     r5,r5,r8,lsl#24
+       ldrb    r8,[$inp,#10]
+       and     r5,r5,r3
+
+#if    __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+       tst     r12,#ARMV7_NEON         @ check for NEON
+# ifdef        __thumb2__
+       adr     r9,.Lpoly1305_blocks_neon
+       adr     r11,.Lpoly1305_blocks
+       it      ne
+       movne   r11,r9
+       adr     r12,.Lpoly1305_emit
+       orr     r11,r11,#1              @ thumb-ify addresses
+       orr     r12,r12,#1
+# else
+       add     r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
+       ite     eq
+       addeq   r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
+       addne   r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
+# endif
+#endif
+       ldrb    r9,[$inp,#11]
+       orr     r6,r6,r7,lsl#8
+       ldrb    r7,[$inp,#12]
+       orr     r6,r6,r8,lsl#16
+       ldrb    r8,[$inp,#13]
+       orr     r6,r6,r9,lsl#24
+       ldrb    r9,[$inp,#14]
+       and     r6,r6,r3
+
+       ldrb    r10,[$inp,#15]
+       orr     r7,r7,r8,lsl#8
+       str     r4,[$ctx,#0]
+       orr     r7,r7,r9,lsl#16
+       str     r5,[$ctx,#4]
+       orr     r7,r7,r10,lsl#24
+       str     r6,[$ctx,#8]
+       and     r7,r7,r3
+       str     r7,[$ctx,#12]
+#if    __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+       stmia   r2,{r11,r12}            @ fill functions table
+       mov     r0,#1
+#else
+       mov     r0,#0
+#endif
+.Lno_key:
+       ldmia   sp!,{r4-r11}
+#if    __ARM_ARCH__>=5
+       ret                             @ bx    lr
+#else
+       tst     lr,#1
+       moveq   pc,lr                   @ be binary compatible with V4, yet
+       bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
+.size  poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
+my ($s1,$s2,$s3)=($r1,$r2,$r3);
+
+$code.=<<___;
+.type  poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+.Lpoly1305_blocks:
+       stmdb   sp!,{r3-r11,lr}
+
+       ands    $len,$len,#-16
+       beq     .Lno_data
+
+       add     $len,$len,$inp          @ end pointer
+       sub     sp,sp,#32
+
+#if __ARM_ARCH__<7
+       ldmia   $ctx,{$h0-$r3}          @ load context
+       add     $ctx,$ctx,#20
+       str     $len,[sp,#16]           @ offload stuff
+       str     $ctx,[sp,#12]
+#else
+       ldr     lr,[$ctx,#36]           @ is_base2_26
+       ldmia   $ctx!,{$h0-$h4}         @ load hash value
+       str     $len,[sp,#16]           @ offload stuff
+       str     $ctx,[sp,#12]
+
+       adds    $r0,$h0,$h1,lsl#26      @ base 2^26 -> base 2^32
+       mov     $r1,$h1,lsr#6
+       adcs    $r1,$r1,$h2,lsl#20
+       mov     $r2,$h2,lsr#12
+       adcs    $r2,$r2,$h3,lsl#14
+       mov     $r3,$h3,lsr#18
+       adcs    $r3,$r3,$h4,lsl#8
+       mov     $len,#0
+       teq     lr,#0
+       str     $len,[$ctx,#16]         @ clear is_base2_26
+       adc     $len,$len,$h4,lsr#24
+
+       itttt   ne
+       movne   $h0,$r0                 @ choose between radixes
+       movne   $h1,$r1
+       movne   $h2,$r2
+       movne   $h3,$r3
+       ldmia   $ctx,{$r0-$r3}          @ load key
+       it      ne
+       movne   $h4,$len
+#endif
+
+       mov     lr,$inp
+       cmp     $padbit,#0
+       str     $r1,[sp,#20]
+       str     $r2,[sp,#24]
+       str     $r3,[sp,#28]
+       b       .Loop
+
+.align 4
+.Loop:
+#if __ARM_ARCH__<7
+       ldrb    r0,[lr],#16             @ load input
+# ifdef        __thumb2__
+       it      hi
+# endif
+       addhi   $h4,$h4,#1              @ 1<<128
+       ldrb    r1,[lr,#-15]
+       ldrb    r2,[lr,#-14]
+       ldrb    r3,[lr,#-13]
+       orr     r1,r0,r1,lsl#8
+       ldrb    r0,[lr,#-12]
+       orr     r2,r1,r2,lsl#16
+       ldrb    r1,[lr,#-11]
+       orr     r3,r2,r3,lsl#24
+       ldrb    r2,[lr,#-10]
+       adds    $h0,$h0,r3              @ accumulate input
+
+       ldrb    r3,[lr,#-9]
+       orr     r1,r0,r1,lsl#8
+       ldrb    r0,[lr,#-8]
+       orr     r2,r1,r2,lsl#16
+       ldrb    r1,[lr,#-7]
+       orr     r3,r2,r3,lsl#24
+       ldrb    r2,[lr,#-6]
+       adcs    $h1,$h1,r3
+
+       ldrb    r3,[lr,#-5]
+       orr     r1,r0,r1,lsl#8
+       ldrb    r0,[lr,#-4]
+       orr     r2,r1,r2,lsl#16
+       ldrb    r1,[lr,#-3]
+       orr     r3,r2,r3,lsl#24
+       ldrb    r2,[lr,#-2]
+       adcs    $h2,$h2,r3
+
+       ldrb    r3,[lr,#-1]
+       orr     r1,r0,r1,lsl#8
+       str     lr,[sp,#8]              @ offload input pointer
+       orr     r2,r1,r2,lsl#16
+       add     $s1,$r1,$r1,lsr#2
+       orr     r3,r2,r3,lsl#24
+#else
+       ldr     r0,[lr],#16             @ load input
+       it      hi
+       addhi   $h4,$h4,#1              @ padbit
+       ldr     r1,[lr,#-12]
+       ldr     r2,[lr,#-8]
+       ldr     r3,[lr,#-4]
+# ifdef        __ARMEB__
+       rev     r0,r0
+       rev     r1,r1
+       rev     r2,r2
+       rev     r3,r3
+# endif
+       adds    $h0,$h0,r0              @ accumulate input
+       str     lr,[sp,#8]              @ offload input pointer
+       adcs    $h1,$h1,r1
+       add     $s1,$r1,$r1,lsr#2
+       adcs    $h2,$h2,r2
+#endif
+       add     $s2,$r2,$r2,lsr#2
+       adcs    $h3,$h3,r3
+       add     $s3,$r3,$r3,lsr#2
+
+       umull   r2,r3,$h1,$r0
+        adc    $h4,$h4,#0
+       umull   r0,r1,$h0,$r0
+       umlal   r2,r3,$h4,$s1
+       umlal   r0,r1,$h3,$s1
+       ldr     $r1,[sp,#20]            @ reload $r1
+       umlal   r2,r3,$h2,$s3
+       umlal   r0,r1,$h1,$s3
+       umlal   r2,r3,$h3,$s2
+       umlal   r0,r1,$h2,$s2
+       umlal   r2,r3,$h0,$r1
+       str     r0,[sp,#0]              @ future $h0
+        mul    r0,$s2,$h4
+       ldr     $r2,[sp,#24]            @ reload $r2
+       adds    r2,r2,r1                @ d1+=d0>>32
+        eor    r1,r1,r1
+       adc     lr,r3,#0                @ future $h2
+       str     r2,[sp,#4]              @ future $h1
+
+       mul     r2,$s3,$h4
+       eor     r3,r3,r3
+       umlal   r0,r1,$h3,$s3
+       ldr     $r3,[sp,#28]            @ reload $r3
+       umlal   r2,r3,$h3,$r0
+       umlal   r0,r1,$h2,$r0
+       umlal   r2,r3,$h2,$r1
+       umlal   r0,r1,$h1,$r1
+       umlal   r2,r3,$h1,$r2
+       umlal   r0,r1,$h0,$r2
+       umlal   r2,r3,$h0,$r3
+       ldr     $h0,[sp,#0]
+       mul     $h4,$r0,$h4
+       ldr     $h1,[sp,#4]
+
+       adds    $h2,lr,r0               @ d2+=d1>>32
+       ldr     lr,[sp,#8]              @ reload input pointer
+       adc     r1,r1,#0
+       adds    $h3,r2,r1               @ d3+=d2>>32
+       ldr     r0,[sp,#16]             @ reload end pointer
+       adc     r3,r3,#0
+       add     $h4,$h4,r3              @ h4+=d3>>32
+
+       and     r1,$h4,#-4
+       and     $h4,$h4,#3
+       add     r1,r1,r1,lsr#2          @ *=5
+       adds    $h0,$h0,r1
+       adcs    $h1,$h1,#0
+       adcs    $h2,$h2,#0
+       adcs    $h3,$h3,#0
+       adc     $h4,$h4,#0
+
+       cmp     r0,lr                   @ done yet?
+       bhi     .Loop
+
+       ldr     $ctx,[sp,#12]
+       add     sp,sp,#32
+       stmdb   $ctx,{$h0-$h4}          @ store the result
+
+.Lno_data:
+#if    __ARM_ARCH__>=5
+       ldmia   sp!,{r3-r11,pc}
+#else
+       ldmia   sp!,{r3-r11,lr}
+       tst     lr,#1
+       moveq   pc,lr                   @ be binary compatible with V4, yet
+       bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
+.size  poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce)=map("r$_",(0..2));
+my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
+my $g4=$ctx;
+
+$code.=<<___;
+.type  poly1305_emit,%function
+.align 5
+poly1305_emit:
+.Lpoly1305_emit:
+       stmdb   sp!,{r4-r11}
+
+       ldmia   $ctx,{$h0-$h4}
+
+#if __ARM_ARCH__>=7
+       ldr     ip,[$ctx,#36]           @ is_base2_26
+
+       adds    $g0,$h0,$h1,lsl#26      @ base 2^26 -> base 2^32
+       mov     $g1,$h1,lsr#6
+       adcs    $g1,$g1,$h2,lsl#20
+       mov     $g2,$h2,lsr#12
+       adcs    $g2,$g2,$h3,lsl#14
+       mov     $g3,$h3,lsr#18
+       adcs    $g3,$g3,$h4,lsl#8
+       mov     $g4,#0
+       adc     $g4,$g4,$h4,lsr#24
+
+       tst     ip,ip
+       itttt   ne
+       movne   $h0,$g0
+       movne   $h1,$g1
+       movne   $h2,$g2
+       movne   $h3,$g3
+       it      ne
+       movne   $h4,$g4
+#endif
+
+       adds    $g0,$h0,#5              @ compare to modulus
+       adcs    $g1,$h1,#0
+       adcs    $g2,$h2,#0
+       adcs    $g3,$h3,#0
+       adc     $g4,$h4,#0
+       tst     $g4,#4                  @ did it carry/borrow?
+
+#ifdef __thumb2__
+       it      ne
+#endif
+       movne   $h0,$g0
+       ldr     $g0,[$nonce,#0]
+#ifdef __thumb2__
+       it      ne
+#endif
+       movne   $h1,$g1
+       ldr     $g1,[$nonce,#4]
+#ifdef __thumb2__
+       it      ne
+#endif
+       movne   $h2,$g2
+       ldr     $g2,[$nonce,#8]
+#ifdef __thumb2__
+       it      ne
+#endif
+       movne   $h3,$g3
+       ldr     $g3,[$nonce,#12]
+
+       adds    $h0,$h0,$g0
+       adcs    $h1,$h1,$g1
+       adcs    $h2,$h2,$g2
+       adc     $h3,$h3,$g3
+
+#if __ARM_ARCH__>=7
+# ifdef __ARMEB__
+       rev     $h0,$h0
+       rev     $h1,$h1
+       rev     $h2,$h2
+       rev     $h3,$h3
+# endif
+       str     $h0,[$mac,#0]
+       str     $h1,[$mac,#4]
+       str     $h2,[$mac,#8]
+       str     $h3,[$mac,#12]
+#else
+       strb    $h0,[$mac,#0]
+       mov     $h0,$h0,lsr#8
+       strb    $h1,[$mac,#4]
+       mov     $h1,$h1,lsr#8
+       strb    $h2,[$mac,#8]
+       mov     $h2,$h2,lsr#8
+       strb    $h3,[$mac,#12]
+       mov     $h3,$h3,lsr#8
+
+       strb    $h0,[$mac,#1]
+       mov     $h0,$h0,lsr#8
+       strb    $h1,[$mac,#5]
+       mov     $h1,$h1,lsr#8
+       strb    $h2,[$mac,#9]
+       mov     $h2,$h2,lsr#8
+       strb    $h3,[$mac,#13]
+       mov     $h3,$h3,lsr#8
+
+       strb    $h0,[$mac,#2]
+       mov     $h0,$h0,lsr#8
+       strb    $h1,[$mac,#6]
+       mov     $h1,$h1,lsr#8
+       strb    $h2,[$mac,#10]
+       mov     $h2,$h2,lsr#8
+       strb    $h3,[$mac,#14]
+       mov     $h3,$h3,lsr#8
+
+       strb    $h0,[$mac,#3]
+       strb    $h1,[$mac,#7]
+       strb    $h2,[$mac,#11]
+       strb    $h3,[$mac,#15]
+#endif
+       ldmia   sp!,{r4-r11}
+#if    __ARM_ARCH__>=5
+       ret                             @ bx    lr
+#else
+       tst     lr,#1
+       moveq   pc,lr                   @ be binary compatible with V4, yet
+       bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
+.size  poly1305_emit,.-poly1305_emit
+___
+{
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
+my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
+my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
+
+my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
+
+$code.=<<___;
+#if    __ARM_MAX_ARCH__>=7
+.fpu   neon
+
+.type  poly1305_init_neon,%function
+.align 5
+poly1305_init_neon:
+.Lpoly1305_init_neon:
+       ldr     r3,[$ctx,#48]           @ first table element
+       cmp     r3,#-1                  @ is value impossible?
+       bne     .Lno_init_neon
+
+       ldr     r4,[$ctx,#20]           @ load key base 2^32
+       ldr     r5,[$ctx,#24]
+       ldr     r6,[$ctx,#28]
+       ldr     r7,[$ctx,#32]
+
+       and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
+       mov     r3,r4,lsr#26
+       mov     r4,r5,lsr#20
+       orr     r3,r3,r5,lsl#6
+       mov     r5,r6,lsr#14
+       orr     r4,r4,r6,lsl#12
+       mov     r6,r7,lsr#8
+       orr     r5,r5,r7,lsl#18
+       and     r3,r3,#0x03ffffff
+       and     r4,r4,#0x03ffffff
+       and     r5,r5,#0x03ffffff
+
+       vdup.32 $R0,r2                  @ r^1 in both lanes
+       add     r2,r3,r3,lsl#2          @ *5
+       vdup.32 $R1,r3
+       add     r3,r4,r4,lsl#2
+       vdup.32 $S1,r2
+       vdup.32 $R2,r4
+       add     r4,r5,r5,lsl#2
+       vdup.32 $S2,r3
+       vdup.32 $R3,r5
+       add     r5,r6,r6,lsl#2
+       vdup.32 $S3,r4
+       vdup.32 $R4,r6
+       vdup.32 $S4,r5
+
+       mov     $zeros,#2               @ counter
+
+.Lsquare_neon:
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+       @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+       @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+       @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+       @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+
+       vmull.u32       $D0,$R0,${R0}[1]
+       vmull.u32       $D1,$R1,${R0}[1]
+       vmull.u32       $D2,$R2,${R0}[1]
+       vmull.u32       $D3,$R3,${R0}[1]
+       vmull.u32       $D4,$R4,${R0}[1]
+
+       vmlal.u32       $D0,$R4,${S1}[1]
+       vmlal.u32       $D1,$R0,${R1}[1]
+       vmlal.u32       $D2,$R1,${R1}[1]
+       vmlal.u32       $D3,$R2,${R1}[1]
+       vmlal.u32       $D4,$R3,${R1}[1]
+
+       vmlal.u32       $D0,$R3,${S2}[1]
+       vmlal.u32       $D1,$R4,${S2}[1]
+       vmlal.u32       $D3,$R1,${R2}[1]
+       vmlal.u32       $D2,$R0,${R2}[1]
+       vmlal.u32       $D4,$R2,${R2}[1]
+
+       vmlal.u32       $D0,$R2,${S3}[1]
+       vmlal.u32       $D3,$R0,${R3}[1]
+       vmlal.u32       $D1,$R3,${S3}[1]
+       vmlal.u32       $D2,$R4,${S3}[1]
+       vmlal.u32       $D4,$R1,${R3}[1]
+
+       vmlal.u32       $D3,$R4,${S4}[1]
+       vmlal.u32       $D0,$R1,${S4}[1]
+       vmlal.u32       $D1,$R2,${S4}[1]
+       vmlal.u32       $D2,$R3,${S4}[1]
+       vmlal.u32       $D4,$R0,${R4}[1]
+
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+       @ and P. Schwabe
+       @
+       @ H0>>+H1>>+H2>>+H3>>+H4
+       @ H3>>+H4>>*5+H0>>+H1
+       @
+       @ Trivia.
+       @
+       @ Result of multiplication of n-bit number by m-bit number is
+       @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
+       @ m-bit number multiplied by 2^n is still n+m bits wide.
+       @
+       @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
+       @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
+       @ one is n+1 bits wide.
+       @
+       @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
+       @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
+       @ can be 27. However! In cases when their width exceeds 26 bits
+       @ they are limited by 2^26+2^6. This in turn means that *sum*
+       @ of the products with these values can still be viewed as sum
+       @ of 52-bit numbers as long as the amount of addends is not a
+       @ power of 2. For example,
+       @
+       @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
+       @
+       @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
+       @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
+       @ 8 * (2^52) or 2^55. However, the value is then multiplied by
+       @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
+       @ which is less than 32 * (2^52) or 2^57. And when processing
+       @ data we are looking at triple as many addends...
+       @
+       @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
+       @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
+       @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
+       @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
+       @ instruction accepts 2x32-bit input and writes 2x64-bit result.
+       @ This means that result of reduction have to be compressed upon
+       @ loop wrap-around. This can be done in the process of reduction
+       @ to minimize amount of instructions [as well as amount of
+       @ 128-bit instructions, which benefits low-end processors], but
+       @ one has to watch for H2 (which is narrower than H0) and 5*H4
+       @ not being wider than 58 bits, so that result of right shift
+       @ by 26 bits fits in 32 bits. This is also useful on x86,
+       @ because it allows to use paddd in place for paddq, which
+       @ benefits Atom, where paddq is ridiculously slow.
+
+       vshr.u64        $T0,$D3,#26
+       vmovn.i64       $D3#lo,$D3
+        vshr.u64       $T1,$D0,#26
+        vmovn.i64      $D0#lo,$D0
+       vadd.i64        $D4,$D4,$T0             @ h3 -> h4
+       vbic.i32        $D3#lo,#0xfc000000      @ &=0x03ffffff
+        vadd.i64       $D1,$D1,$T1             @ h0 -> h1
+        vbic.i32       $D0#lo,#0xfc000000
+
+       vshrn.u64       $T0#lo,$D4,#26
+       vmovn.i64       $D4#lo,$D4
+        vshr.u64       $T1,$D1,#26
+        vmovn.i64      $D1#lo,$D1
+        vadd.i64       $D2,$D2,$T1             @ h1 -> h2
+       vbic.i32        $D4#lo,#0xfc000000
+        vbic.i32       $D1#lo,#0xfc000000
+
+       vadd.i32        $D0#lo,$D0#lo,$T0#lo
+       vshl.u32        $T0#lo,$T0#lo,#2
+        vshrn.u64      $T1#lo,$D2,#26
+        vmovn.i64      $D2#lo,$D2
+       vadd.i32        $D0#lo,$D0#lo,$T0#lo    @ h4 -> h0
+        vadd.i32       $D3#lo,$D3#lo,$T1#lo    @ h2 -> h3
+        vbic.i32       $D2#lo,#0xfc000000
+
+       vshr.u32        $T0#lo,$D0#lo,#26
+       vbic.i32        $D0#lo,#0xfc000000
+        vshr.u32       $T1#lo,$D3#lo,#26
+        vbic.i32       $D3#lo,#0xfc000000
+       vadd.i32        $D1#lo,$D1#lo,$T0#lo    @ h0 -> h1
+        vadd.i32       $D4#lo,$D4#lo,$T1#lo    @ h3 -> h4
+
+       subs            $zeros,$zeros,#1
+       beq             .Lsquare_break_neon
+
+       add             $tbl0,$ctx,#(48+0*9*4)
+       add             $tbl1,$ctx,#(48+1*9*4)
+
+       vtrn.32         $R0,$D0#lo              @ r^2:r^1
+       vtrn.32         $R2,$D2#lo
+       vtrn.32         $R3,$D3#lo
+       vtrn.32         $R1,$D1#lo
+       vtrn.32         $R4,$D4#lo
+
+       vshl.u32        $S2,$R2,#2              @ *5
+       vshl.u32        $S3,$R3,#2
+       vshl.u32        $S1,$R1,#2
+       vshl.u32        $S4,$R4,#2
+       vadd.i32        $S2,$S2,$R2
+       vadd.i32        $S1,$S1,$R1
+       vadd.i32        $S3,$S3,$R3
+       vadd.i32        $S4,$S4,$R4
+
+       vst4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+       vst4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+       vst4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+       vst4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+       vst1.32         {${S4}[0]},[$tbl0,:32]
+       vst1.32         {${S4}[1]},[$tbl1,:32]
+
+       b               .Lsquare_neon
+
+.align 4
+.Lsquare_break_neon:
+       add             $tbl0,$ctx,#(48+2*4*9)
+       add             $tbl1,$ctx,#(48+3*4*9)
+
+       vmov            $R0,$D0#lo              @ r^4:r^3
+       vshl.u32        $S1,$D1#lo,#2           @ *5
+       vmov            $R1,$D1#lo
+       vshl.u32        $S2,$D2#lo,#2
+       vmov            $R2,$D2#lo
+       vshl.u32        $S3,$D3#lo,#2
+       vmov            $R3,$D3#lo
+       vshl.u32        $S4,$D4#lo,#2
+       vmov            $R4,$D4#lo
+       vadd.i32        $S1,$S1,$D1#lo
+       vadd.i32        $S2,$S2,$D2#lo
+       vadd.i32        $S3,$S3,$D3#lo
+       vadd.i32        $S4,$S4,$D4#lo
+
+       vst4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+       vst4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+       vst4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+       vst4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+       vst1.32         {${S4}[0]},[$tbl0]
+       vst1.32         {${S4}[1]},[$tbl1]
+
+.Lno_init_neon:
+       ret                             @ bx    lr
+.size  poly1305_init_neon,.-poly1305_init_neon
+
+.type  poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
+       ldr     ip,[$ctx,#36]           @ is_base2_26
+
+       cmp     $len,#64
+       blo     .Lpoly1305_blocks
+
+       stmdb   sp!,{r4-r7}
+       vstmdb  sp!,{d8-d15}            @ ABI specification says so
+
+       tst     ip,ip                   @ is_base2_26?
+       bne     .Lbase2_26_neon
+
+       stmdb   sp!,{r1-r3,lr}
+       bl      .Lpoly1305_init_neon
+
+       ldr     r4,[$ctx,#0]            @ load hash value base 2^32
+       ldr     r5,[$ctx,#4]
+       ldr     r6,[$ctx,#8]
+       ldr     r7,[$ctx,#12]
+       ldr     ip,[$ctx,#16]
+
+       and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
+       mov     r3,r4,lsr#26
+        veor   $D0#lo,$D0#lo,$D0#lo
+       mov     r4,r5,lsr#20
+       orr     r3,r3,r5,lsl#6
+        veor   $D1#lo,$D1#lo,$D1#lo
+       mov     r5,r6,lsr#14
+       orr     r4,r4,r6,lsl#12
+        veor   $D2#lo,$D2#lo,$D2#lo
+       mov     r6,r7,lsr#8
+       orr     r5,r5,r7,lsl#18
+        veor   $D3#lo,$D3#lo,$D3#lo
+       and     r3,r3,#0x03ffffff
+       orr     r6,r6,ip,lsl#24
+        veor   $D4#lo,$D4#lo,$D4#lo
+       and     r4,r4,#0x03ffffff
+       mov     r1,#1
+       and     r5,r5,#0x03ffffff
+       str     r1,[$ctx,#36]           @ set is_base2_26
+
+       vmov.32 $D0#lo[0],r2
+       vmov.32 $D1#lo[0],r3
+       vmov.32 $D2#lo[0],r4
+       vmov.32 $D3#lo[0],r5
+       vmov.32 $D4#lo[0],r6
+       adr     $zeros,.Lzeros
+
+       ldmia   sp!,{r1-r3,lr}
+       b       .Lhash_loaded
+
+.align 4
+.Lbase2_26_neon:
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ load hash value
+
+       veor            $D0#lo,$D0#lo,$D0#lo
+       veor            $D1#lo,$D1#lo,$D1#lo
+       veor            $D2#lo,$D2#lo,$D2#lo
+       veor            $D3#lo,$D3#lo,$D3#lo
+       veor            $D4#lo,$D4#lo,$D4#lo
+       vld4.32         {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+       adr             $zeros,.Lzeros
+       vld1.32         {$D4#lo[0]},[$ctx]
+       sub             $ctx,$ctx,#16           @ rewind
+
+.Lhash_loaded:
+       add             $in2,$inp,#32
+       mov             $padbit,$padbit,lsl#24
+       tst             $len,#31
+       beq             .Leven
+
+       vld4.32         {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
+       vmov.32         $H4#lo[0],$padbit
+       sub             $len,$len,#16
+       add             $in2,$inp,#32
+
+# ifdef        __ARMEB__
+       vrev32.8        $H0,$H0
+       vrev32.8        $H3,$H3
+       vrev32.8        $H1,$H1
+       vrev32.8        $H2,$H2
+# endif
+       vsri.u32        $H4#lo,$H3#lo,#8        @ base 2^32 -> base 2^26
+       vshl.u32        $H3#lo,$H3#lo,#18
+
+       vsri.u32        $H3#lo,$H2#lo,#14
+       vshl.u32        $H2#lo,$H2#lo,#12
+       vadd.i32        $H4#hi,$H4#lo,$D4#lo    @ add hash value and move to #hi
+
+       vbic.i32        $H3#lo,#0xfc000000
+       vsri.u32        $H2#lo,$H1#lo,#20
+       vshl.u32        $H1#lo,$H1#lo,#6
+
+       vbic.i32        $H2#lo,#0xfc000000
+       vsri.u32        $H1#lo,$H0#lo,#26
+       vadd.i32        $H3#hi,$H3#lo,$D3#lo
+
+       vbic.i32        $H0#lo,#0xfc000000
+       vbic.i32        $H1#lo,#0xfc000000
+       vadd.i32        $H2#hi,$H2#lo,$D2#lo
+
+       vadd.i32        $H0#hi,$H0#lo,$D0#lo
+       vadd.i32        $H1#hi,$H1#lo,$D1#lo
+
+       mov             $tbl1,$zeros
+       add             $tbl0,$ctx,#48
+
+       cmp             $len,$len
+       b               .Long_tail
+
+.align 4
+.Leven:
+       subs            $len,$len,#64
+       it              lo
+       movlo           $in2,$zeros
+
+       vmov.i32        $H4,#1<<24              @ padbit, yes, always
+       vld4.32         {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]    @ inp[0:1]
+       add             $inp,$inp,#64
+       vld4.32         {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]    @ inp[2:3] (or 0)
+       add             $in2,$in2,#64
+       itt             hi
+       addhi           $tbl1,$ctx,#(48+1*9*4)
+       addhi           $tbl0,$ctx,#(48+3*9*4)
+
+# ifdef        __ARMEB__
+       vrev32.8        $H0,$H0
+       vrev32.8        $H3,$H3
+       vrev32.8        $H1,$H1
+       vrev32.8        $H2,$H2
+# endif
+       vsri.u32        $H4,$H3,#8              @ base 2^32 -> base 2^26
+       vshl.u32        $H3,$H3,#18
+
+       vsri.u32        $H3,$H2,#14
+       vshl.u32        $H2,$H2,#12
+
+       vbic.i32        $H3,#0xfc000000
+       vsri.u32        $H2,$H1,#20
+       vshl.u32        $H1,$H1,#6
+
+       vbic.i32        $H2,#0xfc000000
+       vsri.u32        $H1,$H0,#26
+
+       vbic.i32        $H0,#0xfc000000
+       vbic.i32        $H1,#0xfc000000
+
+       bls             .Lskip_loop
+
+       vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^2
+       vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^4
+       vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+       vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+       b               .Loop_neon
+
+.align 5
+.Loop_neon:
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+       @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+       @   \___________________/
+       @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+       @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+       @   \___________________/ \____________________/
+       @
+       @ Note that we start with inp[2:3]*r^2. This is because it
+       @ doesn't depend on reduction in previous iteration.
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+       @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+       @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+       @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+       @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ inp[2:3]*r^2
+
+       vadd.i32        $H2#lo,$H2#lo,$D2#lo    @ accumulate inp[0:1]
+       vmull.u32       $D2,$H2#hi,${R0}[1]
+       vadd.i32        $H0#lo,$H0#lo,$D0#lo
+       vmull.u32       $D0,$H0#hi,${R0}[1]
+       vadd.i32        $H3#lo,$H3#lo,$D3#lo
+       vmull.u32       $D3,$H3#hi,${R0}[1]
+       vmlal.u32       $D2,$H1#hi,${R1}[1]
+       vadd.i32        $H1#lo,$H1#lo,$D1#lo
+       vmull.u32       $D1,$H1#hi,${R0}[1]
+
+       vadd.i32        $H4#lo,$H4#lo,$D4#lo
+       vmull.u32       $D4,$H4#hi,${R0}[1]
+       subs            $len,$len,#64
+       vmlal.u32       $D0,$H4#hi,${S1}[1]
+       it              lo
+       movlo           $in2,$zeros
+       vmlal.u32       $D3,$H2#hi,${R1}[1]
+       vld1.32         ${S4}[1],[$tbl1,:32]
+       vmlal.u32       $D1,$H0#hi,${R1}[1]
+       vmlal.u32       $D4,$H3#hi,${R1}[1]
+
+       vmlal.u32       $D0,$H3#hi,${S2}[1]
+       vmlal.u32       $D3,$H1#hi,${R2}[1]
+       vmlal.u32       $D4,$H2#hi,${R2}[1]
+       vmlal.u32       $D1,$H4#hi,${S2}[1]
+       vmlal.u32       $D2,$H0#hi,${R2}[1]
+
+       vmlal.u32       $D3,$H0#hi,${R3}[1]
+       vmlal.u32       $D0,$H2#hi,${S3}[1]
+       vmlal.u32       $D4,$H1#hi,${R3}[1]
+       vmlal.u32       $D1,$H3#hi,${S3}[1]
+       vmlal.u32       $D2,$H4#hi,${S3}[1]
+
+       vmlal.u32       $D3,$H4#hi,${S4}[1]
+       vmlal.u32       $D0,$H1#hi,${S4}[1]
+       vmlal.u32       $D4,$H0#hi,${R4}[1]
+       vmlal.u32       $D1,$H2#hi,${S4}[1]
+       vmlal.u32       $D2,$H3#hi,${S4}[1]
+
+       vld4.32         {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]    @ inp[2:3] (or 0)
+       add             $in2,$in2,#64
+
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ (hash+inp[0:1])*r^4 and accumulate
+
+       vmlal.u32       $D3,$H3#lo,${R0}[0]
+       vmlal.u32       $D0,$H0#lo,${R0}[0]
+       vmlal.u32       $D4,$H4#lo,${R0}[0]
+       vmlal.u32       $D1,$H1#lo,${R0}[0]
+       vmlal.u32       $D2,$H2#lo,${R0}[0]
+       vld1.32         ${S4}[0],[$tbl0,:32]
+
+       vmlal.u32       $D3,$H2#lo,${R1}[0]
+       vmlal.u32       $D0,$H4#lo,${S1}[0]
+       vmlal.u32       $D4,$H3#lo,${R1}[0]
+       vmlal.u32       $D1,$H0#lo,${R1}[0]
+       vmlal.u32       $D2,$H1#lo,${R1}[0]
+
+       vmlal.u32       $D3,$H1#lo,${R2}[0]
+       vmlal.u32       $D0,$H3#lo,${S2}[0]
+       vmlal.u32       $D4,$H2#lo,${R2}[0]
+       vmlal.u32       $D1,$H4#lo,${S2}[0]
+       vmlal.u32       $D2,$H0#lo,${R2}[0]
+
+       vmlal.u32       $D3,$H0#lo,${R3}[0]
+       vmlal.u32       $D0,$H2#lo,${S3}[0]
+       vmlal.u32       $D4,$H1#lo,${R3}[0]
+       vmlal.u32       $D1,$H3#lo,${S3}[0]
+       vmlal.u32       $D3,$H4#lo,${S4}[0]
+
+       vmlal.u32       $D2,$H4#lo,${S3}[0]
+       vmlal.u32       $D0,$H1#lo,${S4}[0]
+       vmlal.u32       $D4,$H0#lo,${R4}[0]
+       vmov.i32        $H4,#1<<24              @ padbit, yes, always
+       vmlal.u32       $D1,$H2#lo,${S4}[0]
+       vmlal.u32       $D2,$H3#lo,${S4}[0]
+
+       vld4.32         {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]    @ inp[0:1]
+       add             $inp,$inp,#64
+# ifdef        __ARMEB__
+       vrev32.8        $H0,$H0
+       vrev32.8        $H1,$H1
+       vrev32.8        $H2,$H2
+       vrev32.8        $H3,$H3
+# endif
+
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ lazy reduction interleaved with base 2^32 -> base 2^26 of
+       @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
+
+       vshr.u64        $T0,$D3,#26
+       vmovn.i64       $D3#lo,$D3
+        vshr.u64       $T1,$D0,#26
+        vmovn.i64      $D0#lo,$D0
+       vadd.i64        $D4,$D4,$T0             @ h3 -> h4
+       vbic.i32        $D3#lo,#0xfc000000
+         vsri.u32      $H4,$H3,#8              @ base 2^32 -> base 2^26
+        vadd.i64       $D1,$D1,$T1             @ h0 -> h1
+         vshl.u32      $H3,$H3,#18
+        vbic.i32       $D0#lo,#0xfc000000
+
+       vshrn.u64       $T0#lo,$D4,#26
+       vmovn.i64       $D4#lo,$D4
+        vshr.u64       $T1,$D1,#26
+        vmovn.i64      $D1#lo,$D1
+        vadd.i64       $D2,$D2,$T1             @ h1 -> h2
+         vsri.u32      $H3,$H2,#14
+       vbic.i32        $D4#lo,#0xfc000000
+         vshl.u32      $H2,$H2,#12
+        vbic.i32       $D1#lo,#0xfc000000
+
+       vadd.i32        $D0#lo,$D0#lo,$T0#lo
+       vshl.u32        $T0#lo,$T0#lo,#2
+         vbic.i32      $H3,#0xfc000000
+        vshrn.u64      $T1#lo,$D2,#26
+        vmovn.i64      $D2#lo,$D2
+       vaddl.u32       $D0,$D0#lo,$T0#lo       @ h4 -> h0 [widen for a sec]
+         vsri.u32      $H2,$H1,#20
+        vadd.i32       $D3#lo,$D3#lo,$T1#lo    @ h2 -> h3
+         vshl.u32      $H1,$H1,#6
+        vbic.i32       $D2#lo,#0xfc000000
+         vbic.i32      $H2,#0xfc000000
+
+       vshrn.u64       $T0#lo,$D0,#26          @ re-narrow
+       vmovn.i64       $D0#lo,$D0
+         vsri.u32      $H1,$H0,#26
+         vbic.i32      $H0,#0xfc000000
+        vshr.u32       $T1#lo,$D3#lo,#26
+        vbic.i32       $D3#lo,#0xfc000000
+       vbic.i32        $D0#lo,#0xfc000000
+       vadd.i32        $D1#lo,$D1#lo,$T0#lo    @ h0 -> h1
+        vadd.i32       $D4#lo,$D4#lo,$T1#lo    @ h3 -> h4
+         vbic.i32      $H1,#0xfc000000
+
+       bhi             .Loop_neon
+
+.Lskip_loop:
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+       add             $tbl1,$ctx,#(48+0*9*4)
+       add             $tbl0,$ctx,#(48+1*9*4)
+       adds            $len,$len,#32
+       it              ne
+       movne           $len,#0
+       bne             .Long_tail
+
+       vadd.i32        $H2#hi,$H2#lo,$D2#lo    @ add hash value and move to #hi
+       vadd.i32        $H0#hi,$H0#lo,$D0#lo
+       vadd.i32        $H3#hi,$H3#lo,$D3#lo
+       vadd.i32        $H1#hi,$H1#lo,$D1#lo
+       vadd.i32        $H4#hi,$H4#lo,$D4#lo
+
+.Long_tail:
+       vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^1
+       vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^2
+
+       vadd.i32        $H2#lo,$H2#lo,$D2#lo    @ can be redundant
+       vmull.u32       $D2,$H2#hi,$R0
+       vadd.i32        $H0#lo,$H0#lo,$D0#lo
+       vmull.u32       $D0,$H0#hi,$R0
+       vadd.i32        $H3#lo,$H3#lo,$D3#lo
+       vmull.u32       $D3,$H3#hi,$R0
+       vadd.i32        $H1#lo,$H1#lo,$D1#lo
+       vmull.u32       $D1,$H1#hi,$R0
+       vadd.i32        $H4#lo,$H4#lo,$D4#lo
+       vmull.u32       $D4,$H4#hi,$R0
+
+       vmlal.u32       $D0,$H4#hi,$S1
+       vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+       vmlal.u32       $D3,$H2#hi,$R1
+       vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+       vmlal.u32       $D1,$H0#hi,$R1
+       vmlal.u32       $D4,$H3#hi,$R1
+       vmlal.u32       $D2,$H1#hi,$R1
+
+       vmlal.u32       $D3,$H1#hi,$R2
+       vld1.32         ${S4}[1],[$tbl1,:32]
+       vmlal.u32       $D0,$H3#hi,$S2
+       vld1.32         ${S4}[0],[$tbl0,:32]
+       vmlal.u32       $D4,$H2#hi,$R2
+       vmlal.u32       $D1,$H4#hi,$S2
+       vmlal.u32       $D2,$H0#hi,$R2
+
+       vmlal.u32       $D3,$H0#hi,$R3
+        it             ne
+        addne          $tbl1,$ctx,#(48+2*9*4)
+       vmlal.u32       $D0,$H2#hi,$S3
+        it             ne
+        addne          $tbl0,$ctx,#(48+3*9*4)
+       vmlal.u32       $D4,$H1#hi,$R3
+       vmlal.u32       $D1,$H3#hi,$S3
+       vmlal.u32       $D2,$H4#hi,$S3
+
+       vmlal.u32       $D3,$H4#hi,$S4
+        vorn           $MASK,$MASK,$MASK       @ all-ones, can be redundant
+       vmlal.u32       $D0,$H1#hi,$S4
+        vshr.u64       $MASK,$MASK,#38
+       vmlal.u32       $D4,$H0#hi,$R4
+       vmlal.u32       $D1,$H2#hi,$S4
+       vmlal.u32       $D2,$H3#hi,$S4
+
+       beq             .Lshort_tail
+
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ (hash+inp[0:1])*r^4:r^3 and accumulate
+
+       vld4.32         {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!  @ load r^3
+       vld4.32         {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!  @ load r^4
+
+       vmlal.u32       $D2,$H2#lo,$R0
+       vmlal.u32       $D0,$H0#lo,$R0
+       vmlal.u32       $D3,$H3#lo,$R0
+       vmlal.u32       $D1,$H1#lo,$R0
+       vmlal.u32       $D4,$H4#lo,$R0
+
+       vmlal.u32       $D0,$H4#lo,$S1
+       vld4.32         {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+       vmlal.u32       $D3,$H2#lo,$R1
+       vld4.32         {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+       vmlal.u32       $D1,$H0#lo,$R1
+       vmlal.u32       $D4,$H3#lo,$R1
+       vmlal.u32       $D2,$H1#lo,$R1
+
+       vmlal.u32       $D3,$H1#lo,$R2
+       vld1.32         ${S4}[1],[$tbl1,:32]
+       vmlal.u32       $D0,$H3#lo,$S2
+       vld1.32         ${S4}[0],[$tbl0,:32]
+       vmlal.u32       $D4,$H2#lo,$R2
+       vmlal.u32       $D1,$H4#lo,$S2
+       vmlal.u32       $D2,$H0#lo,$R2
+
+       vmlal.u32       $D3,$H0#lo,$R3
+       vmlal.u32       $D0,$H2#lo,$S3
+       vmlal.u32       $D4,$H1#lo,$R3
+       vmlal.u32       $D1,$H3#lo,$S3
+       vmlal.u32       $D2,$H4#lo,$S3
+
+       vmlal.u32       $D3,$H4#lo,$S4
+        vorn           $MASK,$MASK,$MASK       @ all-ones
+       vmlal.u32       $D0,$H1#lo,$S4
+        vshr.u64       $MASK,$MASK,#38
+       vmlal.u32       $D4,$H0#lo,$R4
+       vmlal.u32       $D1,$H2#lo,$S4
+       vmlal.u32       $D2,$H3#lo,$S4
+
+.Lshort_tail:
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ horizontal addition
+
+       vadd.i64        $D3#lo,$D3#lo,$D3#hi
+       vadd.i64        $D0#lo,$D0#lo,$D0#hi
+       vadd.i64        $D4#lo,$D4#lo,$D4#hi
+       vadd.i64        $D1#lo,$D1#lo,$D1#hi
+       vadd.i64        $D2#lo,$D2#lo,$D2#hi
+
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ lazy reduction, but without narrowing
+
+       vshr.u64        $T0,$D3,#26
+       vand.i64        $D3,$D3,$MASK
+        vshr.u64       $T1,$D0,#26
+        vand.i64       $D0,$D0,$MASK
+       vadd.i64        $D4,$D4,$T0             @ h3 -> h4
+        vadd.i64       $D1,$D1,$T1             @ h0 -> h1
+
+       vshr.u64        $T0,$D4,#26
+       vand.i64        $D4,$D4,$MASK
+        vshr.u64       $T1,$D1,#26
+        vand.i64       $D1,$D1,$MASK
+        vadd.i64       $D2,$D2,$T1             @ h1 -> h2
+
+       vadd.i64        $D0,$D0,$T0
+       vshl.u64        $T0,$T0,#2
+        vshr.u64       $T1,$D2,#26
+        vand.i64       $D2,$D2,$MASK
+       vadd.i64        $D0,$D0,$T0             @ h4 -> h0
+        vadd.i64       $D3,$D3,$T1             @ h2 -> h3
+
+       vshr.u64        $T0,$D0,#26
+       vand.i64        $D0,$D0,$MASK
+        vshr.u64       $T1,$D3,#26
+        vand.i64       $D3,$D3,$MASK
+       vadd.i64        $D1,$D1,$T0             @ h0 -> h1
+        vadd.i64       $D4,$D4,$T1             @ h3 -> h4
+
+       cmp             $len,#0
+       bne             .Leven
+
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ store hash value
+
+       vst4.32         {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+       vst1.32         {$D4#lo[0]},[$ctx]
+
+       vldmia  sp!,{d8-d15}                    @ epilogue
+       ldmia   sp!,{r4-r7}
+       ret                                     @ bx    lr
+.size  poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.align 5
+.Lzeros:
+.long  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+#ifndef        __KERNEL__
+.LOPENSSL_armcap:
+# ifdef        _WIN32
+.word  OPENSSL_armcap_P
+# else
+.word  OPENSSL_armcap_P-.Lpoly1305_init
+# endif
+.comm  OPENSSL_armcap_P,4,4
+.hidden        OPENSSL_armcap_P
+#endif
+#endif
+___
+}      }
+$code.=<<___;
+.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
+.align 2
+___
+
+foreach (split("\n",$code)) {
+       s/\`([^\`]*)\`/eval $1/geo;
+
+       s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
+       s/\bret\b/bx    lr/go                                           or
+       s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
+
+       print $_,"\n";
+}
+close STDOUT; # enforce flush
diff --git a/lib/crypto/arm/poly1305-glue.c b/lib/crypto/arm/poly1305-glue.c
new file mode 100644 (file)
index 0000000..2603b07
--- /dev/null
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <crypto/internal/poly1305.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+
+asmlinkage void poly1305_block_init_arch(
+       struct poly1305_block_state *state,
+       const u8 raw_key[POLY1305_BLOCK_SIZE]);
+EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
+asmlinkage void poly1305_blocks_arm(struct poly1305_block_state *state,
+                                   const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
+                                    const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
+                                  u8 digest[POLY1305_DIGEST_SIZE],
+                                  const u32 nonce[4]);
+EXPORT_SYMBOL_GPL(poly1305_emit_arch);
+
+void __weak poly1305_blocks_neon(struct poly1305_block_state *state,
+                                const u8 *src, u32 len, u32 hibit)
+{
+}
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
+                         unsigned int len, u32 padbit)
+{
+       len = round_down(len, POLY1305_BLOCK_SIZE);
+       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+           static_branch_likely(&have_neon)) {
+               do {
+                       unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+                       kernel_neon_begin();
+                       poly1305_blocks_neon(state, src, todo, padbit);
+                       kernel_neon_end();
+
+                       len -= todo;
+                       src += todo;
+               } while (len);
+       } else
+               poly1305_blocks_arm(state, src, len, padbit);
+}
+EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
+
+bool poly1305_is_arch_optimized(void)
+{
+       /* We always can use at least the ARM scalar implementation. */
+       return true;
+}
+EXPORT_SYMBOL(poly1305_is_arch_optimized);
+
+static int __init arm_poly1305_mod_init(void)
+{
+       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+           (elf_hwcap & HWCAP_NEON))
+               static_branch_enable(&have_neon);
+       return 0;
+}
+subsys_initcall(arm_poly1305_mod_init);
+
+static void __exit arm_poly1305_mod_exit(void)
+{
+}
+module_exit(arm_poly1305_mod_exit);
+
+MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/arm/sha256-armv4.pl b/lib/crypto/arm/sha256-armv4.pl
new file mode 100644 (file)
index 0000000..8122db7
--- /dev/null
@@ -0,0 +1,724 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see https://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA256 block procedure for ARMv4. May 2007.
+
+# Performance is ~2x better than gcc 3.4 generated code and in "abso-
+# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+# byte [on single-issue Xscale PXA250 core].
+
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 22% improvement on
+# Cortex A8 core and ~20 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
+
+# May 2014.
+#
+# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0";     $t0="r0";
+$inp="r1";     $t4="r1";
+$len="r2";     $t1="r2";
+$T1="r3";      $t3="r3";
+$A="r4";
+$B="r5";
+$C="r6";
+$D="r7";
+$E="r8";
+$F="r9";
+$G="r10";
+$H="r11";
+@V=($A,$B,$C,$D,$E,$F,$G,$H);
+$t2="r12";
+$Ktbl="r14";
+
+@Sigma0=( 2,13,22);
+@Sigma1=( 6,11,25);
+@sigma0=( 7,18, 3);
+@sigma1=(17,19,10);
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+       @ ldr   $t1,[$inp],#4                   @ $i
+# if $i==15
+       str     $inp,[sp,#17*4]                 @ make room for $t4
+# endif
+       eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+       add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
+       eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     $t1,$t1
+# endif
+#else
+       @ ldrb  $t1,[$inp,#3]                   @ $i
+       add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
+       ldrb    $t2,[$inp,#2]
+       ldrb    $t0,[$inp,#1]
+       orr     $t1,$t1,$t2,lsl#8
+       ldrb    $t2,[$inp],#4
+       orr     $t1,$t1,$t0,lsl#16
+# if $i==15
+       str     $inp,[sp,#17*4]                 @ make room for $t4
+# endif
+       eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
+       orr     $t1,$t1,$t2,lsl#24
+       eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
+#endif
+___
+$code.=<<___;
+       ldr     $t2,[$Ktbl],#4                  @ *K256++
+       add     $h,$h,$t1                       @ h+=X[i]
+       str     $t1,[sp,#`$i%16`*4]
+       eor     $t1,$f,$g
+       add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
+       and     $t1,$t1,$e
+       add     $h,$h,$t2                       @ h+=K256[i]
+       eor     $t1,$t1,$g                      @ Ch(e,f,g)
+       eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
+       add     $h,$h,$t1                       @ h+=Ch(e,f,g)
+#if $i==31
+       and     $t2,$t2,#0xff
+       cmp     $t2,#0xf2                       @ done?
+#endif
+#if $i<15
+# if __ARM_ARCH__>=7
+       ldr     $t1,[$inp],#4                   @ prefetch
+# else
+       ldrb    $t1,[$inp,#3]
+# endif
+       eor     $t2,$a,$b                       @ a^b, b^c in next round
+#else
+       ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
+       eor     $t2,$a,$b                       @ a^b, b^c in next round
+       ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
+#endif
+       eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
+       and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
+       add     $d,$d,$h                        @ d+=h
+       eor     $t3,$t3,$b                      @ Maj(a,b,c)
+       add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
+       @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
+___
+       ($t2,$t3)=($t3,$t2);
+}
+
+sub BODY_16_XX {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
+
+$code.=<<___;
+       @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
+       @ ldr   $t4,[sp,#`($i+14)%16`*4]
+       mov     $t0,$t1,ror#$sigma0[0]
+       add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
+       mov     $t2,$t4,ror#$sigma1[0]
+       eor     $t0,$t0,$t1,ror#$sigma0[1]
+       eor     $t2,$t2,$t4,ror#$sigma1[1]
+       eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
+       ldr     $t1,[sp,#`($i+0)%16`*4]
+       eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
+       ldr     $t4,[sp,#`($i+9)%16`*4]
+
+       add     $t2,$t2,$t0
+       eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
+       add     $t1,$t1,$t2
+       eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
+       add     $t1,$t1,$t4                     @ X[i]
+___
+       &BODY_00_15(@_);
+}
+
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code  32
+#else
+.syntax unified
+# ifdef __thumb2__
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type  K256,%object
+.align 5
+K256:
+.word  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size  K256,.-K256
+.word  0                               @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word  OPENSSL_armcap_P-sha256_blocks_arch
+#endif
+.align 5
+
+.global        sha256_blocks_arch
+.type  sha256_blocks_arch,%function
+sha256_blocks_arch:
+.Lsha256_blocks_arch:
+#if __ARM_ARCH__<7
+       sub     r3,pc,#8                @ sha256_blocks_arch
+#else
+       adr     r3,.Lsha256_blocks_arch
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+       ldr     r12,.LOPENSSL_armcap
+       ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
+       tst     r12,#ARMV8_SHA256
+       bne     .LARMv8
+       tst     r12,#ARMV7_NEON
+       bne     .LNEON
+#endif
+       add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
+       stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
+       ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
+       sub     $Ktbl,r3,#256+32        @ K256
+       sub     sp,sp,#16*4             @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+       ldr     $t1,[$inp],#4
+# else
+       ldrb    $t1,[$inp,#3]
+# endif
+       eor     $t3,$B,$C               @ magic
+       eor     $t2,$t2,$t2
+___
+for($i=0;$i<16;$i++)   { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=".Lrounds_16_xx:\n";
+for (;$i<32;$i++)      { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+       ite     eq                      @ Thumb2 thing, sanity check in ARM
+#endif
+       ldreq   $t3,[sp,#16*4]          @ pull ctx
+       bne     .Lrounds_16_xx
+
+       add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
+       ldr     $t0,[$t3,#0]
+       ldr     $t1,[$t3,#4]
+       ldr     $t2,[$t3,#8]
+       add     $A,$A,$t0
+       ldr     $t0,[$t3,#12]
+       add     $B,$B,$t1
+       ldr     $t1,[$t3,#16]
+       add     $C,$C,$t2
+       ldr     $t2,[$t3,#20]
+       add     $D,$D,$t0
+       ldr     $t0,[$t3,#24]
+       add     $E,$E,$t1
+       ldr     $t1,[$t3,#28]
+       add     $F,$F,$t2
+       ldr     $inp,[sp,#17*4]         @ pull inp
+       ldr     $t2,[sp,#18*4]          @ pull inp+len
+       add     $G,$G,$t0
+       add     $H,$H,$t1
+       stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
+       cmp     $inp,$t2
+       sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
+       bne     .Loop
+
+       add     sp,sp,#`16+3`*4 @ destroy frame
+#if __ARM_ARCH__>=5
+       ldmia   sp!,{r4-r11,pc}
+#else
+       ldmia   sp!,{r4-r11,lr}
+       tst     lr,#1
+       moveq   pc,lr                   @ be binary compatible with V4, yet
+       bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
+.size  sha256_blocks_arch,.-sha256_blocks_arch
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+       &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vshr_u32       ($T2,$T0,$sigma0[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vshr_u32       ($T1,$T0,$sigma0[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vsli_32        ($T2,$T0,32-$sigma0[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vshr_u32       ($T3,$T0,$sigma0[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &veor           ($T1,$T1,$T2);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vsli_32        ($T3,$T0,32-$sigma0[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &veor         ($T5,$T5,$T4);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &veor         ($T5,$T5,$T4);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vld1_32        ("{$T0}","[$Ktbl,:128]!");
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       ($T0,$T0,@X[0]);
+        while($#insns>=2) { eval(shift(@insns)); }
+       &vst1_32        ("{$T0}","[$Xfer,:128]!");
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       push(@X,shift(@X));             # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vld1_32        ("{$T0}","[$Ktbl,:128]!");
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vrev32_8       (@X[0],@X[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       ($T0,$T0,@X[0]);
+        foreach (@insns) { eval; }     # remaining instructions
+       &vst1_32        ("{$T0}","[$Xfer,:128]!");
+
+       push(@X,shift(@X));             # "rotate" X[]
+}
+
+sub body_00_15 () {
+       (
+       '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+       '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
+       '&eor   ($t1,$f,$g)',
+       '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+       '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
+       '&and   ($t1,$t1,$e)',
+       '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
+       '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+       '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
+       '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
+       '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
+       '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
+       '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
+       '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
+       '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
+       '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
+       '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
+       '&add   ($d,$d,$h)',                    # d+=h
+       '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
+       '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
+       '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+       )
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch  armv7-a
+.fpu   neon
+
+.global        sha256_block_data_order_neon
+.type  sha256_block_data_order_neon,%function
+.align 4
+sha256_block_data_order_neon:
+.LNEON:
+       stmdb   sp!,{r4-r12,lr}
+
+       sub     $H,sp,#16*4+16
+       adr     $Ktbl,.Lsha256_blocks_arch
+       sub     $Ktbl,$Ktbl,#.Lsha256_blocks_arch-K256
+       bic     $H,$H,#15               @ align for 128-bit stores
+       mov     $t2,sp
+       mov     sp,$H                   @ alloca
+       add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
+
+       vld1.8          {@X[0]},[$inp]!
+       vld1.8          {@X[1]},[$inp]!
+       vld1.8          {@X[2]},[$inp]!
+       vld1.8          {@X[3]},[$inp]!
+       vld1.32         {$T0},[$Ktbl,:128]!
+       vld1.32         {$T1},[$Ktbl,:128]!
+       vld1.32         {$T2},[$Ktbl,:128]!
+       vld1.32         {$T3},[$Ktbl,:128]!
+       vrev32.8        @X[0],@X[0]             @ yes, even on
+       str             $ctx,[sp,#64]
+       vrev32.8        @X[1],@X[1]             @ big-endian
+       str             $inp,[sp,#68]
+       mov             $Xfer,sp
+       vrev32.8        @X[2],@X[2]
+       str             $len,[sp,#72]
+       vrev32.8        @X[3],@X[3]
+       str             $t2,[sp,#76]            @ save original sp
+       vadd.i32        $T0,$T0,@X[0]
+       vadd.i32        $T1,$T1,@X[1]
+       vst1.32         {$T0},[$Xfer,:128]!
+       vadd.i32        $T2,$T2,@X[2]
+       vst1.32         {$T1},[$Xfer,:128]!
+       vadd.i32        $T3,$T3,@X[3]
+       vst1.32         {$T2},[$Xfer,:128]!
+       vst1.32         {$T3},[$Xfer,:128]!
+
+       ldmia           $ctx,{$A-$H}
+       sub             $Xfer,$Xfer,#64
+       ldr             $t1,[sp,#0]
+       eor             $t2,$t2,$t2
+       eor             $t3,$B,$C
+       b               .L_00_48
+
+.align 4
+.L_00_48:
+___
+       &Xupdate(\&body_00_15);
+       &Xupdate(\&body_00_15);
+       &Xupdate(\&body_00_15);
+       &Xupdate(\&body_00_15);
+$code.=<<___;
+       teq     $t1,#0                          @ check for K256 terminator
+       ldr     $t1,[sp,#0]
+       sub     $Xfer,$Xfer,#64
+       bne     .L_00_48
+
+       ldr             $inp,[sp,#68]
+       ldr             $t0,[sp,#72]
+       sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
+       teq             $inp,$t0
+       it              eq
+       subeq           $inp,$inp,#64           @ avoid SEGV
+       vld1.8          {@X[0]},[$inp]!         @ load next input block
+       vld1.8          {@X[1]},[$inp]!
+       vld1.8          {@X[2]},[$inp]!
+       vld1.8          {@X[3]},[$inp]!
+       it              ne
+       strne           $inp,[sp,#68]
+       mov             $Xfer,sp
+___
+       &Xpreload(\&body_00_15);
+       &Xpreload(\&body_00_15);
+       &Xpreload(\&body_00_15);
+       &Xpreload(\&body_00_15);
+$code.=<<___;
+       ldr     $t0,[$t1,#0]
+       add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
+       ldr     $t2,[$t1,#4]
+       ldr     $t3,[$t1,#8]
+       ldr     $t4,[$t1,#12]
+       add     $A,$A,$t0                       @ accumulate
+       ldr     $t0,[$t1,#16]
+       add     $B,$B,$t2
+       ldr     $t2,[$t1,#20]
+       add     $C,$C,$t3
+       ldr     $t3,[$t1,#24]
+       add     $D,$D,$t4
+       ldr     $t4,[$t1,#28]
+       add     $E,$E,$t0
+       str     $A,[$t1],#4
+       add     $F,$F,$t2
+       str     $B,[$t1],#4
+       add     $G,$G,$t3
+       str     $C,[$t1],#4
+       add     $H,$H,$t4
+       str     $D,[$t1],#4
+       stmia   $t1,{$E-$H}
+
+       ittte   ne
+       movne   $Xfer,sp
+       ldrne   $t1,[sp,#0]
+       eorne   $t2,$t2,$t2
+       ldreq   sp,[sp,#76]                     @ restore original sp
+       itt     ne
+       eorne   $t3,$B,$C
+       bne     .L_00_48
+
+       ldmia   sp!,{r4-r12,pc}
+.size  sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+___
+}}}
+######################################################################
+# ARMv8 stuff
+#
+{{{
+my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
+my @MSG=map("q$_",(8..11));
+my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
+my $Ktbl="r3";
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# ifdef __thumb2__
+#  define INST(a,b,c,d)        .byte   c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)        .byte   a,b,c,d
+# endif
+
+.type  sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+       vld1.32 {$ABCD,$EFGH},[$ctx]
+# ifdef __thumb2__
+       adr     $Ktbl,.LARMv8
+       sub     $Ktbl,$Ktbl,#.LARMv8-K256
+# else
+       adrl    $Ktbl,K256
+# endif
+       add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
+
+.Loop_v8:
+       vld1.8          {@MSG[0]-@MSG[1]},[$inp]!
+       vld1.8          {@MSG[2]-@MSG[3]},[$inp]!
+       vld1.32         {$W0},[$Ktbl]!
+       vrev32.8        @MSG[0],@MSG[0]
+       vrev32.8        @MSG[1],@MSG[1]
+       vrev32.8        @MSG[2],@MSG[2]
+       vrev32.8        @MSG[3],@MSG[3]
+       vmov            $ABCD_SAVE,$ABCD        @ offload
+       vmov            $EFGH_SAVE,$EFGH
+       teq             $inp,$len
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+       vld1.32         {$W1},[$Ktbl]!
+       vadd.i32        $W0,$W0,@MSG[0]
+       sha256su0       @MSG[0],@MSG[1]
+       vmov            $abcd,$ABCD
+       sha256h         $ABCD,$EFGH,$W0
+       sha256h2        $EFGH,$abcd,$W0
+       sha256su1       @MSG[0],@MSG[2],@MSG[3]
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+       vld1.32         {$W1},[$Ktbl]!
+       vadd.i32        $W0,$W0,@MSG[0]
+       vmov            $abcd,$ABCD
+       sha256h         $ABCD,$EFGH,$W0
+       sha256h2        $EFGH,$abcd,$W0
+
+       vld1.32         {$W0},[$Ktbl]!
+       vadd.i32        $W1,$W1,@MSG[1]
+       vmov            $abcd,$ABCD
+       sha256h         $ABCD,$EFGH,$W1
+       sha256h2        $EFGH,$abcd,$W1
+
+       vld1.32         {$W1},[$Ktbl]
+       vadd.i32        $W0,$W0,@MSG[2]
+       sub             $Ktbl,$Ktbl,#256-16     @ rewind
+       vmov            $abcd,$ABCD
+       sha256h         $ABCD,$EFGH,$W0
+       sha256h2        $EFGH,$abcd,$W0
+
+       vadd.i32        $W1,$W1,@MSG[3]
+       vmov            $abcd,$ABCD
+       sha256h         $ABCD,$EFGH,$W1
+       sha256h2        $EFGH,$abcd,$W1
+
+       vadd.i32        $ABCD,$ABCD,$ABCD_SAVE
+       vadd.i32        $EFGH,$EFGH,$EFGH_SAVE
+       it              ne
+       bne             .Loop_v8
+
+       vst1.32         {$ABCD,$EFGH},[$ctx]
+
+       ret             @ bx lr
+.size  sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+___
+}}}
+$code.=<<___;
+.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm   OPENSSL_armcap_P,4,4
+#endif
+___
+
+open SELF,$0;
+while(<SELF>) {
+       next if (/^#!/);
+       last if (!s/^#/@/ and !/^$/);
+       print;
+}
+close SELF;
+
+{   my  %opcode = (
+       "sha256h"       => 0xf3000c40,  "sha256h2"      => 0xf3100c40,
+       "sha256su0"     => 0xf3ba03c0,  "sha256su1"     => 0xf3200c40   );
+
+    sub unsha256 {
+       my ($mnemonic,$arg)=@_;
+
+       if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+           my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+                                        |(($2&7)<<17)|(($2&8)<<4)
+                                        |(($3&7)<<1) |(($3&8)<<2);
+           # since ARMv7 instructions are always encoded little-endian.
+           # correct solution is to use .inst directive, but older
+           # assemblers don't implement it:-(
+           sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
+                       $word&0xff,($word>>8)&0xff,
+                       ($word>>16)&0xff,($word>>24)&0xff,
+                       $mnemonic,$arg;
+       }
+    }
+}
+
+foreach (split($/,$code)) {
+
+       s/\`([^\`]*)\`/eval $1/geo;
+
+       s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
+
+       s/\bret\b/bx    lr/go           or
+       s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
+
+       print $_,"\n";
+}
+
+close STDOUT; # enforce flush
diff --git a/lib/crypto/arm/sha256-ce.S b/lib/crypto/arm/sha256-ce.S
new file mode 100644 (file)
index 0000000..ac2c9b0
--- /dev/null
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sha256-ce.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions
+ *
+ * Copyright (C) 2015 Linaro Ltd.
+ * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+       .text
+       .arch           armv8-a
+       .fpu            crypto-neon-fp-armv8
+
+       k0              .req    q7
+       k1              .req    q8
+       rk              .req    r3
+
+       ta0             .req    q9
+       ta1             .req    q10
+       tb0             .req    q10
+       tb1             .req    q9
+
+       dga             .req    q11
+       dgb             .req    q12
+
+       dg0             .req    q13
+       dg1             .req    q14
+       dg2             .req    q15
+
+       .macro          add_only, ev, s0
+       vmov            dg2, dg0
+       .ifnb           \s0
+       vld1.32         {k\ev}, [rk, :128]!
+       .endif
+       sha256h.32      dg0, dg1, tb\ev
+       sha256h2.32     dg1, dg2, tb\ev
+       .ifnb           \s0
+       vadd.u32        ta\ev, q\s0, k\ev
+       .endif
+       .endm
+
+       .macro          add_update, ev, s0, s1, s2, s3
+       sha256su0.32    q\s0, q\s1
+       add_only        \ev, \s1
+       sha256su1.32    q\s0, q\s2, q\s3
+       .endm
+
+       .align          6
+.Lsha256_rcon:
+       .word           0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+       .word           0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+       .word           0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+       .word           0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+       .word           0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+       .word           0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+       .word           0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+       .word           0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+       .word           0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+       .word           0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+       .word           0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+       .word           0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+       .word           0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+       .word           0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+       .word           0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+       .word           0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+       /*
+        * void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
+        *                          const u8 *data, size_t nblocks);
+        */
+ENTRY(sha256_ce_transform)
+       /* load state */
+       vld1.32         {dga-dgb}, [r0]
+
+       /* load input */
+0:     vld1.32         {q0-q1}, [r1]!
+       vld1.32         {q2-q3}, [r1]!
+       subs            r2, r2, #1
+
+#ifndef CONFIG_CPU_BIG_ENDIAN
+       vrev32.8        q0, q0
+       vrev32.8        q1, q1
+       vrev32.8        q2, q2
+       vrev32.8        q3, q3
+#endif
+
+       /* load first round constant */
+       adr             rk, .Lsha256_rcon
+       vld1.32         {k0}, [rk, :128]!
+
+       vadd.u32        ta0, q0, k0
+       vmov            dg0, dga
+       vmov            dg1, dgb
+
+       add_update      1, 0, 1, 2, 3
+       add_update      0, 1, 2, 3, 0
+       add_update      1, 2, 3, 0, 1
+       add_update      0, 3, 0, 1, 2
+       add_update      1, 0, 1, 2, 3
+       add_update      0, 1, 2, 3, 0
+       add_update      1, 2, 3, 0, 1
+       add_update      0, 3, 0, 1, 2
+       add_update      1, 0, 1, 2, 3
+       add_update      0, 1, 2, 3, 0
+       add_update      1, 2, 3, 0, 1
+       add_update      0, 3, 0, 1, 2
+
+       add_only        1, 1
+       add_only        0, 2
+       add_only        1, 3
+       add_only        0
+
+       /* update state */
+       vadd.u32        dga, dga, dg0
+       vadd.u32        dgb, dgb, dg1
+       bne             0b
+
+       /* store new state */
+       vst1.32         {dga-dgb}, [r0]
+       bx              lr
+ENDPROC(sha256_ce_transform)
diff --git a/lib/crypto/arm/sha256.c b/lib/crypto/arm/sha256.c
new file mode 100644 (file)
index 0000000..109192e
--- /dev/null
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-256 optimized for ARM
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <crypto/internal/sha2.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+                                  const u8 *data, size_t nblocks);
+EXPORT_SYMBOL_GPL(sha256_blocks_arch);
+asmlinkage void sha256_block_data_order_neon(u32 state[SHA256_STATE_WORDS],
+                                            const u8 *data, size_t nblocks);
+asmlinkage void sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
+                                   const u8 *data, size_t nblocks);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
+                       const u8 *data, size_t nblocks)
+{
+       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+           static_branch_likely(&have_neon)) {
+               kernel_neon_begin();
+               if (static_branch_likely(&have_ce))
+                       sha256_ce_transform(state, data, nblocks);
+               else
+                       sha256_block_data_order_neon(state, data, nblocks);
+               kernel_neon_end();
+       } else {
+               sha256_blocks_arch(state, data, nblocks);
+       }
+}
+EXPORT_SYMBOL_GPL(sha256_blocks_simd);
+
+bool sha256_is_arch_optimized(void)
+{
+       /* We always can use at least the ARM scalar implementation. */
+       return true;
+}
+EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
+
+static int __init sha256_arm_mod_init(void)
+{
+       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
+               static_branch_enable(&have_neon);
+               if (elf_hwcap2 & HWCAP2_SHA2)
+                       static_branch_enable(&have_ce);
+       }
+       return 0;
+}
+subsys_initcall(sha256_arm_mod_init);
+
+static void __exit sha256_arm_mod_exit(void)
+{
+}
+module_exit(sha256_arm_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-256 optimized for ARM");