lib/crypto: arm64: Move arch/arm64/lib/crypto/ into lib/crypto/
authorEric Biggers <ebiggers@kernel.org>
Thu, 19 Jun 2025 19:19:01 +0000 (12:19 -0700)
committerEric Biggers <ebiggers@kernel.org>
Mon, 30 Jun 2025 16:26:20 +0000 (09:26 -0700)
Move the contents of arch/arm64/lib/crypto/ into lib/crypto/arm64/.

The new code organization makes a lot more sense for how this code
actually works and is developed.  In particular, it makes it possible to
build each algorithm as a single module, with better inlining and dead
code elimination.  For a more detailed explanation, see the patchset
which did this for the CRC library code:
https://lore.kernel.org/r/20250607200454.73587-1-ebiggers@kernel.org/.
Also see the patchset which did this for SHA-512:
https://lore.kernel.org/linux-crypto/20250616014019.415791-1-ebiggers@kernel.org/

This is just a preparatory commit, which does the move to get the files
into their new location but keeps them building the same way as before.
Later commits will make the actual improvements to the way the
arch-optimized code is integrated for each algorithm.

Add a gitignore entry for the removed directory arch/arm64/lib/crypto/
so that people don't accidentally commit leftover generated files.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Sohil Mehta <sohil.mehta@intel.com>
Link: https://lore.kernel.org/r/20250619191908.134235-3-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
24 files changed:
arch/arm64/lib/.gitignore [new file with mode: 0644]
arch/arm64/lib/Makefile
arch/arm64/lib/crypto/.gitignore [deleted file]
arch/arm64/lib/crypto/Kconfig [deleted file]
arch/arm64/lib/crypto/Makefile [deleted file]
arch/arm64/lib/crypto/chacha-neon-core.S [deleted file]
arch/arm64/lib/crypto/chacha-neon-glue.c [deleted file]
arch/arm64/lib/crypto/poly1305-armv8.pl [deleted file]
arch/arm64/lib/crypto/poly1305-glue.c [deleted file]
arch/arm64/lib/crypto/sha2-armv8.pl [deleted file]
arch/arm64/lib/crypto/sha256-ce.S [deleted file]
arch/arm64/lib/crypto/sha256.c [deleted file]
lib/crypto/Kconfig
lib/crypto/Makefile
lib/crypto/arm64/.gitignore
lib/crypto/arm64/Kconfig [new file with mode: 0644]
lib/crypto/arm64/Makefile [new file with mode: 0644]
lib/crypto/arm64/chacha-neon-core.S [new file with mode: 0644]
lib/crypto/arm64/chacha-neon-glue.c [new file with mode: 0644]
lib/crypto/arm64/poly1305-armv8.pl [new file with mode: 0644]
lib/crypto/arm64/poly1305-glue.c [new file with mode: 0644]
lib/crypto/arm64/sha2-armv8.pl [new file with mode: 0644]
lib/crypto/arm64/sha256-ce.S [new file with mode: 0644]
lib/crypto/arm64/sha256.c [new file with mode: 0644]

diff --git a/arch/arm64/lib/.gitignore b/arch/arm64/lib/.gitignore
new file mode 100644 (file)
index 0000000..647d7a9
--- /dev/null
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# This now-removed directory used to contain generated files.
+/crypto/
index 027bfa9689c6a2b99e8fb5c2b6e239b808ff74f2..d97e290619bc5517403b99f3e3e2742bb771880b 100644 (file)
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-
-obj-y += crypto/
-
 lib-y          := clear_user.o delay.o copy_from_user.o                \
                   copy_to_user.o copy_page.o                           \
                   clear_page.o csum.o insn.o memchr.o memcpy.o         \
diff --git a/arch/arm64/lib/crypto/.gitignore b/arch/arm64/lib/crypto/.gitignore
deleted file mode 100644 (file)
index 12d74d8..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-poly1305-core.S
-sha256-core.S
diff --git a/arch/arm64/lib/crypto/Kconfig b/arch/arm64/lib/crypto/Kconfig
deleted file mode 100644 (file)
index 129a768..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_CHACHA20_NEON
-       tristate
-       depends on KERNEL_MODE_NEON
-       default CRYPTO_LIB_CHACHA
-       select CRYPTO_LIB_CHACHA_GENERIC
-       select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_NEON
-       tristate
-       depends on KERNEL_MODE_NEON
-       default CRYPTO_LIB_POLY1305
-       select CRYPTO_ARCH_HAVE_LIB_POLY1305
-
-config CRYPTO_SHA256_ARM64
-       tristate
-       default CRYPTO_LIB_SHA256
-       select CRYPTO_ARCH_HAVE_LIB_SHA256
-       select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
diff --git a/arch/arm64/lib/crypto/Makefile b/arch/arm64/lib/crypto/Makefile
deleted file mode 100644 (file)
index 946c099..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
-chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
-
-obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
-poly1305-neon-y := poly1305-core.o poly1305-glue.o
-AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch
-AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch
-
-obj-$(CONFIG_CRYPTO_SHA256_ARM64) += sha256-arm64.o
-sha256-arm64-y := sha256.o sha256-core.o
-sha256-arm64-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o
-
-quiet_cmd_perlasm = PERLASM $@
-      cmd_perlasm = $(PERL) $(<) void $(@)
-
-$(obj)/%-core.S: $(src)/%-armv8.pl
-       $(call cmd,perlasm)
-
-$(obj)/sha256-core.S: $(src)/sha2-armv8.pl
-       $(call cmd,perlasm)
-
-clean-files += poly1305-core.S sha256-core.S
diff --git a/arch/arm64/lib/crypto/chacha-neon-core.S b/arch/arm64/lib/crypto/chacha-neon-core.S
deleted file mode 100644 (file)
index 8007958..0000000
+++ /dev/null
@@ -1,805 +0,0 @@
-/*
- * ChaCha/HChaCha NEON helper functions
- *
- * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Originally based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/cache.h>
-
-       .text
-       .align          6
-
-/*
- * chacha_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is stored in the four NEON
- * registers v0-v3.  It performs matrix operations on four words in parallel,
- * but requires shuffling to rearrange the words after each round.
- *
- * The round count is given in w3.
- *
- * Clobbers: w3, x10, v4, v12
- */
-SYM_FUNC_START_LOCAL(chacha_permute)
-
-       adr_l           x10, ROT8
-       ld1             {v12.4s}, [x10]
-
-.Ldoubleround:
-       // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-       add             v0.4s, v0.4s, v1.4s
-       eor             v3.16b, v3.16b, v0.16b
-       rev32           v3.8h, v3.8h
-
-       // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-       add             v2.4s, v2.4s, v3.4s
-       eor             v4.16b, v1.16b, v2.16b
-       shl             v1.4s, v4.4s, #12
-       sri             v1.4s, v4.4s, #20
-
-       // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-       add             v0.4s, v0.4s, v1.4s
-       eor             v3.16b, v3.16b, v0.16b
-       tbl             v3.16b, {v3.16b}, v12.16b
-
-       // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-       add             v2.4s, v2.4s, v3.4s
-       eor             v4.16b, v1.16b, v2.16b
-       shl             v1.4s, v4.4s, #7
-       sri             v1.4s, v4.4s, #25
-
-       // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
-       ext             v1.16b, v1.16b, v1.16b, #4
-       // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-       ext             v2.16b, v2.16b, v2.16b, #8
-       // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
-       ext             v3.16b, v3.16b, v3.16b, #12
-
-       // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-       add             v0.4s, v0.4s, v1.4s
-       eor             v3.16b, v3.16b, v0.16b
-       rev32           v3.8h, v3.8h
-
-       // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-       add             v2.4s, v2.4s, v3.4s
-       eor             v4.16b, v1.16b, v2.16b
-       shl             v1.4s, v4.4s, #12
-       sri             v1.4s, v4.4s, #20
-
-       // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-       add             v0.4s, v0.4s, v1.4s
-       eor             v3.16b, v3.16b, v0.16b
-       tbl             v3.16b, {v3.16b}, v12.16b
-
-       // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-       add             v2.4s, v2.4s, v3.4s
-       eor             v4.16b, v1.16b, v2.16b
-       shl             v1.4s, v4.4s, #7
-       sri             v1.4s, v4.4s, #25
-
-       // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
-       ext             v1.16b, v1.16b, v1.16b, #12
-       // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-       ext             v2.16b, v2.16b, v2.16b, #8
-       // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
-       ext             v3.16b, v3.16b, v3.16b, #4
-
-       subs            w3, w3, #2
-       b.ne            .Ldoubleround
-
-       ret
-SYM_FUNC_END(chacha_permute)
-
-SYM_FUNC_START(chacha_block_xor_neon)
-       // x0: Input state matrix, s
-       // x1: 1 data block output, o
-       // x2: 1 data block input, i
-       // w3: nrounds
-
-       stp             x29, x30, [sp, #-16]!
-       mov             x29, sp
-
-       // x0..3 = s0..3
-       ld1             {v0.4s-v3.4s}, [x0]
-       ld1             {v8.4s-v11.4s}, [x0]
-
-       bl              chacha_permute
-
-       ld1             {v4.16b-v7.16b}, [x2]
-
-       // o0 = i0 ^ (x0 + s0)
-       add             v0.4s, v0.4s, v8.4s
-       eor             v0.16b, v0.16b, v4.16b
-
-       // o1 = i1 ^ (x1 + s1)
-       add             v1.4s, v1.4s, v9.4s
-       eor             v1.16b, v1.16b, v5.16b
-
-       // o2 = i2 ^ (x2 + s2)
-       add             v2.4s, v2.4s, v10.4s
-       eor             v2.16b, v2.16b, v6.16b
-
-       // o3 = i3 ^ (x3 + s3)
-       add             v3.4s, v3.4s, v11.4s
-       eor             v3.16b, v3.16b, v7.16b
-
-       st1             {v0.16b-v3.16b}, [x1]
-
-       ldp             x29, x30, [sp], #16
-       ret
-SYM_FUNC_END(chacha_block_xor_neon)
-
-SYM_FUNC_START(hchacha_block_neon)
-       // x0: Input state matrix, s
-       // x1: output (8 32-bit words)
-       // w2: nrounds
-
-       stp             x29, x30, [sp, #-16]!
-       mov             x29, sp
-
-       ld1             {v0.4s-v3.4s}, [x0]
-
-       mov             w3, w2
-       bl              chacha_permute
-
-       st1             {v0.4s}, [x1], #16
-       st1             {v3.4s}, [x1]
-
-       ldp             x29, x30, [sp], #16
-       ret
-SYM_FUNC_END(hchacha_block_neon)
-
-       a0              .req    w12
-       a1              .req    w13
-       a2              .req    w14
-       a3              .req    w15
-       a4              .req    w16
-       a5              .req    w17
-       a6              .req    w19
-       a7              .req    w20
-       a8              .req    w21
-       a9              .req    w22
-       a10             .req    w23
-       a11             .req    w24
-       a12             .req    w25
-       a13             .req    w26
-       a14             .req    w27
-       a15             .req    w28
-
-       .align          6
-SYM_FUNC_START(chacha_4block_xor_neon)
-       frame_push      10
-
-       // x0: Input state matrix, s
-       // x1: 4 data blocks output, o
-       // x2: 4 data blocks input, i
-       // w3: nrounds
-       // x4: byte count
-
-       adr_l           x10, .Lpermute
-       and             x5, x4, #63
-       add             x10, x10, x5
-
-       //
-       // This function encrypts four consecutive ChaCha blocks by loading
-       // the state matrix in NEON registers four times. The algorithm performs
-       // each operation on the corresponding word of each state matrix, hence
-       // requires no word shuffling. For final XORing step we transpose the
-       // matrix by interleaving 32- and then 64-bit words, which allows us to
-       // do XOR in NEON registers.
-       //
-       // At the same time, a fifth block is encrypted in parallel using
-       // scalar registers
-       //
-       adr_l           x9, CTRINC              // ... and ROT8
-       ld1             {v30.4s-v31.4s}, [x9]
-
-       // x0..15[0-3] = s0..3[0..3]
-       add             x8, x0, #16
-       ld4r            { v0.4s- v3.4s}, [x0]
-       ld4r            { v4.4s- v7.4s}, [x8], #16
-       ld4r            { v8.4s-v11.4s}, [x8], #16
-       ld4r            {v12.4s-v15.4s}, [x8]
-
-       mov             a0, v0.s[0]
-       mov             a1, v1.s[0]
-       mov             a2, v2.s[0]
-       mov             a3, v3.s[0]
-       mov             a4, v4.s[0]
-       mov             a5, v5.s[0]
-       mov             a6, v6.s[0]
-       mov             a7, v7.s[0]
-       mov             a8, v8.s[0]
-       mov             a9, v9.s[0]
-       mov             a10, v10.s[0]
-       mov             a11, v11.s[0]
-       mov             a12, v12.s[0]
-       mov             a13, v13.s[0]
-       mov             a14, v14.s[0]
-       mov             a15, v15.s[0]
-
-       // x12 += counter values 1-4
-       add             v12.4s, v12.4s, v30.4s
-
-.Ldoubleround4:
-       // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
-       // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
-       // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
-       // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
-       add             v0.4s, v0.4s, v4.4s
-         add           a0, a0, a4
-       add             v1.4s, v1.4s, v5.4s
-         add           a1, a1, a5
-       add             v2.4s, v2.4s, v6.4s
-         add           a2, a2, a6
-       add             v3.4s, v3.4s, v7.4s
-         add           a3, a3, a7
-
-       eor             v12.16b, v12.16b, v0.16b
-         eor           a12, a12, a0
-       eor             v13.16b, v13.16b, v1.16b
-         eor           a13, a13, a1
-       eor             v14.16b, v14.16b, v2.16b
-         eor           a14, a14, a2
-       eor             v15.16b, v15.16b, v3.16b
-         eor           a15, a15, a3
-
-       rev32           v12.8h, v12.8h
-         ror           a12, a12, #16
-       rev32           v13.8h, v13.8h
-         ror           a13, a13, #16
-       rev32           v14.8h, v14.8h
-         ror           a14, a14, #16
-       rev32           v15.8h, v15.8h
-         ror           a15, a15, #16
-
-       // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
-       // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
-       // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
-       // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
-       add             v8.4s, v8.4s, v12.4s
-         add           a8, a8, a12
-       add             v9.4s, v9.4s, v13.4s
-         add           a9, a9, a13
-       add             v10.4s, v10.4s, v14.4s
-         add           a10, a10, a14
-       add             v11.4s, v11.4s, v15.4s
-         add           a11, a11, a15
-
-       eor             v16.16b, v4.16b, v8.16b
-         eor           a4, a4, a8
-       eor             v17.16b, v5.16b, v9.16b
-         eor           a5, a5, a9
-       eor             v18.16b, v6.16b, v10.16b
-         eor           a6, a6, a10
-       eor             v19.16b, v7.16b, v11.16b
-         eor           a7, a7, a11
-
-       shl             v4.4s, v16.4s, #12
-       shl             v5.4s, v17.4s, #12
-       shl             v6.4s, v18.4s, #12
-       shl             v7.4s, v19.4s, #12
-
-       sri             v4.4s, v16.4s, #20
-         ror           a4, a4, #20
-       sri             v5.4s, v17.4s, #20
-         ror           a5, a5, #20
-       sri             v6.4s, v18.4s, #20
-         ror           a6, a6, #20
-       sri             v7.4s, v19.4s, #20
-         ror           a7, a7, #20
-
-       // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
-       // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
-       // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
-       // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
-       add             v0.4s, v0.4s, v4.4s
-         add           a0, a0, a4
-       add             v1.4s, v1.4s, v5.4s
-         add           a1, a1, a5
-       add             v2.4s, v2.4s, v6.4s
-         add           a2, a2, a6
-       add             v3.4s, v3.4s, v7.4s
-         add           a3, a3, a7
-
-       eor             v12.16b, v12.16b, v0.16b
-         eor           a12, a12, a0
-       eor             v13.16b, v13.16b, v1.16b
-         eor           a13, a13, a1
-       eor             v14.16b, v14.16b, v2.16b
-         eor           a14, a14, a2
-       eor             v15.16b, v15.16b, v3.16b
-         eor           a15, a15, a3
-
-       tbl             v12.16b, {v12.16b}, v31.16b
-         ror           a12, a12, #24
-       tbl             v13.16b, {v13.16b}, v31.16b
-         ror           a13, a13, #24
-       tbl             v14.16b, {v14.16b}, v31.16b
-         ror           a14, a14, #24
-       tbl             v15.16b, {v15.16b}, v31.16b
-         ror           a15, a15, #24
-
-       // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
-       // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
-       // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
-       // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
-       add             v8.4s, v8.4s, v12.4s
-         add           a8, a8, a12
-       add             v9.4s, v9.4s, v13.4s
-         add           a9, a9, a13
-       add             v10.4s, v10.4s, v14.4s
-         add           a10, a10, a14
-       add             v11.4s, v11.4s, v15.4s
-         add           a11, a11, a15
-
-       eor             v16.16b, v4.16b, v8.16b
-         eor           a4, a4, a8
-       eor             v17.16b, v5.16b, v9.16b
-         eor           a5, a5, a9
-       eor             v18.16b, v6.16b, v10.16b
-         eor           a6, a6, a10
-       eor             v19.16b, v7.16b, v11.16b
-         eor           a7, a7, a11
-
-       shl             v4.4s, v16.4s, #7
-       shl             v5.4s, v17.4s, #7
-       shl             v6.4s, v18.4s, #7
-       shl             v7.4s, v19.4s, #7
-
-       sri             v4.4s, v16.4s, #25
-         ror           a4, a4, #25
-       sri             v5.4s, v17.4s, #25
-         ror           a5, a5, #25
-       sri             v6.4s, v18.4s, #25
-        ror            a6, a6, #25
-       sri             v7.4s, v19.4s, #25
-         ror           a7, a7, #25
-
-       // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
-       // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
-       // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
-       // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
-       add             v0.4s, v0.4s, v5.4s
-         add           a0, a0, a5
-       add             v1.4s, v1.4s, v6.4s
-         add           a1, a1, a6
-       add             v2.4s, v2.4s, v7.4s
-         add           a2, a2, a7
-       add             v3.4s, v3.4s, v4.4s
-         add           a3, a3, a4
-
-       eor             v15.16b, v15.16b, v0.16b
-         eor           a15, a15, a0
-       eor             v12.16b, v12.16b, v1.16b
-         eor           a12, a12, a1
-       eor             v13.16b, v13.16b, v2.16b
-         eor           a13, a13, a2
-       eor             v14.16b, v14.16b, v3.16b
-         eor           a14, a14, a3
-
-       rev32           v15.8h, v15.8h
-         ror           a15, a15, #16
-       rev32           v12.8h, v12.8h
-         ror           a12, a12, #16
-       rev32           v13.8h, v13.8h
-         ror           a13, a13, #16
-       rev32           v14.8h, v14.8h
-         ror           a14, a14, #16
-
-       // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
-       // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
-       // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
-       // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
-       add             v10.4s, v10.4s, v15.4s
-         add           a10, a10, a15
-       add             v11.4s, v11.4s, v12.4s
-         add           a11, a11, a12
-       add             v8.4s, v8.4s, v13.4s
-         add           a8, a8, a13
-       add             v9.4s, v9.4s, v14.4s
-         add           a9, a9, a14
-
-       eor             v16.16b, v5.16b, v10.16b
-         eor           a5, a5, a10
-       eor             v17.16b, v6.16b, v11.16b
-         eor           a6, a6, a11
-       eor             v18.16b, v7.16b, v8.16b
-         eor           a7, a7, a8
-       eor             v19.16b, v4.16b, v9.16b
-         eor           a4, a4, a9
-
-       shl             v5.4s, v16.4s, #12
-       shl             v6.4s, v17.4s, #12
-       shl             v7.4s, v18.4s, #12
-       shl             v4.4s, v19.4s, #12
-
-       sri             v5.4s, v16.4s, #20
-         ror           a5, a5, #20
-       sri             v6.4s, v17.4s, #20
-         ror           a6, a6, #20
-       sri             v7.4s, v18.4s, #20
-         ror           a7, a7, #20
-       sri             v4.4s, v19.4s, #20
-         ror           a4, a4, #20
-
-       // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
-       // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
-       // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
-       // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
-       add             v0.4s, v0.4s, v5.4s
-         add           a0, a0, a5
-       add             v1.4s, v1.4s, v6.4s
-         add           a1, a1, a6
-       add             v2.4s, v2.4s, v7.4s
-         add           a2, a2, a7
-       add             v3.4s, v3.4s, v4.4s
-         add           a3, a3, a4
-
-       eor             v15.16b, v15.16b, v0.16b
-         eor           a15, a15, a0
-       eor             v12.16b, v12.16b, v1.16b
-         eor           a12, a12, a1
-       eor             v13.16b, v13.16b, v2.16b
-         eor           a13, a13, a2
-       eor             v14.16b, v14.16b, v3.16b
-         eor           a14, a14, a3
-
-       tbl             v15.16b, {v15.16b}, v31.16b
-         ror           a15, a15, #24
-       tbl             v12.16b, {v12.16b}, v31.16b
-         ror           a12, a12, #24
-       tbl             v13.16b, {v13.16b}, v31.16b
-         ror           a13, a13, #24
-       tbl             v14.16b, {v14.16b}, v31.16b
-         ror           a14, a14, #24
-
-       // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
-       // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
-       // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
-       // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
-       add             v10.4s, v10.4s, v15.4s
-         add           a10, a10, a15
-       add             v11.4s, v11.4s, v12.4s
-         add           a11, a11, a12
-       add             v8.4s, v8.4s, v13.4s
-         add           a8, a8, a13
-       add             v9.4s, v9.4s, v14.4s
-         add           a9, a9, a14
-
-       eor             v16.16b, v5.16b, v10.16b
-         eor           a5, a5, a10
-       eor             v17.16b, v6.16b, v11.16b
-         eor           a6, a6, a11
-       eor             v18.16b, v7.16b, v8.16b
-         eor           a7, a7, a8
-       eor             v19.16b, v4.16b, v9.16b
-         eor           a4, a4, a9
-
-       shl             v5.4s, v16.4s, #7
-       shl             v6.4s, v17.4s, #7
-       shl             v7.4s, v18.4s, #7
-       shl             v4.4s, v19.4s, #7
-
-       sri             v5.4s, v16.4s, #25
-         ror           a5, a5, #25
-       sri             v6.4s, v17.4s, #25
-         ror           a6, a6, #25
-       sri             v7.4s, v18.4s, #25
-         ror           a7, a7, #25
-       sri             v4.4s, v19.4s, #25
-         ror           a4, a4, #25
-
-       subs            w3, w3, #2
-       b.ne            .Ldoubleround4
-
-       ld4r            {v16.4s-v19.4s}, [x0], #16
-       ld4r            {v20.4s-v23.4s}, [x0], #16
-
-       // x12 += counter values 0-3
-       add             v12.4s, v12.4s, v30.4s
-
-       // x0[0-3] += s0[0]
-       // x1[0-3] += s0[1]
-       // x2[0-3] += s0[2]
-       // x3[0-3] += s0[3]
-       add             v0.4s, v0.4s, v16.4s
-         mov           w6, v16.s[0]
-         mov           w7, v17.s[0]
-       add             v1.4s, v1.4s, v17.4s
-         mov           w8, v18.s[0]
-         mov           w9, v19.s[0]
-       add             v2.4s, v2.4s, v18.4s
-         add           a0, a0, w6
-         add           a1, a1, w7
-       add             v3.4s, v3.4s, v19.4s
-         add           a2, a2, w8
-         add           a3, a3, w9
-CPU_BE(          rev           a0, a0          )
-CPU_BE(          rev           a1, a1          )
-CPU_BE(          rev           a2, a2          )
-CPU_BE(          rev           a3, a3          )
-
-       ld4r            {v24.4s-v27.4s}, [x0], #16
-       ld4r            {v28.4s-v31.4s}, [x0]
-
-       // x4[0-3] += s1[0]
-       // x5[0-3] += s1[1]
-       // x6[0-3] += s1[2]
-       // x7[0-3] += s1[3]
-       add             v4.4s, v4.4s, v20.4s
-         mov           w6, v20.s[0]
-         mov           w7, v21.s[0]
-       add             v5.4s, v5.4s, v21.4s
-         mov           w8, v22.s[0]
-         mov           w9, v23.s[0]
-       add             v6.4s, v6.4s, v22.4s
-         add           a4, a4, w6
-         add           a5, a5, w7
-       add             v7.4s, v7.4s, v23.4s
-         add           a6, a6, w8
-         add           a7, a7, w9
-CPU_BE(          rev           a4, a4          )
-CPU_BE(          rev           a5, a5          )
-CPU_BE(          rev           a6, a6          )
-CPU_BE(          rev           a7, a7          )
-
-       // x8[0-3] += s2[0]
-       // x9[0-3] += s2[1]
-       // x10[0-3] += s2[2]
-       // x11[0-3] += s2[3]
-       add             v8.4s, v8.4s, v24.4s
-         mov           w6, v24.s[0]
-         mov           w7, v25.s[0]
-       add             v9.4s, v9.4s, v25.4s
-         mov           w8, v26.s[0]
-         mov           w9, v27.s[0]
-       add             v10.4s, v10.4s, v26.4s
-         add           a8, a8, w6
-         add           a9, a9, w7
-       add             v11.4s, v11.4s, v27.4s
-         add           a10, a10, w8
-         add           a11, a11, w9
-CPU_BE(          rev           a8, a8          )
-CPU_BE(          rev           a9, a9          )
-CPU_BE(          rev           a10, a10        )
-CPU_BE(          rev           a11, a11        )
-
-       // x12[0-3] += s3[0]
-       // x13[0-3] += s3[1]
-       // x14[0-3] += s3[2]
-       // x15[0-3] += s3[3]
-       add             v12.4s, v12.4s, v28.4s
-         mov           w6, v28.s[0]
-         mov           w7, v29.s[0]
-       add             v13.4s, v13.4s, v29.4s
-         mov           w8, v30.s[0]
-         mov           w9, v31.s[0]
-       add             v14.4s, v14.4s, v30.4s
-         add           a12, a12, w6
-         add           a13, a13, w7
-       add             v15.4s, v15.4s, v31.4s
-         add           a14, a14, w8
-         add           a15, a15, w9
-CPU_BE(          rev           a12, a12        )
-CPU_BE(          rev           a13, a13        )
-CPU_BE(          rev           a14, a14        )
-CPU_BE(          rev           a15, a15        )
-
-       // interleave 32-bit words in state n, n+1
-         ldp           w6, w7, [x2], #64
-       zip1            v16.4s, v0.4s, v1.4s
-         ldp           w8, w9, [x2, #-56]
-         eor           a0, a0, w6
-       zip2            v17.4s, v0.4s, v1.4s
-         eor           a1, a1, w7
-       zip1            v18.4s, v2.4s, v3.4s
-         eor           a2, a2, w8
-       zip2            v19.4s, v2.4s, v3.4s
-         eor           a3, a3, w9
-         ldp           w6, w7, [x2, #-48]
-       zip1            v20.4s, v4.4s, v5.4s
-         ldp           w8, w9, [x2, #-40]
-         eor           a4, a4, w6
-       zip2            v21.4s, v4.4s, v5.4s
-         eor           a5, a5, w7
-       zip1            v22.4s, v6.4s, v7.4s
-         eor           a6, a6, w8
-       zip2            v23.4s, v6.4s, v7.4s
-         eor           a7, a7, w9
-         ldp           w6, w7, [x2, #-32]
-       zip1            v24.4s, v8.4s, v9.4s
-         ldp           w8, w9, [x2, #-24]
-         eor           a8, a8, w6
-       zip2            v25.4s, v8.4s, v9.4s
-         eor           a9, a9, w7
-       zip1            v26.4s, v10.4s, v11.4s
-         eor           a10, a10, w8
-       zip2            v27.4s, v10.4s, v11.4s
-         eor           a11, a11, w9
-         ldp           w6, w7, [x2, #-16]
-       zip1            v28.4s, v12.4s, v13.4s
-         ldp           w8, w9, [x2, #-8]
-         eor           a12, a12, w6
-       zip2            v29.4s, v12.4s, v13.4s
-         eor           a13, a13, w7
-       zip1            v30.4s, v14.4s, v15.4s
-         eor           a14, a14, w8
-       zip2            v31.4s, v14.4s, v15.4s
-         eor           a15, a15, w9
-
-       add             x3, x2, x4
-       sub             x3, x3, #128            // start of last block
-
-       subs            x5, x4, #128
-       csel            x2, x2, x3, ge
-
-       // interleave 64-bit words in state n, n+2
-       zip1            v0.2d, v16.2d, v18.2d
-       zip2            v4.2d, v16.2d, v18.2d
-         stp           a0, a1, [x1], #64
-       zip1            v8.2d, v17.2d, v19.2d
-       zip2            v12.2d, v17.2d, v19.2d
-         stp           a2, a3, [x1, #-56]
-
-       subs            x6, x4, #192
-       ld1             {v16.16b-v19.16b}, [x2], #64
-       csel            x2, x2, x3, ge
-
-       zip1            v1.2d, v20.2d, v22.2d
-       zip2            v5.2d, v20.2d, v22.2d
-         stp           a4, a5, [x1, #-48]
-       zip1            v9.2d, v21.2d, v23.2d
-       zip2            v13.2d, v21.2d, v23.2d
-         stp           a6, a7, [x1, #-40]
-
-       subs            x7, x4, #256
-       ld1             {v20.16b-v23.16b}, [x2], #64
-       csel            x2, x2, x3, ge
-
-       zip1            v2.2d, v24.2d, v26.2d
-       zip2            v6.2d, v24.2d, v26.2d
-         stp           a8, a9, [x1, #-32]
-       zip1            v10.2d, v25.2d, v27.2d
-       zip2            v14.2d, v25.2d, v27.2d
-         stp           a10, a11, [x1, #-24]
-
-       subs            x8, x4, #320
-       ld1             {v24.16b-v27.16b}, [x2], #64
-       csel            x2, x2, x3, ge
-
-       zip1            v3.2d, v28.2d, v30.2d
-       zip2            v7.2d, v28.2d, v30.2d
-         stp           a12, a13, [x1, #-16]
-       zip1            v11.2d, v29.2d, v31.2d
-       zip2            v15.2d, v29.2d, v31.2d
-         stp           a14, a15, [x1, #-8]
-
-       tbnz            x5, #63, .Lt128
-       ld1             {v28.16b-v31.16b}, [x2]
-
-       // xor with corresponding input, write to output
-       eor             v16.16b, v16.16b, v0.16b
-       eor             v17.16b, v17.16b, v1.16b
-       eor             v18.16b, v18.16b, v2.16b
-       eor             v19.16b, v19.16b, v3.16b
-
-       tbnz            x6, #63, .Lt192
-
-       eor             v20.16b, v20.16b, v4.16b
-       eor             v21.16b, v21.16b, v5.16b
-       eor             v22.16b, v22.16b, v6.16b
-       eor             v23.16b, v23.16b, v7.16b
-
-       st1             {v16.16b-v19.16b}, [x1], #64
-       tbnz            x7, #63, .Lt256
-
-       eor             v24.16b, v24.16b, v8.16b
-       eor             v25.16b, v25.16b, v9.16b
-       eor             v26.16b, v26.16b, v10.16b
-       eor             v27.16b, v27.16b, v11.16b
-
-       st1             {v20.16b-v23.16b}, [x1], #64
-       tbnz            x8, #63, .Lt320
-
-       eor             v28.16b, v28.16b, v12.16b
-       eor             v29.16b, v29.16b, v13.16b
-       eor             v30.16b, v30.16b, v14.16b
-       eor             v31.16b, v31.16b, v15.16b
-
-       st1             {v24.16b-v27.16b}, [x1], #64
-       st1             {v28.16b-v31.16b}, [x1]
-
-.Lout: frame_pop
-       ret
-
-       // fewer than 192 bytes of in/output
-.Lt192:        cbz             x5, 1f                          // exactly 128 bytes?
-       ld1             {v28.16b-v31.16b}, [x10]
-       add             x5, x5, x1
-       tbl             v28.16b, {v4.16b-v7.16b}, v28.16b
-       tbl             v29.16b, {v4.16b-v7.16b}, v29.16b
-       tbl             v30.16b, {v4.16b-v7.16b}, v30.16b
-       tbl             v31.16b, {v4.16b-v7.16b}, v31.16b
-
-0:     eor             v20.16b, v20.16b, v28.16b
-       eor             v21.16b, v21.16b, v29.16b
-       eor             v22.16b, v22.16b, v30.16b
-       eor             v23.16b, v23.16b, v31.16b
-       st1             {v20.16b-v23.16b}, [x5]         // overlapping stores
-1:     st1             {v16.16b-v19.16b}, [x1]
-       b               .Lout
-
-       // fewer than 128 bytes of in/output
-.Lt128:        ld1             {v28.16b-v31.16b}, [x10]
-       add             x5, x5, x1
-       sub             x1, x1, #64
-       tbl             v28.16b, {v0.16b-v3.16b}, v28.16b
-       tbl             v29.16b, {v0.16b-v3.16b}, v29.16b
-       tbl             v30.16b, {v0.16b-v3.16b}, v30.16b
-       tbl             v31.16b, {v0.16b-v3.16b}, v31.16b
-       ld1             {v16.16b-v19.16b}, [x1]         // reload first output block
-       b               0b
-
-       // fewer than 256 bytes of in/output
-.Lt256:        cbz             x6, 2f                          // exactly 192 bytes?
-       ld1             {v4.16b-v7.16b}, [x10]
-       add             x6, x6, x1
-       tbl             v0.16b, {v8.16b-v11.16b}, v4.16b
-       tbl             v1.16b, {v8.16b-v11.16b}, v5.16b
-       tbl             v2.16b, {v8.16b-v11.16b}, v6.16b
-       tbl             v3.16b, {v8.16b-v11.16b}, v7.16b
-
-       eor             v28.16b, v28.16b, v0.16b
-       eor             v29.16b, v29.16b, v1.16b
-       eor             v30.16b, v30.16b, v2.16b
-       eor             v31.16b, v31.16b, v3.16b
-       st1             {v28.16b-v31.16b}, [x6]         // overlapping stores
-2:     st1             {v20.16b-v23.16b}, [x1]
-       b               .Lout
-
-       // fewer than 320 bytes of in/output
-.Lt320:        cbz             x7, 3f                          // exactly 256 bytes?
-       ld1             {v4.16b-v7.16b}, [x10]
-       add             x7, x7, x1
-       tbl             v0.16b, {v12.16b-v15.16b}, v4.16b
-       tbl             v1.16b, {v12.16b-v15.16b}, v5.16b
-       tbl             v2.16b, {v12.16b-v15.16b}, v6.16b
-       tbl             v3.16b, {v12.16b-v15.16b}, v7.16b
-
-       eor             v28.16b, v28.16b, v0.16b
-       eor             v29.16b, v29.16b, v1.16b
-       eor             v30.16b, v30.16b, v2.16b
-       eor             v31.16b, v31.16b, v3.16b
-       st1             {v28.16b-v31.16b}, [x7]         // overlapping stores
-3:     st1             {v24.16b-v27.16b}, [x1]
-       b               .Lout
-SYM_FUNC_END(chacha_4block_xor_neon)
-
-       .section        ".rodata", "a", %progbits
-       .align          L1_CACHE_SHIFT
-.Lpermute:
-       .set            .Li, 0
-       .rept           128
-       .byte           (.Li - 64)
-       .set            .Li, .Li + 1
-       .endr
-
-CTRINC:        .word           1, 2, 3, 4
-ROT8:  .word           0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/arch/arm64/lib/crypto/chacha-neon-glue.c b/arch/arm64/lib/crypto/chacha-neon-glue.c
deleted file mode 100644 (file)
index d0188f9..0000000
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * ChaCha and HChaCha functions (ARM64 optimized)
- *
- * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/chacha.h>
-#include <crypto/internal/simd.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
-                                     u8 *dst, const u8 *src, int nrounds);
-asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
-                                      u8 *dst, const u8 *src,
-                                      int nrounds, int bytes);
-asmlinkage void hchacha_block_neon(const struct chacha_state *state,
-                                  u32 out[HCHACHA_OUT_WORDS], int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
-                         int bytes, int nrounds)
-{
-       while (bytes > 0) {
-               int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
-
-               if (l <= CHACHA_BLOCK_SIZE) {
-                       u8 buf[CHACHA_BLOCK_SIZE];
-
-                       memcpy(buf, src, l);
-                       chacha_block_xor_neon(state, buf, buf, nrounds);
-                       memcpy(dst, buf, l);
-                       state->x[12] += 1;
-                       break;
-               }
-               chacha_4block_xor_neon(state, dst, src, nrounds, l);
-               bytes -= l;
-               src += l;
-               dst += l;
-               state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
-       }
-}
-
-void hchacha_block_arch(const struct chacha_state *state,
-                       u32 out[HCHACHA_OUT_WORDS], int nrounds)
-{
-       if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
-               hchacha_block_generic(state, out, nrounds);
-       } else {
-               kernel_neon_begin();
-               hchacha_block_neon(state, out, nrounds);
-               kernel_neon_end();
-       }
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
-                      unsigned int bytes, int nrounds)
-{
-       if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
-           !crypto_simd_usable())
-               return chacha_crypt_generic(state, dst, src, bytes, nrounds);
-
-       do {
-               unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
-               kernel_neon_begin();
-               chacha_doneon(state, dst, src, todo, nrounds);
-               kernel_neon_end();
-
-               bytes -= todo;
-               src += todo;
-               dst += todo;
-       } while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
-       return static_key_enabled(&have_neon);
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-static int __init chacha_simd_mod_init(void)
-{
-       if (cpu_have_named_feature(ASIMD))
-               static_branch_enable(&have_neon);
-       return 0;
-}
-subsys_initcall(chacha_simd_mod_init);
-
-static void __exit chacha_simd_mod_exit(void)
-{
-}
-module_exit(chacha_simd_mod_exit);
-
-MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM64 optimized)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/lib/crypto/poly1305-armv8.pl b/arch/arm64/lib/crypto/poly1305-armv8.pl
deleted file mode 100644 (file)
index 22c9069..0000000
+++ /dev/null
@@ -1,917 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
-#
-# ====================================================================
-# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
-# project.
-# ====================================================================
-#
-# This module implements Poly1305 hash for ARMv8.
-#
-# June 2015
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone.
-#
-#              IALU/gcc-4.9    NEON
-#
-# Apple A7     1.86/+5%        0.72
-# Cortex-A53   2.69/+58%       1.47
-# Cortex-A57   2.70/+7%        1.14
-# Denver       1.64/+50%       1.18(*)
-# X-Gene       2.13/+68%       2.27
-# Mongoose     1.77/+75%       1.12
-# Kryo         2.70/+55%       1.13
-# ThunderX2    1.17/+95%       1.36
-#
-# (*)  estimate based on resources availability is less than 1.0,
-#      i.e. measured result is worse than expected, presumably binary
-#      translator is not almighty;
-
-$flavour=shift;
-$output=shift;
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
-    open STDOUT,">$output";
-}
-
-my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
-my ($mac,$nonce)=($inp,$len);
-
-my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
-
-$code.=<<___;
-#ifndef __KERNEL__
-# include "arm_arch.h"
-.extern        OPENSSL_armcap_P
-#endif
-
-.text
-
-// forward "declarations" are required for Apple
-.globl poly1305_blocks
-.globl poly1305_emit
-
-.globl poly1305_init
-.type  poly1305_init,%function
-.align 5
-poly1305_init:
-       cmp     $inp,xzr
-       stp     xzr,xzr,[$ctx]          // zero hash value
-       stp     xzr,xzr,[$ctx,#16]      // [along with is_base2_26]
-
-       csel    x0,xzr,x0,eq
-       b.eq    .Lno_key
-
-#ifndef        __KERNEL__
-       adrp    x17,OPENSSL_armcap_P
-       ldr     w17,[x17,#:lo12:OPENSSL_armcap_P]
-#endif
-
-       ldp     $r0,$r1,[$inp]          // load key
-       mov     $s1,#0xfffffffc0fffffff
-       movk    $s1,#0x0fff,lsl#48
-#ifdef __AARCH64EB__
-       rev     $r0,$r0                 // flip bytes
-       rev     $r1,$r1
-#endif
-       and     $r0,$r0,$s1             // &=0ffffffc0fffffff
-       and     $s1,$s1,#-4
-       and     $r1,$r1,$s1             // &=0ffffffc0ffffffc
-       mov     w#$s1,#-1
-       stp     $r0,$r1,[$ctx,#32]      // save key value
-       str     w#$s1,[$ctx,#48]        // impossible key power value
-
-#ifndef        __KERNEL__
-       tst     w17,#ARMV7_NEON
-
-       adr     $d0,.Lpoly1305_blocks
-       adr     $r0,.Lpoly1305_blocks_neon
-       adr     $d1,.Lpoly1305_emit
-
-       csel    $d0,$d0,$r0,eq
-
-# ifdef        __ILP32__
-       stp     w#$d0,w#$d1,[$len]
-# else
-       stp     $d0,$d1,[$len]
-# endif
-#endif
-       mov     x0,#1
-.Lno_key:
-       ret
-.size  poly1305_init,.-poly1305_init
-
-.type  poly1305_blocks,%function
-.align 5
-poly1305_blocks:
-.Lpoly1305_blocks:
-       ands    $len,$len,#-16
-       b.eq    .Lno_data
-
-       ldp     $h0,$h1,[$ctx]          // load hash value
-       ldp     $h2,x17,[$ctx,#16]      // [along with is_base2_26]
-       ldp     $r0,$r1,[$ctx,#32]      // load key value
-
-#ifdef __AARCH64EB__
-       lsr     $d0,$h0,#32
-       mov     w#$d1,w#$h0
-       lsr     $d2,$h1,#32
-       mov     w15,w#$h1
-       lsr     x16,$h2,#32
-#else
-       mov     w#$d0,w#$h0
-       lsr     $d1,$h0,#32
-       mov     w#$d2,w#$h1
-       lsr     x15,$h1,#32
-       mov     w16,w#$h2
-#endif
-
-       add     $d0,$d0,$d1,lsl#26      // base 2^26 -> base 2^64
-       lsr     $d1,$d2,#12
-       adds    $d0,$d0,$d2,lsl#52
-       add     $d1,$d1,x15,lsl#14
-       adc     $d1,$d1,xzr
-       lsr     $d2,x16,#24
-       adds    $d1,$d1,x16,lsl#40
-       adc     $d2,$d2,xzr
-
-       cmp     x17,#0                  // is_base2_26?
-       add     $s1,$r1,$r1,lsr#2       // s1 = r1 + (r1 >> 2)
-       csel    $h0,$h0,$d0,eq          // choose between radixes
-       csel    $h1,$h1,$d1,eq
-       csel    $h2,$h2,$d2,eq
-
-.Loop:
-       ldp     $t0,$t1,[$inp],#16      // load input
-       sub     $len,$len,#16
-#ifdef __AARCH64EB__
-       rev     $t0,$t0
-       rev     $t1,$t1
-#endif
-       adds    $h0,$h0,$t0             // accumulate input
-       adcs    $h1,$h1,$t1
-
-       mul     $d0,$h0,$r0             // h0*r0
-       adc     $h2,$h2,$padbit
-       umulh   $d1,$h0,$r0
-
-       mul     $t0,$h1,$s1             // h1*5*r1
-       umulh   $t1,$h1,$s1
-
-       adds    $d0,$d0,$t0
-       mul     $t0,$h0,$r1             // h0*r1
-       adc     $d1,$d1,$t1
-       umulh   $d2,$h0,$r1
-
-       adds    $d1,$d1,$t0
-       mul     $t0,$h1,$r0             // h1*r0
-       adc     $d2,$d2,xzr
-       umulh   $t1,$h1,$r0
-
-       adds    $d1,$d1,$t0
-       mul     $t0,$h2,$s1             // h2*5*r1
-       adc     $d2,$d2,$t1
-       mul     $t1,$h2,$r0             // h2*r0
-
-       adds    $d1,$d1,$t0
-       adc     $d2,$d2,$t1
-
-       and     $t0,$d2,#-4             // final reduction
-       and     $h2,$d2,#3
-       add     $t0,$t0,$d2,lsr#2
-       adds    $h0,$d0,$t0
-       adcs    $h1,$d1,xzr
-       adc     $h2,$h2,xzr
-
-       cbnz    $len,.Loop
-
-       stp     $h0,$h1,[$ctx]          // store hash value
-       stp     $h2,xzr,[$ctx,#16]      // [and clear is_base2_26]
-
-.Lno_data:
-       ret
-.size  poly1305_blocks,.-poly1305_blocks
-
-.type  poly1305_emit,%function
-.align 5
-poly1305_emit:
-.Lpoly1305_emit:
-       ldp     $h0,$h1,[$ctx]          // load hash base 2^64
-       ldp     $h2,$r0,[$ctx,#16]      // [along with is_base2_26]
-       ldp     $t0,$t1,[$nonce]        // load nonce
-
-#ifdef __AARCH64EB__
-       lsr     $d0,$h0,#32
-       mov     w#$d1,w#$h0
-       lsr     $d2,$h1,#32
-       mov     w15,w#$h1
-       lsr     x16,$h2,#32
-#else
-       mov     w#$d0,w#$h0
-       lsr     $d1,$h0,#32
-       mov     w#$d2,w#$h1
-       lsr     x15,$h1,#32
-       mov     w16,w#$h2
-#endif
-
-       add     $d0,$d0,$d1,lsl#26      // base 2^26 -> base 2^64
-       lsr     $d1,$d2,#12
-       adds    $d0,$d0,$d2,lsl#52
-       add     $d1,$d1,x15,lsl#14
-       adc     $d1,$d1,xzr
-       lsr     $d2,x16,#24
-       adds    $d1,$d1,x16,lsl#40
-       adc     $d2,$d2,xzr
-
-       cmp     $r0,#0                  // is_base2_26?
-       csel    $h0,$h0,$d0,eq          // choose between radixes
-       csel    $h1,$h1,$d1,eq
-       csel    $h2,$h2,$d2,eq
-
-       adds    $d0,$h0,#5              // compare to modulus
-       adcs    $d1,$h1,xzr
-       adc     $d2,$h2,xzr
-
-       tst     $d2,#-4                 // see if it's carried/borrowed
-
-       csel    $h0,$h0,$d0,eq
-       csel    $h1,$h1,$d1,eq
-
-#ifdef __AARCH64EB__
-       ror     $t0,$t0,#32             // flip nonce words
-       ror     $t1,$t1,#32
-#endif
-       adds    $h0,$h0,$t0             // accumulate nonce
-       adc     $h1,$h1,$t1
-#ifdef __AARCH64EB__
-       rev     $h0,$h0                 // flip output bytes
-       rev     $h1,$h1
-#endif
-       stp     $h0,$h1,[$mac]          // write result
-
-       ret
-.size  poly1305_emit,.-poly1305_emit
-___
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
-my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
-my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
-my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
-my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
-my ($T0,$T1,$MASK) = map("v$_",(29..31));
-
-my ($in2,$zeros)=("x16","x17");
-my $is_base2_26 = $zeros;              # borrow
-
-$code.=<<___;
-.type  poly1305_mult,%function
-.align 5
-poly1305_mult:
-       mul     $d0,$h0,$r0             // h0*r0
-       umulh   $d1,$h0,$r0
-
-       mul     $t0,$h1,$s1             // h1*5*r1
-       umulh   $t1,$h1,$s1
-
-       adds    $d0,$d0,$t0
-       mul     $t0,$h0,$r1             // h0*r1
-       adc     $d1,$d1,$t1
-       umulh   $d2,$h0,$r1
-
-       adds    $d1,$d1,$t0
-       mul     $t0,$h1,$r0             // h1*r0
-       adc     $d2,$d2,xzr
-       umulh   $t1,$h1,$r0
-
-       adds    $d1,$d1,$t0
-       mul     $t0,$h2,$s1             // h2*5*r1
-       adc     $d2,$d2,$t1
-       mul     $t1,$h2,$r0             // h2*r0
-
-       adds    $d1,$d1,$t0
-       adc     $d2,$d2,$t1
-
-       and     $t0,$d2,#-4             // final reduction
-       and     $h2,$d2,#3
-       add     $t0,$t0,$d2,lsr#2
-       adds    $h0,$d0,$t0
-       adcs    $h1,$d1,xzr
-       adc     $h2,$h2,xzr
-
-       ret
-.size  poly1305_mult,.-poly1305_mult
-
-.type  poly1305_splat,%function
-.align 4
-poly1305_splat:
-       and     x12,$h0,#0x03ffffff     // base 2^64 -> base 2^26
-       ubfx    x13,$h0,#26,#26
-       extr    x14,$h1,$h0,#52
-       and     x14,x14,#0x03ffffff
-       ubfx    x15,$h1,#14,#26
-       extr    x16,$h2,$h1,#40
-
-       str     w12,[$ctx,#16*0]        // r0
-       add     w12,w13,w13,lsl#2       // r1*5
-       str     w13,[$ctx,#16*1]        // r1
-       add     w13,w14,w14,lsl#2       // r2*5
-       str     w12,[$ctx,#16*2]        // s1
-       str     w14,[$ctx,#16*3]        // r2
-       add     w14,w15,w15,lsl#2       // r3*5
-       str     w13,[$ctx,#16*4]        // s2
-       str     w15,[$ctx,#16*5]        // r3
-       add     w15,w16,w16,lsl#2       // r4*5
-       str     w14,[$ctx,#16*6]        // s3
-       str     w16,[$ctx,#16*7]        // r4
-       str     w15,[$ctx,#16*8]        // s4
-
-       ret
-.size  poly1305_splat,.-poly1305_splat
-
-#ifdef __KERNEL__
-.globl poly1305_blocks_neon
-#endif
-.type  poly1305_blocks_neon,%function
-.align 5
-poly1305_blocks_neon:
-.Lpoly1305_blocks_neon:
-       ldr     $is_base2_26,[$ctx,#24]
-       cmp     $len,#128
-       b.lo    .Lpoly1305_blocks
-
-       .inst   0xd503233f              // paciasp
-       stp     x29,x30,[sp,#-80]!
-       add     x29,sp,#0
-
-       stp     d8,d9,[sp,#16]          // meet ABI requirements
-       stp     d10,d11,[sp,#32]
-       stp     d12,d13,[sp,#48]
-       stp     d14,d15,[sp,#64]
-
-       cbz     $is_base2_26,.Lbase2_64_neon
-
-       ldp     w10,w11,[$ctx]          // load hash value base 2^26
-       ldp     w12,w13,[$ctx,#8]
-       ldr     w14,[$ctx,#16]
-
-       tst     $len,#31
-       b.eq    .Leven_neon
-
-       ldp     $r0,$r1,[$ctx,#32]      // load key value
-
-       add     $h0,x10,x11,lsl#26      // base 2^26 -> base 2^64
-       lsr     $h1,x12,#12
-       adds    $h0,$h0,x12,lsl#52
-       add     $h1,$h1,x13,lsl#14
-       adc     $h1,$h1,xzr
-       lsr     $h2,x14,#24
-       adds    $h1,$h1,x14,lsl#40
-       adc     $d2,$h2,xzr             // can be partially reduced...
-
-       ldp     $d0,$d1,[$inp],#16      // load input
-       sub     $len,$len,#16
-       add     $s1,$r1,$r1,lsr#2       // s1 = r1 + (r1 >> 2)
-
-#ifdef __AARCH64EB__
-       rev     $d0,$d0
-       rev     $d1,$d1
-#endif
-       adds    $h0,$h0,$d0             // accumulate input
-       adcs    $h1,$h1,$d1
-       adc     $h2,$h2,$padbit
-
-       bl      poly1305_mult
-
-       and     x10,$h0,#0x03ffffff     // base 2^64 -> base 2^26
-       ubfx    x11,$h0,#26,#26
-       extr    x12,$h1,$h0,#52
-       and     x12,x12,#0x03ffffff
-       ubfx    x13,$h1,#14,#26
-       extr    x14,$h2,$h1,#40
-
-       b       .Leven_neon
-
-.align 4
-.Lbase2_64_neon:
-       ldp     $r0,$r1,[$ctx,#32]      // load key value
-
-       ldp     $h0,$h1,[$ctx]          // load hash value base 2^64
-       ldr     $h2,[$ctx,#16]
-
-       tst     $len,#31
-       b.eq    .Linit_neon
-
-       ldp     $d0,$d1,[$inp],#16      // load input
-       sub     $len,$len,#16
-       add     $s1,$r1,$r1,lsr#2       // s1 = r1 + (r1 >> 2)
-#ifdef __AARCH64EB__
-       rev     $d0,$d0
-       rev     $d1,$d1
-#endif
-       adds    $h0,$h0,$d0             // accumulate input
-       adcs    $h1,$h1,$d1
-       adc     $h2,$h2,$padbit
-
-       bl      poly1305_mult
-
-.Linit_neon:
-       ldr     w17,[$ctx,#48]          // first table element
-       and     x10,$h0,#0x03ffffff     // base 2^64 -> base 2^26
-       ubfx    x11,$h0,#26,#26
-       extr    x12,$h1,$h0,#52
-       and     x12,x12,#0x03ffffff
-       ubfx    x13,$h1,#14,#26
-       extr    x14,$h2,$h1,#40
-
-       cmp     w17,#-1                 // is value impossible?
-       b.ne    .Leven_neon
-
-       fmov    ${H0},x10
-       fmov    ${H1},x11
-       fmov    ${H2},x12
-       fmov    ${H3},x13
-       fmov    ${H4},x14
-
-       ////////////////////////////////// initialize r^n table
-       mov     $h0,$r0                 // r^1
-       add     $s1,$r1,$r1,lsr#2       // s1 = r1 + (r1 >> 2)
-       mov     $h1,$r1
-       mov     $h2,xzr
-       add     $ctx,$ctx,#48+12
-       bl      poly1305_splat
-
-       bl      poly1305_mult           // r^2
-       sub     $ctx,$ctx,#4
-       bl      poly1305_splat
-
-       bl      poly1305_mult           // r^3
-       sub     $ctx,$ctx,#4
-       bl      poly1305_splat
-
-       bl      poly1305_mult           // r^4
-       sub     $ctx,$ctx,#4
-       bl      poly1305_splat
-       sub     $ctx,$ctx,#48           // restore original $ctx
-       b       .Ldo_neon
-
-.align 4
-.Leven_neon:
-       fmov    ${H0},x10
-       fmov    ${H1},x11
-       fmov    ${H2},x12
-       fmov    ${H3},x13
-       fmov    ${H4},x14
-
-.Ldo_neon:
-       ldp     x8,x12,[$inp,#32]       // inp[2:3]
-       subs    $len,$len,#64
-       ldp     x9,x13,[$inp,#48]
-       add     $in2,$inp,#96
-       adrp    $zeros,.Lzeros
-       add     $zeros,$zeros,#:lo12:.Lzeros
-
-       lsl     $padbit,$padbit,#24
-       add     x15,$ctx,#48
-
-#ifdef __AARCH64EB__
-       rev     x8,x8
-       rev     x12,x12
-       rev     x9,x9
-       rev     x13,x13
-#endif
-       and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
-       and     x5,x9,#0x03ffffff
-       ubfx    x6,x8,#26,#26
-       ubfx    x7,x9,#26,#26
-       add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
-       extr    x8,x12,x8,#52
-       extr    x9,x13,x9,#52
-       add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
-       fmov    $IN23_0,x4
-       and     x8,x8,#0x03ffffff
-       and     x9,x9,#0x03ffffff
-       ubfx    x10,x12,#14,#26
-       ubfx    x11,x13,#14,#26
-       add     x12,$padbit,x12,lsr#40
-       add     x13,$padbit,x13,lsr#40
-       add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
-       fmov    $IN23_1,x6
-       add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
-       add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
-       fmov    $IN23_2,x8
-       fmov    $IN23_3,x10
-       fmov    $IN23_4,x12
-
-       ldp     x8,x12,[$inp],#16       // inp[0:1]
-       ldp     x9,x13,[$inp],#48
-
-       ld1     {$R0,$R1,$S1,$R2},[x15],#64
-       ld1     {$S2,$R3,$S3,$R4},[x15],#64
-       ld1     {$S4},[x15]
-
-#ifdef __AARCH64EB__
-       rev     x8,x8
-       rev     x12,x12
-       rev     x9,x9
-       rev     x13,x13
-#endif
-       and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
-       and     x5,x9,#0x03ffffff
-       ubfx    x6,x8,#26,#26
-       ubfx    x7,x9,#26,#26
-       add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
-       extr    x8,x12,x8,#52
-       extr    x9,x13,x9,#52
-       add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
-       fmov    $IN01_0,x4
-       and     x8,x8,#0x03ffffff
-       and     x9,x9,#0x03ffffff
-       ubfx    x10,x12,#14,#26
-       ubfx    x11,x13,#14,#26
-       add     x12,$padbit,x12,lsr#40
-       add     x13,$padbit,x13,lsr#40
-       add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
-       fmov    $IN01_1,x6
-       add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
-       add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
-       movi    $MASK.2d,#-1
-       fmov    $IN01_2,x8
-       fmov    $IN01_3,x10
-       fmov    $IN01_4,x12
-       ushr    $MASK.2d,$MASK.2d,#38
-
-       b.ls    .Lskip_loop
-
-.align 4
-.Loop_neon:
-       ////////////////////////////////////////////////////////////////
-       // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
-       // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
-       //   \___________________/
-       // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
-       // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
-       //   \___________________/ \____________________/
-       //
-       // Note that we start with inp[2:3]*r^2. This is because it
-       // doesn't depend on reduction in previous iteration.
-       ////////////////////////////////////////////////////////////////
-       // d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
-       // d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
-       // d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
-       // d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
-       // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
-       subs    $len,$len,#64
-       umull   $ACC4,$IN23_0,${R4}[2]
-       csel    $in2,$zeros,$in2,lo
-       umull   $ACC3,$IN23_0,${R3}[2]
-       umull   $ACC2,$IN23_0,${R2}[2]
-        ldp    x8,x12,[$in2],#16       // inp[2:3] (or zero)
-       umull   $ACC1,$IN23_0,${R1}[2]
-        ldp    x9,x13,[$in2],#48
-       umull   $ACC0,$IN23_0,${R0}[2]
-#ifdef __AARCH64EB__
-        rev    x8,x8
-        rev    x12,x12
-        rev    x9,x9
-        rev    x13,x13
-#endif
-
-       umlal   $ACC4,$IN23_1,${R3}[2]
-        and    x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
-       umlal   $ACC3,$IN23_1,${R2}[2]
-        and    x5,x9,#0x03ffffff
-       umlal   $ACC2,$IN23_1,${R1}[2]
-        ubfx   x6,x8,#26,#26
-       umlal   $ACC1,$IN23_1,${R0}[2]
-        ubfx   x7,x9,#26,#26
-       umlal   $ACC0,$IN23_1,${S4}[2]
-        add    x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
-
-       umlal   $ACC4,$IN23_2,${R2}[2]
-        extr   x8,x12,x8,#52
-       umlal   $ACC3,$IN23_2,${R1}[2]
-        extr   x9,x13,x9,#52
-       umlal   $ACC2,$IN23_2,${R0}[2]
-        add    x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
-       umlal   $ACC1,$IN23_2,${S4}[2]
-        fmov   $IN23_0,x4
-       umlal   $ACC0,$IN23_2,${S3}[2]
-        and    x8,x8,#0x03ffffff
-
-       umlal   $ACC4,$IN23_3,${R1}[2]
-        and    x9,x9,#0x03ffffff
-       umlal   $ACC3,$IN23_3,${R0}[2]
-        ubfx   x10,x12,#14,#26
-       umlal   $ACC2,$IN23_3,${S4}[2]
-        ubfx   x11,x13,#14,#26
-       umlal   $ACC1,$IN23_3,${S3}[2]
-        add    x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
-       umlal   $ACC0,$IN23_3,${S2}[2]
-        fmov   $IN23_1,x6
-
-       add     $IN01_2,$IN01_2,$H2
-        add    x12,$padbit,x12,lsr#40
-       umlal   $ACC4,$IN23_4,${R0}[2]
-        add    x13,$padbit,x13,lsr#40
-       umlal   $ACC3,$IN23_4,${S4}[2]
-        add    x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
-       umlal   $ACC2,$IN23_4,${S3}[2]
-        add    x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
-       umlal   $ACC1,$IN23_4,${S2}[2]
-        fmov   $IN23_2,x8
-       umlal   $ACC0,$IN23_4,${S1}[2]
-        fmov   $IN23_3,x10
-
-       ////////////////////////////////////////////////////////////////
-       // (hash+inp[0:1])*r^4 and accumulate
-
-       add     $IN01_0,$IN01_0,$H0
-        fmov   $IN23_4,x12
-       umlal   $ACC3,$IN01_2,${R1}[0]
-        ldp    x8,x12,[$inp],#16       // inp[0:1]
-       umlal   $ACC0,$IN01_2,${S3}[0]
-        ldp    x9,x13,[$inp],#48
-       umlal   $ACC4,$IN01_2,${R2}[0]
-       umlal   $ACC1,$IN01_2,${S4}[0]
-       umlal   $ACC2,$IN01_2,${R0}[0]
-#ifdef __AARCH64EB__
-        rev    x8,x8
-        rev    x12,x12
-        rev    x9,x9
-        rev    x13,x13
-#endif
-
-       add     $IN01_1,$IN01_1,$H1
-       umlal   $ACC3,$IN01_0,${R3}[0]
-       umlal   $ACC4,$IN01_0,${R4}[0]
-        and    x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
-       umlal   $ACC2,$IN01_0,${R2}[0]
-        and    x5,x9,#0x03ffffff
-       umlal   $ACC0,$IN01_0,${R0}[0]
-        ubfx   x6,x8,#26,#26
-       umlal   $ACC1,$IN01_0,${R1}[0]
-        ubfx   x7,x9,#26,#26
-
-       add     $IN01_3,$IN01_3,$H3
-        add    x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
-       umlal   $ACC3,$IN01_1,${R2}[0]
-        extr   x8,x12,x8,#52
-       umlal   $ACC4,$IN01_1,${R3}[0]
-        extr   x9,x13,x9,#52
-       umlal   $ACC0,$IN01_1,${S4}[0]
-        add    x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
-       umlal   $ACC2,$IN01_1,${R1}[0]
-        fmov   $IN01_0,x4
-       umlal   $ACC1,$IN01_1,${R0}[0]
-        and    x8,x8,#0x03ffffff
-
-       add     $IN01_4,$IN01_4,$H4
-        and    x9,x9,#0x03ffffff
-       umlal   $ACC3,$IN01_3,${R0}[0]
-        ubfx   x10,x12,#14,#26
-       umlal   $ACC0,$IN01_3,${S2}[0]
-        ubfx   x11,x13,#14,#26
-       umlal   $ACC4,$IN01_3,${R1}[0]
-        add    x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
-       umlal   $ACC1,$IN01_3,${S3}[0]
-        fmov   $IN01_1,x6
-       umlal   $ACC2,$IN01_3,${S4}[0]
-        add    x12,$padbit,x12,lsr#40
-
-       umlal   $ACC3,$IN01_4,${S4}[0]
-        add    x13,$padbit,x13,lsr#40
-       umlal   $ACC0,$IN01_4,${S1}[0]
-        add    x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
-       umlal   $ACC4,$IN01_4,${R0}[0]
-        add    x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
-       umlal   $ACC1,$IN01_4,${S2}[0]
-        fmov   $IN01_2,x8
-       umlal   $ACC2,$IN01_4,${S3}[0]
-        fmov   $IN01_3,x10
-        fmov   $IN01_4,x12
-
-       /////////////////////////////////////////////////////////////////
-       // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-       // and P. Schwabe
-       //
-       // [see discussion in poly1305-armv4 module]
-
-       ushr    $T0.2d,$ACC3,#26
-       xtn     $H3,$ACC3
-        ushr   $T1.2d,$ACC0,#26
-        and    $ACC0,$ACC0,$MASK.2d
-       add     $ACC4,$ACC4,$T0.2d      // h3 -> h4
-       bic     $H3,#0xfc,lsl#24        // &=0x03ffffff
-        add    $ACC1,$ACC1,$T1.2d      // h0 -> h1
-
-       ushr    $T0.2d,$ACC4,#26
-       xtn     $H4,$ACC4
-        ushr   $T1.2d,$ACC1,#26
-        xtn    $H1,$ACC1
-       bic     $H4,#0xfc,lsl#24
-        add    $ACC2,$ACC2,$T1.2d      // h1 -> h2
-
-       add     $ACC0,$ACC0,$T0.2d
-       shl     $T0.2d,$T0.2d,#2
-        shrn   $T1.2s,$ACC2,#26
-        xtn    $H2,$ACC2
-       add     $ACC0,$ACC0,$T0.2d      // h4 -> h0
-        bic    $H1,#0xfc,lsl#24
-        add    $H3,$H3,$T1.2s          // h2 -> h3
-        bic    $H2,#0xfc,lsl#24
-
-       shrn    $T0.2s,$ACC0,#26
-       xtn     $H0,$ACC0
-        ushr   $T1.2s,$H3,#26
-        bic    $H3,#0xfc,lsl#24
-        bic    $H0,#0xfc,lsl#24
-       add     $H1,$H1,$T0.2s          // h0 -> h1
-        add    $H4,$H4,$T1.2s          // h3 -> h4
-
-       b.hi    .Loop_neon
-
-.Lskip_loop:
-       dup     $IN23_2,${IN23_2}[0]
-       add     $IN01_2,$IN01_2,$H2
-
-       ////////////////////////////////////////////////////////////////
-       // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
-       adds    $len,$len,#32
-       b.ne    .Long_tail
-
-       dup     $IN23_2,${IN01_2}[0]
-       add     $IN23_0,$IN01_0,$H0
-       add     $IN23_3,$IN01_3,$H3
-       add     $IN23_1,$IN01_1,$H1
-       add     $IN23_4,$IN01_4,$H4
-
-.Long_tail:
-       dup     $IN23_0,${IN23_0}[0]
-       umull2  $ACC0,$IN23_2,${S3}
-       umull2  $ACC3,$IN23_2,${R1}
-       umull2  $ACC4,$IN23_2,${R2}
-       umull2  $ACC2,$IN23_2,${R0}
-       umull2  $ACC1,$IN23_2,${S4}
-
-       dup     $IN23_1,${IN23_1}[0]
-       umlal2  $ACC0,$IN23_0,${R0}
-       umlal2  $ACC2,$IN23_0,${R2}
-       umlal2  $ACC3,$IN23_0,${R3}
-       umlal2  $ACC4,$IN23_0,${R4}
-       umlal2  $ACC1,$IN23_0,${R1}
-
-       dup     $IN23_3,${IN23_3}[0]
-       umlal2  $ACC0,$IN23_1,${S4}
-       umlal2  $ACC3,$IN23_1,${R2}
-       umlal2  $ACC2,$IN23_1,${R1}
-       umlal2  $ACC4,$IN23_1,${R3}
-       umlal2  $ACC1,$IN23_1,${R0}
-
-       dup     $IN23_4,${IN23_4}[0]
-       umlal2  $ACC3,$IN23_3,${R0}
-       umlal2  $ACC4,$IN23_3,${R1}
-       umlal2  $ACC0,$IN23_3,${S2}
-       umlal2  $ACC1,$IN23_3,${S3}
-       umlal2  $ACC2,$IN23_3,${S4}
-
-       umlal2  $ACC3,$IN23_4,${S4}
-       umlal2  $ACC0,$IN23_4,${S1}
-       umlal2  $ACC4,$IN23_4,${R0}
-       umlal2  $ACC1,$IN23_4,${S2}
-       umlal2  $ACC2,$IN23_4,${S3}
-
-       b.eq    .Lshort_tail
-
-       ////////////////////////////////////////////////////////////////
-       // (hash+inp[0:1])*r^4:r^3 and accumulate
-
-       add     $IN01_0,$IN01_0,$H0
-       umlal   $ACC3,$IN01_2,${R1}
-       umlal   $ACC0,$IN01_2,${S3}
-       umlal   $ACC4,$IN01_2,${R2}
-       umlal   $ACC1,$IN01_2,${S4}
-       umlal   $ACC2,$IN01_2,${R0}
-
-       add     $IN01_1,$IN01_1,$H1
-       umlal   $ACC3,$IN01_0,${R3}
-       umlal   $ACC0,$IN01_0,${R0}
-       umlal   $ACC4,$IN01_0,${R4}
-       umlal   $ACC1,$IN01_0,${R1}
-       umlal   $ACC2,$IN01_0,${R2}
-
-       add     $IN01_3,$IN01_3,$H3
-       umlal   $ACC3,$IN01_1,${R2}
-       umlal   $ACC0,$IN01_1,${S4}
-       umlal   $ACC4,$IN01_1,${R3}
-       umlal   $ACC1,$IN01_1,${R0}
-       umlal   $ACC2,$IN01_1,${R1}
-
-       add     $IN01_4,$IN01_4,$H4
-       umlal   $ACC3,$IN01_3,${R0}
-       umlal   $ACC0,$IN01_3,${S2}
-       umlal   $ACC4,$IN01_3,${R1}
-       umlal   $ACC1,$IN01_3,${S3}
-       umlal   $ACC2,$IN01_3,${S4}
-
-       umlal   $ACC3,$IN01_4,${S4}
-       umlal   $ACC0,$IN01_4,${S1}
-       umlal   $ACC4,$IN01_4,${R0}
-       umlal   $ACC1,$IN01_4,${S2}
-       umlal   $ACC2,$IN01_4,${S3}
-
-.Lshort_tail:
-       ////////////////////////////////////////////////////////////////
-       // horizontal add
-
-       addp    $ACC3,$ACC3,$ACC3
-        ldp    d8,d9,[sp,#16]          // meet ABI requirements
-       addp    $ACC0,$ACC0,$ACC0
-        ldp    d10,d11,[sp,#32]
-       addp    $ACC4,$ACC4,$ACC4
-        ldp    d12,d13,[sp,#48]
-       addp    $ACC1,$ACC1,$ACC1
-        ldp    d14,d15,[sp,#64]
-       addp    $ACC2,$ACC2,$ACC2
-        ldr    x30,[sp,#8]
-
-       ////////////////////////////////////////////////////////////////
-       // lazy reduction, but without narrowing
-
-       ushr    $T0.2d,$ACC3,#26
-       and     $ACC3,$ACC3,$MASK.2d
-        ushr   $T1.2d,$ACC0,#26
-        and    $ACC0,$ACC0,$MASK.2d
-
-       add     $ACC4,$ACC4,$T0.2d      // h3 -> h4
-        add    $ACC1,$ACC1,$T1.2d      // h0 -> h1
-
-       ushr    $T0.2d,$ACC4,#26
-       and     $ACC4,$ACC4,$MASK.2d
-        ushr   $T1.2d,$ACC1,#26
-        and    $ACC1,$ACC1,$MASK.2d
-        add    $ACC2,$ACC2,$T1.2d      // h1 -> h2
-
-       add     $ACC0,$ACC0,$T0.2d
-       shl     $T0.2d,$T0.2d,#2
-        ushr   $T1.2d,$ACC2,#26
-        and    $ACC2,$ACC2,$MASK.2d
-       add     $ACC0,$ACC0,$T0.2d      // h4 -> h0
-        add    $ACC3,$ACC3,$T1.2d      // h2 -> h3
-
-       ushr    $T0.2d,$ACC0,#26
-       and     $ACC0,$ACC0,$MASK.2d
-        ushr   $T1.2d,$ACC3,#26
-        and    $ACC3,$ACC3,$MASK.2d
-       add     $ACC1,$ACC1,$T0.2d      // h0 -> h1
-        add    $ACC4,$ACC4,$T1.2d      // h3 -> h4
-
-       ////////////////////////////////////////////////////////////////
-       // write the result, can be partially reduced
-
-       st4     {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
-       mov     x4,#1
-       st1     {$ACC4}[0],[$ctx]
-       str     x4,[$ctx,#8]            // set is_base2_26
-
-       ldr     x29,[sp],#80
-        .inst  0xd50323bf              // autiasp
-       ret
-.size  poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.pushsection .rodata
-.align 5
-.Lzeros:
-.long  0,0,0,0,0,0,0,0
-.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
-.popsection
-
-.align 2
-#if !defined(__KERNEL__) && !defined(_WIN64)
-.comm  OPENSSL_armcap_P,4,4
-.hidden        OPENSSL_armcap_P
-#endif
-___
-
-foreach (split("\n",$code)) {
-       s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/                      or
-       s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/     or
-       (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1))                 or
-       (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1))       or
-       (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1))             or
-       (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1))            or
-       (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
-
-       s/\.[124]([sd])\[/.$1\[/;
-       s/w#x([0-9]+)/w$1/g;
-
-       print $_,"\n";
-}
-close STDOUT;
diff --git a/arch/arm64/lib/crypto/poly1305-glue.c b/arch/arm64/lib/crypto/poly1305-glue.c
deleted file mode 100644 (file)
index c9a7476..0000000
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
- *
- * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <crypto/internal/poly1305.h>
-#include <linux/cpufeature.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/unaligned.h>
-
-asmlinkage void poly1305_block_init_arch(
-       struct poly1305_block_state *state,
-       const u8 raw_key[POLY1305_BLOCK_SIZE]);
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
-asmlinkage void poly1305_blocks(struct poly1305_block_state *state,
-                               const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
-                                    const u8 *src, u32 len, u32 hibit);
-asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
-                                  u8 digest[POLY1305_DIGEST_SIZE],
-                                  const u32 nonce[4]);
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
-                         unsigned int len, u32 padbit)
-{
-       len = round_down(len, POLY1305_BLOCK_SIZE);
-       if (static_branch_likely(&have_neon)) {
-               do {
-                       unsigned int todo = min_t(unsigned int, len, SZ_4K);
-
-                       kernel_neon_begin();
-                       poly1305_blocks_neon(state, src, todo, padbit);
-                       kernel_neon_end();
-
-                       len -= todo;
-                       src += todo;
-               } while (len);
-       } else
-               poly1305_blocks(state, src, len, padbit);
-}
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
-       /* We always can use at least the ARM64 scalar implementation. */
-       return true;
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-static int __init neon_poly1305_mod_init(void)
-{
-       if (cpu_have_named_feature(ASIMD))
-               static_branch_enable(&have_neon);
-       return 0;
-}
-subsys_initcall(neon_poly1305_mod_init);
-
-static void __exit neon_poly1305_mod_exit(void)
-{
-}
-module_exit(neon_poly1305_mod_exit);
-
-MODULE_DESCRIPTION("Poly1305 authenticator (ARM64 optimized)");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm64/lib/crypto/sha2-armv8.pl b/arch/arm64/lib/crypto/sha2-armv8.pl
deleted file mode 100644 (file)
index 4aebd20..0000000
+++ /dev/null
@@ -1,786 +0,0 @@
-#! /usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from the OpenSSL project but the author (Andy Polyakov)
-# has relicensed it under the GPLv2. Therefore this program is free software;
-# you can redistribute it and/or modify it under the terms of the GNU General
-# Public License version 2 as published by the Free Software Foundation.
-#
-# The original headers, including the original license headers, are
-# included below for completeness.
-
-# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
-#
-# Licensed under the OpenSSL license (the "License").  You may not use
-# this file except in compliance with the License.  You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# SHA256/512 for ARMv8.
-#
-# Performance in cycles per processed byte and improvement coefficient
-# over code generated with "default" compiler:
-#
-#              SHA256-hw       SHA256(*)       SHA512
-# Apple A7     1.97            10.5 (+33%)     6.73 (-1%(**))
-# Cortex-A53   2.38            15.5 (+115%)    10.0 (+150%(***))
-# Cortex-A57   2.31            11.6 (+86%)     7.51 (+260%(***))
-# Denver       2.01            10.5 (+26%)     6.70 (+8%)
-# X-Gene                       20.0 (+100%)    12.8 (+300%(***))
-# Mongoose     2.36            13.0 (+50%)     8.36 (+33%)
-#
-# (*)  Software SHA256 results are of lesser relevance, presented
-#      mostly for informational purposes.
-# (**) The result is a trade-off: it's possible to improve it by
-#      10% (or by 1 cycle per round), but at the cost of 20% loss
-#      on Cortex-A53 (or by 4 cycles per round).
-# (***)        Super-impressive coefficients over gcc-generated code are
-#      indication of some compiler "pathology", most notably code
-#      generated with -mgeneral-regs-only is significantly faster
-#      and the gap is only 40-90%.
-#
-# October 2016.
-#
-# Originally it was reckoned that it makes no sense to implement NEON
-# version of SHA256 for 64-bit processors. This is because performance
-# improvement on most wide-spread Cortex-A5x processors was observed
-# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
-# observed that 32-bit NEON SHA256 performs significantly better than
-# 64-bit scalar version on *some* of the more recent processors. As
-# result 64-bit NEON version of SHA256 was added to provide best
-# all-round performance. For example it executes ~30% faster on X-Gene
-# and Mongoose. [For reference, NEON version of SHA512 is bound to
-# deliver much less improvement, likely *negative* on Cortex-A5x.
-# Which is why NEON support is limited to SHA256.]
-
-$output=pop;
-$flavour=pop;
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open OUT,"| \"$^X\" $xlate $flavour $output";
-    *STDOUT=*OUT;
-} else {
-    open STDOUT,">$output";
-}
-
-if ($output =~ /512/) {
-       $BITS=512;
-       $SZ=8;
-       @Sigma0=(28,34,39);
-       @Sigma1=(14,18,41);
-       @sigma0=(1,  8, 7);
-       @sigma1=(19,61, 6);
-       $rounds=80;
-       $reg_t="x";
-} else {
-       $BITS=256;
-       $SZ=4;
-       @Sigma0=( 2,13,22);
-       @Sigma1=( 6,11,25);
-       @sigma0=( 7,18, 3);
-       @sigma1=(17,19,10);
-       $rounds=64;
-       $reg_t="w";
-}
-
-$func="sha${BITS}_blocks_arch";
-
-($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
-
-@X=map("$reg_t$_",(3..15,0..2));
-@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
-($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
-
-sub BODY_00_xx {
-my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
-my $j=($i+1)&15;
-my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
-   $T0=@X[$i+3] if ($i<11);
-
-$code.=<<___   if ($i<16);
-#ifndef        __AARCH64EB__
-       rev     @X[$i],@X[$i]                   // $i
-#endif
-___
-$code.=<<___   if ($i<13 && ($i&1));
-       ldp     @X[$i+1],@X[$i+2],[$inp],#2*$SZ
-___
-$code.=<<___   if ($i==13);
-       ldp     @X[14],@X[15],[$inp]
-___
-$code.=<<___   if ($i>=14);
-       ldr     @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
-___
-$code.=<<___   if ($i>0 && $i<16);
-       add     $a,$a,$t1                       // h+=Sigma0(a)
-___
-$code.=<<___   if ($i>=11);
-       str     @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
-___
-# While ARMv8 specifies merged rotate-n-logical operation such as
-# 'eor x,y,z,ror#n', it was found to negatively affect performance
-# on Apple A7. The reason seems to be that it requires even 'y' to
-# be available earlier. This means that such merged instruction is
-# not necessarily best choice on critical path... On the other hand
-# Cortex-A5x handles merged instructions much better than disjoint
-# rotate and logical... See (**) footnote above.
-$code.=<<___   if ($i<15);
-       ror     $t0,$e,#$Sigma1[0]
-       add     $h,$h,$t2                       // h+=K[i]
-       eor     $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
-       and     $t1,$f,$e
-       bic     $t2,$g,$e
-       add     $h,$h,@X[$i&15]                 // h+=X[i]
-       orr     $t1,$t1,$t2                     // Ch(e,f,g)
-       eor     $t2,$a,$b                       // a^b, b^c in next round
-       eor     $t0,$t0,$T0,ror#$Sigma1[1]      // Sigma1(e)
-       ror     $T0,$a,#$Sigma0[0]
-       add     $h,$h,$t1                       // h+=Ch(e,f,g)
-       eor     $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
-       add     $h,$h,$t0                       // h+=Sigma1(e)
-       and     $t3,$t3,$t2                     // (b^c)&=(a^b)
-       add     $d,$d,$h                        // d+=h
-       eor     $t3,$t3,$b                      // Maj(a,b,c)
-       eor     $t1,$T0,$t1,ror#$Sigma0[1]      // Sigma0(a)
-       add     $h,$h,$t3                       // h+=Maj(a,b,c)
-       ldr     $t3,[$Ktbl],#$SZ                // *K++, $t2 in next round
-       //add   $h,$h,$t1                       // h+=Sigma0(a)
-___
-$code.=<<___   if ($i>=15);
-       ror     $t0,$e,#$Sigma1[0]
-       add     $h,$h,$t2                       // h+=K[i]
-       ror     $T1,@X[($j+1)&15],#$sigma0[0]
-       and     $t1,$f,$e
-       ror     $T2,@X[($j+14)&15],#$sigma1[0]
-       bic     $t2,$g,$e
-       ror     $T0,$a,#$Sigma0[0]
-       add     $h,$h,@X[$i&15]                 // h+=X[i]
-       eor     $t0,$t0,$e,ror#$Sigma1[1]
-       eor     $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
-       orr     $t1,$t1,$t2                     // Ch(e,f,g)
-       eor     $t2,$a,$b                       // a^b, b^c in next round
-       eor     $t0,$t0,$e,ror#$Sigma1[2]       // Sigma1(e)
-       eor     $T0,$T0,$a,ror#$Sigma0[1]
-       add     $h,$h,$t1                       // h+=Ch(e,f,g)
-       and     $t3,$t3,$t2                     // (b^c)&=(a^b)
-       eor     $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
-       eor     $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]    // sigma0(X[i+1])
-       add     $h,$h,$t0                       // h+=Sigma1(e)
-       eor     $t3,$t3,$b                      // Maj(a,b,c)
-       eor     $t1,$T0,$a,ror#$Sigma0[2]       // Sigma0(a)
-       eor     $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]   // sigma1(X[i+14])
-       add     @X[$j],@X[$j],@X[($j+9)&15]
-       add     $d,$d,$h                        // d+=h
-       add     $h,$h,$t3                       // h+=Maj(a,b,c)
-       ldr     $t3,[$Ktbl],#$SZ                // *K++, $t2 in next round
-       add     @X[$j],@X[$j],$T1
-       add     $h,$h,$t1                       // h+=Sigma0(a)
-       add     @X[$j],@X[$j],$T2
-___
-       ($t2,$t3)=($t3,$t2);
-}
-
-$code.=<<___;
-#ifndef        __KERNEL__
-# include "arm_arch.h"
-#endif
-
-.text
-
-.extern        OPENSSL_armcap_P
-.globl $func
-.type  $func,%function
-.align 6
-$func:
-___
-$code.=<<___   if ($SZ==4);
-#ifndef        __KERNEL__
-# ifdef        __ILP32__
-       ldrsw   x16,.LOPENSSL_armcap_P
-# else
-       ldr     x16,.LOPENSSL_armcap_P
-# endif
-       adr     x17,.LOPENSSL_armcap_P
-       add     x16,x16,x17
-       ldr     w16,[x16]
-       tst     w16,#ARMV8_SHA256
-       b.ne    .Lv8_entry
-       tst     w16,#ARMV7_NEON
-       b.ne    .Lneon_entry
-#endif
-___
-$code.=<<___;
-       stp     x29,x30,[sp,#-128]!
-       add     x29,sp,#0
-
-       stp     x19,x20,[sp,#16]
-       stp     x21,x22,[sp,#32]
-       stp     x23,x24,[sp,#48]
-       stp     x25,x26,[sp,#64]
-       stp     x27,x28,[sp,#80]
-       sub     sp,sp,#4*$SZ
-
-       ldp     $A,$B,[$ctx]                            // load context
-       ldp     $C,$D,[$ctx,#2*$SZ]
-       ldp     $E,$F,[$ctx,#4*$SZ]
-       add     $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
-       ldp     $G,$H,[$ctx,#6*$SZ]
-       adr     $Ktbl,.LK$BITS
-       stp     $ctx,$num,[x29,#96]
-
-.Loop:
-       ldp     @X[0],@X[1],[$inp],#2*$SZ
-       ldr     $t2,[$Ktbl],#$SZ                        // *K++
-       eor     $t3,$B,$C                               // magic seed
-       str     $inp,[x29,#112]
-___
-for ($i=0;$i<16;$i++)  { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
-$code.=".Loop_16_xx:\n";
-for (;$i<32;$i++)      { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
-       cbnz    $t2,.Loop_16_xx
-
-       ldp     $ctx,$num,[x29,#96]
-       ldr     $inp,[x29,#112]
-       sub     $Ktbl,$Ktbl,#`$SZ*($rounds+1)`          // rewind
-
-       ldp     @X[0],@X[1],[$ctx]
-       ldp     @X[2],@X[3],[$ctx,#2*$SZ]
-       add     $inp,$inp,#14*$SZ                       // advance input pointer
-       ldp     @X[4],@X[5],[$ctx,#4*$SZ]
-       add     $A,$A,@X[0]
-       ldp     @X[6],@X[7],[$ctx,#6*$SZ]
-       add     $B,$B,@X[1]
-       add     $C,$C,@X[2]
-       add     $D,$D,@X[3]
-       stp     $A,$B,[$ctx]
-       add     $E,$E,@X[4]
-       add     $F,$F,@X[5]
-       stp     $C,$D,[$ctx,#2*$SZ]
-       add     $G,$G,@X[6]
-       add     $H,$H,@X[7]
-       cmp     $inp,$num
-       stp     $E,$F,[$ctx,#4*$SZ]
-       stp     $G,$H,[$ctx,#6*$SZ]
-       b.ne    .Loop
-
-       ldp     x19,x20,[x29,#16]
-       add     sp,sp,#4*$SZ
-       ldp     x21,x22,[x29,#32]
-       ldp     x23,x24,[x29,#48]
-       ldp     x25,x26,[x29,#64]
-       ldp     x27,x28,[x29,#80]
-       ldp     x29,x30,[sp],#128
-       ret
-.size  $func,.-$func
-
-.align 6
-.type  .LK$BITS,%object
-.LK$BITS:
-___
-$code.=<<___ if ($SZ==8);
-       .quad   0x428a2f98d728ae22,0x7137449123ef65cd
-       .quad   0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
-       .quad   0x3956c25bf348b538,0x59f111f1b605d019
-       .quad   0x923f82a4af194f9b,0xab1c5ed5da6d8118
-       .quad   0xd807aa98a3030242,0x12835b0145706fbe
-       .quad   0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
-       .quad   0x72be5d74f27b896f,0x80deb1fe3b1696b1
-       .quad   0x9bdc06a725c71235,0xc19bf174cf692694
-       .quad   0xe49b69c19ef14ad2,0xefbe4786384f25e3
-       .quad   0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
-       .quad   0x2de92c6f592b0275,0x4a7484aa6ea6e483
-       .quad   0x5cb0a9dcbd41fbd4,0x76f988da831153b5
-       .quad   0x983e5152ee66dfab,0xa831c66d2db43210
-       .quad   0xb00327c898fb213f,0xbf597fc7beef0ee4
-       .quad   0xc6e00bf33da88fc2,0xd5a79147930aa725
-       .quad   0x06ca6351e003826f,0x142929670a0e6e70
-       .quad   0x27b70a8546d22ffc,0x2e1b21385c26c926
-       .quad   0x4d2c6dfc5ac42aed,0x53380d139d95b3df
-       .quad   0x650a73548baf63de,0x766a0abb3c77b2a8
-       .quad   0x81c2c92e47edaee6,0x92722c851482353b
-       .quad   0xa2bfe8a14cf10364,0xa81a664bbc423001
-       .quad   0xc24b8b70d0f89791,0xc76c51a30654be30
-       .quad   0xd192e819d6ef5218,0xd69906245565a910
-       .quad   0xf40e35855771202a,0x106aa07032bbd1b8
-       .quad   0x19a4c116b8d2d0c8,0x1e376c085141ab53
-       .quad   0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
-       .quad   0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
-       .quad   0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
-       .quad   0x748f82ee5defb2fc,0x78a5636f43172f60
-       .quad   0x84c87814a1f0ab72,0x8cc702081a6439ec
-       .quad   0x90befffa23631e28,0xa4506cebde82bde9
-       .quad   0xbef9a3f7b2c67915,0xc67178f2e372532b
-       .quad   0xca273eceea26619c,0xd186b8c721c0c207
-       .quad   0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
-       .quad   0x06f067aa72176fba,0x0a637dc5a2c898a6
-       .quad   0x113f9804bef90dae,0x1b710b35131c471b
-       .quad   0x28db77f523047d84,0x32caab7b40c72493
-       .quad   0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
-       .quad   0x4cc5d4becb3e42b6,0x597f299cfc657e2a
-       .quad   0x5fcb6fab3ad6faec,0x6c44198c4a475817
-       .quad   0       // terminator
-___
-$code.=<<___ if ($SZ==4);
-       .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-       .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-       .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-       .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-       .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-       .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-       .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-       .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-       .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-       .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-       .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-       .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-       .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-       .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-       .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-       .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-       .long   0       //terminator
-___
-$code.=<<___;
-.size  .LK$BITS,.-.LK$BITS
-#ifndef        __KERNEL__
-.align 3
-.LOPENSSL_armcap_P:
-# ifdef        __ILP32__
-       .long   OPENSSL_armcap_P-.
-# else
-       .quad   OPENSSL_armcap_P-.
-# endif
-#endif
-.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
-.align 2
-___
-
-if ($SZ==4) {
-my $Ktbl="x3";
-
-my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
-my @MSG=map("v$_.16b",(4..7));
-my ($W0,$W1)=("v16.4s","v17.4s");
-my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
-
-$code.=<<___;
-#ifndef        __KERNEL__
-.type  sha256_block_armv8,%function
-.align 6
-sha256_block_armv8:
-.Lv8_entry:
-       stp             x29,x30,[sp,#-16]!
-       add             x29,sp,#0
-
-       ld1.32          {$ABCD,$EFGH},[$ctx]
-       adr             $Ktbl,.LK256
-
-.Loop_hw:
-       ld1             {@MSG[0]-@MSG[3]},[$inp],#64
-       sub             $num,$num,#1
-       ld1.32          {$W0},[$Ktbl],#16
-       rev32           @MSG[0],@MSG[0]
-       rev32           @MSG[1],@MSG[1]
-       rev32           @MSG[2],@MSG[2]
-       rev32           @MSG[3],@MSG[3]
-       orr             $ABCD_SAVE,$ABCD,$ABCD          // offload
-       orr             $EFGH_SAVE,$EFGH,$EFGH
-___
-for($i=0;$i<12;$i++) {
-$code.=<<___;
-       ld1.32          {$W1},[$Ktbl],#16
-       add.i32         $W0,$W0,@MSG[0]
-       sha256su0       @MSG[0],@MSG[1]
-       orr             $abcd,$ABCD,$ABCD
-       sha256h         $ABCD,$EFGH,$W0
-       sha256h2        $EFGH,$abcd,$W0
-       sha256su1       @MSG[0],@MSG[2],@MSG[3]
-___
-       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
-}
-$code.=<<___;
-       ld1.32          {$W1},[$Ktbl],#16
-       add.i32         $W0,$W0,@MSG[0]
-       orr             $abcd,$ABCD,$ABCD
-       sha256h         $ABCD,$EFGH,$W0
-       sha256h2        $EFGH,$abcd,$W0
-
-       ld1.32          {$W0},[$Ktbl],#16
-       add.i32         $W1,$W1,@MSG[1]
-       orr             $abcd,$ABCD,$ABCD
-       sha256h         $ABCD,$EFGH,$W1
-       sha256h2        $EFGH,$abcd,$W1
-
-       ld1.32          {$W1},[$Ktbl]
-       add.i32         $W0,$W0,@MSG[2]
-       sub             $Ktbl,$Ktbl,#$rounds*$SZ-16     // rewind
-       orr             $abcd,$ABCD,$ABCD
-       sha256h         $ABCD,$EFGH,$W0
-       sha256h2        $EFGH,$abcd,$W0
-
-       add.i32         $W1,$W1,@MSG[3]
-       orr             $abcd,$ABCD,$ABCD
-       sha256h         $ABCD,$EFGH,$W1
-       sha256h2        $EFGH,$abcd,$W1
-
-       add.i32         $ABCD,$ABCD,$ABCD_SAVE
-       add.i32         $EFGH,$EFGH,$EFGH_SAVE
-
-       cbnz            $num,.Loop_hw
-
-       st1.32          {$ABCD,$EFGH},[$ctx]
-
-       ldr             x29,[sp],#16
-       ret
-.size  sha256_block_armv8,.-sha256_block_armv8
-#endif
-___
-}
-
-if ($SZ==4) {  ######################################### NEON stuff #
-# You'll surely note a lot of similarities with sha256-armv4 module,
-# and of course it's not a coincidence. sha256-armv4 was used as
-# initial template, but was adapted for ARMv8 instruction set and
-# extensively re-tuned for all-round performance.
-
-my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
-my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
-my $Ktbl="x16";
-my $Xfer="x17";
-my @X = map("q$_",(0..3));
-my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
-my $j=0;
-
-sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
-  my $arg = pop;
-    $arg = "#$arg" if ($arg*1 eq $arg);
-    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
-}
-
-sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
-sub Dlo     { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
-sub Dhi     { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
-
-sub Xupdate()
-{ use integer;
-  my $body = shift;
-  my @insns = (&$body,&$body,&$body,&$body);
-  my ($a,$b,$c,$d,$e,$f,$g,$h);
-
-       &ext_8          ($T0,@X[0],@X[1],4);    # X[1..4]
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &ext_8          ($T3,@X[2],@X[3],4);    # X[9..12]
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &mov            (&Dscalar($T7),&Dhi(@X[3]));    # X[14..15]
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &ushr_32        ($T2,$T0,$sigma0[0]);
-        eval(shift(@insns));
-       &ushr_32        ($T1,$T0,$sigma0[2]);
-        eval(shift(@insns));
-       &add_32         (@X[0],@X[0],$T3);      # X[0..3] += X[9..12]
-        eval(shift(@insns));
-       &sli_32         ($T2,$T0,32-$sigma0[0]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &ushr_32        ($T3,$T0,$sigma0[1]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &eor_8          ($T1,$T1,$T2);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &sli_32         ($T3,$T0,32-$sigma0[1]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &ushr_32      ($T4,$T7,$sigma1[0]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &eor_8          ($T1,$T1,$T3);          # sigma0(X[1..4])
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &sli_32       ($T4,$T7,32-$sigma1[0]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &ushr_32      ($T5,$T7,$sigma1[2]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &ushr_32      ($T3,$T7,$sigma1[1]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &add_32         (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &sli_u32      ($T3,$T7,32-$sigma1[1]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &eor_8        ($T5,$T5,$T4);
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &eor_8        ($T5,$T5,$T3);          # sigma1(X[14..15])
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &add_32         (@X[0],@X[0],$T5);      # X[0..1] += sigma1(X[14..15])
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &ushr_32      ($T6,@X[0],$sigma1[0]);
-        eval(shift(@insns));
-         &ushr_32      ($T7,@X[0],$sigma1[2]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &sli_32       ($T6,@X[0],32-$sigma1[0]);
-        eval(shift(@insns));
-         &ushr_32      ($T5,@X[0],$sigma1[1]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &eor_8        ($T7,$T7,$T6);
-        eval(shift(@insns));
-        eval(shift(@insns));
-         &sli_32       ($T5,@X[0],32-$sigma1[1]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &ld1_32         ("{$T0}","[$Ktbl], #16");
-        eval(shift(@insns));
-         &eor_8        ($T7,$T7,$T5);          # sigma1(X[16..17])
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &eor_8          ($T5,$T5,$T5);
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &mov            (&Dhi($T5), &Dlo($T7));
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &add_32         (@X[0],@X[0],$T5);      # X[2..3] += sigma1(X[16..17])
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &add_32         ($T0,$T0,@X[0]);
-        while($#insns>=1) { eval(shift(@insns)); }
-       &st1_32         ("{$T0}","[$Xfer], #16");
-        eval(shift(@insns));
-
-       push(@X,shift(@X));             # "rotate" X[]
-}
-
-sub Xpreload()
-{ use integer;
-  my $body = shift;
-  my @insns = (&$body,&$body,&$body,&$body);
-  my ($a,$b,$c,$d,$e,$f,$g,$h);
-
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &ld1_8          ("{@X[0]}","[$inp],#16");
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &ld1_32         ("{$T0}","[$Ktbl],#16");
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &rev32          (@X[0],@X[0]);
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-        eval(shift(@insns));
-       &add_32         ($T0,$T0,@X[0]);
-        foreach (@insns) { eval; }     # remaining instructions
-       &st1_32         ("{$T0}","[$Xfer], #16");
-
-       push(@X,shift(@X));             # "rotate" X[]
-}
-
-sub body_00_15 () {
-       (
-       '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
-       '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
-       '&add   ($a,$a,$t4);'.                  # h+=Sigma0(a) from the past
-       '&and   ($t1,$f,$e)',
-       '&bic   ($t4,$g,$e)',
-       '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
-       '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
-       '&orr   ($t1,$t1,$t4)',                 # Ch(e,f,g)
-       '&eor   ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
-       '&eor   ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
-       '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
-       '&ror   ($t0,$t0,"#$Sigma1[0]")',
-       '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
-       '&eor   ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
-       '&add   ($h,$h,$t0)',                   # h+=Sigma1(e)
-       '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
-       '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
-       '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
-       '&ror   ($t4,$t4,"#$Sigma0[0]")',
-       '&add   ($d,$d,$h)',                    # d+=h
-       '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
-       '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
-       )
-}
-
-$code.=<<___;
-#ifdef __KERNEL__
-.globl sha256_block_neon
-#endif
-.type  sha256_block_neon,%function
-.align 4
-sha256_block_neon:
-.Lneon_entry:
-       stp     x29, x30, [sp, #-16]!
-       mov     x29, sp
-       sub     sp,sp,#16*4
-
-       adr     $Ktbl,.LK256
-       add     $num,$inp,$num,lsl#6    // len to point at the end of inp
-
-       ld1.8   {@X[0]},[$inp], #16
-       ld1.8   {@X[1]},[$inp], #16
-       ld1.8   {@X[2]},[$inp], #16
-       ld1.8   {@X[3]},[$inp], #16
-       ld1.32  {$T0},[$Ktbl], #16
-       ld1.32  {$T1},[$Ktbl], #16
-       ld1.32  {$T2},[$Ktbl], #16
-       ld1.32  {$T3},[$Ktbl], #16
-       rev32   @X[0],@X[0]             // yes, even on
-       rev32   @X[1],@X[1]             // big-endian
-       rev32   @X[2],@X[2]
-       rev32   @X[3],@X[3]
-       mov     $Xfer,sp
-       add.32  $T0,$T0,@X[0]
-       add.32  $T1,$T1,@X[1]
-       add.32  $T2,$T2,@X[2]
-       st1.32  {$T0-$T1},[$Xfer], #32
-       add.32  $T3,$T3,@X[3]
-       st1.32  {$T2-$T3},[$Xfer]
-       sub     $Xfer,$Xfer,#32
-
-       ldp     $A,$B,[$ctx]
-       ldp     $C,$D,[$ctx,#8]
-       ldp     $E,$F,[$ctx,#16]
-       ldp     $G,$H,[$ctx,#24]
-       ldr     $t1,[sp,#0]
-       mov     $t2,wzr
-       eor     $t3,$B,$C
-       mov     $t4,wzr
-       b       .L_00_48
-
-.align 4
-.L_00_48:
-___
-       &Xupdate(\&body_00_15);
-       &Xupdate(\&body_00_15);
-       &Xupdate(\&body_00_15);
-       &Xupdate(\&body_00_15);
-$code.=<<___;
-       cmp     $t1,#0                          // check for K256 terminator
-       ldr     $t1,[sp,#0]
-       sub     $Xfer,$Xfer,#64
-       bne     .L_00_48
-
-       sub     $Ktbl,$Ktbl,#256                // rewind $Ktbl
-       cmp     $inp,$num
-       mov     $Xfer, #64
-       csel    $Xfer, $Xfer, xzr, eq
-       sub     $inp,$inp,$Xfer                 // avoid SEGV
-       mov     $Xfer,sp
-___
-       &Xpreload(\&body_00_15);
-       &Xpreload(\&body_00_15);
-       &Xpreload(\&body_00_15);
-       &Xpreload(\&body_00_15);
-$code.=<<___;
-       add     $A,$A,$t4                       // h+=Sigma0(a) from the past
-       ldp     $t0,$t1,[$ctx,#0]
-       add     $A,$A,$t2                       // h+=Maj(a,b,c) from the past
-       ldp     $t2,$t3,[$ctx,#8]
-       add     $A,$A,$t0                       // accumulate
-       add     $B,$B,$t1
-       ldp     $t0,$t1,[$ctx,#16]
-       add     $C,$C,$t2
-       add     $D,$D,$t3
-       ldp     $t2,$t3,[$ctx,#24]
-       add     $E,$E,$t0
-       add     $F,$F,$t1
-        ldr    $t1,[sp,#0]
-       stp     $A,$B,[$ctx,#0]
-       add     $G,$G,$t2
-        mov    $t2,wzr
-       stp     $C,$D,[$ctx,#8]
-       add     $H,$H,$t3
-       stp     $E,$F,[$ctx,#16]
-        eor    $t3,$B,$C
-       stp     $G,$H,[$ctx,#24]
-        mov    $t4,wzr
-        mov    $Xfer,sp
-       b.ne    .L_00_48
-
-       ldr     x29,[x29]
-       add     sp,sp,#16*4+16
-       ret
-.size  sha256_block_neon,.-sha256_block_neon
-___
-}
-
-$code.=<<___;
-#ifndef        __KERNEL__
-.comm  OPENSSL_armcap_P,4,4
-#endif
-___
-
-{   my  %opcode = (
-       "sha256h"       => 0x5e004000,  "sha256h2"      => 0x5e005000,
-       "sha256su0"     => 0x5e282800,  "sha256su1"     => 0x5e006000   );
-
-    sub unsha256 {
-       my ($mnemonic,$arg)=@_;
-
-       $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
-       &&
-       sprintf ".inst\t0x%08x\t//%s %s",
-                       $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
-                       $mnemonic,$arg;
-    }
-}
-
-open SELF,$0;
-while(<SELF>) {
-        next if (/^#!/);
-        last if (!s/^#/\/\// and !/^$/);
-        print;
-}
-close SELF;
-
-foreach(split("\n",$code)) {
-
-       s/\`([^\`]*)\`/eval($1)/ge;
-
-       s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
-
-       s/\bq([0-9]+)\b/v$1.16b/g;              # old->new registers
-
-       s/\.[ui]?8(\s)/$1/;
-       s/\.\w?32\b//           and s/\.16b/\.4s/g;
-       m/(ld|st)1[^\[]+\[0\]/  and s/\.4s/\.s/g;
-
-       print $_,"\n";
-}
-
-close STDOUT;
diff --git a/arch/arm64/lib/crypto/sha256-ce.S b/arch/arm64/lib/crypto/sha256-ce.S
deleted file mode 100644 (file)
index f3e21c6..0000000
+++ /dev/null
@@ -1,136 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
- *
- * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-       .text
-       .arch           armv8-a+crypto
-
-       dga             .req    q20
-       dgav            .req    v20
-       dgb             .req    q21
-       dgbv            .req    v21
-
-       t0              .req    v22
-       t1              .req    v23
-
-       dg0q            .req    q24
-       dg0v            .req    v24
-       dg1q            .req    q25
-       dg1v            .req    v25
-       dg2q            .req    q26
-       dg2v            .req    v26
-
-       .macro          add_only, ev, rc, s0
-       mov             dg2v.16b, dg0v.16b
-       .ifeq           \ev
-       add             t1.4s, v\s0\().4s, \rc\().4s
-       sha256h         dg0q, dg1q, t0.4s
-       sha256h2        dg1q, dg2q, t0.4s
-       .else
-       .ifnb           \s0
-       add             t0.4s, v\s0\().4s, \rc\().4s
-       .endif
-       sha256h         dg0q, dg1q, t1.4s
-       sha256h2        dg1q, dg2q, t1.4s
-       .endif
-       .endm
-
-       .macro          add_update, ev, rc, s0, s1, s2, s3
-       sha256su0       v\s0\().4s, v\s1\().4s
-       add_only        \ev, \rc, \s1
-       sha256su1       v\s0\().4s, v\s2\().4s, v\s3\().4s
-       .endm
-
-       /*
-        * The SHA-256 round constants
-        */
-       .section        ".rodata", "a"
-       .align          4
-.Lsha2_rcon:
-       .word           0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
-       .word           0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
-       .word           0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
-       .word           0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
-       .word           0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
-       .word           0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
-       .word           0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
-       .word           0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
-       .word           0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
-       .word           0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
-       .word           0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
-       .word           0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
-       .word           0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
-       .word           0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
-       .word           0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
-       .word           0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-
-       /*
-        * size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
-        *                              const u8 *data, size_t nblocks);
-        */
-       .text
-SYM_FUNC_START(__sha256_ce_transform)
-       /* load round constants */
-       adr_l           x8, .Lsha2_rcon
-       ld1             { v0.4s- v3.4s}, [x8], #64
-       ld1             { v4.4s- v7.4s}, [x8], #64
-       ld1             { v8.4s-v11.4s}, [x8], #64
-       ld1             {v12.4s-v15.4s}, [x8]
-
-       /* load state */
-       ld1             {dgav.4s, dgbv.4s}, [x0]
-
-       /* load input */
-0:     ld1             {v16.4s-v19.4s}, [x1], #64
-       sub             x2, x2, #1
-
-CPU_LE(        rev32           v16.16b, v16.16b        )
-CPU_LE(        rev32           v17.16b, v17.16b        )
-CPU_LE(        rev32           v18.16b, v18.16b        )
-CPU_LE(        rev32           v19.16b, v19.16b        )
-
-       add             t0.4s, v16.4s, v0.4s
-       mov             dg0v.16b, dgav.16b
-       mov             dg1v.16b, dgbv.16b
-
-       add_update      0,  v1, 16, 17, 18, 19
-       add_update      1,  v2, 17, 18, 19, 16
-       add_update      0,  v3, 18, 19, 16, 17
-       add_update      1,  v4, 19, 16, 17, 18
-
-       add_update      0,  v5, 16, 17, 18, 19
-       add_update      1,  v6, 17, 18, 19, 16
-       add_update      0,  v7, 18, 19, 16, 17
-       add_update      1,  v8, 19, 16, 17, 18
-
-       add_update      0,  v9, 16, 17, 18, 19
-       add_update      1, v10, 17, 18, 19, 16
-       add_update      0, v11, 18, 19, 16, 17
-       add_update      1, v12, 19, 16, 17, 18
-
-       add_only        0, v13, 17
-       add_only        1, v14, 18
-       add_only        0, v15, 19
-       add_only        1
-
-       /* update state */
-       add             dgav.4s, dgav.4s, dg0v.4s
-       add             dgbv.4s, dgbv.4s, dg1v.4s
-
-       /* return early if voluntary preemption is needed */
-       cond_yield      1f, x5, x6
-
-       /* handled all input blocks? */
-       cbnz            x2, 0b
-
-       /* store new state */
-1:     st1             {dgav.4s, dgbv.4s}, [x0]
-       mov             x0, x2
-       ret
-SYM_FUNC_END(__sha256_ce_transform)
diff --git a/arch/arm64/lib/crypto/sha256.c b/arch/arm64/lib/crypto/sha256.c
deleted file mode 100644 (file)
index bcf7a3a..0000000
+++ /dev/null
@@ -1,75 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * SHA-256 optimized for ARM64
- *
- * Copyright 2025 Google LLC
- */
-#include <asm/neon.h>
-#include <crypto/internal/sha2.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
-                                  const u8 *data, size_t nblocks);
-EXPORT_SYMBOL_GPL(sha256_blocks_arch);
-asmlinkage void sha256_block_neon(u32 state[SHA256_STATE_WORDS],
-                                 const u8 *data, size_t nblocks);
-asmlinkage size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
-                                       const u8 *data, size_t nblocks);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
-
-void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
-                       const u8 *data, size_t nblocks)
-{
-       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-           static_branch_likely(&have_neon)) {
-               if (static_branch_likely(&have_ce)) {
-                       do {
-                               size_t rem;
-
-                               kernel_neon_begin();
-                               rem = __sha256_ce_transform(state,
-                                                           data, nblocks);
-                               kernel_neon_end();
-                               data += (nblocks - rem) * SHA256_BLOCK_SIZE;
-                               nblocks = rem;
-                       } while (nblocks);
-               } else {
-                       kernel_neon_begin();
-                       sha256_block_neon(state, data, nblocks);
-                       kernel_neon_end();
-               }
-       } else {
-               sha256_blocks_arch(state, data, nblocks);
-       }
-}
-EXPORT_SYMBOL_GPL(sha256_blocks_simd);
-
-bool sha256_is_arch_optimized(void)
-{
-       /* We always can use at least the ARM64 scalar implementation. */
-       return true;
-}
-EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
-
-static int __init sha256_arm64_mod_init(void)
-{
-       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-           cpu_have_named_feature(ASIMD)) {
-               static_branch_enable(&have_neon);
-               if (cpu_have_named_feature(SHA2))
-                       static_branch_enable(&have_ce);
-       }
-       return 0;
-}
-subsys_initcall(sha256_arm64_mod_init);
-
-static void __exit sha256_arm64_mod_exit(void)
-{
-}
-module_exit(sha256_arm64_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-256 optimized for ARM64");
index e14bef8e87af249c58959035413ec301467c6750..fdeb91bf003285f56a8c16d6c33cde2dfdf71db0 100644 (file)
@@ -193,7 +193,7 @@ if ARM
 source "lib/crypto/arm/Kconfig"
 endif
 if ARM64
-source "arch/arm64/lib/crypto/Kconfig"
+source "lib/crypto/arm64/Kconfig"
 endif
 if MIPS
 source "arch/mips/lib/crypto/Kconfig"
index 5f2b81f82a85d62cef781a06da79155d278dbe6d..19e9880c5d5f042f871c97197609c28a22edc7fe 100644 (file)
@@ -87,7 +87,7 @@ endif
 
 ifeq ($(CONFIG_ARM64),y)
 libsha512-y += arm64/sha512-core.o
-$(obj)/arm64/sha512-core.S: $(src)/../../arch/arm64/lib/crypto/sha2-armv8.pl
+$(obj)/arm64/sha512-core.S: $(src)/arm64/sha2-armv8.pl
        $(call cmd,perlasm_with_args)
 clean-files += arm64/sha512-core.S
 libsha512-$(CONFIG_KERNEL_MODE_NEON) += arm64/sha512-ce-core.o
@@ -108,3 +108,4 @@ obj-$(CONFIG_CRYPTO_LIB_SM3)                        += libsm3.o
 libsm3-y                                       := sm3.o
 
 obj-$(CONFIG_ARM) += arm/
+obj-$(CONFIG_ARM64) += arm64/
index 670a4d97b5684051b679cac0b1b70cc2ffc2605c..f6c4e8ef80dae9943f0e935373396c0306ea1614 100644 (file)
@@ -1,2 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
+sha256-core.S
 sha512-core.S
diff --git a/lib/crypto/arm64/Kconfig b/lib/crypto/arm64/Kconfig
new file mode 100644 (file)
index 0000000..129a768
--- /dev/null
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_CHACHA20_NEON
+       tristate
+       depends on KERNEL_MODE_NEON
+       default CRYPTO_LIB_CHACHA
+       select CRYPTO_LIB_CHACHA_GENERIC
+       select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_NEON
+       tristate
+       depends on KERNEL_MODE_NEON
+       default CRYPTO_LIB_POLY1305
+       select CRYPTO_ARCH_HAVE_LIB_POLY1305
+
+config CRYPTO_SHA256_ARM64
+       tristate
+       default CRYPTO_LIB_SHA256
+       select CRYPTO_ARCH_HAVE_LIB_SHA256
+       select CRYPTO_ARCH_HAVE_LIB_SHA256_SIMD
diff --git a/lib/crypto/arm64/Makefile b/lib/crypto/arm64/Makefile
new file mode 100644 (file)
index 0000000..946c099
--- /dev/null
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
+
+obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
+poly1305-neon-y := poly1305-core.o poly1305-glue.o
+AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_block_init_arch
+AFLAGS_poly1305-core.o += -Dpoly1305_emit=poly1305_emit_arch
+
+obj-$(CONFIG_CRYPTO_SHA256_ARM64) += sha256-arm64.o
+sha256-arm64-y := sha256.o sha256-core.o
+sha256-arm64-$(CONFIG_KERNEL_MODE_NEON) += sha256-ce.o
+
+quiet_cmd_perlasm = PERLASM $@
+      cmd_perlasm = $(PERL) $(<) void $(@)
+
+$(obj)/%-core.S: $(src)/%-armv8.pl
+       $(call cmd,perlasm)
+
+$(obj)/sha256-core.S: $(src)/sha2-armv8.pl
+       $(call cmd,perlasm)
+
+clean-files += poly1305-core.S sha256-core.S
diff --git a/lib/crypto/arm64/chacha-neon-core.S b/lib/crypto/arm64/chacha-neon-core.S
new file mode 100644 (file)
index 0000000..8007958
--- /dev/null
@@ -0,0 +1,805 @@
+/*
+ * ChaCha/HChaCha NEON helper functions
+ *
+ * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Originally based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/cache.h>
+
+       .text
+       .align          6
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers v0-v3.  It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in w3.
+ *
+ * Clobbers: w3, x10, v4, v12
+ */
+SYM_FUNC_START_LOCAL(chacha_permute)
+
+       adr_l           x10, ROT8
+       ld1             {v12.4s}, [x10]
+
+.Ldoubleround:
+       // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+       add             v0.4s, v0.4s, v1.4s
+       eor             v3.16b, v3.16b, v0.16b
+       rev32           v3.8h, v3.8h
+
+       // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+       add             v2.4s, v2.4s, v3.4s
+       eor             v4.16b, v1.16b, v2.16b
+       shl             v1.4s, v4.4s, #12
+       sri             v1.4s, v4.4s, #20
+
+       // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+       add             v0.4s, v0.4s, v1.4s
+       eor             v3.16b, v3.16b, v0.16b
+       tbl             v3.16b, {v3.16b}, v12.16b
+
+       // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+       add             v2.4s, v2.4s, v3.4s
+       eor             v4.16b, v1.16b, v2.16b
+       shl             v1.4s, v4.4s, #7
+       sri             v1.4s, v4.4s, #25
+
+       // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+       ext             v1.16b, v1.16b, v1.16b, #4
+       // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+       ext             v2.16b, v2.16b, v2.16b, #8
+       // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+       ext             v3.16b, v3.16b, v3.16b, #12
+
+       // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+       add             v0.4s, v0.4s, v1.4s
+       eor             v3.16b, v3.16b, v0.16b
+       rev32           v3.8h, v3.8h
+
+       // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+       add             v2.4s, v2.4s, v3.4s
+       eor             v4.16b, v1.16b, v2.16b
+       shl             v1.4s, v4.4s, #12
+       sri             v1.4s, v4.4s, #20
+
+       // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+       add             v0.4s, v0.4s, v1.4s
+       eor             v3.16b, v3.16b, v0.16b
+       tbl             v3.16b, {v3.16b}, v12.16b
+
+       // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+       add             v2.4s, v2.4s, v3.4s
+       eor             v4.16b, v1.16b, v2.16b
+       shl             v1.4s, v4.4s, #7
+       sri             v1.4s, v4.4s, #25
+
+       // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+       ext             v1.16b, v1.16b, v1.16b, #12
+       // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+       ext             v2.16b, v2.16b, v2.16b, #8
+       // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+       ext             v3.16b, v3.16b, v3.16b, #4
+
+       subs            w3, w3, #2
+       b.ne            .Ldoubleround
+
+       ret
+SYM_FUNC_END(chacha_permute)
+
+SYM_FUNC_START(chacha_block_xor_neon)
+       // x0: Input state matrix, s
+       // x1: 1 data block output, o
+       // x2: 1 data block input, i
+       // w3: nrounds
+
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
+
+       // x0..3 = s0..3
+       ld1             {v0.4s-v3.4s}, [x0]
+       ld1             {v8.4s-v11.4s}, [x0]
+
+       bl              chacha_permute
+
+       ld1             {v4.16b-v7.16b}, [x2]
+
+       // o0 = i0 ^ (x0 + s0)
+       add             v0.4s, v0.4s, v8.4s
+       eor             v0.16b, v0.16b, v4.16b
+
+       // o1 = i1 ^ (x1 + s1)
+       add             v1.4s, v1.4s, v9.4s
+       eor             v1.16b, v1.16b, v5.16b
+
+       // o2 = i2 ^ (x2 + s2)
+       add             v2.4s, v2.4s, v10.4s
+       eor             v2.16b, v2.16b, v6.16b
+
+       // o3 = i3 ^ (x3 + s3)
+       add             v3.4s, v3.4s, v11.4s
+       eor             v3.16b, v3.16b, v7.16b
+
+       st1             {v0.16b-v3.16b}, [x1]
+
+       ldp             x29, x30, [sp], #16
+       ret
+SYM_FUNC_END(chacha_block_xor_neon)
+
+SYM_FUNC_START(hchacha_block_neon)
+       // x0: Input state matrix, s
+       // x1: output (8 32-bit words)
+       // w2: nrounds
+
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
+
+       ld1             {v0.4s-v3.4s}, [x0]
+
+       mov             w3, w2
+       bl              chacha_permute
+
+       st1             {v0.4s}, [x1], #16
+       st1             {v3.4s}, [x1]
+
+       ldp             x29, x30, [sp], #16
+       ret
+SYM_FUNC_END(hchacha_block_neon)
+
+       a0              .req    w12
+       a1              .req    w13
+       a2              .req    w14
+       a3              .req    w15
+       a4              .req    w16
+       a5              .req    w17
+       a6              .req    w19
+       a7              .req    w20
+       a8              .req    w21
+       a9              .req    w22
+       a10             .req    w23
+       a11             .req    w24
+       a12             .req    w25
+       a13             .req    w26
+       a14             .req    w27
+       a15             .req    w28
+
+       .align          6
+SYM_FUNC_START(chacha_4block_xor_neon)
+       frame_push      10
+
+       // x0: Input state matrix, s
+       // x1: 4 data blocks output, o
+       // x2: 4 data blocks input, i
+       // w3: nrounds
+       // x4: byte count
+
+       adr_l           x10, .Lpermute
+       and             x5, x4, #63
+       add             x10, x10, x5
+
+       //
+       // This function encrypts four consecutive ChaCha blocks by loading
+       // the state matrix in NEON registers four times. The algorithm performs
+       // each operation on the corresponding word of each state matrix, hence
+       // requires no word shuffling. For final XORing step we transpose the
+       // matrix by interleaving 32- and then 64-bit words, which allows us to
+       // do XOR in NEON registers.
+       //
+       // At the same time, a fifth block is encrypted in parallel using
+       // scalar registers
+       //
+       adr_l           x9, CTRINC              // ... and ROT8
+       ld1             {v30.4s-v31.4s}, [x9]
+
+       // x0..15[0-3] = s0..3[0..3]
+       add             x8, x0, #16
+       ld4r            { v0.4s- v3.4s}, [x0]
+       ld4r            { v4.4s- v7.4s}, [x8], #16
+       ld4r            { v8.4s-v11.4s}, [x8], #16
+       ld4r            {v12.4s-v15.4s}, [x8]
+
+       mov             a0, v0.s[0]
+       mov             a1, v1.s[0]
+       mov             a2, v2.s[0]
+       mov             a3, v3.s[0]
+       mov             a4, v4.s[0]
+       mov             a5, v5.s[0]
+       mov             a6, v6.s[0]
+       mov             a7, v7.s[0]
+       mov             a8, v8.s[0]
+       mov             a9, v9.s[0]
+       mov             a10, v10.s[0]
+       mov             a11, v11.s[0]
+       mov             a12, v12.s[0]
+       mov             a13, v13.s[0]
+       mov             a14, v14.s[0]
+       mov             a15, v15.s[0]
+
+       // x12 += counter values 1-4
+       add             v12.4s, v12.4s, v30.4s
+
+.Ldoubleround4:
+       // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+       // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+       // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+       // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+       add             v0.4s, v0.4s, v4.4s
+         add           a0, a0, a4
+       add             v1.4s, v1.4s, v5.4s
+         add           a1, a1, a5
+       add             v2.4s, v2.4s, v6.4s
+         add           a2, a2, a6
+       add             v3.4s, v3.4s, v7.4s
+         add           a3, a3, a7
+
+       eor             v12.16b, v12.16b, v0.16b
+         eor           a12, a12, a0
+       eor             v13.16b, v13.16b, v1.16b
+         eor           a13, a13, a1
+       eor             v14.16b, v14.16b, v2.16b
+         eor           a14, a14, a2
+       eor             v15.16b, v15.16b, v3.16b
+         eor           a15, a15, a3
+
+       rev32           v12.8h, v12.8h
+         ror           a12, a12, #16
+       rev32           v13.8h, v13.8h
+         ror           a13, a13, #16
+       rev32           v14.8h, v14.8h
+         ror           a14, a14, #16
+       rev32           v15.8h, v15.8h
+         ror           a15, a15, #16
+
+       // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+       // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+       // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+       // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+       add             v8.4s, v8.4s, v12.4s
+         add           a8, a8, a12
+       add             v9.4s, v9.4s, v13.4s
+         add           a9, a9, a13
+       add             v10.4s, v10.4s, v14.4s
+         add           a10, a10, a14
+       add             v11.4s, v11.4s, v15.4s
+         add           a11, a11, a15
+
+       eor             v16.16b, v4.16b, v8.16b
+         eor           a4, a4, a8
+       eor             v17.16b, v5.16b, v9.16b
+         eor           a5, a5, a9
+       eor             v18.16b, v6.16b, v10.16b
+         eor           a6, a6, a10
+       eor             v19.16b, v7.16b, v11.16b
+         eor           a7, a7, a11
+
+       shl             v4.4s, v16.4s, #12
+       shl             v5.4s, v17.4s, #12
+       shl             v6.4s, v18.4s, #12
+       shl             v7.4s, v19.4s, #12
+
+       sri             v4.4s, v16.4s, #20
+         ror           a4, a4, #20
+       sri             v5.4s, v17.4s, #20
+         ror           a5, a5, #20
+       sri             v6.4s, v18.4s, #20
+         ror           a6, a6, #20
+       sri             v7.4s, v19.4s, #20
+         ror           a7, a7, #20
+
+       // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+       // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+       // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+       // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+       add             v0.4s, v0.4s, v4.4s
+         add           a0, a0, a4
+       add             v1.4s, v1.4s, v5.4s
+         add           a1, a1, a5
+       add             v2.4s, v2.4s, v6.4s
+         add           a2, a2, a6
+       add             v3.4s, v3.4s, v7.4s
+         add           a3, a3, a7
+
+       eor             v12.16b, v12.16b, v0.16b
+         eor           a12, a12, a0
+       eor             v13.16b, v13.16b, v1.16b
+         eor           a13, a13, a1
+       eor             v14.16b, v14.16b, v2.16b
+         eor           a14, a14, a2
+       eor             v15.16b, v15.16b, v3.16b
+         eor           a15, a15, a3
+
+       tbl             v12.16b, {v12.16b}, v31.16b
+         ror           a12, a12, #24
+       tbl             v13.16b, {v13.16b}, v31.16b
+         ror           a13, a13, #24
+       tbl             v14.16b, {v14.16b}, v31.16b
+         ror           a14, a14, #24
+       tbl             v15.16b, {v15.16b}, v31.16b
+         ror           a15, a15, #24
+
+       // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+       // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+       // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+       // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+       add             v8.4s, v8.4s, v12.4s
+         add           a8, a8, a12
+       add             v9.4s, v9.4s, v13.4s
+         add           a9, a9, a13
+       add             v10.4s, v10.4s, v14.4s
+         add           a10, a10, a14
+       add             v11.4s, v11.4s, v15.4s
+         add           a11, a11, a15
+
+       eor             v16.16b, v4.16b, v8.16b
+         eor           a4, a4, a8
+       eor             v17.16b, v5.16b, v9.16b
+         eor           a5, a5, a9
+       eor             v18.16b, v6.16b, v10.16b
+         eor           a6, a6, a10
+       eor             v19.16b, v7.16b, v11.16b
+         eor           a7, a7, a11
+
+       shl             v4.4s, v16.4s, #7
+       shl             v5.4s, v17.4s, #7
+       shl             v6.4s, v18.4s, #7
+       shl             v7.4s, v19.4s, #7
+
+       sri             v4.4s, v16.4s, #25
+         ror           a4, a4, #25
+       sri             v5.4s, v17.4s, #25
+         ror           a5, a5, #25
+       sri             v6.4s, v18.4s, #25
+        ror            a6, a6, #25
+       sri             v7.4s, v19.4s, #25
+         ror           a7, a7, #25
+
+       // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+       // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+       // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+       // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+       add             v0.4s, v0.4s, v5.4s
+         add           a0, a0, a5
+       add             v1.4s, v1.4s, v6.4s
+         add           a1, a1, a6
+       add             v2.4s, v2.4s, v7.4s
+         add           a2, a2, a7
+       add             v3.4s, v3.4s, v4.4s
+         add           a3, a3, a4
+
+       eor             v15.16b, v15.16b, v0.16b
+         eor           a15, a15, a0
+       eor             v12.16b, v12.16b, v1.16b
+         eor           a12, a12, a1
+       eor             v13.16b, v13.16b, v2.16b
+         eor           a13, a13, a2
+       eor             v14.16b, v14.16b, v3.16b
+         eor           a14, a14, a3
+
+       rev32           v15.8h, v15.8h
+         ror           a15, a15, #16
+       rev32           v12.8h, v12.8h
+         ror           a12, a12, #16
+       rev32           v13.8h, v13.8h
+         ror           a13, a13, #16
+       rev32           v14.8h, v14.8h
+         ror           a14, a14, #16
+
+       // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+       // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+       // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+       // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+       add             v10.4s, v10.4s, v15.4s
+         add           a10, a10, a15
+       add             v11.4s, v11.4s, v12.4s
+         add           a11, a11, a12
+       add             v8.4s, v8.4s, v13.4s
+         add           a8, a8, a13
+       add             v9.4s, v9.4s, v14.4s
+         add           a9, a9, a14
+
+       eor             v16.16b, v5.16b, v10.16b
+         eor           a5, a5, a10
+       eor             v17.16b, v6.16b, v11.16b
+         eor           a6, a6, a11
+       eor             v18.16b, v7.16b, v8.16b
+         eor           a7, a7, a8
+       eor             v19.16b, v4.16b, v9.16b
+         eor           a4, a4, a9
+
+       shl             v5.4s, v16.4s, #12
+       shl             v6.4s, v17.4s, #12
+       shl             v7.4s, v18.4s, #12
+       shl             v4.4s, v19.4s, #12
+
+       sri             v5.4s, v16.4s, #20
+         ror           a5, a5, #20
+       sri             v6.4s, v17.4s, #20
+         ror           a6, a6, #20
+       sri             v7.4s, v18.4s, #20
+         ror           a7, a7, #20
+       sri             v4.4s, v19.4s, #20
+         ror           a4, a4, #20
+
+       // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+       // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+       // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+       // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+       add             v0.4s, v0.4s, v5.4s
+         add           a0, a0, a5
+       add             v1.4s, v1.4s, v6.4s
+         add           a1, a1, a6
+       add             v2.4s, v2.4s, v7.4s
+         add           a2, a2, a7
+       add             v3.4s, v3.4s, v4.4s
+         add           a3, a3, a4
+
+       eor             v15.16b, v15.16b, v0.16b
+         eor           a15, a15, a0
+       eor             v12.16b, v12.16b, v1.16b
+         eor           a12, a12, a1
+       eor             v13.16b, v13.16b, v2.16b
+         eor           a13, a13, a2
+       eor             v14.16b, v14.16b, v3.16b
+         eor           a14, a14, a3
+
+       tbl             v15.16b, {v15.16b}, v31.16b
+         ror           a15, a15, #24
+       tbl             v12.16b, {v12.16b}, v31.16b
+         ror           a12, a12, #24
+       tbl             v13.16b, {v13.16b}, v31.16b
+         ror           a13, a13, #24
+       tbl             v14.16b, {v14.16b}, v31.16b
+         ror           a14, a14, #24
+
+       // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+       // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+       // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+       // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+       add             v10.4s, v10.4s, v15.4s
+         add           a10, a10, a15
+       add             v11.4s, v11.4s, v12.4s
+         add           a11, a11, a12
+       add             v8.4s, v8.4s, v13.4s
+         add           a8, a8, a13
+       add             v9.4s, v9.4s, v14.4s
+         add           a9, a9, a14
+
+       eor             v16.16b, v5.16b, v10.16b
+         eor           a5, a5, a10
+       eor             v17.16b, v6.16b, v11.16b
+         eor           a6, a6, a11
+       eor             v18.16b, v7.16b, v8.16b
+         eor           a7, a7, a8
+       eor             v19.16b, v4.16b, v9.16b
+         eor           a4, a4, a9
+
+       shl             v5.4s, v16.4s, #7
+       shl             v6.4s, v17.4s, #7
+       shl             v7.4s, v18.4s, #7
+       shl             v4.4s, v19.4s, #7
+
+       sri             v5.4s, v16.4s, #25
+         ror           a5, a5, #25
+       sri             v6.4s, v17.4s, #25
+         ror           a6, a6, #25
+       sri             v7.4s, v18.4s, #25
+         ror           a7, a7, #25
+       sri             v4.4s, v19.4s, #25
+         ror           a4, a4, #25
+
+       subs            w3, w3, #2
+       b.ne            .Ldoubleround4
+
+       ld4r            {v16.4s-v19.4s}, [x0], #16
+       ld4r            {v20.4s-v23.4s}, [x0], #16
+
+       // x12 += counter values 0-3
+       add             v12.4s, v12.4s, v30.4s
+
+       // x0[0-3] += s0[0]
+       // x1[0-3] += s0[1]
+       // x2[0-3] += s0[2]
+       // x3[0-3] += s0[3]
+       add             v0.4s, v0.4s, v16.4s
+         mov           w6, v16.s[0]
+         mov           w7, v17.s[0]
+       add             v1.4s, v1.4s, v17.4s
+         mov           w8, v18.s[0]
+         mov           w9, v19.s[0]
+       add             v2.4s, v2.4s, v18.4s
+         add           a0, a0, w6
+         add           a1, a1, w7
+       add             v3.4s, v3.4s, v19.4s
+         add           a2, a2, w8
+         add           a3, a3, w9
+CPU_BE(          rev           a0, a0          )
+CPU_BE(          rev           a1, a1          )
+CPU_BE(          rev           a2, a2          )
+CPU_BE(          rev           a3, a3          )
+
+       ld4r            {v24.4s-v27.4s}, [x0], #16
+       ld4r            {v28.4s-v31.4s}, [x0]
+
+       // x4[0-3] += s1[0]
+       // x5[0-3] += s1[1]
+       // x6[0-3] += s1[2]
+       // x7[0-3] += s1[3]
+       add             v4.4s, v4.4s, v20.4s
+         mov           w6, v20.s[0]
+         mov           w7, v21.s[0]
+       add             v5.4s, v5.4s, v21.4s
+         mov           w8, v22.s[0]
+         mov           w9, v23.s[0]
+       add             v6.4s, v6.4s, v22.4s
+         add           a4, a4, w6
+         add           a5, a5, w7
+       add             v7.4s, v7.4s, v23.4s
+         add           a6, a6, w8
+         add           a7, a7, w9
+CPU_BE(          rev           a4, a4          )
+CPU_BE(          rev           a5, a5          )
+CPU_BE(          rev           a6, a6          )
+CPU_BE(          rev           a7, a7          )
+
+       // x8[0-3] += s2[0]
+       // x9[0-3] += s2[1]
+       // x10[0-3] += s2[2]
+       // x11[0-3] += s2[3]
+       add             v8.4s, v8.4s, v24.4s
+         mov           w6, v24.s[0]
+         mov           w7, v25.s[0]
+       add             v9.4s, v9.4s, v25.4s
+         mov           w8, v26.s[0]
+         mov           w9, v27.s[0]
+       add             v10.4s, v10.4s, v26.4s
+         add           a8, a8, w6
+         add           a9, a9, w7
+       add             v11.4s, v11.4s, v27.4s
+         add           a10, a10, w8
+         add           a11, a11, w9
+CPU_BE(          rev           a8, a8          )
+CPU_BE(          rev           a9, a9          )
+CPU_BE(          rev           a10, a10        )
+CPU_BE(          rev           a11, a11        )
+
+       // x12[0-3] += s3[0]
+       // x13[0-3] += s3[1]
+       // x14[0-3] += s3[2]
+       // x15[0-3] += s3[3]
+       add             v12.4s, v12.4s, v28.4s
+         mov           w6, v28.s[0]
+         mov           w7, v29.s[0]
+       add             v13.4s, v13.4s, v29.4s
+         mov           w8, v30.s[0]
+         mov           w9, v31.s[0]
+       add             v14.4s, v14.4s, v30.4s
+         add           a12, a12, w6
+         add           a13, a13, w7
+       add             v15.4s, v15.4s, v31.4s
+         add           a14, a14, w8
+         add           a15, a15, w9
+CPU_BE(          rev           a12, a12        )
+CPU_BE(          rev           a13, a13        )
+CPU_BE(          rev           a14, a14        )
+CPU_BE(          rev           a15, a15        )
+
+       // interleave 32-bit words in state n, n+1
+         ldp           w6, w7, [x2], #64
+       zip1            v16.4s, v0.4s, v1.4s
+         ldp           w8, w9, [x2, #-56]
+         eor           a0, a0, w6
+       zip2            v17.4s, v0.4s, v1.4s
+         eor           a1, a1, w7
+       zip1            v18.4s, v2.4s, v3.4s
+         eor           a2, a2, w8
+       zip2            v19.4s, v2.4s, v3.4s
+         eor           a3, a3, w9
+         ldp           w6, w7, [x2, #-48]
+       zip1            v20.4s, v4.4s, v5.4s
+         ldp           w8, w9, [x2, #-40]
+         eor           a4, a4, w6
+       zip2            v21.4s, v4.4s, v5.4s
+         eor           a5, a5, w7
+       zip1            v22.4s, v6.4s, v7.4s
+         eor           a6, a6, w8
+       zip2            v23.4s, v6.4s, v7.4s
+         eor           a7, a7, w9
+         ldp           w6, w7, [x2, #-32]
+       zip1            v24.4s, v8.4s, v9.4s
+         ldp           w8, w9, [x2, #-24]
+         eor           a8, a8, w6
+       zip2            v25.4s, v8.4s, v9.4s
+         eor           a9, a9, w7
+       zip1            v26.4s, v10.4s, v11.4s
+         eor           a10, a10, w8
+       zip2            v27.4s, v10.4s, v11.4s
+         eor           a11, a11, w9
+         ldp           w6, w7, [x2, #-16]
+       zip1            v28.4s, v12.4s, v13.4s
+         ldp           w8, w9, [x2, #-8]
+         eor           a12, a12, w6
+       zip2            v29.4s, v12.4s, v13.4s
+         eor           a13, a13, w7
+       zip1            v30.4s, v14.4s, v15.4s
+         eor           a14, a14, w8
+       zip2            v31.4s, v14.4s, v15.4s
+         eor           a15, a15, w9
+
+       add             x3, x2, x4
+       sub             x3, x3, #128            // start of last block
+
+       subs            x5, x4, #128
+       csel            x2, x2, x3, ge
+
+       // interleave 64-bit words in state n, n+2
+       zip1            v0.2d, v16.2d, v18.2d
+       zip2            v4.2d, v16.2d, v18.2d
+         stp           a0, a1, [x1], #64
+       zip1            v8.2d, v17.2d, v19.2d
+       zip2            v12.2d, v17.2d, v19.2d
+         stp           a2, a3, [x1, #-56]
+
+       subs            x6, x4, #192
+       ld1             {v16.16b-v19.16b}, [x2], #64
+       csel            x2, x2, x3, ge
+
+       zip1            v1.2d, v20.2d, v22.2d
+       zip2            v5.2d, v20.2d, v22.2d
+         stp           a4, a5, [x1, #-48]
+       zip1            v9.2d, v21.2d, v23.2d
+       zip2            v13.2d, v21.2d, v23.2d
+         stp           a6, a7, [x1, #-40]
+
+       subs            x7, x4, #256
+       ld1             {v20.16b-v23.16b}, [x2], #64
+       csel            x2, x2, x3, ge
+
+       zip1            v2.2d, v24.2d, v26.2d
+       zip2            v6.2d, v24.2d, v26.2d
+         stp           a8, a9, [x1, #-32]
+       zip1            v10.2d, v25.2d, v27.2d
+       zip2            v14.2d, v25.2d, v27.2d
+         stp           a10, a11, [x1, #-24]
+
+       subs            x8, x4, #320
+       ld1             {v24.16b-v27.16b}, [x2], #64
+       csel            x2, x2, x3, ge
+
+       zip1            v3.2d, v28.2d, v30.2d
+       zip2            v7.2d, v28.2d, v30.2d
+         stp           a12, a13, [x1, #-16]
+       zip1            v11.2d, v29.2d, v31.2d
+       zip2            v15.2d, v29.2d, v31.2d
+         stp           a14, a15, [x1, #-8]
+
+       tbnz            x5, #63, .Lt128
+       ld1             {v28.16b-v31.16b}, [x2]
+
+       // xor with corresponding input, write to output
+       eor             v16.16b, v16.16b, v0.16b
+       eor             v17.16b, v17.16b, v1.16b
+       eor             v18.16b, v18.16b, v2.16b
+       eor             v19.16b, v19.16b, v3.16b
+
+       tbnz            x6, #63, .Lt192
+
+       eor             v20.16b, v20.16b, v4.16b
+       eor             v21.16b, v21.16b, v5.16b
+       eor             v22.16b, v22.16b, v6.16b
+       eor             v23.16b, v23.16b, v7.16b
+
+       st1             {v16.16b-v19.16b}, [x1], #64
+       tbnz            x7, #63, .Lt256
+
+       eor             v24.16b, v24.16b, v8.16b
+       eor             v25.16b, v25.16b, v9.16b
+       eor             v26.16b, v26.16b, v10.16b
+       eor             v27.16b, v27.16b, v11.16b
+
+       st1             {v20.16b-v23.16b}, [x1], #64
+       tbnz            x8, #63, .Lt320
+
+       eor             v28.16b, v28.16b, v12.16b
+       eor             v29.16b, v29.16b, v13.16b
+       eor             v30.16b, v30.16b, v14.16b
+       eor             v31.16b, v31.16b, v15.16b
+
+       st1             {v24.16b-v27.16b}, [x1], #64
+       st1             {v28.16b-v31.16b}, [x1]
+
+.Lout: frame_pop
+       ret
+
+       // fewer than 192 bytes of in/output
+.Lt192:        cbz             x5, 1f                          // exactly 128 bytes?
+       ld1             {v28.16b-v31.16b}, [x10]
+       add             x5, x5, x1
+       tbl             v28.16b, {v4.16b-v7.16b}, v28.16b
+       tbl             v29.16b, {v4.16b-v7.16b}, v29.16b
+       tbl             v30.16b, {v4.16b-v7.16b}, v30.16b
+       tbl             v31.16b, {v4.16b-v7.16b}, v31.16b
+
+0:     eor             v20.16b, v20.16b, v28.16b
+       eor             v21.16b, v21.16b, v29.16b
+       eor             v22.16b, v22.16b, v30.16b
+       eor             v23.16b, v23.16b, v31.16b
+       st1             {v20.16b-v23.16b}, [x5]         // overlapping stores
+1:     st1             {v16.16b-v19.16b}, [x1]
+       b               .Lout
+
+       // fewer than 128 bytes of in/output
+.Lt128:        ld1             {v28.16b-v31.16b}, [x10]
+       add             x5, x5, x1
+       sub             x1, x1, #64
+       tbl             v28.16b, {v0.16b-v3.16b}, v28.16b
+       tbl             v29.16b, {v0.16b-v3.16b}, v29.16b
+       tbl             v30.16b, {v0.16b-v3.16b}, v30.16b
+       tbl             v31.16b, {v0.16b-v3.16b}, v31.16b
+       ld1             {v16.16b-v19.16b}, [x1]         // reload first output block
+       b               0b
+
+       // fewer than 256 bytes of in/output
+.Lt256:        cbz             x6, 2f                          // exactly 192 bytes?
+       ld1             {v4.16b-v7.16b}, [x10]
+       add             x6, x6, x1
+       tbl             v0.16b, {v8.16b-v11.16b}, v4.16b
+       tbl             v1.16b, {v8.16b-v11.16b}, v5.16b
+       tbl             v2.16b, {v8.16b-v11.16b}, v6.16b
+       tbl             v3.16b, {v8.16b-v11.16b}, v7.16b
+
+       eor             v28.16b, v28.16b, v0.16b
+       eor             v29.16b, v29.16b, v1.16b
+       eor             v30.16b, v30.16b, v2.16b
+       eor             v31.16b, v31.16b, v3.16b
+       st1             {v28.16b-v31.16b}, [x6]         // overlapping stores
+2:     st1             {v20.16b-v23.16b}, [x1]
+       b               .Lout
+
+       // fewer than 320 bytes of in/output
+.Lt320:        cbz             x7, 3f                          // exactly 256 bytes?
+       ld1             {v4.16b-v7.16b}, [x10]
+       add             x7, x7, x1
+       tbl             v0.16b, {v12.16b-v15.16b}, v4.16b
+       tbl             v1.16b, {v12.16b-v15.16b}, v5.16b
+       tbl             v2.16b, {v12.16b-v15.16b}, v6.16b
+       tbl             v3.16b, {v12.16b-v15.16b}, v7.16b
+
+       eor             v28.16b, v28.16b, v0.16b
+       eor             v29.16b, v29.16b, v1.16b
+       eor             v30.16b, v30.16b, v2.16b
+       eor             v31.16b, v31.16b, v3.16b
+       st1             {v28.16b-v31.16b}, [x7]         // overlapping stores
+3:     st1             {v24.16b-v27.16b}, [x1]
+       b               .Lout
+SYM_FUNC_END(chacha_4block_xor_neon)
+
+       .section        ".rodata", "a", %progbits
+       .align          L1_CACHE_SHIFT
+.Lpermute:
+       .set            .Li, 0
+       .rept           128
+       .byte           (.Li - 64)
+       .set            .Li, .Li + 1
+       .endr
+
+CTRINC:        .word           1, 2, 3, 4
+ROT8:  .word           0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
diff --git a/lib/crypto/arm64/chacha-neon-glue.c b/lib/crypto/arm64/chacha-neon-glue.c
new file mode 100644 (file)
index 0000000..d0188f9
--- /dev/null
@@ -0,0 +1,119 @@
+/*
+ * ChaCha and HChaCha functions (ARM64 optimized)
+ *
+ * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
+                                     u8 *dst, const u8 *src, int nrounds);
+asmlinkage void chacha_4block_xor_neon(const struct chacha_state *state,
+                                      u8 *dst, const u8 *src,
+                                      int nrounds, int bytes);
+asmlinkage void hchacha_block_neon(const struct chacha_state *state,
+                                  u32 out[HCHACHA_OUT_WORDS], int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+static void chacha_doneon(struct chacha_state *state, u8 *dst, const u8 *src,
+                         int bytes, int nrounds)
+{
+       while (bytes > 0) {
+               int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
+
+               if (l <= CHACHA_BLOCK_SIZE) {
+                       u8 buf[CHACHA_BLOCK_SIZE];
+
+                       memcpy(buf, src, l);
+                       chacha_block_xor_neon(state, buf, buf, nrounds);
+                       memcpy(dst, buf, l);
+                       state->x[12] += 1;
+                       break;
+               }
+               chacha_4block_xor_neon(state, dst, src, nrounds, l);
+               bytes -= l;
+               src += l;
+               dst += l;
+               state->x[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+       }
+}
+
+void hchacha_block_arch(const struct chacha_state *state,
+                       u32 out[HCHACHA_OUT_WORDS], int nrounds)
+{
+       if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
+               hchacha_block_generic(state, out, nrounds);
+       } else {
+               kernel_neon_begin();
+               hchacha_block_neon(state, out, nrounds);
+               kernel_neon_end();
+       }
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(struct chacha_state *state, u8 *dst, const u8 *src,
+                      unsigned int bytes, int nrounds)
+{
+       if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
+           !crypto_simd_usable())
+               return chacha_crypt_generic(state, dst, src, bytes, nrounds);
+
+       do {
+               unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+               kernel_neon_begin();
+               chacha_doneon(state, dst, src, todo, nrounds);
+               kernel_neon_end();
+
+               bytes -= todo;
+               src += todo;
+               dst += todo;
+       } while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+       return static_key_enabled(&have_neon);
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+static int __init chacha_simd_mod_init(void)
+{
+       if (cpu_have_named_feature(ASIMD))
+               static_branch_enable(&have_neon);
+       return 0;
+}
+subsys_initcall(chacha_simd_mod_init);
+
+static void __exit chacha_simd_mod_exit(void)
+{
+}
+module_exit(chacha_simd_mod_exit);
+
+MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM64 optimized)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/arm64/poly1305-armv8.pl b/lib/crypto/arm64/poly1305-armv8.pl
new file mode 100644 (file)
index 0000000..22c9069
--- /dev/null
@@ -0,0 +1,917 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+# This module implements Poly1305 hash for ARMv8.
+#
+# June 2015
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+#              IALU/gcc-4.9    NEON
+#
+# Apple A7     1.86/+5%        0.72
+# Cortex-A53   2.69/+58%       1.47
+# Cortex-A57   2.70/+7%        1.14
+# Denver       1.64/+50%       1.18(*)
+# X-Gene       2.13/+68%       2.27
+# Mongoose     1.77/+75%       1.12
+# Kryo         2.70/+55%       1.13
+# ThunderX2    1.17/+95%       1.36
+#
+# (*)  estimate based on resources availability is less than 1.0,
+#      i.e. measured result is worse than expected, presumably binary
+#      translator is not almighty;
+
+$flavour=shift;
+$output=shift;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
+my ($mac,$nonce)=($inp,$len);
+
+my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
+
+$code.=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+.extern        OPENSSL_armcap_P
+#endif
+
+.text
+
+// forward "declarations" are required for Apple
+.globl poly1305_blocks
+.globl poly1305_emit
+
+.globl poly1305_init
+.type  poly1305_init,%function
+.align 5
+poly1305_init:
+       cmp     $inp,xzr
+       stp     xzr,xzr,[$ctx]          // zero hash value
+       stp     xzr,xzr,[$ctx,#16]      // [along with is_base2_26]
+
+       csel    x0,xzr,x0,eq
+       b.eq    .Lno_key
+
+#ifndef        __KERNEL__
+       adrp    x17,OPENSSL_armcap_P
+       ldr     w17,[x17,#:lo12:OPENSSL_armcap_P]
+#endif
+
+       ldp     $r0,$r1,[$inp]          // load key
+       mov     $s1,#0xfffffffc0fffffff
+       movk    $s1,#0x0fff,lsl#48
+#ifdef __AARCH64EB__
+       rev     $r0,$r0                 // flip bytes
+       rev     $r1,$r1
+#endif
+       and     $r0,$r0,$s1             // &=0ffffffc0fffffff
+       and     $s1,$s1,#-4
+       and     $r1,$r1,$s1             // &=0ffffffc0ffffffc
+       mov     w#$s1,#-1
+       stp     $r0,$r1,[$ctx,#32]      // save key value
+       str     w#$s1,[$ctx,#48]        // impossible key power value
+
+#ifndef        __KERNEL__
+       tst     w17,#ARMV7_NEON
+
+       adr     $d0,.Lpoly1305_blocks
+       adr     $r0,.Lpoly1305_blocks_neon
+       adr     $d1,.Lpoly1305_emit
+
+       csel    $d0,$d0,$r0,eq
+
+# ifdef        __ILP32__
+       stp     w#$d0,w#$d1,[$len]
+# else
+       stp     $d0,$d1,[$len]
+# endif
+#endif
+       mov     x0,#1
+.Lno_key:
+       ret
+.size  poly1305_init,.-poly1305_init
+
+.type  poly1305_blocks,%function
+.align 5
+poly1305_blocks:
+.Lpoly1305_blocks:
+       ands    $len,$len,#-16
+       b.eq    .Lno_data
+
+       ldp     $h0,$h1,[$ctx]          // load hash value
+       ldp     $h2,x17,[$ctx,#16]      // [along with is_base2_26]
+       ldp     $r0,$r1,[$ctx,#32]      // load key value
+
+#ifdef __AARCH64EB__
+       lsr     $d0,$h0,#32
+       mov     w#$d1,w#$h0
+       lsr     $d2,$h1,#32
+       mov     w15,w#$h1
+       lsr     x16,$h2,#32
+#else
+       mov     w#$d0,w#$h0
+       lsr     $d1,$h0,#32
+       mov     w#$d2,w#$h1
+       lsr     x15,$h1,#32
+       mov     w16,w#$h2
+#endif
+
+       add     $d0,$d0,$d1,lsl#26      // base 2^26 -> base 2^64
+       lsr     $d1,$d2,#12
+       adds    $d0,$d0,$d2,lsl#52
+       add     $d1,$d1,x15,lsl#14
+       adc     $d1,$d1,xzr
+       lsr     $d2,x16,#24
+       adds    $d1,$d1,x16,lsl#40
+       adc     $d2,$d2,xzr
+
+       cmp     x17,#0                  // is_base2_26?
+       add     $s1,$r1,$r1,lsr#2       // s1 = r1 + (r1 >> 2)
+       csel    $h0,$h0,$d0,eq          // choose between radixes
+       csel    $h1,$h1,$d1,eq
+       csel    $h2,$h2,$d2,eq
+
+.Loop:
+       ldp     $t0,$t1,[$inp],#16      // load input
+       sub     $len,$len,#16
+#ifdef __AARCH64EB__
+       rev     $t0,$t0
+       rev     $t1,$t1
+#endif
+       adds    $h0,$h0,$t0             // accumulate input
+       adcs    $h1,$h1,$t1
+
+       mul     $d0,$h0,$r0             // h0*r0
+       adc     $h2,$h2,$padbit
+       umulh   $d1,$h0,$r0
+
+       mul     $t0,$h1,$s1             // h1*5*r1
+       umulh   $t1,$h1,$s1
+
+       adds    $d0,$d0,$t0
+       mul     $t0,$h0,$r1             // h0*r1
+       adc     $d1,$d1,$t1
+       umulh   $d2,$h0,$r1
+
+       adds    $d1,$d1,$t0
+       mul     $t0,$h1,$r0             // h1*r0
+       adc     $d2,$d2,xzr
+       umulh   $t1,$h1,$r0
+
+       adds    $d1,$d1,$t0
+       mul     $t0,$h2,$s1             // h2*5*r1
+       adc     $d2,$d2,$t1
+       mul     $t1,$h2,$r0             // h2*r0
+
+       adds    $d1,$d1,$t0
+       adc     $d2,$d2,$t1
+
+       and     $t0,$d2,#-4             // final reduction
+       and     $h2,$d2,#3
+       add     $t0,$t0,$d2,lsr#2
+       adds    $h0,$d0,$t0
+       adcs    $h1,$d1,xzr
+       adc     $h2,$h2,xzr
+
+       cbnz    $len,.Loop
+
+       stp     $h0,$h1,[$ctx]          // store hash value
+       stp     $h2,xzr,[$ctx,#16]      // [and clear is_base2_26]
+
+.Lno_data:
+       ret
+.size  poly1305_blocks,.-poly1305_blocks
+
+.type  poly1305_emit,%function
+.align 5
+poly1305_emit:
+.Lpoly1305_emit:
+       ldp     $h0,$h1,[$ctx]          // load hash base 2^64
+       ldp     $h2,$r0,[$ctx,#16]      // [along with is_base2_26]
+       ldp     $t0,$t1,[$nonce]        // load nonce
+
+#ifdef __AARCH64EB__
+       lsr     $d0,$h0,#32
+       mov     w#$d1,w#$h0
+       lsr     $d2,$h1,#32
+       mov     w15,w#$h1
+       lsr     x16,$h2,#32
+#else
+       mov     w#$d0,w#$h0
+       lsr     $d1,$h0,#32
+       mov     w#$d2,w#$h1
+       lsr     x15,$h1,#32
+       mov     w16,w#$h2
+#endif
+
+       add     $d0,$d0,$d1,lsl#26      // base 2^26 -> base 2^64
+       lsr     $d1,$d2,#12
+       adds    $d0,$d0,$d2,lsl#52
+       add     $d1,$d1,x15,lsl#14
+       adc     $d1,$d1,xzr
+       lsr     $d2,x16,#24
+       adds    $d1,$d1,x16,lsl#40
+       adc     $d2,$d2,xzr
+
+       cmp     $r0,#0                  // is_base2_26?
+       csel    $h0,$h0,$d0,eq          // choose between radixes
+       csel    $h1,$h1,$d1,eq
+       csel    $h2,$h2,$d2,eq
+
+       adds    $d0,$h0,#5              // compare to modulus
+       adcs    $d1,$h1,xzr
+       adc     $d2,$h2,xzr
+
+       tst     $d2,#-4                 // see if it's carried/borrowed
+
+       csel    $h0,$h0,$d0,eq
+       csel    $h1,$h1,$d1,eq
+
+#ifdef __AARCH64EB__
+       ror     $t0,$t0,#32             // flip nonce words
+       ror     $t1,$t1,#32
+#endif
+       adds    $h0,$h0,$t0             // accumulate nonce
+       adc     $h1,$h1,$t1
+#ifdef __AARCH64EB__
+       rev     $h0,$h0                 // flip output bytes
+       rev     $h1,$h1
+#endif
+       stp     $h0,$h1,[$mac]          // write result
+
+       ret
+.size  poly1305_emit,.-poly1305_emit
+___
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
+my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
+my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
+my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
+my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
+my ($T0,$T1,$MASK) = map("v$_",(29..31));
+
+my ($in2,$zeros)=("x16","x17");
+my $is_base2_26 = $zeros;              # borrow
+
+$code.=<<___;
+.type  poly1305_mult,%function
+.align 5
+poly1305_mult:
+       mul     $d0,$h0,$r0             // h0*r0
+       umulh   $d1,$h0,$r0
+
+       mul     $t0,$h1,$s1             // h1*5*r1
+       umulh   $t1,$h1,$s1
+
+       adds    $d0,$d0,$t0
+       mul     $t0,$h0,$r1             // h0*r1
+       adc     $d1,$d1,$t1
+       umulh   $d2,$h0,$r1
+
+       adds    $d1,$d1,$t0
+       mul     $t0,$h1,$r0             // h1*r0
+       adc     $d2,$d2,xzr
+       umulh   $t1,$h1,$r0
+
+       adds    $d1,$d1,$t0
+       mul     $t0,$h2,$s1             // h2*5*r1
+       adc     $d2,$d2,$t1
+       mul     $t1,$h2,$r0             // h2*r0
+
+       adds    $d1,$d1,$t0
+       adc     $d2,$d2,$t1
+
+       and     $t0,$d2,#-4             // final reduction
+       and     $h2,$d2,#3
+       add     $t0,$t0,$d2,lsr#2
+       adds    $h0,$d0,$t0
+       adcs    $h1,$d1,xzr
+       adc     $h2,$h2,xzr
+
+       ret
+.size  poly1305_mult,.-poly1305_mult
+
+.type  poly1305_splat,%function
+.align 4
+poly1305_splat:
+       and     x12,$h0,#0x03ffffff     // base 2^64 -> base 2^26
+       ubfx    x13,$h0,#26,#26
+       extr    x14,$h1,$h0,#52
+       and     x14,x14,#0x03ffffff
+       ubfx    x15,$h1,#14,#26
+       extr    x16,$h2,$h1,#40
+
+       str     w12,[$ctx,#16*0]        // r0
+       add     w12,w13,w13,lsl#2       // r1*5
+       str     w13,[$ctx,#16*1]        // r1
+       add     w13,w14,w14,lsl#2       // r2*5
+       str     w12,[$ctx,#16*2]        // s1
+       str     w14,[$ctx,#16*3]        // r2
+       add     w14,w15,w15,lsl#2       // r3*5
+       str     w13,[$ctx,#16*4]        // s2
+       str     w15,[$ctx,#16*5]        // r3
+       add     w15,w16,w16,lsl#2       // r4*5
+       str     w14,[$ctx,#16*6]        // s3
+       str     w16,[$ctx,#16*7]        // r4
+       str     w15,[$ctx,#16*8]        // s4
+
+       ret
+.size  poly1305_splat,.-poly1305_splat
+
+#ifdef __KERNEL__
+.globl poly1305_blocks_neon
+#endif
+.type  poly1305_blocks_neon,%function
+.align 5
+poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
+       ldr     $is_base2_26,[$ctx,#24]
+       cmp     $len,#128
+       b.lo    .Lpoly1305_blocks
+
+       .inst   0xd503233f              // paciasp
+       stp     x29,x30,[sp,#-80]!
+       add     x29,sp,#0
+
+       stp     d8,d9,[sp,#16]          // meet ABI requirements
+       stp     d10,d11,[sp,#32]
+       stp     d12,d13,[sp,#48]
+       stp     d14,d15,[sp,#64]
+
+       cbz     $is_base2_26,.Lbase2_64_neon
+
+       ldp     w10,w11,[$ctx]          // load hash value base 2^26
+       ldp     w12,w13,[$ctx,#8]
+       ldr     w14,[$ctx,#16]
+
+       tst     $len,#31
+       b.eq    .Leven_neon
+
+       ldp     $r0,$r1,[$ctx,#32]      // load key value
+
+       add     $h0,x10,x11,lsl#26      // base 2^26 -> base 2^64
+       lsr     $h1,x12,#12
+       adds    $h0,$h0,x12,lsl#52
+       add     $h1,$h1,x13,lsl#14
+       adc     $h1,$h1,xzr
+       lsr     $h2,x14,#24
+       adds    $h1,$h1,x14,lsl#40
+       adc     $d2,$h2,xzr             // can be partially reduced...
+
+       ldp     $d0,$d1,[$inp],#16      // load input
+       sub     $len,$len,#16
+       add     $s1,$r1,$r1,lsr#2       // s1 = r1 + (r1 >> 2)
+
+#ifdef __AARCH64EB__
+       rev     $d0,$d0
+       rev     $d1,$d1
+#endif
+       adds    $h0,$h0,$d0             // accumulate input
+       adcs    $h1,$h1,$d1
+       adc     $h2,$h2,$padbit
+
+       bl      poly1305_mult
+
+       and     x10,$h0,#0x03ffffff     // base 2^64 -> base 2^26
+       ubfx    x11,$h0,#26,#26
+       extr    x12,$h1,$h0,#52
+       and     x12,x12,#0x03ffffff
+       ubfx    x13,$h1,#14,#26
+       extr    x14,$h2,$h1,#40
+
+       b       .Leven_neon
+
+.align 4
+.Lbase2_64_neon:
+       ldp     $r0,$r1,[$ctx,#32]      // load key value
+
+       ldp     $h0,$h1,[$ctx]          // load hash value base 2^64
+       ldr     $h2,[$ctx,#16]
+
+       tst     $len,#31
+       b.eq    .Linit_neon
+
+       ldp     $d0,$d1,[$inp],#16      // load input
+       sub     $len,$len,#16
+       add     $s1,$r1,$r1,lsr#2       // s1 = r1 + (r1 >> 2)
+#ifdef __AARCH64EB__
+       rev     $d0,$d0
+       rev     $d1,$d1
+#endif
+       adds    $h0,$h0,$d0             // accumulate input
+       adcs    $h1,$h1,$d1
+       adc     $h2,$h2,$padbit
+
+       bl      poly1305_mult
+
+.Linit_neon:
+       ldr     w17,[$ctx,#48]          // first table element
+       and     x10,$h0,#0x03ffffff     // base 2^64 -> base 2^26
+       ubfx    x11,$h0,#26,#26
+       extr    x12,$h1,$h0,#52
+       and     x12,x12,#0x03ffffff
+       ubfx    x13,$h1,#14,#26
+       extr    x14,$h2,$h1,#40
+
+       cmp     w17,#-1                 // is value impossible?
+       b.ne    .Leven_neon
+
+       fmov    ${H0},x10
+       fmov    ${H1},x11
+       fmov    ${H2},x12
+       fmov    ${H3},x13
+       fmov    ${H4},x14
+
+       ////////////////////////////////// initialize r^n table
+       mov     $h0,$r0                 // r^1
+       add     $s1,$r1,$r1,lsr#2       // s1 = r1 + (r1 >> 2)
+       mov     $h1,$r1
+       mov     $h2,xzr
+       add     $ctx,$ctx,#48+12
+       bl      poly1305_splat
+
+       bl      poly1305_mult           // r^2
+       sub     $ctx,$ctx,#4
+       bl      poly1305_splat
+
+       bl      poly1305_mult           // r^3
+       sub     $ctx,$ctx,#4
+       bl      poly1305_splat
+
+       bl      poly1305_mult           // r^4
+       sub     $ctx,$ctx,#4
+       bl      poly1305_splat
+       sub     $ctx,$ctx,#48           // restore original $ctx
+       b       .Ldo_neon
+
+.align 4
+.Leven_neon:
+       fmov    ${H0},x10
+       fmov    ${H1},x11
+       fmov    ${H2},x12
+       fmov    ${H3},x13
+       fmov    ${H4},x14
+
+.Ldo_neon:
+       ldp     x8,x12,[$inp,#32]       // inp[2:3]
+       subs    $len,$len,#64
+       ldp     x9,x13,[$inp,#48]
+       add     $in2,$inp,#96
+       adrp    $zeros,.Lzeros
+       add     $zeros,$zeros,#:lo12:.Lzeros
+
+       lsl     $padbit,$padbit,#24
+       add     x15,$ctx,#48
+
+#ifdef __AARCH64EB__
+       rev     x8,x8
+       rev     x12,x12
+       rev     x9,x9
+       rev     x13,x13
+#endif
+       and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
+       and     x5,x9,#0x03ffffff
+       ubfx    x6,x8,#26,#26
+       ubfx    x7,x9,#26,#26
+       add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
+       extr    x8,x12,x8,#52
+       extr    x9,x13,x9,#52
+       add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
+       fmov    $IN23_0,x4
+       and     x8,x8,#0x03ffffff
+       and     x9,x9,#0x03ffffff
+       ubfx    x10,x12,#14,#26
+       ubfx    x11,x13,#14,#26
+       add     x12,$padbit,x12,lsr#40
+       add     x13,$padbit,x13,lsr#40
+       add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
+       fmov    $IN23_1,x6
+       add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
+       add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
+       fmov    $IN23_2,x8
+       fmov    $IN23_3,x10
+       fmov    $IN23_4,x12
+
+       ldp     x8,x12,[$inp],#16       // inp[0:1]
+       ldp     x9,x13,[$inp],#48
+
+       ld1     {$R0,$R1,$S1,$R2},[x15],#64
+       ld1     {$S2,$R3,$S3,$R4},[x15],#64
+       ld1     {$S4},[x15]
+
+#ifdef __AARCH64EB__
+       rev     x8,x8
+       rev     x12,x12
+       rev     x9,x9
+       rev     x13,x13
+#endif
+       and     x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
+       and     x5,x9,#0x03ffffff
+       ubfx    x6,x8,#26,#26
+       ubfx    x7,x9,#26,#26
+       add     x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
+       extr    x8,x12,x8,#52
+       extr    x9,x13,x9,#52
+       add     x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
+       fmov    $IN01_0,x4
+       and     x8,x8,#0x03ffffff
+       and     x9,x9,#0x03ffffff
+       ubfx    x10,x12,#14,#26
+       ubfx    x11,x13,#14,#26
+       add     x12,$padbit,x12,lsr#40
+       add     x13,$padbit,x13,lsr#40
+       add     x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
+       fmov    $IN01_1,x6
+       add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
+       add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
+       movi    $MASK.2d,#-1
+       fmov    $IN01_2,x8
+       fmov    $IN01_3,x10
+       fmov    $IN01_4,x12
+       ushr    $MASK.2d,$MASK.2d,#38
+
+       b.ls    .Lskip_loop
+
+.align 4
+.Loop_neon:
+       ////////////////////////////////////////////////////////////////
+       // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+       // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+       //   \___________________/
+       // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+       // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+       //   \___________________/ \____________________/
+       //
+       // Note that we start with inp[2:3]*r^2. This is because it
+       // doesn't depend on reduction in previous iteration.
+       ////////////////////////////////////////////////////////////////
+       // d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
+       // d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
+       // d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
+       // d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
+       // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
+
+       subs    $len,$len,#64
+       umull   $ACC4,$IN23_0,${R4}[2]
+       csel    $in2,$zeros,$in2,lo
+       umull   $ACC3,$IN23_0,${R3}[2]
+       umull   $ACC2,$IN23_0,${R2}[2]
+        ldp    x8,x12,[$in2],#16       // inp[2:3] (or zero)
+       umull   $ACC1,$IN23_0,${R1}[2]
+        ldp    x9,x13,[$in2],#48
+       umull   $ACC0,$IN23_0,${R0}[2]
+#ifdef __AARCH64EB__
+        rev    x8,x8
+        rev    x12,x12
+        rev    x9,x9
+        rev    x13,x13
+#endif
+
+       umlal   $ACC4,$IN23_1,${R3}[2]
+        and    x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
+       umlal   $ACC3,$IN23_1,${R2}[2]
+        and    x5,x9,#0x03ffffff
+       umlal   $ACC2,$IN23_1,${R1}[2]
+        ubfx   x6,x8,#26,#26
+       umlal   $ACC1,$IN23_1,${R0}[2]
+        ubfx   x7,x9,#26,#26
+       umlal   $ACC0,$IN23_1,${S4}[2]
+        add    x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
+
+       umlal   $ACC4,$IN23_2,${R2}[2]
+        extr   x8,x12,x8,#52
+       umlal   $ACC3,$IN23_2,${R1}[2]
+        extr   x9,x13,x9,#52
+       umlal   $ACC2,$IN23_2,${R0}[2]
+        add    x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
+       umlal   $ACC1,$IN23_2,${S4}[2]
+        fmov   $IN23_0,x4
+       umlal   $ACC0,$IN23_2,${S3}[2]
+        and    x8,x8,#0x03ffffff
+
+       umlal   $ACC4,$IN23_3,${R1}[2]
+        and    x9,x9,#0x03ffffff
+       umlal   $ACC3,$IN23_3,${R0}[2]
+        ubfx   x10,x12,#14,#26
+       umlal   $ACC2,$IN23_3,${S4}[2]
+        ubfx   x11,x13,#14,#26
+       umlal   $ACC1,$IN23_3,${S3}[2]
+        add    x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
+       umlal   $ACC0,$IN23_3,${S2}[2]
+        fmov   $IN23_1,x6
+
+       add     $IN01_2,$IN01_2,$H2
+        add    x12,$padbit,x12,lsr#40
+       umlal   $ACC4,$IN23_4,${R0}[2]
+        add    x13,$padbit,x13,lsr#40
+       umlal   $ACC3,$IN23_4,${S4}[2]
+        add    x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
+       umlal   $ACC2,$IN23_4,${S3}[2]
+        add    x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
+       umlal   $ACC1,$IN23_4,${S2}[2]
+        fmov   $IN23_2,x8
+       umlal   $ACC0,$IN23_4,${S1}[2]
+        fmov   $IN23_3,x10
+
+       ////////////////////////////////////////////////////////////////
+       // (hash+inp[0:1])*r^4 and accumulate
+
+       add     $IN01_0,$IN01_0,$H0
+        fmov   $IN23_4,x12
+       umlal   $ACC3,$IN01_2,${R1}[0]
+        ldp    x8,x12,[$inp],#16       // inp[0:1]
+       umlal   $ACC0,$IN01_2,${S3}[0]
+        ldp    x9,x13,[$inp],#48
+       umlal   $ACC4,$IN01_2,${R2}[0]
+       umlal   $ACC1,$IN01_2,${S4}[0]
+       umlal   $ACC2,$IN01_2,${R0}[0]
+#ifdef __AARCH64EB__
+        rev    x8,x8
+        rev    x12,x12
+        rev    x9,x9
+        rev    x13,x13
+#endif
+
+       add     $IN01_1,$IN01_1,$H1
+       umlal   $ACC3,$IN01_0,${R3}[0]
+       umlal   $ACC4,$IN01_0,${R4}[0]
+        and    x4,x8,#0x03ffffff       // base 2^64 -> base 2^26
+       umlal   $ACC2,$IN01_0,${R2}[0]
+        and    x5,x9,#0x03ffffff
+       umlal   $ACC0,$IN01_0,${R0}[0]
+        ubfx   x6,x8,#26,#26
+       umlal   $ACC1,$IN01_0,${R1}[0]
+        ubfx   x7,x9,#26,#26
+
+       add     $IN01_3,$IN01_3,$H3
+        add    x4,x4,x5,lsl#32         // bfi  x4,x5,#32,#32
+       umlal   $ACC3,$IN01_1,${R2}[0]
+        extr   x8,x12,x8,#52
+       umlal   $ACC4,$IN01_1,${R3}[0]
+        extr   x9,x13,x9,#52
+       umlal   $ACC0,$IN01_1,${S4}[0]
+        add    x6,x6,x7,lsl#32         // bfi  x6,x7,#32,#32
+       umlal   $ACC2,$IN01_1,${R1}[0]
+        fmov   $IN01_0,x4
+       umlal   $ACC1,$IN01_1,${R0}[0]
+        and    x8,x8,#0x03ffffff
+
+       add     $IN01_4,$IN01_4,$H4
+        and    x9,x9,#0x03ffffff
+       umlal   $ACC3,$IN01_3,${R0}[0]
+        ubfx   x10,x12,#14,#26
+       umlal   $ACC0,$IN01_3,${S2}[0]
+        ubfx   x11,x13,#14,#26
+       umlal   $ACC4,$IN01_3,${R1}[0]
+        add    x8,x8,x9,lsl#32         // bfi  x8,x9,#32,#32
+       umlal   $ACC1,$IN01_3,${S3}[0]
+        fmov   $IN01_1,x6
+       umlal   $ACC2,$IN01_3,${S4}[0]
+        add    x12,$padbit,x12,lsr#40
+
+       umlal   $ACC3,$IN01_4,${S4}[0]
+        add    x13,$padbit,x13,lsr#40
+       umlal   $ACC0,$IN01_4,${S1}[0]
+        add    x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
+       umlal   $ACC4,$IN01_4,${R0}[0]
+        add    x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
+       umlal   $ACC1,$IN01_4,${S2}[0]
+        fmov   $IN01_2,x8
+       umlal   $ACC2,$IN01_4,${S3}[0]
+        fmov   $IN01_3,x10
+        fmov   $IN01_4,x12
+
+       /////////////////////////////////////////////////////////////////
+       // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+       // and P. Schwabe
+       //
+       // [see discussion in poly1305-armv4 module]
+
+       ushr    $T0.2d,$ACC3,#26
+       xtn     $H3,$ACC3
+        ushr   $T1.2d,$ACC0,#26
+        and    $ACC0,$ACC0,$MASK.2d
+       add     $ACC4,$ACC4,$T0.2d      // h3 -> h4
+       bic     $H3,#0xfc,lsl#24        // &=0x03ffffff
+        add    $ACC1,$ACC1,$T1.2d      // h0 -> h1
+
+       ushr    $T0.2d,$ACC4,#26
+       xtn     $H4,$ACC4
+        ushr   $T1.2d,$ACC1,#26
+        xtn    $H1,$ACC1
+       bic     $H4,#0xfc,lsl#24
+        add    $ACC2,$ACC2,$T1.2d      // h1 -> h2
+
+       add     $ACC0,$ACC0,$T0.2d
+       shl     $T0.2d,$T0.2d,#2
+        shrn   $T1.2s,$ACC2,#26
+        xtn    $H2,$ACC2
+       add     $ACC0,$ACC0,$T0.2d      // h4 -> h0
+        bic    $H1,#0xfc,lsl#24
+        add    $H3,$H3,$T1.2s          // h2 -> h3
+        bic    $H2,#0xfc,lsl#24
+
+       shrn    $T0.2s,$ACC0,#26
+       xtn     $H0,$ACC0
+        ushr   $T1.2s,$H3,#26
+        bic    $H3,#0xfc,lsl#24
+        bic    $H0,#0xfc,lsl#24
+       add     $H1,$H1,$T0.2s          // h0 -> h1
+        add    $H4,$H4,$T1.2s          // h3 -> h4
+
+       b.hi    .Loop_neon
+
+.Lskip_loop:
+       dup     $IN23_2,${IN23_2}[0]
+       add     $IN01_2,$IN01_2,$H2
+
+       ////////////////////////////////////////////////////////////////
+       // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+       adds    $len,$len,#32
+       b.ne    .Long_tail
+
+       dup     $IN23_2,${IN01_2}[0]
+       add     $IN23_0,$IN01_0,$H0
+       add     $IN23_3,$IN01_3,$H3
+       add     $IN23_1,$IN01_1,$H1
+       add     $IN23_4,$IN01_4,$H4
+
+.Long_tail:
+       dup     $IN23_0,${IN23_0}[0]
+       umull2  $ACC0,$IN23_2,${S3}
+       umull2  $ACC3,$IN23_2,${R1}
+       umull2  $ACC4,$IN23_2,${R2}
+       umull2  $ACC2,$IN23_2,${R0}
+       umull2  $ACC1,$IN23_2,${S4}
+
+       dup     $IN23_1,${IN23_1}[0]
+       umlal2  $ACC0,$IN23_0,${R0}
+       umlal2  $ACC2,$IN23_0,${R2}
+       umlal2  $ACC3,$IN23_0,${R3}
+       umlal2  $ACC4,$IN23_0,${R4}
+       umlal2  $ACC1,$IN23_0,${R1}
+
+       dup     $IN23_3,${IN23_3}[0]
+       umlal2  $ACC0,$IN23_1,${S4}
+       umlal2  $ACC3,$IN23_1,${R2}
+       umlal2  $ACC2,$IN23_1,${R1}
+       umlal2  $ACC4,$IN23_1,${R3}
+       umlal2  $ACC1,$IN23_1,${R0}
+
+       dup     $IN23_4,${IN23_4}[0]
+       umlal2  $ACC3,$IN23_3,${R0}
+       umlal2  $ACC4,$IN23_3,${R1}
+       umlal2  $ACC0,$IN23_3,${S2}
+       umlal2  $ACC1,$IN23_3,${S3}
+       umlal2  $ACC2,$IN23_3,${S4}
+
+       umlal2  $ACC3,$IN23_4,${S4}
+       umlal2  $ACC0,$IN23_4,${S1}
+       umlal2  $ACC4,$IN23_4,${R0}
+       umlal2  $ACC1,$IN23_4,${S2}
+       umlal2  $ACC2,$IN23_4,${S3}
+
+       b.eq    .Lshort_tail
+
+       ////////////////////////////////////////////////////////////////
+       // (hash+inp[0:1])*r^4:r^3 and accumulate
+
+       add     $IN01_0,$IN01_0,$H0
+       umlal   $ACC3,$IN01_2,${R1}
+       umlal   $ACC0,$IN01_2,${S3}
+       umlal   $ACC4,$IN01_2,${R2}
+       umlal   $ACC1,$IN01_2,${S4}
+       umlal   $ACC2,$IN01_2,${R0}
+
+       add     $IN01_1,$IN01_1,$H1
+       umlal   $ACC3,$IN01_0,${R3}
+       umlal   $ACC0,$IN01_0,${R0}
+       umlal   $ACC4,$IN01_0,${R4}
+       umlal   $ACC1,$IN01_0,${R1}
+       umlal   $ACC2,$IN01_0,${R2}
+
+       add     $IN01_3,$IN01_3,$H3
+       umlal   $ACC3,$IN01_1,${R2}
+       umlal   $ACC0,$IN01_1,${S4}
+       umlal   $ACC4,$IN01_1,${R3}
+       umlal   $ACC1,$IN01_1,${R0}
+       umlal   $ACC2,$IN01_1,${R1}
+
+       add     $IN01_4,$IN01_4,$H4
+       umlal   $ACC3,$IN01_3,${R0}
+       umlal   $ACC0,$IN01_3,${S2}
+       umlal   $ACC4,$IN01_3,${R1}
+       umlal   $ACC1,$IN01_3,${S3}
+       umlal   $ACC2,$IN01_3,${S4}
+
+       umlal   $ACC3,$IN01_4,${S4}
+       umlal   $ACC0,$IN01_4,${S1}
+       umlal   $ACC4,$IN01_4,${R0}
+       umlal   $ACC1,$IN01_4,${S2}
+       umlal   $ACC2,$IN01_4,${S3}
+
+.Lshort_tail:
+       ////////////////////////////////////////////////////////////////
+       // horizontal add
+
+       addp    $ACC3,$ACC3,$ACC3
+        ldp    d8,d9,[sp,#16]          // meet ABI requirements
+       addp    $ACC0,$ACC0,$ACC0
+        ldp    d10,d11,[sp,#32]
+       addp    $ACC4,$ACC4,$ACC4
+        ldp    d12,d13,[sp,#48]
+       addp    $ACC1,$ACC1,$ACC1
+        ldp    d14,d15,[sp,#64]
+       addp    $ACC2,$ACC2,$ACC2
+        ldr    x30,[sp,#8]
+
+       ////////////////////////////////////////////////////////////////
+       // lazy reduction, but without narrowing
+
+       ushr    $T0.2d,$ACC3,#26
+       and     $ACC3,$ACC3,$MASK.2d
+        ushr   $T1.2d,$ACC0,#26
+        and    $ACC0,$ACC0,$MASK.2d
+
+       add     $ACC4,$ACC4,$T0.2d      // h3 -> h4
+        add    $ACC1,$ACC1,$T1.2d      // h0 -> h1
+
+       ushr    $T0.2d,$ACC4,#26
+       and     $ACC4,$ACC4,$MASK.2d
+        ushr   $T1.2d,$ACC1,#26
+        and    $ACC1,$ACC1,$MASK.2d
+        add    $ACC2,$ACC2,$T1.2d      // h1 -> h2
+
+       add     $ACC0,$ACC0,$T0.2d
+       shl     $T0.2d,$T0.2d,#2
+        ushr   $T1.2d,$ACC2,#26
+        and    $ACC2,$ACC2,$MASK.2d
+       add     $ACC0,$ACC0,$T0.2d      // h4 -> h0
+        add    $ACC3,$ACC3,$T1.2d      // h2 -> h3
+
+       ushr    $T0.2d,$ACC0,#26
+       and     $ACC0,$ACC0,$MASK.2d
+        ushr   $T1.2d,$ACC3,#26
+        and    $ACC3,$ACC3,$MASK.2d
+       add     $ACC1,$ACC1,$T0.2d      // h0 -> h1
+        add    $ACC4,$ACC4,$T1.2d      // h3 -> h4
+
+       ////////////////////////////////////////////////////////////////
+       // write the result, can be partially reduced
+
+       st4     {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
+       mov     x4,#1
+       st1     {$ACC4}[0],[$ctx]
+       str     x4,[$ctx,#8]            // set is_base2_26
+
+       ldr     x29,[sp],#80
+        .inst  0xd50323bf              // autiasp
+       ret
+.size  poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.pushsection .rodata
+.align 5
+.Lzeros:
+.long  0,0,0,0,0,0,0,0
+.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
+.popsection
+
+.align 2
+#if !defined(__KERNEL__) && !defined(_WIN64)
+.comm  OPENSSL_armcap_P,4,4
+.hidden        OPENSSL_armcap_P
+#endif
+___
+
+foreach (split("\n",$code)) {
+       s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/                      or
+       s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/     or
+       (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1))                 or
+       (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1))       or
+       (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1))             or
+       (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1))            or
+       (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
+
+       s/\.[124]([sd])\[/.$1\[/;
+       s/w#x([0-9]+)/w$1/g;
+
+       print $_,"\n";
+}
+close STDOUT;
diff --git a/lib/crypto/arm64/poly1305-glue.c b/lib/crypto/arm64/poly1305-glue.c
new file mode 100644 (file)
index 0000000..c9a7476
--- /dev/null
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <crypto/internal/poly1305.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+
+asmlinkage void poly1305_block_init_arch(
+       struct poly1305_block_state *state,
+       const u8 raw_key[POLY1305_BLOCK_SIZE]);
+EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
+asmlinkage void poly1305_blocks(struct poly1305_block_state *state,
+                               const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_blocks_neon(struct poly1305_block_state *state,
+                                    const u8 *src, u32 len, u32 hibit);
+asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
+                                  u8 digest[POLY1305_DIGEST_SIZE],
+                                  const u32 nonce[4]);
+EXPORT_SYMBOL_GPL(poly1305_emit_arch);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
+                         unsigned int len, u32 padbit)
+{
+       len = round_down(len, POLY1305_BLOCK_SIZE);
+       if (static_branch_likely(&have_neon)) {
+               do {
+                       unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+                       kernel_neon_begin();
+                       poly1305_blocks_neon(state, src, todo, padbit);
+                       kernel_neon_end();
+
+                       len -= todo;
+                       src += todo;
+               } while (len);
+       } else
+               poly1305_blocks(state, src, len, padbit);
+}
+EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
+
+bool poly1305_is_arch_optimized(void)
+{
+       /* We always can use at least the ARM64 scalar implementation. */
+       return true;
+}
+EXPORT_SYMBOL(poly1305_is_arch_optimized);
+
+static int __init neon_poly1305_mod_init(void)
+{
+       if (cpu_have_named_feature(ASIMD))
+               static_branch_enable(&have_neon);
+       return 0;
+}
+subsys_initcall(neon_poly1305_mod_init);
+
+static void __exit neon_poly1305_mod_exit(void)
+{
+}
+module_exit(neon_poly1305_mod_exit);
+
+MODULE_DESCRIPTION("Poly1305 authenticator (ARM64 optimized)");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/arm64/sha2-armv8.pl b/lib/crypto/arm64/sha2-armv8.pl
new file mode 100644 (file)
index 0000000..4aebd20
--- /dev/null
@@ -0,0 +1,786 @@
+#! /usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+
+# This code is taken from the OpenSSL project but the author (Andy Polyakov)
+# has relicensed it under the GPLv2. Therefore this program is free software;
+# you can redistribute it and/or modify it under the terms of the GNU General
+# Public License version 2 as published by the Free Software Foundation.
+#
+# The original headers, including the original license headers, are
+# included below for completeness.
+
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256/512 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+#              SHA256-hw       SHA256(*)       SHA512
+# Apple A7     1.97            10.5 (+33%)     6.73 (-1%(**))
+# Cortex-A53   2.38            15.5 (+115%)    10.0 (+150%(***))
+# Cortex-A57   2.31            11.6 (+86%)     7.51 (+260%(***))
+# Denver       2.01            10.5 (+26%)     6.70 (+8%)
+# X-Gene                       20.0 (+100%)    12.8 (+300%(***))
+# Mongoose     2.36            13.0 (+50%)     8.36 (+33%)
+#
+# (*)  Software SHA256 results are of lesser relevance, presented
+#      mostly for informational purposes.
+# (**) The result is a trade-off: it's possible to improve it by
+#      10% (or by 1 cycle per round), but at the cost of 20% loss
+#      on Cortex-A53 (or by 4 cycles per round).
+# (***)        Super-impressive coefficients over gcc-generated code are
+#      indication of some compiler "pathology", most notably code
+#      generated with -mgeneral-regs-only is significantly faster
+#      and the gap is only 40-90%.
+#
+# October 2016.
+#
+# Originally it was reckoned that it makes no sense to implement NEON
+# version of SHA256 for 64-bit processors. This is because performance
+# improvement on most wide-spread Cortex-A5x processors was observed
+# to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
+# observed that 32-bit NEON SHA256 performs significantly better than
+# 64-bit scalar version on *some* of the more recent processors. As
+# result 64-bit NEON version of SHA256 was added to provide best
+# all-round performance. For example it executes ~30% faster on X-Gene
+# and Mongoose. [For reference, NEON version of SHA512 is bound to
+# deliver much less improvement, likely *negative* on Cortex-A5x.
+# Which is why NEON support is limited to SHA256.]
+
+$output=pop;
+$flavour=pop;
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open OUT,"| \"$^X\" $xlate $flavour $output";
+    *STDOUT=*OUT;
+} else {
+    open STDOUT,">$output";
+}
+
+if ($output =~ /512/) {
+       $BITS=512;
+       $SZ=8;
+       @Sigma0=(28,34,39);
+       @Sigma1=(14,18,41);
+       @sigma0=(1,  8, 7);
+       @sigma1=(19,61, 6);
+       $rounds=80;
+       $reg_t="x";
+} else {
+       $BITS=256;
+       $SZ=4;
+       @Sigma0=( 2,13,22);
+       @Sigma1=( 6,11,25);
+       @sigma0=( 7,18, 3);
+       @sigma1=(17,19,10);
+       $rounds=64;
+       $reg_t="w";
+}
+
+$func="sha${BITS}_blocks_arch";
+
+($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
+
+@X=map("$reg_t$_",(3..15,0..2));
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
+($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
+
+sub BODY_00_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my $j=($i+1)&15;
+my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
+   $T0=@X[$i+3] if ($i<11);
+
+$code.=<<___   if ($i<16);
+#ifndef        __AARCH64EB__
+       rev     @X[$i],@X[$i]                   // $i
+#endif
+___
+$code.=<<___   if ($i<13 && ($i&1));
+       ldp     @X[$i+1],@X[$i+2],[$inp],#2*$SZ
+___
+$code.=<<___   if ($i==13);
+       ldp     @X[14],@X[15],[$inp]
+___
+$code.=<<___   if ($i>=14);
+       ldr     @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
+___
+$code.=<<___   if ($i>0 && $i<16);
+       add     $a,$a,$t1                       // h+=Sigma0(a)
+___
+$code.=<<___   if ($i>=11);
+       str     @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
+___
+# While ARMv8 specifies merged rotate-n-logical operation such as
+# 'eor x,y,z,ror#n', it was found to negatively affect performance
+# on Apple A7. The reason seems to be that it requires even 'y' to
+# be available earlier. This means that such merged instruction is
+# not necessarily best choice on critical path... On the other hand
+# Cortex-A5x handles merged instructions much better than disjoint
+# rotate and logical... See (**) footnote above.
+$code.=<<___   if ($i<15);
+       ror     $t0,$e,#$Sigma1[0]
+       add     $h,$h,$t2                       // h+=K[i]
+       eor     $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
+       and     $t1,$f,$e
+       bic     $t2,$g,$e
+       add     $h,$h,@X[$i&15]                 // h+=X[i]
+       orr     $t1,$t1,$t2                     // Ch(e,f,g)
+       eor     $t2,$a,$b                       // a^b, b^c in next round
+       eor     $t0,$t0,$T0,ror#$Sigma1[1]      // Sigma1(e)
+       ror     $T0,$a,#$Sigma0[0]
+       add     $h,$h,$t1                       // h+=Ch(e,f,g)
+       eor     $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
+       add     $h,$h,$t0                       // h+=Sigma1(e)
+       and     $t3,$t3,$t2                     // (b^c)&=(a^b)
+       add     $d,$d,$h                        // d+=h
+       eor     $t3,$t3,$b                      // Maj(a,b,c)
+       eor     $t1,$T0,$t1,ror#$Sigma0[1]      // Sigma0(a)
+       add     $h,$h,$t3                       // h+=Maj(a,b,c)
+       ldr     $t3,[$Ktbl],#$SZ                // *K++, $t2 in next round
+       //add   $h,$h,$t1                       // h+=Sigma0(a)
+___
+$code.=<<___   if ($i>=15);
+       ror     $t0,$e,#$Sigma1[0]
+       add     $h,$h,$t2                       // h+=K[i]
+       ror     $T1,@X[($j+1)&15],#$sigma0[0]
+       and     $t1,$f,$e
+       ror     $T2,@X[($j+14)&15],#$sigma1[0]
+       bic     $t2,$g,$e
+       ror     $T0,$a,#$Sigma0[0]
+       add     $h,$h,@X[$i&15]                 // h+=X[i]
+       eor     $t0,$t0,$e,ror#$Sigma1[1]
+       eor     $T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
+       orr     $t1,$t1,$t2                     // Ch(e,f,g)
+       eor     $t2,$a,$b                       // a^b, b^c in next round
+       eor     $t0,$t0,$e,ror#$Sigma1[2]       // Sigma1(e)
+       eor     $T0,$T0,$a,ror#$Sigma0[1]
+       add     $h,$h,$t1                       // h+=Ch(e,f,g)
+       and     $t3,$t3,$t2                     // (b^c)&=(a^b)
+       eor     $T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
+       eor     $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]    // sigma0(X[i+1])
+       add     $h,$h,$t0                       // h+=Sigma1(e)
+       eor     $t3,$t3,$b                      // Maj(a,b,c)
+       eor     $t1,$T0,$a,ror#$Sigma0[2]       // Sigma0(a)
+       eor     $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]   // sigma1(X[i+14])
+       add     @X[$j],@X[$j],@X[($j+9)&15]
+       add     $d,$d,$h                        // d+=h
+       add     $h,$h,$t3                       // h+=Maj(a,b,c)
+       ldr     $t3,[$Ktbl],#$SZ                // *K++, $t2 in next round
+       add     @X[$j],@X[$j],$T1
+       add     $h,$h,$t1                       // h+=Sigma0(a)
+       add     @X[$j],@X[$j],$T2
+___
+       ($t2,$t3)=($t3,$t2);
+}
+
+$code.=<<___;
+#ifndef        __KERNEL__
+# include "arm_arch.h"
+#endif
+
+.text
+
+.extern        OPENSSL_armcap_P
+.globl $func
+.type  $func,%function
+.align 6
+$func:
+___
+$code.=<<___   if ($SZ==4);
+#ifndef        __KERNEL__
+# ifdef        __ILP32__
+       ldrsw   x16,.LOPENSSL_armcap_P
+# else
+       ldr     x16,.LOPENSSL_armcap_P
+# endif
+       adr     x17,.LOPENSSL_armcap_P
+       add     x16,x16,x17
+       ldr     w16,[x16]
+       tst     w16,#ARMV8_SHA256
+       b.ne    .Lv8_entry
+       tst     w16,#ARMV7_NEON
+       b.ne    .Lneon_entry
+#endif
+___
+$code.=<<___;
+       stp     x29,x30,[sp,#-128]!
+       add     x29,sp,#0
+
+       stp     x19,x20,[sp,#16]
+       stp     x21,x22,[sp,#32]
+       stp     x23,x24,[sp,#48]
+       stp     x25,x26,[sp,#64]
+       stp     x27,x28,[sp,#80]
+       sub     sp,sp,#4*$SZ
+
+       ldp     $A,$B,[$ctx]                            // load context
+       ldp     $C,$D,[$ctx,#2*$SZ]
+       ldp     $E,$F,[$ctx,#4*$SZ]
+       add     $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
+       ldp     $G,$H,[$ctx,#6*$SZ]
+       adr     $Ktbl,.LK$BITS
+       stp     $ctx,$num,[x29,#96]
+
+.Loop:
+       ldp     @X[0],@X[1],[$inp],#2*$SZ
+       ldr     $t2,[$Ktbl],#$SZ                        // *K++
+       eor     $t3,$B,$C                               // magic seed
+       str     $inp,[x29,#112]
+___
+for ($i=0;$i<16;$i++)  { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=".Loop_16_xx:\n";
+for (;$i<32;$i++)      { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+       cbnz    $t2,.Loop_16_xx
+
+       ldp     $ctx,$num,[x29,#96]
+       ldr     $inp,[x29,#112]
+       sub     $Ktbl,$Ktbl,#`$SZ*($rounds+1)`          // rewind
+
+       ldp     @X[0],@X[1],[$ctx]
+       ldp     @X[2],@X[3],[$ctx,#2*$SZ]
+       add     $inp,$inp,#14*$SZ                       // advance input pointer
+       ldp     @X[4],@X[5],[$ctx,#4*$SZ]
+       add     $A,$A,@X[0]
+       ldp     @X[6],@X[7],[$ctx,#6*$SZ]
+       add     $B,$B,@X[1]
+       add     $C,$C,@X[2]
+       add     $D,$D,@X[3]
+       stp     $A,$B,[$ctx]
+       add     $E,$E,@X[4]
+       add     $F,$F,@X[5]
+       stp     $C,$D,[$ctx,#2*$SZ]
+       add     $G,$G,@X[6]
+       add     $H,$H,@X[7]
+       cmp     $inp,$num
+       stp     $E,$F,[$ctx,#4*$SZ]
+       stp     $G,$H,[$ctx,#6*$SZ]
+       b.ne    .Loop
+
+       ldp     x19,x20,[x29,#16]
+       add     sp,sp,#4*$SZ
+       ldp     x21,x22,[x29,#32]
+       ldp     x23,x24,[x29,#48]
+       ldp     x25,x26,[x29,#64]
+       ldp     x27,x28,[x29,#80]
+       ldp     x29,x30,[sp],#128
+       ret
+.size  $func,.-$func
+
+.align 6
+.type  .LK$BITS,%object
+.LK$BITS:
+___
+$code.=<<___ if ($SZ==8);
+       .quad   0x428a2f98d728ae22,0x7137449123ef65cd
+       .quad   0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+       .quad   0x3956c25bf348b538,0x59f111f1b605d019
+       .quad   0x923f82a4af194f9b,0xab1c5ed5da6d8118
+       .quad   0xd807aa98a3030242,0x12835b0145706fbe
+       .quad   0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+       .quad   0x72be5d74f27b896f,0x80deb1fe3b1696b1
+       .quad   0x9bdc06a725c71235,0xc19bf174cf692694
+       .quad   0xe49b69c19ef14ad2,0xefbe4786384f25e3
+       .quad   0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+       .quad   0x2de92c6f592b0275,0x4a7484aa6ea6e483
+       .quad   0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+       .quad   0x983e5152ee66dfab,0xa831c66d2db43210
+       .quad   0xb00327c898fb213f,0xbf597fc7beef0ee4
+       .quad   0xc6e00bf33da88fc2,0xd5a79147930aa725
+       .quad   0x06ca6351e003826f,0x142929670a0e6e70
+       .quad   0x27b70a8546d22ffc,0x2e1b21385c26c926
+       .quad   0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+       .quad   0x650a73548baf63de,0x766a0abb3c77b2a8
+       .quad   0x81c2c92e47edaee6,0x92722c851482353b
+       .quad   0xa2bfe8a14cf10364,0xa81a664bbc423001
+       .quad   0xc24b8b70d0f89791,0xc76c51a30654be30
+       .quad   0xd192e819d6ef5218,0xd69906245565a910
+       .quad   0xf40e35855771202a,0x106aa07032bbd1b8
+       .quad   0x19a4c116b8d2d0c8,0x1e376c085141ab53
+       .quad   0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+       .quad   0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+       .quad   0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+       .quad   0x748f82ee5defb2fc,0x78a5636f43172f60
+       .quad   0x84c87814a1f0ab72,0x8cc702081a6439ec
+       .quad   0x90befffa23631e28,0xa4506cebde82bde9
+       .quad   0xbef9a3f7b2c67915,0xc67178f2e372532b
+       .quad   0xca273eceea26619c,0xd186b8c721c0c207
+       .quad   0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+       .quad   0x06f067aa72176fba,0x0a637dc5a2c898a6
+       .quad   0x113f9804bef90dae,0x1b710b35131c471b
+       .quad   0x28db77f523047d84,0x32caab7b40c72493
+       .quad   0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+       .quad   0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+       .quad   0x5fcb6fab3ad6faec,0x6c44198c4a475817
+       .quad   0       // terminator
+___
+$code.=<<___ if ($SZ==4);
+       .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+       .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+       .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+       .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+       .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+       .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+       .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+       .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+       .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+       .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+       .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+       .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+       .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+       .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+       .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+       .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+       .long   0       //terminator
+___
+$code.=<<___;
+.size  .LK$BITS,.-.LK$BITS
+#ifndef        __KERNEL__
+.align 3
+.LOPENSSL_armcap_P:
+# ifdef        __ILP32__
+       .long   OPENSSL_armcap_P-.
+# else
+       .quad   OPENSSL_armcap_P-.
+# endif
+#endif
+.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+
+if ($SZ==4) {
+my $Ktbl="x3";
+
+my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
+my @MSG=map("v$_.16b",(4..7));
+my ($W0,$W1)=("v16.4s","v17.4s");
+my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
+
+$code.=<<___;
+#ifndef        __KERNEL__
+.type  sha256_block_armv8,%function
+.align 6
+sha256_block_armv8:
+.Lv8_entry:
+       stp             x29,x30,[sp,#-16]!
+       add             x29,sp,#0
+
+       ld1.32          {$ABCD,$EFGH},[$ctx]
+       adr             $Ktbl,.LK256
+
+.Loop_hw:
+       ld1             {@MSG[0]-@MSG[3]},[$inp],#64
+       sub             $num,$num,#1
+       ld1.32          {$W0},[$Ktbl],#16
+       rev32           @MSG[0],@MSG[0]
+       rev32           @MSG[1],@MSG[1]
+       rev32           @MSG[2],@MSG[2]
+       rev32           @MSG[3],@MSG[3]
+       orr             $ABCD_SAVE,$ABCD,$ABCD          // offload
+       orr             $EFGH_SAVE,$EFGH,$EFGH
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+       ld1.32          {$W1},[$Ktbl],#16
+       add.i32         $W0,$W0,@MSG[0]
+       sha256su0       @MSG[0],@MSG[1]
+       orr             $abcd,$ABCD,$ABCD
+       sha256h         $ABCD,$EFGH,$W0
+       sha256h2        $EFGH,$abcd,$W0
+       sha256su1       @MSG[0],@MSG[2],@MSG[3]
+___
+       ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+       ld1.32          {$W1},[$Ktbl],#16
+       add.i32         $W0,$W0,@MSG[0]
+       orr             $abcd,$ABCD,$ABCD
+       sha256h         $ABCD,$EFGH,$W0
+       sha256h2        $EFGH,$abcd,$W0
+
+       ld1.32          {$W0},[$Ktbl],#16
+       add.i32         $W1,$W1,@MSG[1]
+       orr             $abcd,$ABCD,$ABCD
+       sha256h         $ABCD,$EFGH,$W1
+       sha256h2        $EFGH,$abcd,$W1
+
+       ld1.32          {$W1},[$Ktbl]
+       add.i32         $W0,$W0,@MSG[2]
+       sub             $Ktbl,$Ktbl,#$rounds*$SZ-16     // rewind
+       orr             $abcd,$ABCD,$ABCD
+       sha256h         $ABCD,$EFGH,$W0
+       sha256h2        $EFGH,$abcd,$W0
+
+       add.i32         $W1,$W1,@MSG[3]
+       orr             $abcd,$ABCD,$ABCD
+       sha256h         $ABCD,$EFGH,$W1
+       sha256h2        $EFGH,$abcd,$W1
+
+       add.i32         $ABCD,$ABCD,$ABCD_SAVE
+       add.i32         $EFGH,$EFGH,$EFGH_SAVE
+
+       cbnz            $num,.Loop_hw
+
+       st1.32          {$ABCD,$EFGH},[$ctx]
+
+       ldr             x29,[sp],#16
+       ret
+.size  sha256_block_armv8,.-sha256_block_armv8
+#endif
+___
+}
+
+if ($SZ==4) {  ######################################### NEON stuff #
+# You'll surely note a lot of similarities with sha256-armv4 module,
+# and of course it's not a coincidence. sha256-armv4 was used as
+# initial template, but was adapted for ARMv8 instruction set and
+# extensively re-tuned for all-round performance.
+
+my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10));
+my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15));
+my $Ktbl="x16";
+my $Xfer="x17";
+my @X = map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19));
+my $j=0;
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; }
+sub Dlo     { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; }
+sub Dhi     { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; }
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+       &ext_8          ($T0,@X[0],@X[1],4);    # X[1..4]
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &ext_8          ($T3,@X[2],@X[3],4);    # X[9..12]
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &mov            (&Dscalar($T7),&Dhi(@X[3]));    # X[14..15]
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &ushr_32        ($T2,$T0,$sigma0[0]);
+        eval(shift(@insns));
+       &ushr_32        ($T1,$T0,$sigma0[2]);
+        eval(shift(@insns));
+       &add_32         (@X[0],@X[0],$T3);      # X[0..3] += X[9..12]
+        eval(shift(@insns));
+       &sli_32         ($T2,$T0,32-$sigma0[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &ushr_32        ($T3,$T0,$sigma0[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &eor_8          ($T1,$T1,$T2);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &sli_32         ($T3,$T0,32-$sigma0[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &ushr_32      ($T4,$T7,$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &eor_8          ($T1,$T1,$T3);          # sigma0(X[1..4])
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &sli_32       ($T4,$T7,32-$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &ushr_32      ($T5,$T7,$sigma1[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &ushr_32      ($T3,$T7,$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &add_32         (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &sli_u32      ($T3,$T7,32-$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &eor_8        ($T5,$T5,$T4);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &eor_8        ($T5,$T5,$T3);          # sigma1(X[14..15])
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &add_32         (@X[0],@X[0],$T5);      # X[0..1] += sigma1(X[14..15])
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &ushr_32      ($T6,@X[0],$sigma1[0]);
+        eval(shift(@insns));
+         &ushr_32      ($T7,@X[0],$sigma1[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &sli_32       ($T6,@X[0],32-$sigma1[0]);
+        eval(shift(@insns));
+         &ushr_32      ($T5,@X[0],$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &eor_8        ($T7,$T7,$T6);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &sli_32       ($T5,@X[0],32-$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &ld1_32         ("{$T0}","[$Ktbl], #16");
+        eval(shift(@insns));
+         &eor_8        ($T7,$T7,$T5);          # sigma1(X[16..17])
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &eor_8          ($T5,$T5,$T5);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &mov            (&Dhi($T5), &Dlo($T7));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &add_32         (@X[0],@X[0],$T5);      # X[2..3] += sigma1(X[16..17])
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &add_32         ($T0,$T0,@X[0]);
+        while($#insns>=1) { eval(shift(@insns)); }
+       &st1_32         ("{$T0}","[$Xfer], #16");
+        eval(shift(@insns));
+
+       push(@X,shift(@X));             # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &ld1_8          ("{@X[0]}","[$inp],#16");
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &ld1_32         ("{$T0}","[$Ktbl],#16");
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &rev32          (@X[0],@X[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &add_32         ($T0,$T0,@X[0]);
+        foreach (@insns) { eval; }     # remaining instructions
+       &st1_32         ("{$T0}","[$Xfer], #16");
+
+       push(@X,shift(@X));             # "rotate" X[]
+}
+
+sub body_00_15 () {
+       (
+       '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+       '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
+       '&add   ($a,$a,$t4);'.                  # h+=Sigma0(a) from the past
+       '&and   ($t1,$f,$e)',
+       '&bic   ($t4,$g,$e)',
+       '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+       '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
+       '&orr   ($t1,$t1,$t4)',                 # Ch(e,f,g)
+       '&eor   ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
+       '&eor   ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+       '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
+       '&ror   ($t0,$t0,"#$Sigma1[0]")',
+       '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
+       '&eor   ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
+       '&add   ($h,$h,$t0)',                   # h+=Sigma1(e)
+       '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
+       '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
+       '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
+       '&ror   ($t4,$t4,"#$Sigma0[0]")',
+       '&add   ($d,$d,$h)',                    # d+=h
+       '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
+       '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+       )
+}
+
+$code.=<<___;
+#ifdef __KERNEL__
+.globl sha256_block_neon
+#endif
+.type  sha256_block_neon,%function
+.align 4
+sha256_block_neon:
+.Lneon_entry:
+       stp     x29, x30, [sp, #-16]!
+       mov     x29, sp
+       sub     sp,sp,#16*4
+
+       adr     $Ktbl,.LK256
+       add     $num,$inp,$num,lsl#6    // len to point at the end of inp
+
+       ld1.8   {@X[0]},[$inp], #16
+       ld1.8   {@X[1]},[$inp], #16
+       ld1.8   {@X[2]},[$inp], #16
+       ld1.8   {@X[3]},[$inp], #16
+       ld1.32  {$T0},[$Ktbl], #16
+       ld1.32  {$T1},[$Ktbl], #16
+       ld1.32  {$T2},[$Ktbl], #16
+       ld1.32  {$T3},[$Ktbl], #16
+       rev32   @X[0],@X[0]             // yes, even on
+       rev32   @X[1],@X[1]             // big-endian
+       rev32   @X[2],@X[2]
+       rev32   @X[3],@X[3]
+       mov     $Xfer,sp
+       add.32  $T0,$T0,@X[0]
+       add.32  $T1,$T1,@X[1]
+       add.32  $T2,$T2,@X[2]
+       st1.32  {$T0-$T1},[$Xfer], #32
+       add.32  $T3,$T3,@X[3]
+       st1.32  {$T2-$T3},[$Xfer]
+       sub     $Xfer,$Xfer,#32
+
+       ldp     $A,$B,[$ctx]
+       ldp     $C,$D,[$ctx,#8]
+       ldp     $E,$F,[$ctx,#16]
+       ldp     $G,$H,[$ctx,#24]
+       ldr     $t1,[sp,#0]
+       mov     $t2,wzr
+       eor     $t3,$B,$C
+       mov     $t4,wzr
+       b       .L_00_48
+
+.align 4
+.L_00_48:
+___
+       &Xupdate(\&body_00_15);
+       &Xupdate(\&body_00_15);
+       &Xupdate(\&body_00_15);
+       &Xupdate(\&body_00_15);
+$code.=<<___;
+       cmp     $t1,#0                          // check for K256 terminator
+       ldr     $t1,[sp,#0]
+       sub     $Xfer,$Xfer,#64
+       bne     .L_00_48
+
+       sub     $Ktbl,$Ktbl,#256                // rewind $Ktbl
+       cmp     $inp,$num
+       mov     $Xfer, #64
+       csel    $Xfer, $Xfer, xzr, eq
+       sub     $inp,$inp,$Xfer                 // avoid SEGV
+       mov     $Xfer,sp
+___
+       &Xpreload(\&body_00_15);
+       &Xpreload(\&body_00_15);
+       &Xpreload(\&body_00_15);
+       &Xpreload(\&body_00_15);
+$code.=<<___;
+       add     $A,$A,$t4                       // h+=Sigma0(a) from the past
+       ldp     $t0,$t1,[$ctx,#0]
+       add     $A,$A,$t2                       // h+=Maj(a,b,c) from the past
+       ldp     $t2,$t3,[$ctx,#8]
+       add     $A,$A,$t0                       // accumulate
+       add     $B,$B,$t1
+       ldp     $t0,$t1,[$ctx,#16]
+       add     $C,$C,$t2
+       add     $D,$D,$t3
+       ldp     $t2,$t3,[$ctx,#24]
+       add     $E,$E,$t0
+       add     $F,$F,$t1
+        ldr    $t1,[sp,#0]
+       stp     $A,$B,[$ctx,#0]
+       add     $G,$G,$t2
+        mov    $t2,wzr
+       stp     $C,$D,[$ctx,#8]
+       add     $H,$H,$t3
+       stp     $E,$F,[$ctx,#16]
+        eor    $t3,$B,$C
+       stp     $G,$H,[$ctx,#24]
+        mov    $t4,wzr
+        mov    $Xfer,sp
+       b.ne    .L_00_48
+
+       ldr     x29,[x29]
+       add     sp,sp,#16*4+16
+       ret
+.size  sha256_block_neon,.-sha256_block_neon
+___
+}
+
+$code.=<<___;
+#ifndef        __KERNEL__
+.comm  OPENSSL_armcap_P,4,4
+#endif
+___
+
+{   my  %opcode = (
+       "sha256h"       => 0x5e004000,  "sha256h2"      => 0x5e005000,
+       "sha256su0"     => 0x5e282800,  "sha256su1"     => 0x5e006000   );
+
+    sub unsha256 {
+       my ($mnemonic,$arg)=@_;
+
+       $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+       &&
+       sprintf ".inst\t0x%08x\t//%s %s",
+                       $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+                       $mnemonic,$arg;
+    }
+}
+
+open SELF,$0;
+while(<SELF>) {
+        next if (/^#!/);
+        last if (!s/^#/\/\// and !/^$/);
+        print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+
+       s/\`([^\`]*)\`/eval($1)/ge;
+
+       s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge;
+
+       s/\bq([0-9]+)\b/v$1.16b/g;              # old->new registers
+
+       s/\.[ui]?8(\s)/$1/;
+       s/\.\w?32\b//           and s/\.16b/\.4s/g;
+       m/(ld|st)1[^\[]+\[0\]/  and s/\.4s/\.s/g;
+
+       print $_,"\n";
+}
+
+close STDOUT;
diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S
new file mode 100644 (file)
index 0000000..f3e21c6
--- /dev/null
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+       .text
+       .arch           armv8-a+crypto
+
+       dga             .req    q20
+       dgav            .req    v20
+       dgb             .req    q21
+       dgbv            .req    v21
+
+       t0              .req    v22
+       t1              .req    v23
+
+       dg0q            .req    q24
+       dg0v            .req    v24
+       dg1q            .req    q25
+       dg1v            .req    v25
+       dg2q            .req    q26
+       dg2v            .req    v26
+
+       .macro          add_only, ev, rc, s0
+       mov             dg2v.16b, dg0v.16b
+       .ifeq           \ev
+       add             t1.4s, v\s0\().4s, \rc\().4s
+       sha256h         dg0q, dg1q, t0.4s
+       sha256h2        dg1q, dg2q, t0.4s
+       .else
+       .ifnb           \s0
+       add             t0.4s, v\s0\().4s, \rc\().4s
+       .endif
+       sha256h         dg0q, dg1q, t1.4s
+       sha256h2        dg1q, dg2q, t1.4s
+       .endif
+       .endm
+
+       .macro          add_update, ev, rc, s0, s1, s2, s3
+       sha256su0       v\s0\().4s, v\s1\().4s
+       add_only        \ev, \rc, \s1
+       sha256su1       v\s0\().4s, v\s2\().4s, v\s3\().4s
+       .endm
+
+       /*
+        * The SHA-256 round constants
+        */
+       .section        ".rodata", "a"
+       .align          4
+.Lsha2_rcon:
+       .word           0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+       .word           0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+       .word           0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+       .word           0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+       .word           0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+       .word           0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+       .word           0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+       .word           0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+       .word           0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+       .word           0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+       .word           0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+       .word           0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+       .word           0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+       .word           0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+       .word           0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+       .word           0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+
+       /*
+        * size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
+        *                              const u8 *data, size_t nblocks);
+        */
+       .text
+SYM_FUNC_START(__sha256_ce_transform)
+       /* load round constants */
+       adr_l           x8, .Lsha2_rcon
+       ld1             { v0.4s- v3.4s}, [x8], #64
+       ld1             { v4.4s- v7.4s}, [x8], #64
+       ld1             { v8.4s-v11.4s}, [x8], #64
+       ld1             {v12.4s-v15.4s}, [x8]
+
+       /* load state */
+       ld1             {dgav.4s, dgbv.4s}, [x0]
+
+       /* load input */
+0:     ld1             {v16.4s-v19.4s}, [x1], #64
+       sub             x2, x2, #1
+
+CPU_LE(        rev32           v16.16b, v16.16b        )
+CPU_LE(        rev32           v17.16b, v17.16b        )
+CPU_LE(        rev32           v18.16b, v18.16b        )
+CPU_LE(        rev32           v19.16b, v19.16b        )
+
+       add             t0.4s, v16.4s, v0.4s
+       mov             dg0v.16b, dgav.16b
+       mov             dg1v.16b, dgbv.16b
+
+       add_update      0,  v1, 16, 17, 18, 19
+       add_update      1,  v2, 17, 18, 19, 16
+       add_update      0,  v3, 18, 19, 16, 17
+       add_update      1,  v4, 19, 16, 17, 18
+
+       add_update      0,  v5, 16, 17, 18, 19
+       add_update      1,  v6, 17, 18, 19, 16
+       add_update      0,  v7, 18, 19, 16, 17
+       add_update      1,  v8, 19, 16, 17, 18
+
+       add_update      0,  v9, 16, 17, 18, 19
+       add_update      1, v10, 17, 18, 19, 16
+       add_update      0, v11, 18, 19, 16, 17
+       add_update      1, v12, 19, 16, 17, 18
+
+       add_only        0, v13, 17
+       add_only        1, v14, 18
+       add_only        0, v15, 19
+       add_only        1
+
+       /* update state */
+       add             dgav.4s, dgav.4s, dg0v.4s
+       add             dgbv.4s, dgbv.4s, dg1v.4s
+
+       /* return early if voluntary preemption is needed */
+       cond_yield      1f, x5, x6
+
+       /* handled all input blocks? */
+       cbnz            x2, 0b
+
+       /* store new state */
+1:     st1             {dgav.4s, dgbv.4s}, [x0]
+       mov             x0, x2
+       ret
+SYM_FUNC_END(__sha256_ce_transform)
diff --git a/lib/crypto/arm64/sha256.c b/lib/crypto/arm64/sha256.c
new file mode 100644 (file)
index 0000000..bcf7a3a
--- /dev/null
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-256 optimized for ARM64
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <crypto/internal/sha2.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+asmlinkage void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS],
+                                  const u8 *data, size_t nblocks);
+EXPORT_SYMBOL_GPL(sha256_blocks_arch);
+asmlinkage void sha256_block_neon(u32 state[SHA256_STATE_WORDS],
+                                 const u8 *data, size_t nblocks);
+asmlinkage size_t __sha256_ce_transform(u32 state[SHA256_STATE_WORDS],
+                                       const u8 *data, size_t nblocks);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_ce);
+
+void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS],
+                       const u8 *data, size_t nblocks)
+{
+       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+           static_branch_likely(&have_neon)) {
+               if (static_branch_likely(&have_ce)) {
+                       do {
+                               size_t rem;
+
+                               kernel_neon_begin();
+                               rem = __sha256_ce_transform(state,
+                                                           data, nblocks);
+                               kernel_neon_end();
+                               data += (nblocks - rem) * SHA256_BLOCK_SIZE;
+                               nblocks = rem;
+                       } while (nblocks);
+               } else {
+                       kernel_neon_begin();
+                       sha256_block_neon(state, data, nblocks);
+                       kernel_neon_end();
+               }
+       } else {
+               sha256_blocks_arch(state, data, nblocks);
+       }
+}
+EXPORT_SYMBOL_GPL(sha256_blocks_simd);
+
+bool sha256_is_arch_optimized(void)
+{
+       /* We always can use at least the ARM64 scalar implementation. */
+       return true;
+}
+EXPORT_SYMBOL_GPL(sha256_is_arch_optimized);
+
+static int __init sha256_arm64_mod_init(void)
+{
+       if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+           cpu_have_named_feature(ASIMD)) {
+               static_branch_enable(&have_neon);
+               if (cpu_have_named_feature(SHA2))
+                       static_branch_enable(&have_ce);
+       }
+       return 0;
+}
+subsys_initcall(sha256_arm64_mod_init);
+
+static void __exit sha256_arm64_mod_exit(void)
+{
+}
+module_exit(sha256_arm64_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-256 optimized for ARM64");