From: Eric Biggers <ebiggers@google.com>
Date: Tue, 22 Apr 2025 15:27:08 +0000 (-0700)
Subject: crypto: arm - move library functions to arch/arm/lib/crypto/
X-Git-Tag: block-6.16-20250606~34^2~159
X-Git-Url: https://git.kernel.dk/?a=commitdiff_plain;h=714656a84697f9615b9488b490c99edb3ecfcd3d;p=linux-block.git

crypto: arm - move library functions to arch/arm/lib/crypto/

Continue disentangling the crypto library functions from the generic
crypto infrastructure by moving the arm BLAKE2s, ChaCha, and Poly1305
library functions into a new directory arch/arm/lib/crypto/ that does
not depend on CRYPTO.  This mirrors the distinction between crypto/ and
lib/crypto/.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

diff --git a/MAINTAINERS b/MAINTAINERS
index a2604c13f11b..d0d1968e323a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6288,6 +6288,7 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6.git
 F:	Documentation/crypto/
 F:	Documentation/devicetree/bindings/crypto/
 F:	arch/*/crypto/
+F:	arch/*/lib/crypto/
 F:	crypto/
 F:	drivers/crypto/
 F:	include/crypto/
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 3530e7c80793..1f889d6bab77 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -46,24 +46,6 @@ config CRYPTO_NHPOLY1305_NEON
 	  Architecture: arm using:
 	  - NEON (Advanced SIMD) extensions
 
-config CRYPTO_POLY1305_ARM
-	tristate
-	select CRYPTO_ARCH_HAVE_LIB_POLY1305
-	default CRYPTO_LIB_POLY1305_INTERNAL
-
-config CRYPTO_BLAKE2S_ARM
-	bool "Hash functions: BLAKE2s"
-	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
-	help
-	  BLAKE2s cryptographic hash function (RFC 7693)
-
-	  Architecture: arm
-
-	  This is faster than the generic implementations of BLAKE2s and
-	  BLAKE2b, but slower than the NEON implementation of BLAKE2b.
-	  There is no NEON implementation of BLAKE2s, since NEON doesn't
-	  really help with it.
-
 config CRYPTO_BLAKE2B_NEON
 	tristate "Hash functions: BLAKE2b (NEON)"
 	depends on KERNEL_MODE_NEON
@@ -206,10 +188,5 @@ config CRYPTO_AES_ARM_CE
 	  Architecture: arm using:
 	  - ARMv8 Crypto Extensions
 
-config CRYPTO_CHACHA20_NEON
-	tristate
-	select CRYPTO_ARCH_HAVE_LIB_CHACHA
-	default CRYPTO_LIB_CHACHA_INTERNAL
-
 endmenu
 
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 3d0e23ff9e74..ecabe6603e08 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -9,10 +9,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
-obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o
 obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
-obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
 obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
 
@@ -29,15 +26,11 @@ sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
 sha256-arm-y	:= sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
 sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
 sha512-arm-y	:= sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
-libblake2s-arm-y:= blake2s-core.o blake2s-glue.o
 blake2b-neon-y  := blake2b-neon-core.o blake2b-neon-glue.o
 sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
 sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
-chacha-neon-y := chacha-scalar-core.o chacha-glue.o
-chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
-poly1305-arm-y := poly1305-core.o poly1305-glue.o
 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
 curve25519-neon-y := curve25519-core.o curve25519-glue.o
 
@@ -47,14 +40,9 @@ quiet_cmd_perl = PERL    $@
 $(obj)/%-core.S: $(src)/%-armv4.pl
 	$(call cmd,perl)
 
-clean-files += poly1305-core.S sha256-core.S sha512-core.S
+clean-files += sha256-core.S sha512-core.S
 
 aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
 
 AFLAGS_sha256-core.o += $(aflags-thumb2-y)
 AFLAGS_sha512-core.o += $(aflags-thumb2-y)
-
-# massage the perlasm code a bit so we only get the NEON routine if we need it
-poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
-poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
-AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y)
diff --git a/arch/arm/crypto/blake2s-core.S b/arch/arm/crypto/blake2s-core.S
deleted file mode 100644
index df40e46601f1..000000000000
--- a/arch/arm/crypto/blake2s-core.S
+++ /dev/null
@@ -1,306 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * BLAKE2s digest algorithm, ARM scalar implementation
- *
- * Copyright 2020 Google LLC
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	// Registers used to hold message words temporarily.  There aren't
-	// enough ARM registers to hold the whole message block, so we have to
-	// load the words on-demand.
-	M_0		.req	r12
-	M_1		.req	r14
-
-// The BLAKE2s initialization vector
-.Lblake2s_IV:
-	.word	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
-	.word	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
-
-.macro __ldrd		a, b, src, offset
-#if __LINUX_ARM_ARCH__ >= 6
-	ldrd		\a, \b, [\src, #\offset]
-#else
-	ldr		\a, [\src, #\offset]
-	ldr		\b, [\src, #\offset + 4]
-#endif
-.endm
-
-.macro __strd		a, b, dst, offset
-#if __LINUX_ARM_ARCH__ >= 6
-	strd		\a, \b, [\dst, #\offset]
-#else
-	str		\a, [\dst, #\offset]
-	str		\b, [\dst, #\offset + 4]
-#endif
-.endm
-
-.macro _le32_bswap	a, tmp
-#ifdef __ARMEB__
-	rev_l		\a, \tmp
-#endif
-.endm
-
-.macro _le32_bswap_8x	a, b, c, d, e, f, g, h,  tmp
-	_le32_bswap	\a, \tmp
-	_le32_bswap	\b, \tmp
-	_le32_bswap	\c, \tmp
-	_le32_bswap	\d, \tmp
-	_le32_bswap	\e, \tmp
-	_le32_bswap	\f, \tmp
-	_le32_bswap	\g, \tmp
-	_le32_bswap	\h, \tmp
-.endm
-
-// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
-// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
-// columns/diagonals.  s0-s1 are the word offsets to the message words the first
-// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
-// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
-//
-// Note that to save instructions, the rotations don't happen when the
-// pseudocode says they should, but rather they are delayed until the values are
-// used.  See the comment above _blake2s_round().
-.macro _blake2s_quarterround  a0, b0, c0, d0,  a1, b1, c1, d1,  s0, s1, s2, s3
-
-	ldr		M_0, [sp, #32 + 4 * \s0]
-	ldr		M_1, [sp, #32 + 4 * \s2]
-
-	// a += b + m[blake2s_sigma[r][2*i + 0]];
-	add		\a0, \a0, \b0, ror #brot
-	add		\a1, \a1, \b1, ror #brot
-	add		\a0, \a0, M_0
-	add		\a1, \a1, M_1
-
-	// d = ror32(d ^ a, 16);
-	eor		\d0, \a0, \d0, ror #drot
-	eor		\d1, \a1, \d1, ror #drot
-
-	// c += d;
-	add		\c0, \c0, \d0, ror #16
-	add		\c1, \c1, \d1, ror #16
-
-	// b = ror32(b ^ c, 12);
-	eor		\b0, \c0, \b0, ror #brot
-	eor		\b1, \c1, \b1, ror #brot
-
-	ldr		M_0, [sp, #32 + 4 * \s1]
-	ldr		M_1, [sp, #32 + 4 * \s3]
-
-	// a += b + m[blake2s_sigma[r][2*i + 1]];
-	add		\a0, \a0, \b0, ror #12
-	add		\a1, \a1, \b1, ror #12
-	add		\a0, \a0, M_0
-	add		\a1, \a1, M_1
-
-	// d = ror32(d ^ a, 8);
-	eor		\d0, \a0, \d0, ror#16
-	eor		\d1, \a1, \d1, ror#16
-
-	// c += d;
-	add		\c0, \c0, \d0, ror#8
-	add		\c1, \c1, \d1, ror#8
-
-	// b = ror32(b ^ c, 7);
-	eor		\b0, \c0, \b0, ror#12
-	eor		\b1, \c1, \b1, ror#12
-.endm
-
-// Execute one round of BLAKE2s by updating the state matrix v[0..15].  v[0..9]
-// are in r0..r9.  The stack pointer points to 8 bytes of scratch space for
-// spilling v[8..9], then to v[9..15], then to the message block.  r10-r12 and
-// r14 are free to use.  The macro arguments s0-s15 give the order in which the
-// message words are used in this round.
-//
-// All rotates are performed using the implicit rotate operand accepted by the
-// 'add' and 'eor' instructions.  This is faster than using explicit rotate
-// instructions.  To make this work, we allow the values in the second and last
-// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
-// wrong rotation amount.  The rotation amount is then fixed up just in time
-// when the values are used.  'brot' is the number of bits the values in row 'b'
-// need to be rotated right to arrive at the correct values, and 'drot'
-// similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
-// that they end up as (7, 8) after every round.
-.macro	_blake2s_round	s0, s1, s2, s3, s4, s5, s6, s7, \
-			s8, s9, s10, s11, s12, s13, s14, s15
-
-	// Mix first two columns:
-	// (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
-	__ldrd		r10, r11, sp, 16	// load v[12] and v[13]
-	_blake2s_quarterround	r0, r4, r8, r10,  r1, r5, r9, r11, \
-				\s0, \s1, \s2, \s3
-	__strd		r8, r9, sp, 0
-	__strd		r10, r11, sp, 16
-
-	// Mix second two columns:
-	// (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
-	__ldrd		r8, r9, sp, 8		// load v[10] and v[11]
-	__ldrd		r10, r11, sp, 24	// load v[14] and v[15]
-	_blake2s_quarterround	r2, r6, r8, r10,  r3, r7, r9, r11, \
-				\s4, \s5, \s6, \s7
-	str		r10, [sp, #24]		// store v[14]
-	// v[10], v[11], and v[15] are used below, so no need to store them yet.
-
-	.set brot, 7
-	.set drot, 8
-
-	// Mix first two diagonals:
-	// (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
-	ldr		r10, [sp, #16]		// load v[12]
-	_blake2s_quarterround	r0, r5, r8, r11,  r1, r6, r9, r10, \
-				\s8, \s9, \s10, \s11
-	__strd		r8, r9, sp, 8
-	str		r11, [sp, #28]
-	str		r10, [sp, #16]
-
-	// Mix second two diagonals:
-	// (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
-	__ldrd		r8, r9, sp, 0		// load v[8] and v[9]
-	__ldrd		r10, r11, sp, 20	// load v[13] and v[14]
-	_blake2s_quarterround	r2, r7, r8, r10,  r3, r4, r9, r11, \
-				\s12, \s13, \s14, \s15
-	__strd		r10, r11, sp, 20
-.endm
-
-//
-// void blake2s_compress(struct blake2s_state *state,
-//			 const u8 *block, size_t nblocks, u32 inc);
-//
-// Only the first three fields of struct blake2s_state are used:
-//	u32 h[8];	(inout)
-//	u32 t[2];	(inout)
-//	u32 f[2];	(in)
-//
-	.align		5
-ENTRY(blake2s_compress)
-	push		{r0-r2,r4-r11,lr}	// keep this an even number
-
-.Lnext_block:
-	// r0 is 'state'
-	// r1 is 'block'
-	// r3 is 'inc'
-
-	// Load and increment the counter t[0..1].
-	__ldrd		r10, r11, r0, 32
-	adds		r10, r10, r3
-	adc		r11, r11, #0
-	__strd		r10, r11, r0, 32
-
-	// _blake2s_round is very short on registers, so copy the message block
-	// to the stack to save a register during the rounds.  This also has the
-	// advantage that misalignment only needs to be dealt with in one place.
-	sub		sp, sp, #64
-	mov		r12, sp
-	tst		r1, #3
-	bne		.Lcopy_block_misaligned
-	ldmia		r1!, {r2-r9}
-	_le32_bswap_8x	r2, r3, r4, r5, r6, r7, r8, r9,  r14
-	stmia		r12!, {r2-r9}
-	ldmia		r1!, {r2-r9}
-	_le32_bswap_8x	r2, r3, r4, r5, r6, r7, r8, r9,  r14
-	stmia		r12, {r2-r9}
-.Lcopy_block_done:
-	str		r1, [sp, #68]		// Update message pointer
-
-	// Calculate v[8..15].  Push v[9..15] onto the stack, and leave space
-	// for spilling v[8..9].  Leave v[8..9] in r8-r9.
-	mov		r14, r0			// r14 = state
-	adr		r12, .Lblake2s_IV
-	ldmia		r12!, {r8-r9}		// load IV[0..1]
-	__ldrd		r0, r1, r14, 40		// load f[0..1]
-	ldm		r12, {r2-r7}		// load IV[3..7]
-	eor		r4, r4, r10		// v[12] = IV[4] ^ t[0]
-	eor		r5, r5, r11		// v[13] = IV[5] ^ t[1]
-	eor		r6, r6, r0		// v[14] = IV[6] ^ f[0]
-	eor		r7, r7, r1		// v[15] = IV[7] ^ f[1]
-	push		{r2-r7}			// push v[9..15]
-	sub		sp, sp, #8		// leave space for v[8..9]
-
-	// Load h[0..7] == v[0..7].
-	ldm		r14, {r0-r7}
-
-	// Execute the rounds.  Each round is provided the order in which it
-	// needs to use the message words.
-	.set brot, 0
-	.set drot, 0
-	_blake2s_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-	_blake2s_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
-	_blake2s_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
-	_blake2s_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
-	_blake2s_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
-	_blake2s_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
-	_blake2s_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
-	_blake2s_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
-	_blake2s_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
-	_blake2s_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
-
-	// Fold the final state matrix into the hash chaining value:
-	//
-	//	for (i = 0; i < 8; i++)
-	//		h[i] ^= v[i] ^ v[i + 8];
-	//
-	ldr		r14, [sp, #96]		// r14 = &h[0]
-	add		sp, sp, #8		// v[8..9] are already loaded.
-	pop		{r10-r11}		// load v[10..11]
-	eor		r0, r0, r8
-	eor		r1, r1, r9
-	eor		r2, r2, r10
-	eor		r3, r3, r11
-	ldm		r14, {r8-r11}		// load h[0..3]
-	eor		r0, r0, r8
-	eor		r1, r1, r9
-	eor		r2, r2, r10
-	eor		r3, r3, r11
-	stmia		r14!, {r0-r3}		// store new h[0..3]
-	ldm		r14, {r0-r3}		// load old h[4..7]
-	pop		{r8-r11}		// load v[12..15]
-	eor		r0, r0, r4, ror #brot
-	eor		r1, r1, r5, ror #brot
-	eor		r2, r2, r6, ror #brot
-	eor		r3, r3, r7, ror #brot
-	eor		r0, r0, r8, ror #drot
-	eor		r1, r1, r9, ror #drot
-	eor		r2, r2, r10, ror #drot
-	eor		r3, r3, r11, ror #drot
-	  add		sp, sp, #64		// skip copy of message block
-	stm		r14, {r0-r3}		// store new h[4..7]
-
-	// Advance to the next block, if there is one.  Note that if there are
-	// multiple blocks, then 'inc' (the counter increment amount) must be
-	// 64.  So we can simply set it to 64 without re-loading it.
-	ldm		sp, {r0, r1, r2}	// load (state, block, nblocks)
-	mov		r3, #64			// set 'inc'
-	subs		r2, r2, #1		// nblocks--
-	str		r2, [sp, #8]
-	bne		.Lnext_block		// nblocks != 0?
-
-	pop		{r0-r2,r4-r11,pc}
-
-	// The next message block (pointed to by r1) isn't 4-byte aligned, so it
-	// can't be loaded using ldmia.  Copy it to the stack buffer (pointed to
-	// by r12) using an alternative method.  r2-r9 are free to use.
-.Lcopy_block_misaligned:
-	mov		r2, #64
-1:
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-	ldr		r3, [r1], #4
-	_le32_bswap	r3, r4
-#else
-	ldrb		r3, [r1, #0]
-	ldrb		r4, [r1, #1]
-	ldrb		r5, [r1, #2]
-	ldrb		r6, [r1, #3]
-	add		r1, r1, #4
-	orr		r3, r3, r4, lsl #8
-	orr		r3, r3, r5, lsl #16
-	orr		r3, r3, r6, lsl #24
-#endif
-	subs		r2, r2, #4
-	str		r3, [r12], #4
-	bne		1b
-	b		.Lcopy_block_done
-ENDPROC(blake2s_compress)
diff --git a/arch/arm/crypto/blake2s-glue.c b/arch/arm/crypto/blake2s-glue.c
deleted file mode 100644
index 0238a70d9581..000000000000
--- a/arch/arm/crypto/blake2s-glue.c
+++ /dev/null
@@ -1,7 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include <crypto/internal/blake2s.h>
-#include <linux/module.h>
-
-/* defined in blake2s-core.S */
-EXPORT_SYMBOL(blake2s_compress);
diff --git a/arch/arm/crypto/chacha-glue.c b/arch/arm/crypto/chacha-glue.c
deleted file mode 100644
index 12afb40cf1ff..000000000000
--- a/arch/arm/crypto/chacha-glue.c
+++ /dev/null
@@ -1,134 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ChaCha and HChaCha functions (ARM optimized)
- *
- * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <crypto/chacha.h>
-#include <crypto/internal/simd.h>
-#include <linux/jump_label.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/cputype.h>
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
-				      int nrounds);
-asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
-				       int nrounds, unsigned int nbytes);
-asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
-asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
-
-asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
-			     const u32 *state, int nrounds);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
-
-static inline bool neon_usable(void)
-{
-	return static_branch_likely(&use_neon) && crypto_simd_usable();
-}
-
-static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
-			  unsigned int bytes, int nrounds)
-{
-	u8 buf[CHACHA_BLOCK_SIZE];
-
-	while (bytes > CHACHA_BLOCK_SIZE) {
-		unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
-
-		chacha_4block_xor_neon(state, dst, src, nrounds, l);
-		bytes -= l;
-		src += l;
-		dst += l;
-		state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
-	}
-	if (bytes) {
-		const u8 *s = src;
-		u8 *d = dst;
-
-		if (bytes != CHACHA_BLOCK_SIZE)
-			s = d = memcpy(buf, src, bytes);
-		chacha_block_xor_neon(state, d, s, nrounds);
-		if (d != dst)
-			memcpy(dst, buf, bytes);
-		state[12]++;
-	}
-}
-
-void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
-{
-	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
-		hchacha_block_arm(state, stream, nrounds);
-	} else {
-		kernel_neon_begin();
-		hchacha_block_neon(state, stream, nrounds);
-		kernel_neon_end();
-	}
-}
-EXPORT_SYMBOL(hchacha_block_arch);
-
-void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
-		       int nrounds)
-{
-	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
-	    bytes <= CHACHA_BLOCK_SIZE) {
-		chacha_doarm(dst, src, bytes, state, nrounds);
-		state[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
-		return;
-	}
-
-	do {
-		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
-
-		kernel_neon_begin();
-		chacha_doneon(state, dst, src, todo, nrounds);
-		kernel_neon_end();
-
-		bytes -= todo;
-		src += todo;
-		dst += todo;
-	} while (bytes);
-}
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-bool chacha_is_arch_optimized(void)
-{
-	/* We always can use at least the ARM scalar implementation. */
-	return true;
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-static int __init chacha_arm_mod_init(void)
-{
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
-		switch (read_cpuid_part()) {
-		case ARM_CPU_PART_CORTEX_A7:
-		case ARM_CPU_PART_CORTEX_A5:
-			/*
-			 * The Cortex-A7 and Cortex-A5 do not perform well with
-			 * the NEON implementation but do incredibly with the
-			 * scalar one and use less power.
-			 */
-			break;
-		default:
-			static_branch_enable(&use_neon);
-		}
-	}
-	return 0;
-}
-arch_initcall(chacha_arm_mod_init);
-
-static void __exit chacha_arm_mod_exit(void)
-{
-}
-module_exit(chacha_arm_mod_exit);
-
-MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/crypto/chacha-neon-core.S b/arch/arm/crypto/chacha-neon-core.S
deleted file mode 100644
index ddd62b6294a5..000000000000
--- a/arch/arm/crypto/chacha-neon-core.S
+++ /dev/null
@@ -1,643 +0,0 @@
-/*
- * ChaCha/HChaCha NEON helper functions
- *
- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
- /*
-  * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
-  *
-  * (a)  vshl.u32 + vsri.u32		(needs temporary register)
-  * (b)  vshl.u32 + vshr.u32 + vorr	(needs temporary register)
-  * (c)  vrev32.16			(16-bit rotations only)
-  * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
-  *					 needs index vector)
-  *
-  * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
-  * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
-  * cycles of (b) on both Cortex-A7 and Cortex-A53.
-  *
-  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
-  * and doesn't need a temporary register.
-  *
-  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
-  * is twice as fast as (a), even when doing (a) on multiple registers
-  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
-  * parallelizes better when temporary registers are scarce.
-  *
-  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
-  * (a), so the need to load the rotation table actually makes the vtbl method
-  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
-  * seems to be a good compromise to get a more significant speed boost on some
-  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
-  */
-
-#include <linux/linkage.h>
-#include <asm/cache.h>
-
-	.text
-	.fpu		neon
-	.align		5
-
-/*
- * chacha_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is stored in the four NEON
- * registers q0-q3.  It performs matrix operations on four words in parallel,
- * but requires shuffling to rearrange the words after each round.
- *
- * The round count is given in r3.
- *
- * Clobbers: r3, ip, q4-q5
- */
-chacha_permute:
-
-	adr		ip, .Lrol8_table
-	vld1.8		{d10}, [ip, :64]
-
-.Ldoubleround:
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	vadd.i32	q0, q0, q1
-	veor		q3, q3, q0
-	vrev32.16	q3, q3
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	vadd.i32	q2, q2, q3
-	veor		q4, q1, q2
-	vshl.u32	q1, q4, #12
-	vsri.u32	q1, q4, #20
-
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	vadd.i32	q0, q0, q1
-	veor		q3, q3, q0
-	vtbl.8		d6, {d6}, d10
-	vtbl.8		d7, {d7}, d10
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	vadd.i32	q2, q2, q3
-	veor		q4, q1, q2
-	vshl.u32	q1, q4, #7
-	vsri.u32	q1, q4, #25
-
-	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
-	vext.8		q1, q1, q1, #4
-	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	vext.8		q2, q2, q2, #8
-	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
-	vext.8		q3, q3, q3, #12
-
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
-	vadd.i32	q0, q0, q1
-	veor		q3, q3, q0
-	vrev32.16	q3, q3
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
-	vadd.i32	q2, q2, q3
-	veor		q4, q1, q2
-	vshl.u32	q1, q4, #12
-	vsri.u32	q1, q4, #20
-
-	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
-	vadd.i32	q0, q0, q1
-	veor		q3, q3, q0
-	vtbl.8		d6, {d6}, d10
-	vtbl.8		d7, {d7}, d10
-
-	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
-	vadd.i32	q2, q2, q3
-	veor		q4, q1, q2
-	vshl.u32	q1, q4, #7
-	vsri.u32	q1, q4, #25
-
-	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
-	vext.8		q1, q1, q1, #12
-	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
-	vext.8		q2, q2, q2, #8
-	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
-	vext.8		q3, q3, q3, #4
-
-	subs		r3, r3, #2
-	bne		.Ldoubleround
-
-	bx		lr
-ENDPROC(chacha_permute)
-
-ENTRY(chacha_block_xor_neon)
-	// r0: Input state matrix, s
-	// r1: 1 data block output, o
-	// r2: 1 data block input, i
-	// r3: nrounds
-	push		{lr}
-
-	// x0..3 = s0..3
-	add		ip, r0, #0x20
-	vld1.32		{q0-q1}, [r0]
-	vld1.32		{q2-q3}, [ip]
-
-	vmov		q8, q0
-	vmov		q9, q1
-	vmov		q10, q2
-	vmov		q11, q3
-
-	bl		chacha_permute
-
-	add		ip, r2, #0x20
-	vld1.8		{q4-q5}, [r2]
-	vld1.8		{q6-q7}, [ip]
-
-	// o0 = i0 ^ (x0 + s0)
-	vadd.i32	q0, q0, q8
-	veor		q0, q0, q4
-
-	// o1 = i1 ^ (x1 + s1)
-	vadd.i32	q1, q1, q9
-	veor		q1, q1, q5
-
-	// o2 = i2 ^ (x2 + s2)
-	vadd.i32	q2, q2, q10
-	veor		q2, q2, q6
-
-	// o3 = i3 ^ (x3 + s3)
-	vadd.i32	q3, q3, q11
-	veor		q3, q3, q7
-
-	add		ip, r1, #0x20
-	vst1.8		{q0-q1}, [r1]
-	vst1.8		{q2-q3}, [ip]
-
-	pop		{pc}
-ENDPROC(chacha_block_xor_neon)
-
-ENTRY(hchacha_block_neon)
-	// r0: Input state matrix, s
-	// r1: output (8 32-bit words)
-	// r2: nrounds
-	push		{lr}
-
-	vld1.32		{q0-q1}, [r0]!
-	vld1.32		{q2-q3}, [r0]
-
-	mov		r3, r2
-	bl		chacha_permute
-
-	vst1.32		{q0}, [r1]!
-	vst1.32		{q3}, [r1]
-
-	pop		{pc}
-ENDPROC(hchacha_block_neon)
-
-	.align		4
-.Lctrinc:	.word	0, 1, 2, 3
-.Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
-
-	.align		5
-ENTRY(chacha_4block_xor_neon)
-	push		{r4, lr}
-	mov		r4, sp			// preserve the stack pointer
-	sub		ip, sp, #0x20		// allocate a 32 byte buffer
-	bic		ip, ip, #0x1f		// aligned to 32 bytes
-	mov		sp, ip
-
-	// r0: Input state matrix, s
-	// r1: 4 data blocks output, o
-	// r2: 4 data blocks input, i
-	// r3: nrounds
-
-	//
-	// This function encrypts four consecutive ChaCha blocks by loading
-	// the state matrix in NEON registers four times. The algorithm performs
-	// each operation on the corresponding word of each state matrix, hence
-	// requires no word shuffling. The words are re-interleaved before the
-	// final addition of the original state and the XORing step.
-	//
-
-	// x0..15[0-3] = s0..15[0-3]
-	add		ip, r0, #0x20
-	vld1.32		{q0-q1}, [r0]
-	vld1.32		{q2-q3}, [ip]
-
-	adr		lr, .Lctrinc
-	vdup.32		q15, d7[1]
-	vdup.32		q14, d7[0]
-	vld1.32		{q4}, [lr, :128]
-	vdup.32		q13, d6[1]
-	vdup.32		q12, d6[0]
-	vdup.32		q11, d5[1]
-	vdup.32		q10, d5[0]
-	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
-	vdup.32		q9, d4[1]
-	vdup.32		q8, d4[0]
-	vdup.32		q7, d3[1]
-	vdup.32		q6, d3[0]
-	vdup.32		q5, d2[1]
-	vdup.32		q4, d2[0]
-	vdup.32		q3, d1[1]
-	vdup.32		q2, d1[0]
-	vdup.32		q1, d0[1]
-	vdup.32		q0, d0[0]
-
-	adr		ip, .Lrol8_table
-	b		1f
-
-.Ldoubleround4:
-	vld1.32		{q8-q9}, [sp, :256]
-1:
-	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
-	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
-	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
-	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
-	vadd.i32	q0, q0, q4
-	vadd.i32	q1, q1, q5
-	vadd.i32	q2, q2, q6
-	vadd.i32	q3, q3, q7
-
-	veor		q12, q12, q0
-	veor		q13, q13, q1
-	veor		q14, q14, q2
-	veor		q15, q15, q3
-
-	vrev32.16	q12, q12
-	vrev32.16	q13, q13
-	vrev32.16	q14, q14
-	vrev32.16	q15, q15
-
-	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
-	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
-	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
-	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
-	vadd.i32	q8, q8, q12
-	vadd.i32	q9, q9, q13
-	vadd.i32	q10, q10, q14
-	vadd.i32	q11, q11, q15
-
-	vst1.32		{q8-q9}, [sp, :256]
-
-	veor		q8, q4, q8
-	veor		q9, q5, q9
-	vshl.u32	q4, q8, #12
-	vshl.u32	q5, q9, #12
-	vsri.u32	q4, q8, #20
-	vsri.u32	q5, q9, #20
-
-	veor		q8, q6, q10
-	veor		q9, q7, q11
-	vshl.u32	q6, q8, #12
-	vshl.u32	q7, q9, #12
-	vsri.u32	q6, q8, #20
-	vsri.u32	q7, q9, #20
-
-	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
-	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
-	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
-	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
-	vld1.8		{d16}, [ip, :64]
-	vadd.i32	q0, q0, q4
-	vadd.i32	q1, q1, q5
-	vadd.i32	q2, q2, q6
-	vadd.i32	q3, q3, q7
-
-	veor		q12, q12, q0
-	veor		q13, q13, q1
-	veor		q14, q14, q2
-	veor		q15, q15, q3
-
-	vtbl.8		d24, {d24}, d16
-	vtbl.8		d25, {d25}, d16
-	vtbl.8		d26, {d26}, d16
-	vtbl.8		d27, {d27}, d16
-	vtbl.8		d28, {d28}, d16
-	vtbl.8		d29, {d29}, d16
-	vtbl.8		d30, {d30}, d16
-	vtbl.8		d31, {d31}, d16
-
-	vld1.32		{q8-q9}, [sp, :256]
-
-	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
-	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
-	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
-	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
-	vadd.i32	q8, q8, q12
-	vadd.i32	q9, q9, q13
-	vadd.i32	q10, q10, q14
-	vadd.i32	q11, q11, q15
-
-	vst1.32		{q8-q9}, [sp, :256]
-
-	veor		q8, q4, q8
-	veor		q9, q5, q9
-	vshl.u32	q4, q8, #7
-	vshl.u32	q5, q9, #7
-	vsri.u32	q4, q8, #25
-	vsri.u32	q5, q9, #25
-
-	veor		q8, q6, q10
-	veor		q9, q7, q11
-	vshl.u32	q6, q8, #7
-	vshl.u32	q7, q9, #7
-	vsri.u32	q6, q8, #25
-	vsri.u32	q7, q9, #25
-
-	vld1.32		{q8-q9}, [sp, :256]
-
-	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
-	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
-	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
-	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
-	vadd.i32	q0, q0, q5
-	vadd.i32	q1, q1, q6
-	vadd.i32	q2, q2, q7
-	vadd.i32	q3, q3, q4
-
-	veor		q15, q15, q0
-	veor		q12, q12, q1
-	veor		q13, q13, q2
-	veor		q14, q14, q3
-
-	vrev32.16	q15, q15
-	vrev32.16	q12, q12
-	vrev32.16	q13, q13
-	vrev32.16	q14, q14
-
-	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
-	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
-	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
-	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
-	vadd.i32	q10, q10, q15
-	vadd.i32	q11, q11, q12
-	vadd.i32	q8, q8, q13
-	vadd.i32	q9, q9, q14
-
-	vst1.32		{q8-q9}, [sp, :256]
-
-	veor		q8, q7, q8
-	veor		q9, q4, q9
-	vshl.u32	q7, q8, #12
-	vshl.u32	q4, q9, #12
-	vsri.u32	q7, q8, #20
-	vsri.u32	q4, q9, #20
-
-	veor		q8, q5, q10
-	veor		q9, q6, q11
-	vshl.u32	q5, q8, #12
-	vshl.u32	q6, q9, #12
-	vsri.u32	q5, q8, #20
-	vsri.u32	q6, q9, #20
-
-	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
-	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
-	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
-	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
-	vld1.8		{d16}, [ip, :64]
-	vadd.i32	q0, q0, q5
-	vadd.i32	q1, q1, q6
-	vadd.i32	q2, q2, q7
-	vadd.i32	q3, q3, q4
-
-	veor		q15, q15, q0
-	veor		q12, q12, q1
-	veor		q13, q13, q2
-	veor		q14, q14, q3
-
-	vtbl.8		d30, {d30}, d16
-	vtbl.8		d31, {d31}, d16
-	vtbl.8		d24, {d24}, d16
-	vtbl.8		d25, {d25}, d16
-	vtbl.8		d26, {d26}, d16
-	vtbl.8		d27, {d27}, d16
-	vtbl.8		d28, {d28}, d16
-	vtbl.8		d29, {d29}, d16
-
-	vld1.32		{q8-q9}, [sp, :256]
-
-	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
-	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
-	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
-	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
-	vadd.i32	q10, q10, q15
-	vadd.i32	q11, q11, q12
-	vadd.i32	q8, q8, q13
-	vadd.i32	q9, q9, q14
-
-	vst1.32		{q8-q9}, [sp, :256]
-
-	veor		q8, q7, q8
-	veor		q9, q4, q9
-	vshl.u32	q7, q8, #7
-	vshl.u32	q4, q9, #7
-	vsri.u32	q7, q8, #25
-	vsri.u32	q4, q9, #25
-
-	veor		q8, q5, q10
-	veor		q9, q6, q11
-	vshl.u32	q5, q8, #7
-	vshl.u32	q6, q9, #7
-	vsri.u32	q5, q8, #25
-	vsri.u32	q6, q9, #25
-
-	subs		r3, r3, #2
-	bne		.Ldoubleround4
-
-	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
-	// x8..9[0-3] are on the stack.
-
-	// Re-interleave the words in the first two rows of each block (x0..7).
-	// Also add the counter values 0-3 to x12[0-3].
-	  vld1.32	{q8}, [lr, :128]	// load counter values 0-3
-	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
-	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
-	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
-	vzip.32		q6, q7			// => (6 7 6 7) (6 7 6 7)
-	  vadd.u32	q12, q8			// x12 += counter values 0-3
-	vswp		d1, d4
-	vswp		d3, d6
-	  vld1.32	{q8-q9}, [r0]!		// load s0..7
-	vswp		d9, d12
-	vswp		d11, d14
-
-	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
-	// after XORing the first 32 bytes.
-	vswp		q1, q4
-
-	// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
-
-	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
-	vadd.u32	q0, q0, q8
-	vadd.u32	q2, q2, q8
-	vadd.u32	q4, q4, q8
-	vadd.u32	q3, q3, q8
-
-	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
-	vadd.u32	q1, q1, q9
-	vadd.u32	q6, q6, q9
-	vadd.u32	q5, q5, q9
-	vadd.u32	q7, q7, q9
-
-	// XOR first 32 bytes using keystream from first two rows of first block
-	vld1.8		{q8-q9}, [r2]!
-	veor		q8, q8, q0
-	veor		q9, q9, q1
-	vst1.8		{q8-q9}, [r1]!
-
-	// Re-interleave the words in the last two rows of each block (x8..15).
-	vld1.32		{q8-q9}, [sp, :256]
-	  mov		sp, r4		// restore original stack pointer
-	  ldr		r4, [r4, #8]	// load number of bytes
-	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
-	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
-	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
-	vzip.32		q10, q11	// => (10 11 10 11) (10 11 10 11)
-	  vld1.32	{q0-q1}, [r0]	// load s8..15
-	vswp		d25, d28
-	vswp		d27, d30
-	vswp		d17, d20
-	vswp		d19, d22
-
-	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
-
-	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
-	vadd.u32	q8,  q8,  q0
-	vadd.u32	q10, q10, q0
-	vadd.u32	q9,  q9,  q0
-	vadd.u32	q11, q11, q0
-
-	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
-	vadd.u32	q12, q12, q1
-	vadd.u32	q14, q14, q1
-	vadd.u32	q13, q13, q1
-	vadd.u32	q15, q15, q1
-
-	// XOR the rest of the data with the keystream
-
-	vld1.8		{q0-q1}, [r2]!
-	subs		r4, r4, #96
-	veor		q0, q0, q8
-	veor		q1, q1, q12
-	ble		.Lle96
-	vst1.8		{q0-q1}, [r1]!
-
-	vld1.8		{q0-q1}, [r2]!
-	subs		r4, r4, #32
-	veor		q0, q0, q2
-	veor		q1, q1, q6
-	ble		.Lle128
-	vst1.8		{q0-q1}, [r1]!
-
-	vld1.8		{q0-q1}, [r2]!
-	subs		r4, r4, #32
-	veor		q0, q0, q10
-	veor		q1, q1, q14
-	ble		.Lle160
-	vst1.8		{q0-q1}, [r1]!
-
-	vld1.8		{q0-q1}, [r2]!
-	subs		r4, r4, #32
-	veor		q0, q0, q4
-	veor		q1, q1, q5
-	ble		.Lle192
-	vst1.8		{q0-q1}, [r1]!
-
-	vld1.8		{q0-q1}, [r2]!
-	subs		r4, r4, #32
-	veor		q0, q0, q9
-	veor		q1, q1, q13
-	ble		.Lle224
-	vst1.8		{q0-q1}, [r1]!
-
-	vld1.8		{q0-q1}, [r2]!
-	subs		r4, r4, #32
-	veor		q0, q0, q3
-	veor		q1, q1, q7
-	blt		.Llt256
-.Lout:
-	vst1.8		{q0-q1}, [r1]!
-
-	vld1.8		{q0-q1}, [r2]
-	veor		q0, q0, q11
-	veor		q1, q1, q15
-	vst1.8		{q0-q1}, [r1]
-
-	pop		{r4, pc}
-
-.Lle192:
-	vmov		q4, q9
-	vmov		q5, q13
-
-.Lle160:
-	// nothing to do
-
-.Lfinalblock:
-	// Process the final block if processing less than 4 full blocks.
-	// Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
-	// previous 32 byte output block that still needs to be written at
-	// [r1] in q0-q1.
-	beq		.Lfullblock
-
-.Lpartialblock:
-	adr		lr, .Lpermute + 32
-	add		r2, r2, r4
-	add		lr, lr, r4
-	add		r4, r4, r1
-
-	vld1.8		{q2-q3}, [lr]
-	vld1.8		{q6-q7}, [r2]
-
-	add		r4, r4, #32
-
-	vtbl.8		d4, {q4-q5}, d4
-	vtbl.8		d5, {q4-q5}, d5
-	vtbl.8		d6, {q4-q5}, d6
-	vtbl.8		d7, {q4-q5}, d7
-
-	veor		q6, q6, q2
-	veor		q7, q7, q3
-
-	vst1.8		{q6-q7}, [r4]	// overlapping stores
-	vst1.8		{q0-q1}, [r1]
-	pop		{r4, pc}
-
-.Lfullblock:
-	vmov		q11, q4
-	vmov		q15, q5
-	b		.Lout
-.Lle96:
-	vmov		q4, q2
-	vmov		q5, q6
-	b		.Lfinalblock
-.Lle128:
-	vmov		q4, q10
-	vmov		q5, q14
-	b		.Lfinalblock
-.Lle224:
-	vmov		q4, q3
-	vmov		q5, q7
-	b		.Lfinalblock
-.Llt256:
-	vmov		q4, q11
-	vmov		q5, q15
-	b		.Lpartialblock
-ENDPROC(chacha_4block_xor_neon)
-
-	.align		L1_CACHE_SHIFT
-.Lpermute:
-	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
-	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
-	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
-	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
-	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
-	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
-	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
-	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
diff --git a/arch/arm/crypto/chacha-scalar-core.S b/arch/arm/crypto/chacha-scalar-core.S
deleted file mode 100644
index 083fe1ab96d0..000000000000
--- a/arch/arm/crypto/chacha-scalar-core.S
+++ /dev/null
@@ -1,443 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2018 Google, Inc.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-/*
- * Design notes:
- *
- * 16 registers would be needed to hold the state matrix, but only 14 are
- * available because 'sp' and 'pc' cannot be used.  So we spill the elements
- * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
- * 'ldrd' and one 'strd' instruction per round.
- *
- * All rotates are performed using the implicit rotate operand accepted by the
- * 'add' and 'eor' instructions.  This is faster than using explicit rotate
- * instructions.  To make this work, we allow the values in the second and last
- * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
- * wrong rotation amount.  The rotation amount is then fixed up just in time
- * when the values are used.  'brot' is the number of bits the values in row 'b'
- * need to be rotated right to arrive at the correct values, and 'drot'
- * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
- * that they end up as (25, 24) after every round.
- */
-
-	// ChaCha state registers
-	X0	.req	r0
-	X1	.req	r1
-	X2	.req	r2
-	X3	.req	r3
-	X4	.req	r4
-	X5	.req	r5
-	X6	.req	r6
-	X7	.req	r7
-	X8_X10	.req	r8	// shared by x8 and x10
-	X9_X11	.req	r9	// shared by x9 and x11
-	X12	.req	r10
-	X13	.req	r11
-	X14	.req	r12
-	X15	.req	r14
-
-.macro _le32_bswap_4x	a, b, c, d,  tmp
-#ifdef __ARMEB__
-	rev_l		\a,  \tmp
-	rev_l		\b,  \tmp
-	rev_l		\c,  \tmp
-	rev_l		\d,  \tmp
-#endif
-.endm
-
-.macro __ldrd		a, b, src, offset
-#if __LINUX_ARM_ARCH__ >= 6
-	ldrd		\a, \b, [\src, #\offset]
-#else
-	ldr		\a, [\src, #\offset]
-	ldr		\b, [\src, #\offset + 4]
-#endif
-.endm
-
-.macro __strd		a, b, dst, offset
-#if __LINUX_ARM_ARCH__ >= 6
-	strd		\a, \b, [\dst, #\offset]
-#else
-	str		\a, [\dst, #\offset]
-	str		\b, [\dst, #\offset + 4]
-#endif
-.endm
-
-.macro _halfround	a1, b1, c1, d1,  a2, b2, c2, d2
-
-	// a += b; d ^= a; d = rol(d, 16);
-	add		\a1, \a1, \b1, ror #brot
-	add		\a2, \a2, \b2, ror #brot
-	eor		\d1, \a1, \d1, ror #drot
-	eor		\d2, \a2, \d2, ror #drot
-	// drot == 32 - 16 == 16
-
-	// c += d; b ^= c; b = rol(b, 12);
-	add		\c1, \c1, \d1, ror #16
-	add		\c2, \c2, \d2, ror #16
-	eor		\b1, \c1, \b1, ror #brot
-	eor		\b2, \c2, \b2, ror #brot
-	// brot == 32 - 12 == 20
-
-	// a += b; d ^= a; d = rol(d, 8);
-	add		\a1, \a1, \b1, ror #20
-	add		\a2, \a2, \b2, ror #20
-	eor		\d1, \a1, \d1, ror #16
-	eor		\d2, \a2, \d2, ror #16
-	// drot == 32 - 8 == 24
-
-	// c += d; b ^= c; b = rol(b, 7);
-	add		\c1, \c1, \d1, ror #24
-	add		\c2, \c2, \d2, ror #24
-	eor		\b1, \c1, \b1, ror #20
-	eor		\b2, \c2, \b2, ror #20
-	// brot == 32 - 7 == 25
-.endm
-
-.macro _doubleround
-
-	// column round
-
-	// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
-	_halfround	X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
-
-	// save (x8, x9); restore (x10, x11)
-	__strd		X8_X10, X9_X11, sp, 0
-	__ldrd		X8_X10, X9_X11, sp, 8
-
-	// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
-	_halfround	X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
-
-	.set brot, 25
-	.set drot, 24
-
-	// diagonal round
-
-	// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
-	_halfround	X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
-
-	// save (x10, x11); restore (x8, x9)
-	__strd		X8_X10, X9_X11, sp, 8
-	__ldrd		X8_X10, X9_X11, sp, 0
-
-	// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
-	_halfround	X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
-.endm
-
-.macro _chacha_permute	nrounds
-	.set brot, 0
-	.set drot, 0
-	.rept \nrounds / 2
-	 _doubleround
-	.endr
-.endm
-
-.macro _chacha		nrounds
-
-.Lnext_block\@:
-	// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
-	// Registers contain x0-x9,x12-x15.
-
-	// Do the core ChaCha permutation to update x0-x15.
-	_chacha_permute	\nrounds
-
-	add		sp, #8
-	// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
-	// Registers contain x0-x9,x12-x15.
-	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
-	// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
-	push		{X8_X10, X9_X11, X12, X13, X14, X15}
-
-	// Load (OUT, IN, LEN).
-	ldr		r14, [sp, #96]
-	ldr		r12, [sp, #100]
-	ldr		r11, [sp, #104]
-
-	orr		r10, r14, r12
-
-	// Use slow path if fewer than 64 bytes remain.
-	cmp		r11, #64
-	blt		.Lxor_slowpath\@
-
-	// Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
-	// ARMv6+, since ldmia and stmia (used below) still require alignment.
-	tst		r10, #3
-	bne		.Lxor_slowpath\@
-
-	// Fast path: XOR 64 bytes of aligned data.
-
-	// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
-	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
-	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
-	// x0-x3
-	__ldrd		r8, r9, sp, 32
-	__ldrd		r10, r11, sp, 40
-	add		X0, X0, r8
-	add		X1, X1, r9
-	add		X2, X2, r10
-	add		X3, X3, r11
-	_le32_bswap_4x	X0, X1, X2, X3,  r8
-	ldmia		r12!, {r8-r11}
-	eor		X0, X0, r8
-	eor		X1, X1, r9
-	eor		X2, X2, r10
-	eor		X3, X3, r11
-	stmia		r14!, {X0-X3}
-
-	// x4-x7
-	__ldrd		r8, r9, sp, 48
-	__ldrd		r10, r11, sp, 56
-	add		X4, r8, X4, ror #brot
-	add		X5, r9, X5, ror #brot
-	ldmia		r12!, {X0-X3}
-	add		X6, r10, X6, ror #brot
-	add		X7, r11, X7, ror #brot
-	_le32_bswap_4x	X4, X5, X6, X7,  r8
-	eor		X4, X4, X0
-	eor		X5, X5, X1
-	eor		X6, X6, X2
-	eor		X7, X7, X3
-	stmia		r14!, {X4-X7}
-
-	// x8-x15
-	pop		{r0-r7}			// (x8-x9,x12-x15,x10-x11)
-	__ldrd		r8, r9, sp, 32
-	__ldrd		r10, r11, sp, 40
-	add		r0, r0, r8		// x8
-	add		r1, r1, r9		// x9
-	add		r6, r6, r10		// x10
-	add		r7, r7, r11		// x11
-	_le32_bswap_4x	r0, r1, r6, r7,  r8
-	ldmia		r12!, {r8-r11}
-	eor		r0, r0, r8		// x8
-	eor		r1, r1, r9		// x9
-	eor		r6, r6, r10		// x10
-	eor		r7, r7, r11		// x11
-	stmia		r14!, {r0,r1,r6,r7}
-	ldmia		r12!, {r0,r1,r6,r7}
-	__ldrd		r8, r9, sp, 48
-	__ldrd		r10, r11, sp, 56
-	add		r2, r8, r2, ror #drot	// x12
-	add		r3, r9, r3, ror #drot	// x13
-	add		r4, r10, r4, ror #drot	// x14
-	add		r5, r11, r5, ror #drot	// x15
-	_le32_bswap_4x	r2, r3, r4, r5,  r9
-	  ldr		r9, [sp, #72]		// load LEN
-	eor		r2, r2, r0		// x12
-	eor		r3, r3, r1		// x13
-	eor		r4, r4, r6		// x14
-	eor		r5, r5, r7		// x15
-	  subs		r9, #64			// decrement and check LEN
-	stmia		r14!, {r2-r5}
-
-	beq		.Ldone\@
-
-.Lprepare_for_next_block\@:
-
-	// Stack: x0-x15 OUT IN LEN
-
-	// Increment block counter (x12)
-	add		r8, #1
-
-	// Store updated (OUT, IN, LEN)
-	str		r14, [sp, #64]
-	str		r12, [sp, #68]
-	str		r9, [sp, #72]
-
-	  mov		r14, sp
-
-	// Store updated block counter (x12)
-	str		r8, [sp, #48]
-
-	  sub		sp, #16
-
-	// Reload state and do next block
-	ldmia		r14!, {r0-r11}		// load x0-x11
-	__strd		r10, r11, sp, 8		// store x10-x11 before state
-	ldmia		r14, {r10-r12,r14}	// load x12-x15
-	b		.Lnext_block\@
-
-.Lxor_slowpath\@:
-	// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
-	// We handle it by storing the 64 bytes of keystream to the stack, then
-	// XOR-ing the needed portion with the data.
-
-	// Allocate keystream buffer
-	sub		sp, #64
-	mov		r14, sp
-
-	// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
-	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
-	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
-
-	// Save keystream for x0-x3
-	__ldrd		r8, r9, sp, 96
-	__ldrd		r10, r11, sp, 104
-	add		X0, X0, r8
-	add		X1, X1, r9
-	add		X2, X2, r10
-	add		X3, X3, r11
-	_le32_bswap_4x	X0, X1, X2, X3,  r8
-	stmia		r14!, {X0-X3}
-
-	// Save keystream for x4-x7
-	__ldrd		r8, r9, sp, 112
-	__ldrd		r10, r11, sp, 120
-	add		X4, r8, X4, ror #brot
-	add		X5, r9, X5, ror #brot
-	add		X6, r10, X6, ror #brot
-	add		X7, r11, X7, ror #brot
-	_le32_bswap_4x	X4, X5, X6, X7,  r8
-	  add		r8, sp, #64
-	stmia		r14!, {X4-X7}
-
-	// Save keystream for x8-x15
-	ldm		r8, {r0-r7}		// (x8-x9,x12-x15,x10-x11)
-	__ldrd		r8, r9, sp, 128
-	__ldrd		r10, r11, sp, 136
-	add		r0, r0, r8		// x8
-	add		r1, r1, r9		// x9
-	add		r6, r6, r10		// x10
-	add		r7, r7, r11		// x11
-	_le32_bswap_4x	r0, r1, r6, r7,  r8
-	stmia		r14!, {r0,r1,r6,r7}
-	__ldrd		r8, r9, sp, 144
-	__ldrd		r10, r11, sp, 152
-	add		r2, r8, r2, ror #drot	// x12
-	add		r3, r9, r3, ror #drot	// x13
-	add		r4, r10, r4, ror #drot	// x14
-	add		r5, r11, r5, ror #drot	// x15
-	_le32_bswap_4x	r2, r3, r4, r5,  r9
-	stmia		r14, {r2-r5}
-
-	// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
-	// Registers: r8 is block counter, r12 is IN.
-
-	ldr		r9, [sp, #168]		// LEN
-	ldr		r14, [sp, #160]		// OUT
-	cmp		r9, #64
-	  mov		r0, sp
-	movle		r1, r9
-	movgt		r1, #64
-	// r1 is number of bytes to XOR, in range [1, 64]
-
-.if __LINUX_ARM_ARCH__ < 6
-	orr		r2, r12, r14
-	tst		r2, #3			// IN or OUT misaligned?
-	bne		.Lxor_next_byte\@
-.endif
-
-	// XOR a word at a time
-.rept 16
-	subs		r1, #4
-	blt		.Lxor_words_done\@
-	ldr		r2, [r12], #4
-	ldr		r3, [r0], #4
-	eor		r2, r2, r3
-	str		r2, [r14], #4
-.endr
-	b		.Lxor_slowpath_done\@
-.Lxor_words_done\@:
-	ands		r1, r1, #3
-	beq		.Lxor_slowpath_done\@
-
-	// XOR a byte at a time
-.Lxor_next_byte\@:
-	ldrb		r2, [r12], #1
-	ldrb		r3, [r0], #1
-	eor		r2, r2, r3
-	strb		r2, [r14], #1
-	subs		r1, #1
-	bne		.Lxor_next_byte\@
-
-.Lxor_slowpath_done\@:
-	subs		r9, #64
-	add		sp, #96
-	bgt		.Lprepare_for_next_block\@
-
-.Ldone\@:
-.endm	// _chacha
-
-/*
- * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
- *		     const u32 *state, int nrounds);
- */
-ENTRY(chacha_doarm)
-	cmp		r2, #0			// len == 0?
-	reteq		lr
-
-	ldr		ip, [sp]
-	cmp		ip, #12
-
-	push		{r0-r2,r4-r11,lr}
-
-	// Push state x0-x15 onto stack.
-	// Also store an extra copy of x10-x11 just before the state.
-
-	add		X12, r3, #48
-	ldm		X12, {X12,X13,X14,X15}
-	push		{X12,X13,X14,X15}
-	sub		sp, sp, #64
-
-	__ldrd		X8_X10, X9_X11, r3, 40
-	__strd		X8_X10, X9_X11, sp, 8
-	__strd		X8_X10, X9_X11, sp, 56
-	ldm		r3, {X0-X9_X11}
-	__strd		X0, X1, sp, 16
-	__strd		X2, X3, sp, 24
-	__strd		X4, X5, sp, 32
-	__strd		X6, X7, sp, 40
-	__strd		X8_X10, X9_X11, sp, 48
-
-	beq		1f
-	_chacha		20
-
-0:	add		sp, #76
-	pop		{r4-r11, pc}
-
-1:	_chacha		12
-	b		0b
-ENDPROC(chacha_doarm)
-
-/*
- * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
- */
-ENTRY(hchacha_block_arm)
-	push		{r1,r4-r11,lr}
-
-	cmp		r2, #12			// ChaCha12 ?
-
-	mov		r14, r0
-	ldmia		r14!, {r0-r11}		// load x0-x11
-	push		{r10-r11}		// store x10-x11 to stack
-	ldm		r14, {r10-r12,r14}	// load x12-x15
-	sub		sp, #8
-
-	beq		1f
-	_chacha_permute	20
-
-	// Skip over (unused0-unused1, x10-x11)
-0:	add		sp, #16
-
-	// Fix up rotations of x12-x15
-	ror		X12, X12, #drot
-	ror		X13, X13, #drot
-	  pop		{r4}			// load 'out'
-	ror		X14, X14, #drot
-	ror		X15, X15, #drot
-
-	// Store (x0-x3,x12-x15) to 'out'
-	stm		r4, {X0,X1,X2,X3,X12,X13,X14,X15}
-
-	pop		{r4-r11,pc}
-
-1:	_chacha_permute	12
-	b		0b
-ENDPROC(hchacha_block_arm)
diff --git a/arch/arm/crypto/poly1305-armv4.pl b/arch/arm/crypto/poly1305-armv4.pl
deleted file mode 100644
index 6d79498d3115..000000000000
--- a/arch/arm/crypto/poly1305-armv4.pl
+++ /dev/null
@@ -1,1236 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
-#
-# ====================================================================
-# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
-# project.
-# ====================================================================
-#
-#			IALU(*)/gcc-4.4		NEON
-#
-# ARM11xx(ARMv6)	7.78/+100%		-
-# Cortex-A5		6.35/+130%		3.00
-# Cortex-A8		6.25/+115%		2.36
-# Cortex-A9		5.10/+95%		2.55
-# Cortex-A15		3.85/+85%		1.25(**)
-# Snapdragon S4		5.70/+100%		1.48(**)
-#
-# (*)	this is for -march=armv6, i.e. with bunch of ldrb loading data;
-# (**)	these are trade-off results, they can be improved by ~8% but at
-#	the cost of 15/12% regression on Cortex-A5/A7, it's even possible
-#	to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
-
-$flavour = shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
-    open STDOUT,">$output";
-}
-
-($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
-
-$code.=<<___;
-#ifndef	__KERNEL__
-# include "arm_arch.h"
-#else
-# define __ARM_ARCH__ __LINUX_ARM_ARCH__
-# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
-# define poly1305_init   poly1305_init_arm
-# define poly1305_blocks poly1305_blocks_arm
-# define poly1305_emit   poly1305_emit_arm
-.globl	poly1305_blocks_neon
-#endif
-
-#if defined(__thumb2__)
-.syntax	unified
-.thumb
-#else
-.code	32
-#endif
-
-.text
-
-.globl	poly1305_emit
-.globl	poly1305_blocks
-.globl	poly1305_init
-.type	poly1305_init,%function
-.align	5
-poly1305_init:
-.Lpoly1305_init:
-	stmdb	sp!,{r4-r11}
-
-	eor	r3,r3,r3
-	cmp	$inp,#0
-	str	r3,[$ctx,#0]		@ zero hash value
-	str	r3,[$ctx,#4]
-	str	r3,[$ctx,#8]
-	str	r3,[$ctx,#12]
-	str	r3,[$ctx,#16]
-	str	r3,[$ctx,#36]		@ clear is_base2_26
-	add	$ctx,$ctx,#20
-
-#ifdef	__thumb2__
-	it	eq
-#endif
-	moveq	r0,#0
-	beq	.Lno_key
-
-#if	__ARM_MAX_ARCH__>=7
-	mov	r3,#-1
-	str	r3,[$ctx,#28]		@ impossible key power value
-# ifndef __KERNEL__
-	adr	r11,.Lpoly1305_init
-	ldr	r12,.LOPENSSL_armcap
-# endif
-#endif
-	ldrb	r4,[$inp,#0]
-	mov	r10,#0x0fffffff
-	ldrb	r5,[$inp,#1]
-	and	r3,r10,#-4		@ 0x0ffffffc
-	ldrb	r6,[$inp,#2]
-	ldrb	r7,[$inp,#3]
-	orr	r4,r4,r5,lsl#8
-	ldrb	r5,[$inp,#4]
-	orr	r4,r4,r6,lsl#16
-	ldrb	r6,[$inp,#5]
-	orr	r4,r4,r7,lsl#24
-	ldrb	r7,[$inp,#6]
-	and	r4,r4,r10
-
-#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-# if !defined(_WIN32)
-	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
-# endif
-# if defined(__APPLE__) || defined(_WIN32)
-	ldr	r12,[r12]
-# endif
-#endif
-	ldrb	r8,[$inp,#7]
-	orr	r5,r5,r6,lsl#8
-	ldrb	r6,[$inp,#8]
-	orr	r5,r5,r7,lsl#16
-	ldrb	r7,[$inp,#9]
-	orr	r5,r5,r8,lsl#24
-	ldrb	r8,[$inp,#10]
-	and	r5,r5,r3
-
-#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-	tst	r12,#ARMV7_NEON		@ check for NEON
-# ifdef	__thumb2__
-	adr	r9,.Lpoly1305_blocks_neon
-	adr	r11,.Lpoly1305_blocks
-	it	ne
-	movne	r11,r9
-	adr	r12,.Lpoly1305_emit
-	orr	r11,r11,#1		@ thumb-ify addresses
-	orr	r12,r12,#1
-# else
-	add	r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
-	ite	eq
-	addeq	r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
-	addne	r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
-# endif
-#endif
-	ldrb	r9,[$inp,#11]
-	orr	r6,r6,r7,lsl#8
-	ldrb	r7,[$inp,#12]
-	orr	r6,r6,r8,lsl#16
-	ldrb	r8,[$inp,#13]
-	orr	r6,r6,r9,lsl#24
-	ldrb	r9,[$inp,#14]
-	and	r6,r6,r3
-
-	ldrb	r10,[$inp,#15]
-	orr	r7,r7,r8,lsl#8
-	str	r4,[$ctx,#0]
-	orr	r7,r7,r9,lsl#16
-	str	r5,[$ctx,#4]
-	orr	r7,r7,r10,lsl#24
-	str	r6,[$ctx,#8]
-	and	r7,r7,r3
-	str	r7,[$ctx,#12]
-#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
-	stmia	r2,{r11,r12}		@ fill functions table
-	mov	r0,#1
-#else
-	mov	r0,#0
-#endif
-.Lno_key:
-	ldmia	sp!,{r4-r11}
-#if	__ARM_ARCH__>=5
-	ret				@ bx	lr
-#else
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	poly1305_init,.-poly1305_init
-___
-{
-my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
-my ($s1,$s2,$s3)=($r1,$r2,$r3);
-
-$code.=<<___;
-.type	poly1305_blocks,%function
-.align	5
-poly1305_blocks:
-.Lpoly1305_blocks:
-	stmdb	sp!,{r3-r11,lr}
-
-	ands	$len,$len,#-16
-	beq	.Lno_data
-
-	add	$len,$len,$inp		@ end pointer
-	sub	sp,sp,#32
-
-#if __ARM_ARCH__<7
-	ldmia	$ctx,{$h0-$r3}		@ load context
-	add	$ctx,$ctx,#20
-	str	$len,[sp,#16]		@ offload stuff
-	str	$ctx,[sp,#12]
-#else
-	ldr	lr,[$ctx,#36]		@ is_base2_26
-	ldmia	$ctx!,{$h0-$h4}		@ load hash value
-	str	$len,[sp,#16]		@ offload stuff
-	str	$ctx,[sp,#12]
-
-	adds	$r0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
-	mov	$r1,$h1,lsr#6
-	adcs	$r1,$r1,$h2,lsl#20
-	mov	$r2,$h2,lsr#12
-	adcs	$r2,$r2,$h3,lsl#14
-	mov	$r3,$h3,lsr#18
-	adcs	$r3,$r3,$h4,lsl#8
-	mov	$len,#0
-	teq	lr,#0
-	str	$len,[$ctx,#16]		@ clear is_base2_26
-	adc	$len,$len,$h4,lsr#24
-
-	itttt	ne
-	movne	$h0,$r0			@ choose between radixes
-	movne	$h1,$r1
-	movne	$h2,$r2
-	movne	$h3,$r3
-	ldmia	$ctx,{$r0-$r3}		@ load key
-	it	ne
-	movne	$h4,$len
-#endif
-
-	mov	lr,$inp
-	cmp	$padbit,#0
-	str	$r1,[sp,#20]
-	str	$r2,[sp,#24]
-	str	$r3,[sp,#28]
-	b	.Loop
-
-.align	4
-.Loop:
-#if __ARM_ARCH__<7
-	ldrb	r0,[lr],#16		@ load input
-# ifdef	__thumb2__
-	it	hi
-# endif
-	addhi	$h4,$h4,#1		@ 1<<128
-	ldrb	r1,[lr,#-15]
-	ldrb	r2,[lr,#-14]
-	ldrb	r3,[lr,#-13]
-	orr	r1,r0,r1,lsl#8
-	ldrb	r0,[lr,#-12]
-	orr	r2,r1,r2,lsl#16
-	ldrb	r1,[lr,#-11]
-	orr	r3,r2,r3,lsl#24
-	ldrb	r2,[lr,#-10]
-	adds	$h0,$h0,r3		@ accumulate input
-
-	ldrb	r3,[lr,#-9]
-	orr	r1,r0,r1,lsl#8
-	ldrb	r0,[lr,#-8]
-	orr	r2,r1,r2,lsl#16
-	ldrb	r1,[lr,#-7]
-	orr	r3,r2,r3,lsl#24
-	ldrb	r2,[lr,#-6]
-	adcs	$h1,$h1,r3
-
-	ldrb	r3,[lr,#-5]
-	orr	r1,r0,r1,lsl#8
-	ldrb	r0,[lr,#-4]
-	orr	r2,r1,r2,lsl#16
-	ldrb	r1,[lr,#-3]
-	orr	r3,r2,r3,lsl#24
-	ldrb	r2,[lr,#-2]
-	adcs	$h2,$h2,r3
-
-	ldrb	r3,[lr,#-1]
-	orr	r1,r0,r1,lsl#8
-	str	lr,[sp,#8]		@ offload input pointer
-	orr	r2,r1,r2,lsl#16
-	add	$s1,$r1,$r1,lsr#2
-	orr	r3,r2,r3,lsl#24
-#else
-	ldr	r0,[lr],#16		@ load input
-	it	hi
-	addhi	$h4,$h4,#1		@ padbit
-	ldr	r1,[lr,#-12]
-	ldr	r2,[lr,#-8]
-	ldr	r3,[lr,#-4]
-# ifdef	__ARMEB__
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-# endif
-	adds	$h0,$h0,r0		@ accumulate input
-	str	lr,[sp,#8]		@ offload input pointer
-	adcs	$h1,$h1,r1
-	add	$s1,$r1,$r1,lsr#2
-	adcs	$h2,$h2,r2
-#endif
-	add	$s2,$r2,$r2,lsr#2
-	adcs	$h3,$h3,r3
-	add	$s3,$r3,$r3,lsr#2
-
-	umull	r2,r3,$h1,$r0
-	 adc	$h4,$h4,#0
-	umull	r0,r1,$h0,$r0
-	umlal	r2,r3,$h4,$s1
-	umlal	r0,r1,$h3,$s1
-	ldr	$r1,[sp,#20]		@ reload $r1
-	umlal	r2,r3,$h2,$s3
-	umlal	r0,r1,$h1,$s3
-	umlal	r2,r3,$h3,$s2
-	umlal	r0,r1,$h2,$s2
-	umlal	r2,r3,$h0,$r1
-	str	r0,[sp,#0]		@ future $h0
-	 mul	r0,$s2,$h4
-	ldr	$r2,[sp,#24]		@ reload $r2
-	adds	r2,r2,r1		@ d1+=d0>>32
-	 eor	r1,r1,r1
-	adc	lr,r3,#0		@ future $h2
-	str	r2,[sp,#4]		@ future $h1
-
-	mul	r2,$s3,$h4
-	eor	r3,r3,r3
-	umlal	r0,r1,$h3,$s3
-	ldr	$r3,[sp,#28]		@ reload $r3
-	umlal	r2,r3,$h3,$r0
-	umlal	r0,r1,$h2,$r0
-	umlal	r2,r3,$h2,$r1
-	umlal	r0,r1,$h1,$r1
-	umlal	r2,r3,$h1,$r2
-	umlal	r0,r1,$h0,$r2
-	umlal	r2,r3,$h0,$r3
-	ldr	$h0,[sp,#0]
-	mul	$h4,$r0,$h4
-	ldr	$h1,[sp,#4]
-
-	adds	$h2,lr,r0		@ d2+=d1>>32
-	ldr	lr,[sp,#8]		@ reload input pointer
-	adc	r1,r1,#0
-	adds	$h3,r2,r1		@ d3+=d2>>32
-	ldr	r0,[sp,#16]		@ reload end pointer
-	adc	r3,r3,#0
-	add	$h4,$h4,r3		@ h4+=d3>>32
-
-	and	r1,$h4,#-4
-	and	$h4,$h4,#3
-	add	r1,r1,r1,lsr#2		@ *=5
-	adds	$h0,$h0,r1
-	adcs	$h1,$h1,#0
-	adcs	$h2,$h2,#0
-	adcs	$h3,$h3,#0
-	adc	$h4,$h4,#0
-
-	cmp	r0,lr			@ done yet?
-	bhi	.Loop
-
-	ldr	$ctx,[sp,#12]
-	add	sp,sp,#32
-	stmdb	$ctx,{$h0-$h4}		@ store the result
-
-.Lno_data:
-#if	__ARM_ARCH__>=5
-	ldmia	sp!,{r3-r11,pc}
-#else
-	ldmia	sp!,{r3-r11,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	poly1305_blocks,.-poly1305_blocks
-___
-}
-{
-my ($ctx,$mac,$nonce)=map("r$_",(0..2));
-my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
-my $g4=$ctx;
-
-$code.=<<___;
-.type	poly1305_emit,%function
-.align	5
-poly1305_emit:
-.Lpoly1305_emit:
-	stmdb	sp!,{r4-r11}
-
-	ldmia	$ctx,{$h0-$h4}
-
-#if __ARM_ARCH__>=7
-	ldr	ip,[$ctx,#36]		@ is_base2_26
-
-	adds	$g0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
-	mov	$g1,$h1,lsr#6
-	adcs	$g1,$g1,$h2,lsl#20
-	mov	$g2,$h2,lsr#12
-	adcs	$g2,$g2,$h3,lsl#14
-	mov	$g3,$h3,lsr#18
-	adcs	$g3,$g3,$h4,lsl#8
-	mov	$g4,#0
-	adc	$g4,$g4,$h4,lsr#24
-
-	tst	ip,ip
-	itttt	ne
-	movne	$h0,$g0
-	movne	$h1,$g1
-	movne	$h2,$g2
-	movne	$h3,$g3
-	it	ne
-	movne	$h4,$g4
-#endif
-
-	adds	$g0,$h0,#5		@ compare to modulus
-	adcs	$g1,$h1,#0
-	adcs	$g2,$h2,#0
-	adcs	$g3,$h3,#0
-	adc	$g4,$h4,#0
-	tst	$g4,#4			@ did it carry/borrow?
-
-#ifdef	__thumb2__
-	it	ne
-#endif
-	movne	$h0,$g0
-	ldr	$g0,[$nonce,#0]
-#ifdef	__thumb2__
-	it	ne
-#endif
-	movne	$h1,$g1
-	ldr	$g1,[$nonce,#4]
-#ifdef	__thumb2__
-	it	ne
-#endif
-	movne	$h2,$g2
-	ldr	$g2,[$nonce,#8]
-#ifdef	__thumb2__
-	it	ne
-#endif
-	movne	$h3,$g3
-	ldr	$g3,[$nonce,#12]
-
-	adds	$h0,$h0,$g0
-	adcs	$h1,$h1,$g1
-	adcs	$h2,$h2,$g2
-	adc	$h3,$h3,$g3
-
-#if __ARM_ARCH__>=7
-# ifdef __ARMEB__
-	rev	$h0,$h0
-	rev	$h1,$h1
-	rev	$h2,$h2
-	rev	$h3,$h3
-# endif
-	str	$h0,[$mac,#0]
-	str	$h1,[$mac,#4]
-	str	$h2,[$mac,#8]
-	str	$h3,[$mac,#12]
-#else
-	strb	$h0,[$mac,#0]
-	mov	$h0,$h0,lsr#8
-	strb	$h1,[$mac,#4]
-	mov	$h1,$h1,lsr#8
-	strb	$h2,[$mac,#8]
-	mov	$h2,$h2,lsr#8
-	strb	$h3,[$mac,#12]
-	mov	$h3,$h3,lsr#8
-
-	strb	$h0,[$mac,#1]
-	mov	$h0,$h0,lsr#8
-	strb	$h1,[$mac,#5]
-	mov	$h1,$h1,lsr#8
-	strb	$h2,[$mac,#9]
-	mov	$h2,$h2,lsr#8
-	strb	$h3,[$mac,#13]
-	mov	$h3,$h3,lsr#8
-
-	strb	$h0,[$mac,#2]
-	mov	$h0,$h0,lsr#8
-	strb	$h1,[$mac,#6]
-	mov	$h1,$h1,lsr#8
-	strb	$h2,[$mac,#10]
-	mov	$h2,$h2,lsr#8
-	strb	$h3,[$mac,#14]
-	mov	$h3,$h3,lsr#8
-
-	strb	$h0,[$mac,#3]
-	strb	$h1,[$mac,#7]
-	strb	$h2,[$mac,#11]
-	strb	$h3,[$mac,#15]
-#endif
-	ldmia	sp!,{r4-r11}
-#if	__ARM_ARCH__>=5
-	ret				@ bx	lr
-#else
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	poly1305_emit,.-poly1305_emit
-___
-{
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
-my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
-my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
-
-my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
-
-$code.=<<___;
-#if	__ARM_MAX_ARCH__>=7
-.fpu	neon
-
-.type	poly1305_init_neon,%function
-.align	5
-poly1305_init_neon:
-.Lpoly1305_init_neon:
-	ldr	r3,[$ctx,#48]		@ first table element
-	cmp	r3,#-1			@ is value impossible?
-	bne	.Lno_init_neon
-
-	ldr	r4,[$ctx,#20]		@ load key base 2^32
-	ldr	r5,[$ctx,#24]
-	ldr	r6,[$ctx,#28]
-	ldr	r7,[$ctx,#32]
-
-	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
-	mov	r3,r4,lsr#26
-	mov	r4,r5,lsr#20
-	orr	r3,r3,r5,lsl#6
-	mov	r5,r6,lsr#14
-	orr	r4,r4,r6,lsl#12
-	mov	r6,r7,lsr#8
-	orr	r5,r5,r7,lsl#18
-	and	r3,r3,#0x03ffffff
-	and	r4,r4,#0x03ffffff
-	and	r5,r5,#0x03ffffff
-
-	vdup.32	$R0,r2			@ r^1 in both lanes
-	add	r2,r3,r3,lsl#2		@ *5
-	vdup.32	$R1,r3
-	add	r3,r4,r4,lsl#2
-	vdup.32	$S1,r2
-	vdup.32	$R2,r4
-	add	r4,r5,r5,lsl#2
-	vdup.32	$S2,r3
-	vdup.32	$R3,r5
-	add	r5,r6,r6,lsl#2
-	vdup.32	$S3,r4
-	vdup.32	$R4,r6
-	vdup.32	$S4,r5
-
-	mov	$zeros,#2		@ counter
-
-.Lsquare_neon:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-
-	vmull.u32	$D0,$R0,${R0}[1]
-	vmull.u32	$D1,$R1,${R0}[1]
-	vmull.u32	$D2,$R2,${R0}[1]
-	vmull.u32	$D3,$R3,${R0}[1]
-	vmull.u32	$D4,$R4,${R0}[1]
-
-	vmlal.u32	$D0,$R4,${S1}[1]
-	vmlal.u32	$D1,$R0,${R1}[1]
-	vmlal.u32	$D2,$R1,${R1}[1]
-	vmlal.u32	$D3,$R2,${R1}[1]
-	vmlal.u32	$D4,$R3,${R1}[1]
-
-	vmlal.u32	$D0,$R3,${S2}[1]
-	vmlal.u32	$D1,$R4,${S2}[1]
-	vmlal.u32	$D3,$R1,${R2}[1]
-	vmlal.u32	$D2,$R0,${R2}[1]
-	vmlal.u32	$D4,$R2,${R2}[1]
-
-	vmlal.u32	$D0,$R2,${S3}[1]
-	vmlal.u32	$D3,$R0,${R3}[1]
-	vmlal.u32	$D1,$R3,${S3}[1]
-	vmlal.u32	$D2,$R4,${S3}[1]
-	vmlal.u32	$D4,$R1,${R3}[1]
-
-	vmlal.u32	$D3,$R4,${S4}[1]
-	vmlal.u32	$D0,$R1,${S4}[1]
-	vmlal.u32	$D1,$R2,${S4}[1]
-	vmlal.u32	$D2,$R3,${S4}[1]
-	vmlal.u32	$D4,$R0,${R4}[1]
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-	@ and P. Schwabe
-	@
-	@ H0>>+H1>>+H2>>+H3>>+H4
-	@ H3>>+H4>>*5+H0>>+H1
-	@
-	@ Trivia.
-	@
-	@ Result of multiplication of n-bit number by m-bit number is
-	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
-	@ m-bit number multiplied by 2^n is still n+m bits wide.
-	@
-	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
-	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
-	@ one is n+1 bits wide.
-	@
-	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
-	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
-	@ can be 27. However! In cases when their width exceeds 26 bits
-	@ they are limited by 2^26+2^6. This in turn means that *sum*
-	@ of the products with these values can still be viewed as sum
-	@ of 52-bit numbers as long as the amount of addends is not a
-	@ power of 2. For example,
-	@
-	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
-	@
-	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
-	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
-	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
-	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
-	@ which is less than 32 * (2^52) or 2^57. And when processing
-	@ data we are looking at triple as many addends...
-	@
-	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
-	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
-	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
-	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
-	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
-	@ This means that result of reduction have to be compressed upon
-	@ loop wrap-around. This can be done in the process of reduction
-	@ to minimize amount of instructions [as well as amount of
-	@ 128-bit instructions, which benefits low-end processors], but
-	@ one has to watch for H2 (which is narrower than H0) and 5*H4
-	@ not being wider than 58 bits, so that result of right shift
-	@ by 26 bits fits in 32 bits. This is also useful on x86,
-	@ because it allows to use paddd in place for paddq, which
-	@ benefits Atom, where paddq is ridiculously slow.
-
-	vshr.u64	$T0,$D3,#26
-	vmovn.i64	$D3#lo,$D3
-	 vshr.u64	$T1,$D0,#26
-	 vmovn.i64	$D0#lo,$D0
-	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
-	vbic.i32	$D3#lo,#0xfc000000	@ &=0x03ffffff
-	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
-	 vbic.i32	$D0#lo,#0xfc000000
-
-	vshrn.u64	$T0#lo,$D4,#26
-	vmovn.i64	$D4#lo,$D4
-	 vshr.u64	$T1,$D1,#26
-	 vmovn.i64	$D1#lo,$D1
-	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
-	vbic.i32	$D4#lo,#0xfc000000
-	 vbic.i32	$D1#lo,#0xfc000000
-
-	vadd.i32	$D0#lo,$D0#lo,$T0#lo
-	vshl.u32	$T0#lo,$T0#lo,#2
-	 vshrn.u64	$T1#lo,$D2,#26
-	 vmovn.i64	$D2#lo,$D2
-	vadd.i32	$D0#lo,$D0#lo,$T0#lo	@ h4 -> h0
-	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
-	 vbic.i32	$D2#lo,#0xfc000000
-
-	vshr.u32	$T0#lo,$D0#lo,#26
-	vbic.i32	$D0#lo,#0xfc000000
-	 vshr.u32	$T1#lo,$D3#lo,#26
-	 vbic.i32	$D3#lo,#0xfc000000
-	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
-	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
-
-	subs		$zeros,$zeros,#1
-	beq		.Lsquare_break_neon
-
-	add		$tbl0,$ctx,#(48+0*9*4)
-	add		$tbl1,$ctx,#(48+1*9*4)
-
-	vtrn.32		$R0,$D0#lo		@ r^2:r^1
-	vtrn.32		$R2,$D2#lo
-	vtrn.32		$R3,$D3#lo
-	vtrn.32		$R1,$D1#lo
-	vtrn.32		$R4,$D4#lo
-
-	vshl.u32	$S2,$R2,#2		@ *5
-	vshl.u32	$S3,$R3,#2
-	vshl.u32	$S1,$R1,#2
-	vshl.u32	$S4,$R4,#2
-	vadd.i32	$S2,$S2,$R2
-	vadd.i32	$S1,$S1,$R1
-	vadd.i32	$S3,$S3,$R3
-	vadd.i32	$S4,$S4,$R4
-
-	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
-	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
-	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vst1.32		{${S4}[0]},[$tbl0,:32]
-	vst1.32		{${S4}[1]},[$tbl1,:32]
-
-	b		.Lsquare_neon
-
-.align	4
-.Lsquare_break_neon:
-	add		$tbl0,$ctx,#(48+2*4*9)
-	add		$tbl1,$ctx,#(48+3*4*9)
-
-	vmov		$R0,$D0#lo		@ r^4:r^3
-	vshl.u32	$S1,$D1#lo,#2		@ *5
-	vmov		$R1,$D1#lo
-	vshl.u32	$S2,$D2#lo,#2
-	vmov		$R2,$D2#lo
-	vshl.u32	$S3,$D3#lo,#2
-	vmov		$R3,$D3#lo
-	vshl.u32	$S4,$D4#lo,#2
-	vmov		$R4,$D4#lo
-	vadd.i32	$S1,$S1,$D1#lo
-	vadd.i32	$S2,$S2,$D2#lo
-	vadd.i32	$S3,$S3,$D3#lo
-	vadd.i32	$S4,$S4,$D4#lo
-
-	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
-	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
-	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vst1.32		{${S4}[0]},[$tbl0]
-	vst1.32		{${S4}[1]},[$tbl1]
-
-.Lno_init_neon:
-	ret				@ bx	lr
-.size	poly1305_init_neon,.-poly1305_init_neon
-
-.type	poly1305_blocks_neon,%function
-.align	5
-poly1305_blocks_neon:
-.Lpoly1305_blocks_neon:
-	ldr	ip,[$ctx,#36]		@ is_base2_26
-
-	cmp	$len,#64
-	blo	.Lpoly1305_blocks
-
-	stmdb	sp!,{r4-r7}
-	vstmdb	sp!,{d8-d15}		@ ABI specification says so
-
-	tst	ip,ip			@ is_base2_26?
-	bne	.Lbase2_26_neon
-
-	stmdb	sp!,{r1-r3,lr}
-	bl	.Lpoly1305_init_neon
-
-	ldr	r4,[$ctx,#0]		@ load hash value base 2^32
-	ldr	r5,[$ctx,#4]
-	ldr	r6,[$ctx,#8]
-	ldr	r7,[$ctx,#12]
-	ldr	ip,[$ctx,#16]
-
-	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
-	mov	r3,r4,lsr#26
-	 veor	$D0#lo,$D0#lo,$D0#lo
-	mov	r4,r5,lsr#20
-	orr	r3,r3,r5,lsl#6
-	 veor	$D1#lo,$D1#lo,$D1#lo
-	mov	r5,r6,lsr#14
-	orr	r4,r4,r6,lsl#12
-	 veor	$D2#lo,$D2#lo,$D2#lo
-	mov	r6,r7,lsr#8
-	orr	r5,r5,r7,lsl#18
-	 veor	$D3#lo,$D3#lo,$D3#lo
-	and	r3,r3,#0x03ffffff
-	orr	r6,r6,ip,lsl#24
-	 veor	$D4#lo,$D4#lo,$D4#lo
-	and	r4,r4,#0x03ffffff
-	mov	r1,#1
-	and	r5,r5,#0x03ffffff
-	str	r1,[$ctx,#36]		@ set is_base2_26
-
-	vmov.32	$D0#lo[0],r2
-	vmov.32	$D1#lo[0],r3
-	vmov.32	$D2#lo[0],r4
-	vmov.32	$D3#lo[0],r5
-	vmov.32	$D4#lo[0],r6
-	adr	$zeros,.Lzeros
-
-	ldmia	sp!,{r1-r3,lr}
-	b	.Lhash_loaded
-
-.align	4
-.Lbase2_26_neon:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ load hash value
-
-	veor		$D0#lo,$D0#lo,$D0#lo
-	veor		$D1#lo,$D1#lo,$D1#lo
-	veor		$D2#lo,$D2#lo,$D2#lo
-	veor		$D3#lo,$D3#lo,$D3#lo
-	veor		$D4#lo,$D4#lo,$D4#lo
-	vld4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
-	adr		$zeros,.Lzeros
-	vld1.32		{$D4#lo[0]},[$ctx]
-	sub		$ctx,$ctx,#16		@ rewind
-
-.Lhash_loaded:
-	add		$in2,$inp,#32
-	mov		$padbit,$padbit,lsl#24
-	tst		$len,#31
-	beq		.Leven
-
-	vld4.32		{$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
-	vmov.32		$H4#lo[0],$padbit
-	sub		$len,$len,#16
-	add		$in2,$inp,#32
-
-# ifdef	__ARMEB__
-	vrev32.8	$H0,$H0
-	vrev32.8	$H3,$H3
-	vrev32.8	$H1,$H1
-	vrev32.8	$H2,$H2
-# endif
-	vsri.u32	$H4#lo,$H3#lo,#8	@ base 2^32 -> base 2^26
-	vshl.u32	$H3#lo,$H3#lo,#18
-
-	vsri.u32	$H3#lo,$H2#lo,#14
-	vshl.u32	$H2#lo,$H2#lo,#12
-	vadd.i32	$H4#hi,$H4#lo,$D4#lo	@ add hash value and move to #hi
-
-	vbic.i32	$H3#lo,#0xfc000000
-	vsri.u32	$H2#lo,$H1#lo,#20
-	vshl.u32	$H1#lo,$H1#lo,#6
-
-	vbic.i32	$H2#lo,#0xfc000000
-	vsri.u32	$H1#lo,$H0#lo,#26
-	vadd.i32	$H3#hi,$H3#lo,$D3#lo
-
-	vbic.i32	$H0#lo,#0xfc000000
-	vbic.i32	$H1#lo,#0xfc000000
-	vadd.i32	$H2#hi,$H2#lo,$D2#lo
-
-	vadd.i32	$H0#hi,$H0#lo,$D0#lo
-	vadd.i32	$H1#hi,$H1#lo,$D1#lo
-
-	mov		$tbl1,$zeros
-	add		$tbl0,$ctx,#48
-
-	cmp		$len,$len
-	b		.Long_tail
-
-.align	4
-.Leven:
-	subs		$len,$len,#64
-	it		lo
-	movlo		$in2,$zeros
-
-	vmov.i32	$H4,#1<<24		@ padbit, yes, always
-	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
-	add		$inp,$inp,#64
-	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
-	add		$in2,$in2,#64
-	itt		hi
-	addhi		$tbl1,$ctx,#(48+1*9*4)
-	addhi		$tbl0,$ctx,#(48+3*9*4)
-
-# ifdef	__ARMEB__
-	vrev32.8	$H0,$H0
-	vrev32.8	$H3,$H3
-	vrev32.8	$H1,$H1
-	vrev32.8	$H2,$H2
-# endif
-	vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
-	vshl.u32	$H3,$H3,#18
-
-	vsri.u32	$H3,$H2,#14
-	vshl.u32	$H2,$H2,#12
-
-	vbic.i32	$H3,#0xfc000000
-	vsri.u32	$H2,$H1,#20
-	vshl.u32	$H1,$H1,#6
-
-	vbic.i32	$H2,#0xfc000000
-	vsri.u32	$H1,$H0,#26
-
-	vbic.i32	$H0,#0xfc000000
-	vbic.i32	$H1,#0xfc000000
-
-	bls		.Lskip_loop
-
-	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^2
-	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
-	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	b		.Loop_neon
-
-.align	5
-.Loop_neon:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
-	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
-	@   \___________________/
-	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
-	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
-	@   \___________________/ \____________________/
-	@
-	@ Note that we start with inp[2:3]*r^2. This is because it
-	@ doesn't depend on reduction in previous iteration.
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ inp[2:3]*r^2
-
-	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ accumulate inp[0:1]
-	vmull.u32	$D2,$H2#hi,${R0}[1]
-	vadd.i32	$H0#lo,$H0#lo,$D0#lo
-	vmull.u32	$D0,$H0#hi,${R0}[1]
-	vadd.i32	$H3#lo,$H3#lo,$D3#lo
-	vmull.u32	$D3,$H3#hi,${R0}[1]
-	vmlal.u32	$D2,$H1#hi,${R1}[1]
-	vadd.i32	$H1#lo,$H1#lo,$D1#lo
-	vmull.u32	$D1,$H1#hi,${R0}[1]
-
-	vadd.i32	$H4#lo,$H4#lo,$D4#lo
-	vmull.u32	$D4,$H4#hi,${R0}[1]
-	subs		$len,$len,#64
-	vmlal.u32	$D0,$H4#hi,${S1}[1]
-	it		lo
-	movlo		$in2,$zeros
-	vmlal.u32	$D3,$H2#hi,${R1}[1]
-	vld1.32		${S4}[1],[$tbl1,:32]
-	vmlal.u32	$D1,$H0#hi,${R1}[1]
-	vmlal.u32	$D4,$H3#hi,${R1}[1]
-
-	vmlal.u32	$D0,$H3#hi,${S2}[1]
-	vmlal.u32	$D3,$H1#hi,${R2}[1]
-	vmlal.u32	$D4,$H2#hi,${R2}[1]
-	vmlal.u32	$D1,$H4#hi,${S2}[1]
-	vmlal.u32	$D2,$H0#hi,${R2}[1]
-
-	vmlal.u32	$D3,$H0#hi,${R3}[1]
-	vmlal.u32	$D0,$H2#hi,${S3}[1]
-	vmlal.u32	$D4,$H1#hi,${R3}[1]
-	vmlal.u32	$D1,$H3#hi,${S3}[1]
-	vmlal.u32	$D2,$H4#hi,${S3}[1]
-
-	vmlal.u32	$D3,$H4#hi,${S4}[1]
-	vmlal.u32	$D0,$H1#hi,${S4}[1]
-	vmlal.u32	$D4,$H0#hi,${R4}[1]
-	vmlal.u32	$D1,$H2#hi,${S4}[1]
-	vmlal.u32	$D2,$H3#hi,${S4}[1]
-
-	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
-	add		$in2,$in2,#64
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ (hash+inp[0:1])*r^4 and accumulate
-
-	vmlal.u32	$D3,$H3#lo,${R0}[0]
-	vmlal.u32	$D0,$H0#lo,${R0}[0]
-	vmlal.u32	$D4,$H4#lo,${R0}[0]
-	vmlal.u32	$D1,$H1#lo,${R0}[0]
-	vmlal.u32	$D2,$H2#lo,${R0}[0]
-	vld1.32		${S4}[0],[$tbl0,:32]
-
-	vmlal.u32	$D3,$H2#lo,${R1}[0]
-	vmlal.u32	$D0,$H4#lo,${S1}[0]
-	vmlal.u32	$D4,$H3#lo,${R1}[0]
-	vmlal.u32	$D1,$H0#lo,${R1}[0]
-	vmlal.u32	$D2,$H1#lo,${R1}[0]
-
-	vmlal.u32	$D3,$H1#lo,${R2}[0]
-	vmlal.u32	$D0,$H3#lo,${S2}[0]
-	vmlal.u32	$D4,$H2#lo,${R2}[0]
-	vmlal.u32	$D1,$H4#lo,${S2}[0]
-	vmlal.u32	$D2,$H0#lo,${R2}[0]
-
-	vmlal.u32	$D3,$H0#lo,${R3}[0]
-	vmlal.u32	$D0,$H2#lo,${S3}[0]
-	vmlal.u32	$D4,$H1#lo,${R3}[0]
-	vmlal.u32	$D1,$H3#lo,${S3}[0]
-	vmlal.u32	$D3,$H4#lo,${S4}[0]
-
-	vmlal.u32	$D2,$H4#lo,${S3}[0]
-	vmlal.u32	$D0,$H1#lo,${S4}[0]
-	vmlal.u32	$D4,$H0#lo,${R4}[0]
-	vmov.i32	$H4,#1<<24		@ padbit, yes, always
-	vmlal.u32	$D1,$H2#lo,${S4}[0]
-	vmlal.u32	$D2,$H3#lo,${S4}[0]
-
-	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
-	add		$inp,$inp,#64
-# ifdef	__ARMEB__
-	vrev32.8	$H0,$H0
-	vrev32.8	$H1,$H1
-	vrev32.8	$H2,$H2
-	vrev32.8	$H3,$H3
-# endif
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
-	@ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
-
-	vshr.u64	$T0,$D3,#26
-	vmovn.i64	$D3#lo,$D3
-	 vshr.u64	$T1,$D0,#26
-	 vmovn.i64	$D0#lo,$D0
-	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
-	vbic.i32	$D3#lo,#0xfc000000
-	  vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
-	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
-	  vshl.u32	$H3,$H3,#18
-	 vbic.i32	$D0#lo,#0xfc000000
-
-	vshrn.u64	$T0#lo,$D4,#26
-	vmovn.i64	$D4#lo,$D4
-	 vshr.u64	$T1,$D1,#26
-	 vmovn.i64	$D1#lo,$D1
-	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
-	  vsri.u32	$H3,$H2,#14
-	vbic.i32	$D4#lo,#0xfc000000
-	  vshl.u32	$H2,$H2,#12
-	 vbic.i32	$D1#lo,#0xfc000000
-
-	vadd.i32	$D0#lo,$D0#lo,$T0#lo
-	vshl.u32	$T0#lo,$T0#lo,#2
-	  vbic.i32	$H3,#0xfc000000
-	 vshrn.u64	$T1#lo,$D2,#26
-	 vmovn.i64	$D2#lo,$D2
-	vaddl.u32	$D0,$D0#lo,$T0#lo	@ h4 -> h0 [widen for a sec]
-	  vsri.u32	$H2,$H1,#20
-	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
-	  vshl.u32	$H1,$H1,#6
-	 vbic.i32	$D2#lo,#0xfc000000
-	  vbic.i32	$H2,#0xfc000000
-
-	vshrn.u64	$T0#lo,$D0,#26		@ re-narrow
-	vmovn.i64	$D0#lo,$D0
-	  vsri.u32	$H1,$H0,#26
-	  vbic.i32	$H0,#0xfc000000
-	 vshr.u32	$T1#lo,$D3#lo,#26
-	 vbic.i32	$D3#lo,#0xfc000000
-	vbic.i32	$D0#lo,#0xfc000000
-	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
-	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
-	  vbic.i32	$H1,#0xfc000000
-
-	bhi		.Loop_neon
-
-.Lskip_loop:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
-	add		$tbl1,$ctx,#(48+0*9*4)
-	add		$tbl0,$ctx,#(48+1*9*4)
-	adds		$len,$len,#32
-	it		ne
-	movne		$len,#0
-	bne		.Long_tail
-
-	vadd.i32	$H2#hi,$H2#lo,$D2#lo	@ add hash value and move to #hi
-	vadd.i32	$H0#hi,$H0#lo,$D0#lo
-	vadd.i32	$H3#hi,$H3#lo,$D3#lo
-	vadd.i32	$H1#hi,$H1#lo,$D1#lo
-	vadd.i32	$H4#hi,$H4#lo,$D4#lo
-
-.Long_tail:
-	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^1
-	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^2
-
-	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ can be redundant
-	vmull.u32	$D2,$H2#hi,$R0
-	vadd.i32	$H0#lo,$H0#lo,$D0#lo
-	vmull.u32	$D0,$H0#hi,$R0
-	vadd.i32	$H3#lo,$H3#lo,$D3#lo
-	vmull.u32	$D3,$H3#hi,$R0
-	vadd.i32	$H1#lo,$H1#lo,$D1#lo
-	vmull.u32	$D1,$H1#hi,$R0
-	vadd.i32	$H4#lo,$H4#lo,$D4#lo
-	vmull.u32	$D4,$H4#hi,$R0
-
-	vmlal.u32	$D0,$H4#hi,$S1
-	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vmlal.u32	$D3,$H2#hi,$R1
-	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	vmlal.u32	$D1,$H0#hi,$R1
-	vmlal.u32	$D4,$H3#hi,$R1
-	vmlal.u32	$D2,$H1#hi,$R1
-
-	vmlal.u32	$D3,$H1#hi,$R2
-	vld1.32		${S4}[1],[$tbl1,:32]
-	vmlal.u32	$D0,$H3#hi,$S2
-	vld1.32		${S4}[0],[$tbl0,:32]
-	vmlal.u32	$D4,$H2#hi,$R2
-	vmlal.u32	$D1,$H4#hi,$S2
-	vmlal.u32	$D2,$H0#hi,$R2
-
-	vmlal.u32	$D3,$H0#hi,$R3
-	 it		ne
-	 addne		$tbl1,$ctx,#(48+2*9*4)
-	vmlal.u32	$D0,$H2#hi,$S3
-	 it		ne
-	 addne		$tbl0,$ctx,#(48+3*9*4)
-	vmlal.u32	$D4,$H1#hi,$R3
-	vmlal.u32	$D1,$H3#hi,$S3
-	vmlal.u32	$D2,$H4#hi,$S3
-
-	vmlal.u32	$D3,$H4#hi,$S4
-	 vorn		$MASK,$MASK,$MASK	@ all-ones, can be redundant
-	vmlal.u32	$D0,$H1#hi,$S4
-	 vshr.u64	$MASK,$MASK,#38
-	vmlal.u32	$D4,$H0#hi,$R4
-	vmlal.u32	$D1,$H2#hi,$S4
-	vmlal.u32	$D2,$H3#hi,$S4
-
-	beq		.Lshort_tail
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ (hash+inp[0:1])*r^4:r^3 and accumulate
-
-	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^3
-	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
-
-	vmlal.u32	$D2,$H2#lo,$R0
-	vmlal.u32	$D0,$H0#lo,$R0
-	vmlal.u32	$D3,$H3#lo,$R0
-	vmlal.u32	$D1,$H1#lo,$R0
-	vmlal.u32	$D4,$H4#lo,$R0
-
-	vmlal.u32	$D0,$H4#lo,$S1
-	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vmlal.u32	$D3,$H2#lo,$R1
-	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	vmlal.u32	$D1,$H0#lo,$R1
-	vmlal.u32	$D4,$H3#lo,$R1
-	vmlal.u32	$D2,$H1#lo,$R1
-
-	vmlal.u32	$D3,$H1#lo,$R2
-	vld1.32		${S4}[1],[$tbl1,:32]
-	vmlal.u32	$D0,$H3#lo,$S2
-	vld1.32		${S4}[0],[$tbl0,:32]
-	vmlal.u32	$D4,$H2#lo,$R2
-	vmlal.u32	$D1,$H4#lo,$S2
-	vmlal.u32	$D2,$H0#lo,$R2
-
-	vmlal.u32	$D3,$H0#lo,$R3
-	vmlal.u32	$D0,$H2#lo,$S3
-	vmlal.u32	$D4,$H1#lo,$R3
-	vmlal.u32	$D1,$H3#lo,$S3
-	vmlal.u32	$D2,$H4#lo,$S3
-
-	vmlal.u32	$D3,$H4#lo,$S4
-	 vorn		$MASK,$MASK,$MASK	@ all-ones
-	vmlal.u32	$D0,$H1#lo,$S4
-	 vshr.u64	$MASK,$MASK,#38
-	vmlal.u32	$D4,$H0#lo,$R4
-	vmlal.u32	$D1,$H2#lo,$S4
-	vmlal.u32	$D2,$H3#lo,$S4
-
-.Lshort_tail:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ horizontal addition
-
-	vadd.i64	$D3#lo,$D3#lo,$D3#hi
-	vadd.i64	$D0#lo,$D0#lo,$D0#hi
-	vadd.i64	$D4#lo,$D4#lo,$D4#hi
-	vadd.i64	$D1#lo,$D1#lo,$D1#hi
-	vadd.i64	$D2#lo,$D2#lo,$D2#hi
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ lazy reduction, but without narrowing
-
-	vshr.u64	$T0,$D3,#26
-	vand.i64	$D3,$D3,$MASK
-	 vshr.u64	$T1,$D0,#26
-	 vand.i64	$D0,$D0,$MASK
-	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
-	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
-
-	vshr.u64	$T0,$D4,#26
-	vand.i64	$D4,$D4,$MASK
-	 vshr.u64	$T1,$D1,#26
-	 vand.i64	$D1,$D1,$MASK
-	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
-
-	vadd.i64	$D0,$D0,$T0
-	vshl.u64	$T0,$T0,#2
-	 vshr.u64	$T1,$D2,#26
-	 vand.i64	$D2,$D2,$MASK
-	vadd.i64	$D0,$D0,$T0		@ h4 -> h0
-	 vadd.i64	$D3,$D3,$T1		@ h2 -> h3
-
-	vshr.u64	$T0,$D0,#26
-	vand.i64	$D0,$D0,$MASK
-	 vshr.u64	$T1,$D3,#26
-	 vand.i64	$D3,$D3,$MASK
-	vadd.i64	$D1,$D1,$T0		@ h0 -> h1
-	 vadd.i64	$D4,$D4,$T1		@ h3 -> h4
-
-	cmp		$len,#0
-	bne		.Leven
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ store hash value
-
-	vst4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
-	vst1.32		{$D4#lo[0]},[$ctx]
-
-	vldmia	sp!,{d8-d15}			@ epilogue
-	ldmia	sp!,{r4-r7}
-	ret					@ bx	lr
-.size	poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.align	5
-.Lzeros:
-.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-#ifndef	__KERNEL__
-.LOPENSSL_armcap:
-# ifdef	_WIN32
-.word	OPENSSL_armcap_P
-# else
-.word	OPENSSL_armcap_P-.Lpoly1305_init
-# endif
-.comm	OPENSSL_armcap_P,4,4
-.hidden	OPENSSL_armcap_P
-#endif
-#endif
-___
-}	}
-$code.=<<___;
-.asciz	"Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
-.align	2
-___
-
-foreach (split("\n",$code)) {
-	s/\`([^\`]*)\`/eval $1/geo;
-
-	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
-	s/\bret\b/bx	lr/go						or
-	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
-
-	print $_,"\n";
-}
-close STDOUT; # enforce flush
diff --git a/arch/arm/crypto/poly1305-glue.c b/arch/arm/crypto/poly1305-glue.c
deleted file mode 100644
index 42d0ebde1ae1..000000000000
--- a/arch/arm/crypto/poly1305-glue.c
+++ /dev/null
@@ -1,125 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
- *
- * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/poly1305.h>
-#include <crypto/internal/simd.h>
-#include <linux/cpufeature.h>
-#include <linux/jump_label.h>
-#include <linux/module.h>
-#include <linux/unaligned.h>
-
-void poly1305_init_arm(void *state, const u8 *key);
-void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
-void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
-void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce);
-
-void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
-{
-}
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
-
-void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
-{
-	poly1305_init_arm(&dctx->h, key);
-	dctx->s[0] = get_unaligned_le32(key + 16);
-	dctx->s[1] = get_unaligned_le32(key + 20);
-	dctx->s[2] = get_unaligned_le32(key + 24);
-	dctx->s[3] = get_unaligned_le32(key + 28);
-	dctx->buflen = 0;
-}
-EXPORT_SYMBOL(poly1305_init_arch);
-
-void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
-			  unsigned int nbytes)
-{
-	bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-		       crypto_simd_usable();
-
-	if (unlikely(dctx->buflen)) {
-		u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
-
-		memcpy(dctx->buf + dctx->buflen, src, bytes);
-		src += bytes;
-		nbytes -= bytes;
-		dctx->buflen += bytes;
-
-		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
-			poly1305_blocks_arm(&dctx->h, dctx->buf,
-					    POLY1305_BLOCK_SIZE, 1);
-			dctx->buflen = 0;
-		}
-	}
-
-	if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
-		unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
-
-		if (static_branch_likely(&have_neon) && do_neon) {
-			do {
-				unsigned int todo = min_t(unsigned int, len, SZ_4K);
-
-				kernel_neon_begin();
-				poly1305_blocks_neon(&dctx->h, src, todo, 1);
-				kernel_neon_end();
-
-				len -= todo;
-				src += todo;
-			} while (len);
-		} else {
-			poly1305_blocks_arm(&dctx->h, src, len, 1);
-			src += len;
-		}
-		nbytes %= POLY1305_BLOCK_SIZE;
-	}
-
-	if (unlikely(nbytes)) {
-		dctx->buflen = nbytes;
-		memcpy(dctx->buf, src, nbytes);
-	}
-}
-EXPORT_SYMBOL(poly1305_update_arch);
-
-void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
-{
-	if (unlikely(dctx->buflen)) {
-		dctx->buf[dctx->buflen++] = 1;
-		memset(dctx->buf + dctx->buflen, 0,
-		       POLY1305_BLOCK_SIZE - dctx->buflen);
-		poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
-	}
-
-	poly1305_emit_arm(&dctx->h, dst, dctx->s);
-	*dctx = (struct poly1305_desc_ctx){};
-}
-EXPORT_SYMBOL(poly1305_final_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
-	/* We always can use at least the ARM scalar implementation. */
-	return true;
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-static int __init arm_poly1305_mod_init(void)
-{
-	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
-	    (elf_hwcap & HWCAP_NEON))
-		static_branch_enable(&have_neon);
-	return 0;
-}
-arch_initcall(arm_poly1305_mod_init);
-
-static void __exit arm_poly1305_mod_exit(void)
-{
-}
-module_exit(arm_poly1305_mod_exit);
-
-MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 007874320937..39ffabd3cf2a 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -5,6 +5,8 @@
 # Copyright (C) 1995-2000 Russell King
 #
 
+obj-y += crypto/
+
 lib-y		:= changebit.o csumipv6.o csumpartial.o               \
 		   csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
 		   delay.o delay-loop.o findbit.o memchr.o memcpy.o   \
diff --git a/arch/arm/lib/crypto/.gitignore b/arch/arm/lib/crypto/.gitignore
new file mode 100644
index 000000000000..0d47d4f21c6d
--- /dev/null
+++ b/arch/arm/lib/crypto/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
diff --git a/arch/arm/lib/crypto/Kconfig b/arch/arm/lib/crypto/Kconfig
new file mode 100644
index 000000000000..181f138d563b
--- /dev/null
+++ b/arch/arm/lib/crypto/Kconfig
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_BLAKE2S_ARM
+	bool "Hash functions: BLAKE2s"
+	select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
+	help
+	  BLAKE2s cryptographic hash function (RFC 7693)
+
+	  Architecture: arm
+
+	  This is faster than the generic implementations of BLAKE2s and
+	  BLAKE2b, but slower than the NEON implementation of BLAKE2b.
+	  There is no NEON implementation of BLAKE2s, since NEON doesn't
+	  really help with it.
+
+config CRYPTO_CHACHA20_NEON
+	tristate
+	default CRYPTO_LIB_CHACHA_INTERNAL
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_ARM
+	tristate
+	default CRYPTO_LIB_POLY1305_INTERNAL
+	select CRYPTO_ARCH_HAVE_LIB_POLY1305
diff --git a/arch/arm/lib/crypto/Makefile b/arch/arm/lib/crypto/Makefile
new file mode 100644
index 000000000000..4c042a4c77ed
--- /dev/null
+++ b/arch/arm/lib/crypto/Makefile
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o
+libblake2s-arm-y := blake2s-core.o blake2s-glue.o
+
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+chacha-neon-y := chacha-scalar-core.o chacha-glue.o
+chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
+
+obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
+poly1305-arm-y := poly1305-core.o poly1305-glue.o
+
+quiet_cmd_perl = PERL    $@
+      cmd_perl = $(PERL) $(<) > $(@)
+
+$(obj)/%-core.S: $(src)/%-armv4.pl
+	$(call cmd,perl)
+
+clean-files += poly1305-core.S
+
+aflags-thumb2-$(CONFIG_THUMB2_KERNEL)  := -U__thumb2__ -D__thumb2__=1
+
+# massage the perlasm code a bit so we only get the NEON routine if we need it
+poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
+poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
+AFLAGS_poly1305-core.o += $(poly1305-aflags-y) $(aflags-thumb2-y)
diff --git a/arch/arm/lib/crypto/blake2s-core.S b/arch/arm/lib/crypto/blake2s-core.S
new file mode 100644
index 000000000000..df40e46601f1
--- /dev/null
+++ b/arch/arm/lib/crypto/blake2s-core.S
@@ -0,0 +1,306 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2s digest algorithm, ARM scalar implementation
+ *
+ * Copyright 2020 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	// Registers used to hold message words temporarily.  There aren't
+	// enough ARM registers to hold the whole message block, so we have to
+	// load the words on-demand.
+	M_0		.req	r12
+	M_1		.req	r14
+
+// The BLAKE2s initialization vector
+.Lblake2s_IV:
+	.word	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+	.word	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+
+.macro __ldrd		a, b, src, offset
+#if __LINUX_ARM_ARCH__ >= 6
+	ldrd		\a, \b, [\src, #\offset]
+#else
+	ldr		\a, [\src, #\offset]
+	ldr		\b, [\src, #\offset + 4]
+#endif
+.endm
+
+.macro __strd		a, b, dst, offset
+#if __LINUX_ARM_ARCH__ >= 6
+	strd		\a, \b, [\dst, #\offset]
+#else
+	str		\a, [\dst, #\offset]
+	str		\b, [\dst, #\offset + 4]
+#endif
+.endm
+
+.macro _le32_bswap	a, tmp
+#ifdef __ARMEB__
+	rev_l		\a, \tmp
+#endif
+.endm
+
+.macro _le32_bswap_8x	a, b, c, d, e, f, g, h,  tmp
+	_le32_bswap	\a, \tmp
+	_le32_bswap	\b, \tmp
+	_le32_bswap	\c, \tmp
+	_le32_bswap	\d, \tmp
+	_le32_bswap	\e, \tmp
+	_le32_bswap	\f, \tmp
+	_le32_bswap	\g, \tmp
+	_le32_bswap	\h, \tmp
+.endm
+
+// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
+// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
+// columns/diagonals.  s0-s1 are the word offsets to the message words the first
+// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
+// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
+//
+// Note that to save instructions, the rotations don't happen when the
+// pseudocode says they should, but rather they are delayed until the values are
+// used.  See the comment above _blake2s_round().
+.macro _blake2s_quarterround  a0, b0, c0, d0,  a1, b1, c1, d1,  s0, s1, s2, s3
+
+	ldr		M_0, [sp, #32 + 4 * \s0]
+	ldr		M_1, [sp, #32 + 4 * \s2]
+
+	// a += b + m[blake2s_sigma[r][2*i + 0]];
+	add		\a0, \a0, \b0, ror #brot
+	add		\a1, \a1, \b1, ror #brot
+	add		\a0, \a0, M_0
+	add		\a1, \a1, M_1
+
+	// d = ror32(d ^ a, 16);
+	eor		\d0, \a0, \d0, ror #drot
+	eor		\d1, \a1, \d1, ror #drot
+
+	// c += d;
+	add		\c0, \c0, \d0, ror #16
+	add		\c1, \c1, \d1, ror #16
+
+	// b = ror32(b ^ c, 12);
+	eor		\b0, \c0, \b0, ror #brot
+	eor		\b1, \c1, \b1, ror #brot
+
+	ldr		M_0, [sp, #32 + 4 * \s1]
+	ldr		M_1, [sp, #32 + 4 * \s3]
+
+	// a += b + m[blake2s_sigma[r][2*i + 1]];
+	add		\a0, \a0, \b0, ror #12
+	add		\a1, \a1, \b1, ror #12
+	add		\a0, \a0, M_0
+	add		\a1, \a1, M_1
+
+	// d = ror32(d ^ a, 8);
+	eor		\d0, \a0, \d0, ror#16
+	eor		\d1, \a1, \d1, ror#16
+
+	// c += d;
+	add		\c0, \c0, \d0, ror#8
+	add		\c1, \c1, \d1, ror#8
+
+	// b = ror32(b ^ c, 7);
+	eor		\b0, \c0, \b0, ror#12
+	eor		\b1, \c1, \b1, ror#12
+.endm
+
+// Execute one round of BLAKE2s by updating the state matrix v[0..15].  v[0..9]
+// are in r0..r9.  The stack pointer points to 8 bytes of scratch space for
+// spilling v[8..9], then to v[9..15], then to the message block.  r10-r12 and
+// r14 are free to use.  The macro arguments s0-s15 give the order in which the
+// message words are used in this round.
+//
+// All rotates are performed using the implicit rotate operand accepted by the
+// 'add' and 'eor' instructions.  This is faster than using explicit rotate
+// instructions.  To make this work, we allow the values in the second and last
+// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
+// wrong rotation amount.  The rotation amount is then fixed up just in time
+// when the values are used.  'brot' is the number of bits the values in row 'b'
+// need to be rotated right to arrive at the correct values, and 'drot'
+// similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
+// that they end up as (7, 8) after every round.
+.macro	_blake2s_round	s0, s1, s2, s3, s4, s5, s6, s7, \
+			s8, s9, s10, s11, s12, s13, s14, s15
+
+	// Mix first two columns:
+	// (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
+	__ldrd		r10, r11, sp, 16	// load v[12] and v[13]
+	_blake2s_quarterround	r0, r4, r8, r10,  r1, r5, r9, r11, \
+				\s0, \s1, \s2, \s3
+	__strd		r8, r9, sp, 0
+	__strd		r10, r11, sp, 16
+
+	// Mix second two columns:
+	// (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
+	__ldrd		r8, r9, sp, 8		// load v[10] and v[11]
+	__ldrd		r10, r11, sp, 24	// load v[14] and v[15]
+	_blake2s_quarterround	r2, r6, r8, r10,  r3, r7, r9, r11, \
+				\s4, \s5, \s6, \s7
+	str		r10, [sp, #24]		// store v[14]
+	// v[10], v[11], and v[15] are used below, so no need to store them yet.
+
+	.set brot, 7
+	.set drot, 8
+
+	// Mix first two diagonals:
+	// (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
+	ldr		r10, [sp, #16]		// load v[12]
+	_blake2s_quarterround	r0, r5, r8, r11,  r1, r6, r9, r10, \
+				\s8, \s9, \s10, \s11
+	__strd		r8, r9, sp, 8
+	str		r11, [sp, #28]
+	str		r10, [sp, #16]
+
+	// Mix second two diagonals:
+	// (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
+	__ldrd		r8, r9, sp, 0		// load v[8] and v[9]
+	__ldrd		r10, r11, sp, 20	// load v[13] and v[14]
+	_blake2s_quarterround	r2, r7, r8, r10,  r3, r4, r9, r11, \
+				\s12, \s13, \s14, \s15
+	__strd		r10, r11, sp, 20
+.endm
+
+//
+// void blake2s_compress(struct blake2s_state *state,
+//			 const u8 *block, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_state are used:
+//	u32 h[8];	(inout)
+//	u32 t[2];	(inout)
+//	u32 f[2];	(in)
+//
+	.align		5
+ENTRY(blake2s_compress)
+	push		{r0-r2,r4-r11,lr}	// keep this an even number
+
+.Lnext_block:
+	// r0 is 'state'
+	// r1 is 'block'
+	// r3 is 'inc'
+
+	// Load and increment the counter t[0..1].
+	__ldrd		r10, r11, r0, 32
+	adds		r10, r10, r3
+	adc		r11, r11, #0
+	__strd		r10, r11, r0, 32
+
+	// _blake2s_round is very short on registers, so copy the message block
+	// to the stack to save a register during the rounds.  This also has the
+	// advantage that misalignment only needs to be dealt with in one place.
+	sub		sp, sp, #64
+	mov		r12, sp
+	tst		r1, #3
+	bne		.Lcopy_block_misaligned
+	ldmia		r1!, {r2-r9}
+	_le32_bswap_8x	r2, r3, r4, r5, r6, r7, r8, r9,  r14
+	stmia		r12!, {r2-r9}
+	ldmia		r1!, {r2-r9}
+	_le32_bswap_8x	r2, r3, r4, r5, r6, r7, r8, r9,  r14
+	stmia		r12, {r2-r9}
+.Lcopy_block_done:
+	str		r1, [sp, #68]		// Update message pointer
+
+	// Calculate v[8..15].  Push v[9..15] onto the stack, and leave space
+	// for spilling v[8..9].  Leave v[8..9] in r8-r9.
+	mov		r14, r0			// r14 = state
+	adr		r12, .Lblake2s_IV
+	ldmia		r12!, {r8-r9}		// load IV[0..1]
+	__ldrd		r0, r1, r14, 40		// load f[0..1]
+	ldm		r12, {r2-r7}		// load IV[3..7]
+	eor		r4, r4, r10		// v[12] = IV[4] ^ t[0]
+	eor		r5, r5, r11		// v[13] = IV[5] ^ t[1]
+	eor		r6, r6, r0		// v[14] = IV[6] ^ f[0]
+	eor		r7, r7, r1		// v[15] = IV[7] ^ f[1]
+	push		{r2-r7}			// push v[9..15]
+	sub		sp, sp, #8		// leave space for v[8..9]
+
+	// Load h[0..7] == v[0..7].
+	ldm		r14, {r0-r7}
+
+	// Execute the rounds.  Each round is provided the order in which it
+	// needs to use the message words.
+	.set brot, 0
+	.set drot, 0
+	_blake2s_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+	_blake2s_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
+	_blake2s_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
+	_blake2s_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
+	_blake2s_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
+	_blake2s_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
+	_blake2s_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
+	_blake2s_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
+	_blake2s_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
+	_blake2s_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
+
+	// Fold the final state matrix into the hash chaining value:
+	//
+	//	for (i = 0; i < 8; i++)
+	//		h[i] ^= v[i] ^ v[i + 8];
+	//
+	ldr		r14, [sp, #96]		// r14 = &h[0]
+	add		sp, sp, #8		// v[8..9] are already loaded.
+	pop		{r10-r11}		// load v[10..11]
+	eor		r0, r0, r8
+	eor		r1, r1, r9
+	eor		r2, r2, r10
+	eor		r3, r3, r11
+	ldm		r14, {r8-r11}		// load h[0..3]
+	eor		r0, r0, r8
+	eor		r1, r1, r9
+	eor		r2, r2, r10
+	eor		r3, r3, r11
+	stmia		r14!, {r0-r3}		// store new h[0..3]
+	ldm		r14, {r0-r3}		// load old h[4..7]
+	pop		{r8-r11}		// load v[12..15]
+	eor		r0, r0, r4, ror #brot
+	eor		r1, r1, r5, ror #brot
+	eor		r2, r2, r6, ror #brot
+	eor		r3, r3, r7, ror #brot
+	eor		r0, r0, r8, ror #drot
+	eor		r1, r1, r9, ror #drot
+	eor		r2, r2, r10, ror #drot
+	eor		r3, r3, r11, ror #drot
+	  add		sp, sp, #64		// skip copy of message block
+	stm		r14, {r0-r3}		// store new h[4..7]
+
+	// Advance to the next block, if there is one.  Note that if there are
+	// multiple blocks, then 'inc' (the counter increment amount) must be
+	// 64.  So we can simply set it to 64 without re-loading it.
+	ldm		sp, {r0, r1, r2}	// load (state, block, nblocks)
+	mov		r3, #64			// set 'inc'
+	subs		r2, r2, #1		// nblocks--
+	str		r2, [sp, #8]
+	bne		.Lnext_block		// nblocks != 0?
+
+	pop		{r0-r2,r4-r11,pc}
+
+	// The next message block (pointed to by r1) isn't 4-byte aligned, so it
+	// can't be loaded using ldmia.  Copy it to the stack buffer (pointed to
+	// by r12) using an alternative method.  r2-r9 are free to use.
+.Lcopy_block_misaligned:
+	mov		r2, #64
+1:
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	ldr		r3, [r1], #4
+	_le32_bswap	r3, r4
+#else
+	ldrb		r3, [r1, #0]
+	ldrb		r4, [r1, #1]
+	ldrb		r5, [r1, #2]
+	ldrb		r6, [r1, #3]
+	add		r1, r1, #4
+	orr		r3, r3, r4, lsl #8
+	orr		r3, r3, r5, lsl #16
+	orr		r3, r3, r6, lsl #24
+#endif
+	subs		r2, r2, #4
+	str		r3, [r12], #4
+	bne		1b
+	b		.Lcopy_block_done
+ENDPROC(blake2s_compress)
diff --git a/arch/arm/lib/crypto/blake2s-glue.c b/arch/arm/lib/crypto/blake2s-glue.c
new file mode 100644
index 000000000000..0238a70d9581
--- /dev/null
+++ b/arch/arm/lib/crypto/blake2s-glue.c
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <crypto/internal/blake2s.h>
+#include <linux/module.h>
+
+/* defined in blake2s-core.S */
+EXPORT_SYMBOL(blake2s_compress);
diff --git a/arch/arm/lib/crypto/chacha-glue.c b/arch/arm/lib/crypto/chacha-glue.c
new file mode 100644
index 000000000000..12afb40cf1ff
--- /dev/null
+++ b/arch/arm/lib/crypto/chacha-glue.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ChaCha and HChaCha functions (ARM optimized)
+ *
+ * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2015 Martin Willi
+ */
+
+#include <crypto/chacha.h>
+#include <crypto/internal/simd.h>
+#include <linux/jump_label.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/cputype.h>
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+				      int nrounds);
+asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+				       int nrounds, unsigned int nbytes);
+asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
+asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
+
+asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+			     const u32 *state, int nrounds);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
+
+static inline bool neon_usable(void)
+{
+	return static_branch_likely(&use_neon) && crypto_simd_usable();
+}
+
+static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
+			  unsigned int bytes, int nrounds)
+{
+	u8 buf[CHACHA_BLOCK_SIZE];
+
+	while (bytes > CHACHA_BLOCK_SIZE) {
+		unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
+
+		chacha_4block_xor_neon(state, dst, src, nrounds, l);
+		bytes -= l;
+		src += l;
+		dst += l;
+		state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
+	}
+	if (bytes) {
+		const u8 *s = src;
+		u8 *d = dst;
+
+		if (bytes != CHACHA_BLOCK_SIZE)
+			s = d = memcpy(buf, src, bytes);
+		chacha_block_xor_neon(state, d, s, nrounds);
+		if (d != dst)
+			memcpy(dst, buf, bytes);
+		state[12]++;
+	}
+}
+
+void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
+{
+	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
+		hchacha_block_arm(state, stream, nrounds);
+	} else {
+		kernel_neon_begin();
+		hchacha_block_neon(state, stream, nrounds);
+		kernel_neon_end();
+	}
+}
+EXPORT_SYMBOL(hchacha_block_arch);
+
+void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
+		       int nrounds)
+{
+	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
+	    bytes <= CHACHA_BLOCK_SIZE) {
+		chacha_doarm(dst, src, bytes, state, nrounds);
+		state[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
+		return;
+	}
+
+	do {
+		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
+
+		kernel_neon_begin();
+		chacha_doneon(state, dst, src, todo, nrounds);
+		kernel_neon_end();
+
+		bytes -= todo;
+		src += todo;
+		dst += todo;
+	} while (bytes);
+}
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+	/* We always can use at least the ARM scalar implementation. */
+	return true;
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+static int __init chacha_arm_mod_init(void)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
+		switch (read_cpuid_part()) {
+		case ARM_CPU_PART_CORTEX_A7:
+		case ARM_CPU_PART_CORTEX_A5:
+			/*
+			 * The Cortex-A7 and Cortex-A5 do not perform well with
+			 * the NEON implementation but do incredibly with the
+			 * scalar one and use less power.
+			 */
+			break;
+		default:
+			static_branch_enable(&use_neon);
+		}
+	}
+	return 0;
+}
+arch_initcall(chacha_arm_mod_init);
+
+static void __exit chacha_arm_mod_exit(void)
+{
+}
+module_exit(chacha_arm_mod_exit);
+
+MODULE_DESCRIPTION("ChaCha and HChaCha functions (ARM optimized)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/arm/lib/crypto/chacha-neon-core.S b/arch/arm/lib/crypto/chacha-neon-core.S
new file mode 100644
index 000000000000..ddd62b6294a5
--- /dev/null
+++ b/arch/arm/lib/crypto/chacha-neon-core.S
@@ -0,0 +1,643 @@
+/*
+ * ChaCha/HChaCha NEON helper functions
+ *
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+ /*
+  * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
+  *
+  * (a)  vshl.u32 + vsri.u32		(needs temporary register)
+  * (b)  vshl.u32 + vshr.u32 + vorr	(needs temporary register)
+  * (c)  vrev32.16			(16-bit rotations only)
+  * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
+  *					 needs index vector)
+  *
+  * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
+  * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
+  * cycles of (b) on both Cortex-A7 and Cortex-A53.
+  *
+  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
+  * and doesn't need a temporary register.
+  *
+  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
+  * is twice as fast as (a), even when doing (a) on multiple registers
+  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
+  * parallelizes better when temporary registers are scarce.
+  *
+  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
+  * (a), so the need to load the rotation table actually makes the vtbl method
+  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
+  * seems to be a good compromise to get a more significant speed boost on some
+  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
+  */
+
+#include <linux/linkage.h>
+#include <asm/cache.h>
+
+	.text
+	.fpu		neon
+	.align		5
+
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers q0-q3.  It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * The round count is given in r3.
+ *
+ * Clobbers: r3, ip, q4-q5
+ */
+chacha_permute:
+
+	adr		ip, .Lrol8_table
+	vld1.8		{d10}, [ip, :64]
+
+.Ldoubleround:
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vadd.i32	q0, q0, q1
+	veor		q3, q3, q0
+	vrev32.16	q3, q3
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #12
+	vsri.u32	q1, q4, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vadd.i32	q0, q0, q1
+	veor		q3, q3, q0
+	vtbl.8		d6, {d6}, d10
+	vtbl.8		d7, {d7}, d10
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #7
+	vsri.u32	q1, q4, #25
+
+	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+	vext.8		q1, q1, q1, #4
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vext.8		q2, q2, q2, #8
+	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+	vext.8		q3, q3, q3, #12
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+	vadd.i32	q0, q0, q1
+	veor		q3, q3, q0
+	vrev32.16	q3, q3
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #12
+	vsri.u32	q1, q4, #20
+
+	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+	vadd.i32	q0, q0, q1
+	veor		q3, q3, q0
+	vtbl.8		d6, {d6}, d10
+	vtbl.8		d7, {d7}, d10
+
+	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+	vadd.i32	q2, q2, q3
+	veor		q4, q1, q2
+	vshl.u32	q1, q4, #7
+	vsri.u32	q1, q4, #25
+
+	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+	vext.8		q1, q1, q1, #12
+	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+	vext.8		q2, q2, q2, #8
+	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+	vext.8		q3, q3, q3, #4
+
+	subs		r3, r3, #2
+	bne		.Ldoubleround
+
+	bx		lr
+ENDPROC(chacha_permute)
+
+ENTRY(chacha_block_xor_neon)
+	// r0: Input state matrix, s
+	// r1: 1 data block output, o
+	// r2: 1 data block input, i
+	// r3: nrounds
+	push		{lr}
+
+	// x0..3 = s0..3
+	add		ip, r0, #0x20
+	vld1.32		{q0-q1}, [r0]
+	vld1.32		{q2-q3}, [ip]
+
+	vmov		q8, q0
+	vmov		q9, q1
+	vmov		q10, q2
+	vmov		q11, q3
+
+	bl		chacha_permute
+
+	add		ip, r2, #0x20
+	vld1.8		{q4-q5}, [r2]
+	vld1.8		{q6-q7}, [ip]
+
+	// o0 = i0 ^ (x0 + s0)
+	vadd.i32	q0, q0, q8
+	veor		q0, q0, q4
+
+	// o1 = i1 ^ (x1 + s1)
+	vadd.i32	q1, q1, q9
+	veor		q1, q1, q5
+
+	// o2 = i2 ^ (x2 + s2)
+	vadd.i32	q2, q2, q10
+	veor		q2, q2, q6
+
+	// o3 = i3 ^ (x3 + s3)
+	vadd.i32	q3, q3, q11
+	veor		q3, q3, q7
+
+	add		ip, r1, #0x20
+	vst1.8		{q0-q1}, [r1]
+	vst1.8		{q2-q3}, [ip]
+
+	pop		{pc}
+ENDPROC(chacha_block_xor_neon)
+
+ENTRY(hchacha_block_neon)
+	// r0: Input state matrix, s
+	// r1: output (8 32-bit words)
+	// r2: nrounds
+	push		{lr}
+
+	vld1.32		{q0-q1}, [r0]!
+	vld1.32		{q2-q3}, [r0]
+
+	mov		r3, r2
+	bl		chacha_permute
+
+	vst1.32		{q0}, [r1]!
+	vst1.32		{q3}, [r1]
+
+	pop		{pc}
+ENDPROC(hchacha_block_neon)
+
+	.align		4
+.Lctrinc:	.word	0, 1, 2, 3
+.Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
+
+	.align		5
+ENTRY(chacha_4block_xor_neon)
+	push		{r4, lr}
+	mov		r4, sp			// preserve the stack pointer
+	sub		ip, sp, #0x20		// allocate a 32 byte buffer
+	bic		ip, ip, #0x1f		// aligned to 32 bytes
+	mov		sp, ip
+
+	// r0: Input state matrix, s
+	// r1: 4 data blocks output, o
+	// r2: 4 data blocks input, i
+	// r3: nrounds
+
+	//
+	// This function encrypts four consecutive ChaCha blocks by loading
+	// the state matrix in NEON registers four times. The algorithm performs
+	// each operation on the corresponding word of each state matrix, hence
+	// requires no word shuffling. The words are re-interleaved before the
+	// final addition of the original state and the XORing step.
+	//
+
+	// x0..15[0-3] = s0..15[0-3]
+	add		ip, r0, #0x20
+	vld1.32		{q0-q1}, [r0]
+	vld1.32		{q2-q3}, [ip]
+
+	adr		lr, .Lctrinc
+	vdup.32		q15, d7[1]
+	vdup.32		q14, d7[0]
+	vld1.32		{q4}, [lr, :128]
+	vdup.32		q13, d6[1]
+	vdup.32		q12, d6[0]
+	vdup.32		q11, d5[1]
+	vdup.32		q10, d5[0]
+	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
+	vdup.32		q9, d4[1]
+	vdup.32		q8, d4[0]
+	vdup.32		q7, d3[1]
+	vdup.32		q6, d3[0]
+	vdup.32		q5, d2[1]
+	vdup.32		q4, d2[0]
+	vdup.32		q3, d1[1]
+	vdup.32		q2, d1[0]
+	vdup.32		q1, d0[1]
+	vdup.32		q0, d0[0]
+
+	adr		ip, .Lrol8_table
+	b		1f
+
+.Ldoubleround4:
+	vld1.32		{q8-q9}, [sp, :256]
+1:
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+	vadd.i32	q0, q0, q4
+	vadd.i32	q1, q1, q5
+	vadd.i32	q2, q2, q6
+	vadd.i32	q3, q3, q7
+
+	veor		q12, q12, q0
+	veor		q13, q13, q1
+	veor		q14, q14, q2
+	veor		q15, q15, q3
+
+	vrev32.16	q12, q12
+	vrev32.16	q13, q13
+	vrev32.16	q14, q14
+	vrev32.16	q15, q15
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+	vadd.i32	q8, q8, q12
+	vadd.i32	q9, q9, q13
+	vadd.i32	q10, q10, q14
+	vadd.i32	q11, q11, q15
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q4, q8
+	veor		q9, q5, q9
+	vshl.u32	q4, q8, #12
+	vshl.u32	q5, q9, #12
+	vsri.u32	q4, q8, #20
+	vsri.u32	q5, q9, #20
+
+	veor		q8, q6, q10
+	veor		q9, q7, q11
+	vshl.u32	q6, q8, #12
+	vshl.u32	q7, q9, #12
+	vsri.u32	q6, q8, #20
+	vsri.u32	q7, q9, #20
+
+	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+	vld1.8		{d16}, [ip, :64]
+	vadd.i32	q0, q0, q4
+	vadd.i32	q1, q1, q5
+	vadd.i32	q2, q2, q6
+	vadd.i32	q3, q3, q7
+
+	veor		q12, q12, q0
+	veor		q13, q13, q1
+	veor		q14, q14, q2
+	veor		q15, q15, q3
+
+	vtbl.8		d24, {d24}, d16
+	vtbl.8		d25, {d25}, d16
+	vtbl.8		d26, {d26}, d16
+	vtbl.8		d27, {d27}, d16
+	vtbl.8		d28, {d28}, d16
+	vtbl.8		d29, {d29}, d16
+	vtbl.8		d30, {d30}, d16
+	vtbl.8		d31, {d31}, d16
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+	vadd.i32	q8, q8, q12
+	vadd.i32	q9, q9, q13
+	vadd.i32	q10, q10, q14
+	vadd.i32	q11, q11, q15
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q4, q8
+	veor		q9, q5, q9
+	vshl.u32	q4, q8, #7
+	vshl.u32	q5, q9, #7
+	vsri.u32	q4, q8, #25
+	vsri.u32	q5, q9, #25
+
+	veor		q8, q6, q10
+	veor		q9, q7, q11
+	vshl.u32	q6, q8, #7
+	vshl.u32	q7, q9, #7
+	vsri.u32	q6, q8, #25
+	vsri.u32	q7, q9, #25
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+	vadd.i32	q0, q0, q5
+	vadd.i32	q1, q1, q6
+	vadd.i32	q2, q2, q7
+	vadd.i32	q3, q3, q4
+
+	veor		q15, q15, q0
+	veor		q12, q12, q1
+	veor		q13, q13, q2
+	veor		q14, q14, q3
+
+	vrev32.16	q15, q15
+	vrev32.16	q12, q12
+	vrev32.16	q13, q13
+	vrev32.16	q14, q14
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+	vadd.i32	q10, q10, q15
+	vadd.i32	q11, q11, q12
+	vadd.i32	q8, q8, q13
+	vadd.i32	q9, q9, q14
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q7, q8
+	veor		q9, q4, q9
+	vshl.u32	q7, q8, #12
+	vshl.u32	q4, q9, #12
+	vsri.u32	q7, q8, #20
+	vsri.u32	q4, q9, #20
+
+	veor		q8, q5, q10
+	veor		q9, q6, q11
+	vshl.u32	q5, q8, #12
+	vshl.u32	q6, q9, #12
+	vsri.u32	q5, q8, #20
+	vsri.u32	q6, q9, #20
+
+	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+	vld1.8		{d16}, [ip, :64]
+	vadd.i32	q0, q0, q5
+	vadd.i32	q1, q1, q6
+	vadd.i32	q2, q2, q7
+	vadd.i32	q3, q3, q4
+
+	veor		q15, q15, q0
+	veor		q12, q12, q1
+	veor		q13, q13, q2
+	veor		q14, q14, q3
+
+	vtbl.8		d30, {d30}, d16
+	vtbl.8		d31, {d31}, d16
+	vtbl.8		d24, {d24}, d16
+	vtbl.8		d25, {d25}, d16
+	vtbl.8		d26, {d26}, d16
+	vtbl.8		d27, {d27}, d16
+	vtbl.8		d28, {d28}, d16
+	vtbl.8		d29, {d29}, d16
+
+	vld1.32		{q8-q9}, [sp, :256]
+
+	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+	vadd.i32	q10, q10, q15
+	vadd.i32	q11, q11, q12
+	vadd.i32	q8, q8, q13
+	vadd.i32	q9, q9, q14
+
+	vst1.32		{q8-q9}, [sp, :256]
+
+	veor		q8, q7, q8
+	veor		q9, q4, q9
+	vshl.u32	q7, q8, #7
+	vshl.u32	q4, q9, #7
+	vsri.u32	q7, q8, #25
+	vsri.u32	q4, q9, #25
+
+	veor		q8, q5, q10
+	veor		q9, q6, q11
+	vshl.u32	q5, q8, #7
+	vshl.u32	q6, q9, #7
+	vsri.u32	q5, q8, #25
+	vsri.u32	q6, q9, #25
+
+	subs		r3, r3, #2
+	bne		.Ldoubleround4
+
+	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
+	// x8..9[0-3] are on the stack.
+
+	// Re-interleave the words in the first two rows of each block (x0..7).
+	// Also add the counter values 0-3 to x12[0-3].
+	  vld1.32	{q8}, [lr, :128]	// load counter values 0-3
+	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
+	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
+	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
+	vzip.32		q6, q7			// => (6 7 6 7) (6 7 6 7)
+	  vadd.u32	q12, q8			// x12 += counter values 0-3
+	vswp		d1, d4
+	vswp		d3, d6
+	  vld1.32	{q8-q9}, [r0]!		// load s0..7
+	vswp		d9, d12
+	vswp		d11, d14
+
+	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
+	// after XORing the first 32 bytes.
+	vswp		q1, q4
+
+	// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
+
+	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
+	vadd.u32	q0, q0, q8
+	vadd.u32	q2, q2, q8
+	vadd.u32	q4, q4, q8
+	vadd.u32	q3, q3, q8
+
+	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
+	vadd.u32	q1, q1, q9
+	vadd.u32	q6, q6, q9
+	vadd.u32	q5, q5, q9
+	vadd.u32	q7, q7, q9
+
+	// XOR first 32 bytes using keystream from first two rows of first block
+	vld1.8		{q8-q9}, [r2]!
+	veor		q8, q8, q0
+	veor		q9, q9, q1
+	vst1.8		{q8-q9}, [r1]!
+
+	// Re-interleave the words in the last two rows of each block (x8..15).
+	vld1.32		{q8-q9}, [sp, :256]
+	  mov		sp, r4		// restore original stack pointer
+	  ldr		r4, [r4, #8]	// load number of bytes
+	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
+	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
+	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
+	vzip.32		q10, q11	// => (10 11 10 11) (10 11 10 11)
+	  vld1.32	{q0-q1}, [r0]	// load s8..15
+	vswp		d25, d28
+	vswp		d27, d30
+	vswp		d17, d20
+	vswp		d19, d22
+
+	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
+
+	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
+	vadd.u32	q8,  q8,  q0
+	vadd.u32	q10, q10, q0
+	vadd.u32	q9,  q9,  q0
+	vadd.u32	q11, q11, q0
+
+	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
+	vadd.u32	q12, q12, q1
+	vadd.u32	q14, q14, q1
+	vadd.u32	q13, q13, q1
+	vadd.u32	q15, q15, q1
+
+	// XOR the rest of the data with the keystream
+
+	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #96
+	veor		q0, q0, q8
+	veor		q1, q1, q12
+	ble		.Lle96
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+	veor		q0, q0, q2
+	veor		q1, q1, q6
+	ble		.Lle128
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+	veor		q0, q0, q10
+	veor		q1, q1, q14
+	ble		.Lle160
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+	veor		q0, q0, q4
+	veor		q1, q1, q5
+	ble		.Lle192
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+	veor		q0, q0, q9
+	veor		q1, q1, q13
+	ble		.Lle224
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]!
+	subs		r4, r4, #32
+	veor		q0, q0, q3
+	veor		q1, q1, q7
+	blt		.Llt256
+.Lout:
+	vst1.8		{q0-q1}, [r1]!
+
+	vld1.8		{q0-q1}, [r2]
+	veor		q0, q0, q11
+	veor		q1, q1, q15
+	vst1.8		{q0-q1}, [r1]
+
+	pop		{r4, pc}
+
+.Lle192:
+	vmov		q4, q9
+	vmov		q5, q13
+
+.Lle160:
+	// nothing to do
+
+.Lfinalblock:
+	// Process the final block if processing less than 4 full blocks.
+	// Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
+	// previous 32 byte output block that still needs to be written at
+	// [r1] in q0-q1.
+	beq		.Lfullblock
+
+.Lpartialblock:
+	adr		lr, .Lpermute + 32
+	add		r2, r2, r4
+	add		lr, lr, r4
+	add		r4, r4, r1
+
+	vld1.8		{q2-q3}, [lr]
+	vld1.8		{q6-q7}, [r2]
+
+	add		r4, r4, #32
+
+	vtbl.8		d4, {q4-q5}, d4
+	vtbl.8		d5, {q4-q5}, d5
+	vtbl.8		d6, {q4-q5}, d6
+	vtbl.8		d7, {q4-q5}, d7
+
+	veor		q6, q6, q2
+	veor		q7, q7, q3
+
+	vst1.8		{q6-q7}, [r4]	// overlapping stores
+	vst1.8		{q0-q1}, [r1]
+	pop		{r4, pc}
+
+.Lfullblock:
+	vmov		q11, q4
+	vmov		q15, q5
+	b		.Lout
+.Lle96:
+	vmov		q4, q2
+	vmov		q5, q6
+	b		.Lfinalblock
+.Lle128:
+	vmov		q4, q10
+	vmov		q5, q14
+	b		.Lfinalblock
+.Lle224:
+	vmov		q4, q3
+	vmov		q5, q7
+	b		.Lfinalblock
+.Llt256:
+	vmov		q4, q11
+	vmov		q5, q15
+	b		.Lpartialblock
+ENDPROC(chacha_4block_xor_neon)
+
+	.align		L1_CACHE_SHIFT
+.Lpermute:
+	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
diff --git a/arch/arm/lib/crypto/chacha-scalar-core.S b/arch/arm/lib/crypto/chacha-scalar-core.S
new file mode 100644
index 000000000000..083fe1ab96d0
--- /dev/null
+++ b/arch/arm/lib/crypto/chacha-scalar-core.S
@@ -0,0 +1,443 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Google, Inc.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * Design notes:
+ *
+ * 16 registers would be needed to hold the state matrix, but only 14 are
+ * available because 'sp' and 'pc' cannot be used.  So we spill the elements
+ * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
+ * 'ldrd' and one 'strd' instruction per round.
+ *
+ * All rotates are performed using the implicit rotate operand accepted by the
+ * 'add' and 'eor' instructions.  This is faster than using explicit rotate
+ * instructions.  To make this work, we allow the values in the second and last
+ * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
+ * wrong rotation amount.  The rotation amount is then fixed up just in time
+ * when the values are used.  'brot' is the number of bits the values in row 'b'
+ * need to be rotated right to arrive at the correct values, and 'drot'
+ * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
+ * that they end up as (25, 24) after every round.
+ */
+
+	// ChaCha state registers
+	X0	.req	r0
+	X1	.req	r1
+	X2	.req	r2
+	X3	.req	r3
+	X4	.req	r4
+	X5	.req	r5
+	X6	.req	r6
+	X7	.req	r7
+	X8_X10	.req	r8	// shared by x8 and x10
+	X9_X11	.req	r9	// shared by x9 and x11
+	X12	.req	r10
+	X13	.req	r11
+	X14	.req	r12
+	X15	.req	r14
+
+.macro _le32_bswap_4x	a, b, c, d,  tmp
+#ifdef __ARMEB__
+	rev_l		\a,  \tmp
+	rev_l		\b,  \tmp
+	rev_l		\c,  \tmp
+	rev_l		\d,  \tmp
+#endif
+.endm
+
+.macro __ldrd		a, b, src, offset
+#if __LINUX_ARM_ARCH__ >= 6
+	ldrd		\a, \b, [\src, #\offset]
+#else
+	ldr		\a, [\src, #\offset]
+	ldr		\b, [\src, #\offset + 4]
+#endif
+.endm
+
+.macro __strd		a, b, dst, offset
+#if __LINUX_ARM_ARCH__ >= 6
+	strd		\a, \b, [\dst, #\offset]
+#else
+	str		\a, [\dst, #\offset]
+	str		\b, [\dst, #\offset + 4]
+#endif
+.endm
+
+.macro _halfround	a1, b1, c1, d1,  a2, b2, c2, d2
+
+	// a += b; d ^= a; d = rol(d, 16);
+	add		\a1, \a1, \b1, ror #brot
+	add		\a2, \a2, \b2, ror #brot
+	eor		\d1, \a1, \d1, ror #drot
+	eor		\d2, \a2, \d2, ror #drot
+	// drot == 32 - 16 == 16
+
+	// c += d; b ^= c; b = rol(b, 12);
+	add		\c1, \c1, \d1, ror #16
+	add		\c2, \c2, \d2, ror #16
+	eor		\b1, \c1, \b1, ror #brot
+	eor		\b2, \c2, \b2, ror #brot
+	// brot == 32 - 12 == 20
+
+	// a += b; d ^= a; d = rol(d, 8);
+	add		\a1, \a1, \b1, ror #20
+	add		\a2, \a2, \b2, ror #20
+	eor		\d1, \a1, \d1, ror #16
+	eor		\d2, \a2, \d2, ror #16
+	// drot == 32 - 8 == 24
+
+	// c += d; b ^= c; b = rol(b, 7);
+	add		\c1, \c1, \d1, ror #24
+	add		\c2, \c2, \d2, ror #24
+	eor		\b1, \c1, \b1, ror #20
+	eor		\b2, \c2, \b2, ror #20
+	// brot == 32 - 7 == 25
+.endm
+
+.macro _doubleround
+
+	// column round
+
+	// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
+	_halfround	X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
+
+	// save (x8, x9); restore (x10, x11)
+	__strd		X8_X10, X9_X11, sp, 0
+	__ldrd		X8_X10, X9_X11, sp, 8
+
+	// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
+	_halfround	X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
+
+	.set brot, 25
+	.set drot, 24
+
+	// diagonal round
+
+	// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
+	_halfround	X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
+
+	// save (x10, x11); restore (x8, x9)
+	__strd		X8_X10, X9_X11, sp, 8
+	__ldrd		X8_X10, X9_X11, sp, 0
+
+	// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
+	_halfround	X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
+.endm
+
+.macro _chacha_permute	nrounds
+	.set brot, 0
+	.set drot, 0
+	.rept \nrounds / 2
+	 _doubleround
+	.endr
+.endm
+
+.macro _chacha		nrounds
+
+.Lnext_block\@:
+	// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
+	// Registers contain x0-x9,x12-x15.
+
+	// Do the core ChaCha permutation to update x0-x15.
+	_chacha_permute	\nrounds
+
+	add		sp, #8
+	// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
+	// Registers contain x0-x9,x12-x15.
+	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+	// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
+	push		{X8_X10, X9_X11, X12, X13, X14, X15}
+
+	// Load (OUT, IN, LEN).
+	ldr		r14, [sp, #96]
+	ldr		r12, [sp, #100]
+	ldr		r11, [sp, #104]
+
+	orr		r10, r14, r12
+
+	// Use slow path if fewer than 64 bytes remain.
+	cmp		r11, #64
+	blt		.Lxor_slowpath\@
+
+	// Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
+	// ARMv6+, since ldmia and stmia (used below) still require alignment.
+	tst		r10, #3
+	bne		.Lxor_slowpath\@
+
+	// Fast path: XOR 64 bytes of aligned data.
+
+	// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
+	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+	// x0-x3
+	__ldrd		r8, r9, sp, 32
+	__ldrd		r10, r11, sp, 40
+	add		X0, X0, r8
+	add		X1, X1, r9
+	add		X2, X2, r10
+	add		X3, X3, r11
+	_le32_bswap_4x	X0, X1, X2, X3,  r8
+	ldmia		r12!, {r8-r11}
+	eor		X0, X0, r8
+	eor		X1, X1, r9
+	eor		X2, X2, r10
+	eor		X3, X3, r11
+	stmia		r14!, {X0-X3}
+
+	// x4-x7
+	__ldrd		r8, r9, sp, 48
+	__ldrd		r10, r11, sp, 56
+	add		X4, r8, X4, ror #brot
+	add		X5, r9, X5, ror #brot
+	ldmia		r12!, {X0-X3}
+	add		X6, r10, X6, ror #brot
+	add		X7, r11, X7, ror #brot
+	_le32_bswap_4x	X4, X5, X6, X7,  r8
+	eor		X4, X4, X0
+	eor		X5, X5, X1
+	eor		X6, X6, X2
+	eor		X7, X7, X3
+	stmia		r14!, {X4-X7}
+
+	// x8-x15
+	pop		{r0-r7}			// (x8-x9,x12-x15,x10-x11)
+	__ldrd		r8, r9, sp, 32
+	__ldrd		r10, r11, sp, 40
+	add		r0, r0, r8		// x8
+	add		r1, r1, r9		// x9
+	add		r6, r6, r10		// x10
+	add		r7, r7, r11		// x11
+	_le32_bswap_4x	r0, r1, r6, r7,  r8
+	ldmia		r12!, {r8-r11}
+	eor		r0, r0, r8		// x8
+	eor		r1, r1, r9		// x9
+	eor		r6, r6, r10		// x10
+	eor		r7, r7, r11		// x11
+	stmia		r14!, {r0,r1,r6,r7}
+	ldmia		r12!, {r0,r1,r6,r7}
+	__ldrd		r8, r9, sp, 48
+	__ldrd		r10, r11, sp, 56
+	add		r2, r8, r2, ror #drot	// x12
+	add		r3, r9, r3, ror #drot	// x13
+	add		r4, r10, r4, ror #drot	// x14
+	add		r5, r11, r5, ror #drot	// x15
+	_le32_bswap_4x	r2, r3, r4, r5,  r9
+	  ldr		r9, [sp, #72]		// load LEN
+	eor		r2, r2, r0		// x12
+	eor		r3, r3, r1		// x13
+	eor		r4, r4, r6		// x14
+	eor		r5, r5, r7		// x15
+	  subs		r9, #64			// decrement and check LEN
+	stmia		r14!, {r2-r5}
+
+	beq		.Ldone\@
+
+.Lprepare_for_next_block\@:
+
+	// Stack: x0-x15 OUT IN LEN
+
+	// Increment block counter (x12)
+	add		r8, #1
+
+	// Store updated (OUT, IN, LEN)
+	str		r14, [sp, #64]
+	str		r12, [sp, #68]
+	str		r9, [sp, #72]
+
+	  mov		r14, sp
+
+	// Store updated block counter (x12)
+	str		r8, [sp, #48]
+
+	  sub		sp, #16
+
+	// Reload state and do next block
+	ldmia		r14!, {r0-r11}		// load x0-x11
+	__strd		r10, r11, sp, 8		// store x10-x11 before state
+	ldmia		r14, {r10-r12,r14}	// load x12-x15
+	b		.Lnext_block\@
+
+.Lxor_slowpath\@:
+	// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
+	// We handle it by storing the 64 bytes of keystream to the stack, then
+	// XOR-ing the needed portion with the data.
+
+	// Allocate keystream buffer
+	sub		sp, #64
+	mov		r14, sp
+
+	// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
+	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
+	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
+
+	// Save keystream for x0-x3
+	__ldrd		r8, r9, sp, 96
+	__ldrd		r10, r11, sp, 104
+	add		X0, X0, r8
+	add		X1, X1, r9
+	add		X2, X2, r10
+	add		X3, X3, r11
+	_le32_bswap_4x	X0, X1, X2, X3,  r8
+	stmia		r14!, {X0-X3}
+
+	// Save keystream for x4-x7
+	__ldrd		r8, r9, sp, 112
+	__ldrd		r10, r11, sp, 120
+	add		X4, r8, X4, ror #brot
+	add		X5, r9, X5, ror #brot
+	add		X6, r10, X6, ror #brot
+	add		X7, r11, X7, ror #brot
+	_le32_bswap_4x	X4, X5, X6, X7,  r8
+	  add		r8, sp, #64
+	stmia		r14!, {X4-X7}
+
+	// Save keystream for x8-x15
+	ldm		r8, {r0-r7}		// (x8-x9,x12-x15,x10-x11)
+	__ldrd		r8, r9, sp, 128
+	__ldrd		r10, r11, sp, 136
+	add		r0, r0, r8		// x8
+	add		r1, r1, r9		// x9
+	add		r6, r6, r10		// x10
+	add		r7, r7, r11		// x11
+	_le32_bswap_4x	r0, r1, r6, r7,  r8
+	stmia		r14!, {r0,r1,r6,r7}
+	__ldrd		r8, r9, sp, 144
+	__ldrd		r10, r11, sp, 152
+	add		r2, r8, r2, ror #drot	// x12
+	add		r3, r9, r3, ror #drot	// x13
+	add		r4, r10, r4, ror #drot	// x14
+	add		r5, r11, r5, ror #drot	// x15
+	_le32_bswap_4x	r2, r3, r4, r5,  r9
+	stmia		r14, {r2-r5}
+
+	// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
+	// Registers: r8 is block counter, r12 is IN.
+
+	ldr		r9, [sp, #168]		// LEN
+	ldr		r14, [sp, #160]		// OUT
+	cmp		r9, #64
+	  mov		r0, sp
+	movle		r1, r9
+	movgt		r1, #64
+	// r1 is number of bytes to XOR, in range [1, 64]
+
+.if __LINUX_ARM_ARCH__ < 6
+	orr		r2, r12, r14
+	tst		r2, #3			// IN or OUT misaligned?
+	bne		.Lxor_next_byte\@
+.endif
+
+	// XOR a word at a time
+.rept 16
+	subs		r1, #4
+	blt		.Lxor_words_done\@
+	ldr		r2, [r12], #4
+	ldr		r3, [r0], #4
+	eor		r2, r2, r3
+	str		r2, [r14], #4
+.endr
+	b		.Lxor_slowpath_done\@
+.Lxor_words_done\@:
+	ands		r1, r1, #3
+	beq		.Lxor_slowpath_done\@
+
+	// XOR a byte at a time
+.Lxor_next_byte\@:
+	ldrb		r2, [r12], #1
+	ldrb		r3, [r0], #1
+	eor		r2, r2, r3
+	strb		r2, [r14], #1
+	subs		r1, #1
+	bne		.Lxor_next_byte\@
+
+.Lxor_slowpath_done\@:
+	subs		r9, #64
+	add		sp, #96
+	bgt		.Lprepare_for_next_block\@
+
+.Ldone\@:
+.endm	// _chacha
+
+/*
+ * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
+ *		     const u32 *state, int nrounds);
+ */
+ENTRY(chacha_doarm)
+	cmp		r2, #0			// len == 0?
+	reteq		lr
+
+	ldr		ip, [sp]
+	cmp		ip, #12
+
+	push		{r0-r2,r4-r11,lr}
+
+	// Push state x0-x15 onto stack.
+	// Also store an extra copy of x10-x11 just before the state.
+
+	add		X12, r3, #48
+	ldm		X12, {X12,X13,X14,X15}
+	push		{X12,X13,X14,X15}
+	sub		sp, sp, #64
+
+	__ldrd		X8_X10, X9_X11, r3, 40
+	__strd		X8_X10, X9_X11, sp, 8
+	__strd		X8_X10, X9_X11, sp, 56
+	ldm		r3, {X0-X9_X11}
+	__strd		X0, X1, sp, 16
+	__strd		X2, X3, sp, 24
+	__strd		X4, X5, sp, 32
+	__strd		X6, X7, sp, 40
+	__strd		X8_X10, X9_X11, sp, 48
+
+	beq		1f
+	_chacha		20
+
+0:	add		sp, #76
+	pop		{r4-r11, pc}
+
+1:	_chacha		12
+	b		0b
+ENDPROC(chacha_doarm)
+
+/*
+ * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
+ */
+ENTRY(hchacha_block_arm)
+	push		{r1,r4-r11,lr}
+
+	cmp		r2, #12			// ChaCha12 ?
+
+	mov		r14, r0
+	ldmia		r14!, {r0-r11}		// load x0-x11
+	push		{r10-r11}		// store x10-x11 to stack
+	ldm		r14, {r10-r12,r14}	// load x12-x15
+	sub		sp, #8
+
+	beq		1f
+	_chacha_permute	20
+
+	// Skip over (unused0-unused1, x10-x11)
+0:	add		sp, #16
+
+	// Fix up rotations of x12-x15
+	ror		X12, X12, #drot
+	ror		X13, X13, #drot
+	  pop		{r4}			// load 'out'
+	ror		X14, X14, #drot
+	ror		X15, X15, #drot
+
+	// Store (x0-x3,x12-x15) to 'out'
+	stm		r4, {X0,X1,X2,X3,X12,X13,X14,X15}
+
+	pop		{r4-r11,pc}
+
+1:	_chacha_permute	12
+	b		0b
+ENDPROC(hchacha_block_arm)
diff --git a/arch/arm/lib/crypto/poly1305-armv4.pl b/arch/arm/lib/crypto/poly1305-armv4.pl
new file mode 100644
index 000000000000..6d79498d3115
--- /dev/null
+++ b/arch/arm/lib/crypto/poly1305-armv4.pl
@@ -0,0 +1,1236 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
+# project.
+# ====================================================================
+#
+#			IALU(*)/gcc-4.4		NEON
+#
+# ARM11xx(ARMv6)	7.78/+100%		-
+# Cortex-A5		6.35/+130%		3.00
+# Cortex-A8		6.25/+115%		2.36
+# Cortex-A9		5.10/+95%		2.55
+# Cortex-A15		3.85/+85%		1.25(**)
+# Snapdragon S4		5.70/+100%		1.48(**)
+#
+# (*)	this is for -march=armv6, i.e. with bunch of ldrb loading data;
+# (**)	these are trade-off results, they can be improved by ~8% but at
+#	the cost of 15/12% regression on Cortex-A5/A7, it's even possible
+#	to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
+
+$flavour = shift;
+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
+
+($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
+
+$code.=<<___;
+#ifndef	__KERNEL__
+# include "arm_arch.h"
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
+# define poly1305_init   poly1305_init_arm
+# define poly1305_blocks poly1305_blocks_arm
+# define poly1305_emit   poly1305_emit_arm
+.globl	poly1305_blocks_neon
+#endif
+
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+.text
+
+.globl	poly1305_emit
+.globl	poly1305_blocks
+.globl	poly1305_init
+.type	poly1305_init,%function
+.align	5
+poly1305_init:
+.Lpoly1305_init:
+	stmdb	sp!,{r4-r11}
+
+	eor	r3,r3,r3
+	cmp	$inp,#0
+	str	r3,[$ctx,#0]		@ zero hash value
+	str	r3,[$ctx,#4]
+	str	r3,[$ctx,#8]
+	str	r3,[$ctx,#12]
+	str	r3,[$ctx,#16]
+	str	r3,[$ctx,#36]		@ clear is_base2_26
+	add	$ctx,$ctx,#20
+
+#ifdef	__thumb2__
+	it	eq
+#endif
+	moveq	r0,#0
+	beq	.Lno_key
+
+#if	__ARM_MAX_ARCH__>=7
+	mov	r3,#-1
+	str	r3,[$ctx,#28]		@ impossible key power value
+# ifndef __KERNEL__
+	adr	r11,.Lpoly1305_init
+	ldr	r12,.LOPENSSL_armcap
+# endif
+#endif
+	ldrb	r4,[$inp,#0]
+	mov	r10,#0x0fffffff
+	ldrb	r5,[$inp,#1]
+	and	r3,r10,#-4		@ 0x0ffffffc
+	ldrb	r6,[$inp,#2]
+	ldrb	r7,[$inp,#3]
+	orr	r4,r4,r5,lsl#8
+	ldrb	r5,[$inp,#4]
+	orr	r4,r4,r6,lsl#16
+	ldrb	r6,[$inp,#5]
+	orr	r4,r4,r7,lsl#24
+	ldrb	r7,[$inp,#6]
+	and	r4,r4,r10
+
+#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+# if !defined(_WIN32)
+	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
+# endif
+# if defined(__APPLE__) || defined(_WIN32)
+	ldr	r12,[r12]
+# endif
+#endif
+	ldrb	r8,[$inp,#7]
+	orr	r5,r5,r6,lsl#8
+	ldrb	r6,[$inp,#8]
+	orr	r5,r5,r7,lsl#16
+	ldrb	r7,[$inp,#9]
+	orr	r5,r5,r8,lsl#24
+	ldrb	r8,[$inp,#10]
+	and	r5,r5,r3
+
+#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	tst	r12,#ARMV7_NEON		@ check for NEON
+# ifdef	__thumb2__
+	adr	r9,.Lpoly1305_blocks_neon
+	adr	r11,.Lpoly1305_blocks
+	it	ne
+	movne	r11,r9
+	adr	r12,.Lpoly1305_emit
+	orr	r11,r11,#1		@ thumb-ify addresses
+	orr	r12,r12,#1
+# else
+	add	r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
+	ite	eq
+	addeq	r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
+	addne	r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
+# endif
+#endif
+	ldrb	r9,[$inp,#11]
+	orr	r6,r6,r7,lsl#8
+	ldrb	r7,[$inp,#12]
+	orr	r6,r6,r8,lsl#16
+	ldrb	r8,[$inp,#13]
+	orr	r6,r6,r9,lsl#24
+	ldrb	r9,[$inp,#14]
+	and	r6,r6,r3
+
+	ldrb	r10,[$inp,#15]
+	orr	r7,r7,r8,lsl#8
+	str	r4,[$ctx,#0]
+	orr	r7,r7,r9,lsl#16
+	str	r5,[$ctx,#4]
+	orr	r7,r7,r10,lsl#24
+	str	r6,[$ctx,#8]
+	and	r7,r7,r3
+	str	r7,[$ctx,#12]
+#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	stmia	r2,{r11,r12}		@ fill functions table
+	mov	r0,#1
+#else
+	mov	r0,#0
+#endif
+.Lno_key:
+	ldmia	sp!,{r4-r11}
+#if	__ARM_ARCH__>=5
+	ret				@ bx	lr
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	poly1305_init,.-poly1305_init
+___
+{
+my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
+my ($s1,$s2,$s3)=($r1,$r2,$r3);
+
+$code.=<<___;
+.type	poly1305_blocks,%function
+.align	5
+poly1305_blocks:
+.Lpoly1305_blocks:
+	stmdb	sp!,{r3-r11,lr}
+
+	ands	$len,$len,#-16
+	beq	.Lno_data
+
+	add	$len,$len,$inp		@ end pointer
+	sub	sp,sp,#32
+
+#if __ARM_ARCH__<7
+	ldmia	$ctx,{$h0-$r3}		@ load context
+	add	$ctx,$ctx,#20
+	str	$len,[sp,#16]		@ offload stuff
+	str	$ctx,[sp,#12]
+#else
+	ldr	lr,[$ctx,#36]		@ is_base2_26
+	ldmia	$ctx!,{$h0-$h4}		@ load hash value
+	str	$len,[sp,#16]		@ offload stuff
+	str	$ctx,[sp,#12]
+
+	adds	$r0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
+	mov	$r1,$h1,lsr#6
+	adcs	$r1,$r1,$h2,lsl#20
+	mov	$r2,$h2,lsr#12
+	adcs	$r2,$r2,$h3,lsl#14
+	mov	$r3,$h3,lsr#18
+	adcs	$r3,$r3,$h4,lsl#8
+	mov	$len,#0
+	teq	lr,#0
+	str	$len,[$ctx,#16]		@ clear is_base2_26
+	adc	$len,$len,$h4,lsr#24
+
+	itttt	ne
+	movne	$h0,$r0			@ choose between radixes
+	movne	$h1,$r1
+	movne	$h2,$r2
+	movne	$h3,$r3
+	ldmia	$ctx,{$r0-$r3}		@ load key
+	it	ne
+	movne	$h4,$len
+#endif
+
+	mov	lr,$inp
+	cmp	$padbit,#0
+	str	$r1,[sp,#20]
+	str	$r2,[sp,#24]
+	str	$r3,[sp,#28]
+	b	.Loop
+
+.align	4
+.Loop:
+#if __ARM_ARCH__<7
+	ldrb	r0,[lr],#16		@ load input
+# ifdef	__thumb2__
+	it	hi
+# endif
+	addhi	$h4,$h4,#1		@ 1<<128
+	ldrb	r1,[lr,#-15]
+	ldrb	r2,[lr,#-14]
+	ldrb	r3,[lr,#-13]
+	orr	r1,r0,r1,lsl#8
+	ldrb	r0,[lr,#-12]
+	orr	r2,r1,r2,lsl#16
+	ldrb	r1,[lr,#-11]
+	orr	r3,r2,r3,lsl#24
+	ldrb	r2,[lr,#-10]
+	adds	$h0,$h0,r3		@ accumulate input
+
+	ldrb	r3,[lr,#-9]
+	orr	r1,r0,r1,lsl#8
+	ldrb	r0,[lr,#-8]
+	orr	r2,r1,r2,lsl#16
+	ldrb	r1,[lr,#-7]
+	orr	r3,r2,r3,lsl#24
+	ldrb	r2,[lr,#-6]
+	adcs	$h1,$h1,r3
+
+	ldrb	r3,[lr,#-5]
+	orr	r1,r0,r1,lsl#8
+	ldrb	r0,[lr,#-4]
+	orr	r2,r1,r2,lsl#16
+	ldrb	r1,[lr,#-3]
+	orr	r3,r2,r3,lsl#24
+	ldrb	r2,[lr,#-2]
+	adcs	$h2,$h2,r3
+
+	ldrb	r3,[lr,#-1]
+	orr	r1,r0,r1,lsl#8
+	str	lr,[sp,#8]		@ offload input pointer
+	orr	r2,r1,r2,lsl#16
+	add	$s1,$r1,$r1,lsr#2
+	orr	r3,r2,r3,lsl#24
+#else
+	ldr	r0,[lr],#16		@ load input
+	it	hi
+	addhi	$h4,$h4,#1		@ padbit
+	ldr	r1,[lr,#-12]
+	ldr	r2,[lr,#-8]
+	ldr	r3,[lr,#-4]
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+	adds	$h0,$h0,r0		@ accumulate input
+	str	lr,[sp,#8]		@ offload input pointer
+	adcs	$h1,$h1,r1
+	add	$s1,$r1,$r1,lsr#2
+	adcs	$h2,$h2,r2
+#endif
+	add	$s2,$r2,$r2,lsr#2
+	adcs	$h3,$h3,r3
+	add	$s3,$r3,$r3,lsr#2
+
+	umull	r2,r3,$h1,$r0
+	 adc	$h4,$h4,#0
+	umull	r0,r1,$h0,$r0
+	umlal	r2,r3,$h4,$s1
+	umlal	r0,r1,$h3,$s1
+	ldr	$r1,[sp,#20]		@ reload $r1
+	umlal	r2,r3,$h2,$s3
+	umlal	r0,r1,$h1,$s3
+	umlal	r2,r3,$h3,$s2
+	umlal	r0,r1,$h2,$s2
+	umlal	r2,r3,$h0,$r1
+	str	r0,[sp,#0]		@ future $h0
+	 mul	r0,$s2,$h4
+	ldr	$r2,[sp,#24]		@ reload $r2
+	adds	r2,r2,r1		@ d1+=d0>>32
+	 eor	r1,r1,r1
+	adc	lr,r3,#0		@ future $h2
+	str	r2,[sp,#4]		@ future $h1
+
+	mul	r2,$s3,$h4
+	eor	r3,r3,r3
+	umlal	r0,r1,$h3,$s3
+	ldr	$r3,[sp,#28]		@ reload $r3
+	umlal	r2,r3,$h3,$r0
+	umlal	r0,r1,$h2,$r0
+	umlal	r2,r3,$h2,$r1
+	umlal	r0,r1,$h1,$r1
+	umlal	r2,r3,$h1,$r2
+	umlal	r0,r1,$h0,$r2
+	umlal	r2,r3,$h0,$r3
+	ldr	$h0,[sp,#0]
+	mul	$h4,$r0,$h4
+	ldr	$h1,[sp,#4]
+
+	adds	$h2,lr,r0		@ d2+=d1>>32
+	ldr	lr,[sp,#8]		@ reload input pointer
+	adc	r1,r1,#0
+	adds	$h3,r2,r1		@ d3+=d2>>32
+	ldr	r0,[sp,#16]		@ reload end pointer
+	adc	r3,r3,#0
+	add	$h4,$h4,r3		@ h4+=d3>>32
+
+	and	r1,$h4,#-4
+	and	$h4,$h4,#3
+	add	r1,r1,r1,lsr#2		@ *=5
+	adds	$h0,$h0,r1
+	adcs	$h1,$h1,#0
+	adcs	$h2,$h2,#0
+	adcs	$h3,$h3,#0
+	adc	$h4,$h4,#0
+
+	cmp	r0,lr			@ done yet?
+	bhi	.Loop
+
+	ldr	$ctx,[sp,#12]
+	add	sp,sp,#32
+	stmdb	$ctx,{$h0-$h4}		@ store the result
+
+.Lno_data:
+#if	__ARM_ARCH__>=5
+	ldmia	sp!,{r3-r11,pc}
+#else
+	ldmia	sp!,{r3-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	poly1305_blocks,.-poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce)=map("r$_",(0..2));
+my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
+my $g4=$ctx;
+
+$code.=<<___;
+.type	poly1305_emit,%function
+.align	5
+poly1305_emit:
+.Lpoly1305_emit:
+	stmdb	sp!,{r4-r11}
+
+	ldmia	$ctx,{$h0-$h4}
+
+#if __ARM_ARCH__>=7
+	ldr	ip,[$ctx,#36]		@ is_base2_26
+
+	adds	$g0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
+	mov	$g1,$h1,lsr#6
+	adcs	$g1,$g1,$h2,lsl#20
+	mov	$g2,$h2,lsr#12
+	adcs	$g2,$g2,$h3,lsl#14
+	mov	$g3,$h3,lsr#18
+	adcs	$g3,$g3,$h4,lsl#8
+	mov	$g4,#0
+	adc	$g4,$g4,$h4,lsr#24
+
+	tst	ip,ip
+	itttt	ne
+	movne	$h0,$g0
+	movne	$h1,$g1
+	movne	$h2,$g2
+	movne	$h3,$g3
+	it	ne
+	movne	$h4,$g4
+#endif
+
+	adds	$g0,$h0,#5		@ compare to modulus
+	adcs	$g1,$h1,#0
+	adcs	$g2,$h2,#0
+	adcs	$g3,$h3,#0
+	adc	$g4,$h4,#0
+	tst	$g4,#4			@ did it carry/borrow?
+
+#ifdef	__thumb2__
+	it	ne
+#endif
+	movne	$h0,$g0
+	ldr	$g0,[$nonce,#0]
+#ifdef	__thumb2__
+	it	ne
+#endif
+	movne	$h1,$g1
+	ldr	$g1,[$nonce,#4]
+#ifdef	__thumb2__
+	it	ne
+#endif
+	movne	$h2,$g2
+	ldr	$g2,[$nonce,#8]
+#ifdef	__thumb2__
+	it	ne
+#endif
+	movne	$h3,$g3
+	ldr	$g3,[$nonce,#12]
+
+	adds	$h0,$h0,$g0
+	adcs	$h1,$h1,$g1
+	adcs	$h2,$h2,$g2
+	adc	$h3,$h3,$g3
+
+#if __ARM_ARCH__>=7
+# ifdef __ARMEB__
+	rev	$h0,$h0
+	rev	$h1,$h1
+	rev	$h2,$h2
+	rev	$h3,$h3
+# endif
+	str	$h0,[$mac,#0]
+	str	$h1,[$mac,#4]
+	str	$h2,[$mac,#8]
+	str	$h3,[$mac,#12]
+#else
+	strb	$h0,[$mac,#0]
+	mov	$h0,$h0,lsr#8
+	strb	$h1,[$mac,#4]
+	mov	$h1,$h1,lsr#8
+	strb	$h2,[$mac,#8]
+	mov	$h2,$h2,lsr#8
+	strb	$h3,[$mac,#12]
+	mov	$h3,$h3,lsr#8
+
+	strb	$h0,[$mac,#1]
+	mov	$h0,$h0,lsr#8
+	strb	$h1,[$mac,#5]
+	mov	$h1,$h1,lsr#8
+	strb	$h2,[$mac,#9]
+	mov	$h2,$h2,lsr#8
+	strb	$h3,[$mac,#13]
+	mov	$h3,$h3,lsr#8
+
+	strb	$h0,[$mac,#2]
+	mov	$h0,$h0,lsr#8
+	strb	$h1,[$mac,#6]
+	mov	$h1,$h1,lsr#8
+	strb	$h2,[$mac,#10]
+	mov	$h2,$h2,lsr#8
+	strb	$h3,[$mac,#14]
+	mov	$h3,$h3,lsr#8
+
+	strb	$h0,[$mac,#3]
+	strb	$h1,[$mac,#7]
+	strb	$h2,[$mac,#11]
+	strb	$h3,[$mac,#15]
+#endif
+	ldmia	sp!,{r4-r11}
+#if	__ARM_ARCH__>=5
+	ret				@ bx	lr
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	poly1305_emit,.-poly1305_emit
+___
+{
+my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
+my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
+my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
+
+my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
+
+$code.=<<___;
+#if	__ARM_MAX_ARCH__>=7
+.fpu	neon
+
+.type	poly1305_init_neon,%function
+.align	5
+poly1305_init_neon:
+.Lpoly1305_init_neon:
+	ldr	r3,[$ctx,#48]		@ first table element
+	cmp	r3,#-1			@ is value impossible?
+	bne	.Lno_init_neon
+
+	ldr	r4,[$ctx,#20]		@ load key base 2^32
+	ldr	r5,[$ctx,#24]
+	ldr	r6,[$ctx,#28]
+	ldr	r7,[$ctx,#32]
+
+	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
+	mov	r3,r4,lsr#26
+	mov	r4,r5,lsr#20
+	orr	r3,r3,r5,lsl#6
+	mov	r5,r6,lsr#14
+	orr	r4,r4,r6,lsl#12
+	mov	r6,r7,lsr#8
+	orr	r5,r5,r7,lsl#18
+	and	r3,r3,#0x03ffffff
+	and	r4,r4,#0x03ffffff
+	and	r5,r5,#0x03ffffff
+
+	vdup.32	$R0,r2			@ r^1 in both lanes
+	add	r2,r3,r3,lsl#2		@ *5
+	vdup.32	$R1,r3
+	add	r3,r4,r4,lsl#2
+	vdup.32	$S1,r2
+	vdup.32	$R2,r4
+	add	r4,r5,r5,lsl#2
+	vdup.32	$S2,r3
+	vdup.32	$R3,r5
+	add	r5,r6,r6,lsl#2
+	vdup.32	$S3,r4
+	vdup.32	$R4,r6
+	vdup.32	$S4,r5
+
+	mov	$zeros,#2		@ counter
+
+.Lsquare_neon:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+
+	vmull.u32	$D0,$R0,${R0}[1]
+	vmull.u32	$D1,$R1,${R0}[1]
+	vmull.u32	$D2,$R2,${R0}[1]
+	vmull.u32	$D3,$R3,${R0}[1]
+	vmull.u32	$D4,$R4,${R0}[1]
+
+	vmlal.u32	$D0,$R4,${S1}[1]
+	vmlal.u32	$D1,$R0,${R1}[1]
+	vmlal.u32	$D2,$R1,${R1}[1]
+	vmlal.u32	$D3,$R2,${R1}[1]
+	vmlal.u32	$D4,$R3,${R1}[1]
+
+	vmlal.u32	$D0,$R3,${S2}[1]
+	vmlal.u32	$D1,$R4,${S2}[1]
+	vmlal.u32	$D3,$R1,${R2}[1]
+	vmlal.u32	$D2,$R0,${R2}[1]
+	vmlal.u32	$D4,$R2,${R2}[1]
+
+	vmlal.u32	$D0,$R2,${S3}[1]
+	vmlal.u32	$D3,$R0,${R3}[1]
+	vmlal.u32	$D1,$R3,${S3}[1]
+	vmlal.u32	$D2,$R4,${S3}[1]
+	vmlal.u32	$D4,$R1,${R3}[1]
+
+	vmlal.u32	$D3,$R4,${S4}[1]
+	vmlal.u32	$D0,$R1,${S4}[1]
+	vmlal.u32	$D1,$R2,${S4}[1]
+	vmlal.u32	$D2,$R3,${S4}[1]
+	vmlal.u32	$D4,$R0,${R4}[1]
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+	@ and P. Schwabe
+	@
+	@ H0>>+H1>>+H2>>+H3>>+H4
+	@ H3>>+H4>>*5+H0>>+H1
+	@
+	@ Trivia.
+	@
+	@ Result of multiplication of n-bit number by m-bit number is
+	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
+	@ m-bit number multiplied by 2^n is still n+m bits wide.
+	@
+	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
+	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
+	@ one is n+1 bits wide.
+	@
+	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
+	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
+	@ can be 27. However! In cases when their width exceeds 26 bits
+	@ they are limited by 2^26+2^6. This in turn means that *sum*
+	@ of the products with these values can still be viewed as sum
+	@ of 52-bit numbers as long as the amount of addends is not a
+	@ power of 2. For example,
+	@
+	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
+	@
+	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
+	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
+	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
+	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
+	@ which is less than 32 * (2^52) or 2^57. And when processing
+	@ data we are looking at triple as many addends...
+	@
+	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
+	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
+	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
+	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
+	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
+	@ This means that result of reduction have to be compressed upon
+	@ loop wrap-around. This can be done in the process of reduction
+	@ to minimize amount of instructions [as well as amount of
+	@ 128-bit instructions, which benefits low-end processors], but
+	@ one has to watch for H2 (which is narrower than H0) and 5*H4
+	@ not being wider than 58 bits, so that result of right shift
+	@ by 26 bits fits in 32 bits. This is also useful on x86,
+	@ because it allows to use paddd in place for paddq, which
+	@ benefits Atom, where paddq is ridiculously slow.
+
+	vshr.u64	$T0,$D3,#26
+	vmovn.i64	$D3#lo,$D3
+	 vshr.u64	$T1,$D0,#26
+	 vmovn.i64	$D0#lo,$D0
+	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
+	vbic.i32	$D3#lo,#0xfc000000	@ &=0x03ffffff
+	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
+	 vbic.i32	$D0#lo,#0xfc000000
+
+	vshrn.u64	$T0#lo,$D4,#26
+	vmovn.i64	$D4#lo,$D4
+	 vshr.u64	$T1,$D1,#26
+	 vmovn.i64	$D1#lo,$D1
+	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
+	vbic.i32	$D4#lo,#0xfc000000
+	 vbic.i32	$D1#lo,#0xfc000000
+
+	vadd.i32	$D0#lo,$D0#lo,$T0#lo
+	vshl.u32	$T0#lo,$T0#lo,#2
+	 vshrn.u64	$T1#lo,$D2,#26
+	 vmovn.i64	$D2#lo,$D2
+	vadd.i32	$D0#lo,$D0#lo,$T0#lo	@ h4 -> h0
+	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
+	 vbic.i32	$D2#lo,#0xfc000000
+
+	vshr.u32	$T0#lo,$D0#lo,#26
+	vbic.i32	$D0#lo,#0xfc000000
+	 vshr.u32	$T1#lo,$D3#lo,#26
+	 vbic.i32	$D3#lo,#0xfc000000
+	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
+	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
+
+	subs		$zeros,$zeros,#1
+	beq		.Lsquare_break_neon
+
+	add		$tbl0,$ctx,#(48+0*9*4)
+	add		$tbl1,$ctx,#(48+1*9*4)
+
+	vtrn.32		$R0,$D0#lo		@ r^2:r^1
+	vtrn.32		$R2,$D2#lo
+	vtrn.32		$R3,$D3#lo
+	vtrn.32		$R1,$D1#lo
+	vtrn.32		$R4,$D4#lo
+
+	vshl.u32	$S2,$R2,#2		@ *5
+	vshl.u32	$S3,$R3,#2
+	vshl.u32	$S1,$R1,#2
+	vshl.u32	$S4,$R4,#2
+	vadd.i32	$S2,$S2,$R2
+	vadd.i32	$S1,$S1,$R1
+	vadd.i32	$S3,$S3,$R3
+	vadd.i32	$S4,$S4,$R4
+
+	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+	vst1.32		{${S4}[0]},[$tbl0,:32]
+	vst1.32		{${S4}[1]},[$tbl1,:32]
+
+	b		.Lsquare_neon
+
+.align	4
+.Lsquare_break_neon:
+	add		$tbl0,$ctx,#(48+2*4*9)
+	add		$tbl1,$ctx,#(48+3*4*9)
+
+	vmov		$R0,$D0#lo		@ r^4:r^3
+	vshl.u32	$S1,$D1#lo,#2		@ *5
+	vmov		$R1,$D1#lo
+	vshl.u32	$S2,$D2#lo,#2
+	vmov		$R2,$D2#lo
+	vshl.u32	$S3,$D3#lo,#2
+	vmov		$R3,$D3#lo
+	vshl.u32	$S4,$D4#lo,#2
+	vmov		$R4,$D4#lo
+	vadd.i32	$S1,$S1,$D1#lo
+	vadd.i32	$S2,$S2,$D2#lo
+	vadd.i32	$S3,$S3,$D3#lo
+	vadd.i32	$S4,$S4,$D4#lo
+
+	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
+	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
+	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+	vst1.32		{${S4}[0]},[$tbl0]
+	vst1.32		{${S4}[1]},[$tbl1]
+
+.Lno_init_neon:
+	ret				@ bx	lr
+.size	poly1305_init_neon,.-poly1305_init_neon
+
+.type	poly1305_blocks_neon,%function
+.align	5
+poly1305_blocks_neon:
+.Lpoly1305_blocks_neon:
+	ldr	ip,[$ctx,#36]		@ is_base2_26
+
+	cmp	$len,#64
+	blo	.Lpoly1305_blocks
+
+	stmdb	sp!,{r4-r7}
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+
+	tst	ip,ip			@ is_base2_26?
+	bne	.Lbase2_26_neon
+
+	stmdb	sp!,{r1-r3,lr}
+	bl	.Lpoly1305_init_neon
+
+	ldr	r4,[$ctx,#0]		@ load hash value base 2^32
+	ldr	r5,[$ctx,#4]
+	ldr	r6,[$ctx,#8]
+	ldr	r7,[$ctx,#12]
+	ldr	ip,[$ctx,#16]
+
+	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
+	mov	r3,r4,lsr#26
+	 veor	$D0#lo,$D0#lo,$D0#lo
+	mov	r4,r5,lsr#20
+	orr	r3,r3,r5,lsl#6
+	 veor	$D1#lo,$D1#lo,$D1#lo
+	mov	r5,r6,lsr#14
+	orr	r4,r4,r6,lsl#12
+	 veor	$D2#lo,$D2#lo,$D2#lo
+	mov	r6,r7,lsr#8
+	orr	r5,r5,r7,lsl#18
+	 veor	$D3#lo,$D3#lo,$D3#lo
+	and	r3,r3,#0x03ffffff
+	orr	r6,r6,ip,lsl#24
+	 veor	$D4#lo,$D4#lo,$D4#lo
+	and	r4,r4,#0x03ffffff
+	mov	r1,#1
+	and	r5,r5,#0x03ffffff
+	str	r1,[$ctx,#36]		@ set is_base2_26
+
+	vmov.32	$D0#lo[0],r2
+	vmov.32	$D1#lo[0],r3
+	vmov.32	$D2#lo[0],r4
+	vmov.32	$D3#lo[0],r5
+	vmov.32	$D4#lo[0],r6
+	adr	$zeros,.Lzeros
+
+	ldmia	sp!,{r1-r3,lr}
+	b	.Lhash_loaded
+
+.align	4
+.Lbase2_26_neon:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ load hash value
+
+	veor		$D0#lo,$D0#lo,$D0#lo
+	veor		$D1#lo,$D1#lo,$D1#lo
+	veor		$D2#lo,$D2#lo,$D2#lo
+	veor		$D3#lo,$D3#lo,$D3#lo
+	veor		$D4#lo,$D4#lo,$D4#lo
+	vld4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+	adr		$zeros,.Lzeros
+	vld1.32		{$D4#lo[0]},[$ctx]
+	sub		$ctx,$ctx,#16		@ rewind
+
+.Lhash_loaded:
+	add		$in2,$inp,#32
+	mov		$padbit,$padbit,lsl#24
+	tst		$len,#31
+	beq		.Leven
+
+	vld4.32		{$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
+	vmov.32		$H4#lo[0],$padbit
+	sub		$len,$len,#16
+	add		$in2,$inp,#32
+
+# ifdef	__ARMEB__
+	vrev32.8	$H0,$H0
+	vrev32.8	$H3,$H3
+	vrev32.8	$H1,$H1
+	vrev32.8	$H2,$H2
+# endif
+	vsri.u32	$H4#lo,$H3#lo,#8	@ base 2^32 -> base 2^26
+	vshl.u32	$H3#lo,$H3#lo,#18
+
+	vsri.u32	$H3#lo,$H2#lo,#14
+	vshl.u32	$H2#lo,$H2#lo,#12
+	vadd.i32	$H4#hi,$H4#lo,$D4#lo	@ add hash value and move to #hi
+
+	vbic.i32	$H3#lo,#0xfc000000
+	vsri.u32	$H2#lo,$H1#lo,#20
+	vshl.u32	$H1#lo,$H1#lo,#6
+
+	vbic.i32	$H2#lo,#0xfc000000
+	vsri.u32	$H1#lo,$H0#lo,#26
+	vadd.i32	$H3#hi,$H3#lo,$D3#lo
+
+	vbic.i32	$H0#lo,#0xfc000000
+	vbic.i32	$H1#lo,#0xfc000000
+	vadd.i32	$H2#hi,$H2#lo,$D2#lo
+
+	vadd.i32	$H0#hi,$H0#lo,$D0#lo
+	vadd.i32	$H1#hi,$H1#lo,$D1#lo
+
+	mov		$tbl1,$zeros
+	add		$tbl0,$ctx,#48
+
+	cmp		$len,$len
+	b		.Long_tail
+
+.align	4
+.Leven:
+	subs		$len,$len,#64
+	it		lo
+	movlo		$in2,$zeros
+
+	vmov.i32	$H4,#1<<24		@ padbit, yes, always
+	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
+	add		$inp,$inp,#64
+	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
+	add		$in2,$in2,#64
+	itt		hi
+	addhi		$tbl1,$ctx,#(48+1*9*4)
+	addhi		$tbl0,$ctx,#(48+3*9*4)
+
+# ifdef	__ARMEB__
+	vrev32.8	$H0,$H0
+	vrev32.8	$H3,$H3
+	vrev32.8	$H1,$H1
+	vrev32.8	$H2,$H2
+# endif
+	vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
+	vshl.u32	$H3,$H3,#18
+
+	vsri.u32	$H3,$H2,#14
+	vshl.u32	$H2,$H2,#12
+
+	vbic.i32	$H3,#0xfc000000
+	vsri.u32	$H2,$H1,#20
+	vshl.u32	$H1,$H1,#6
+
+	vbic.i32	$H2,#0xfc000000
+	vsri.u32	$H1,$H0,#26
+
+	vbic.i32	$H0,#0xfc000000
+	vbic.i32	$H1,#0xfc000000
+
+	bls		.Lskip_loop
+
+	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^2
+	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
+	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+	b		.Loop_neon
+
+.align	5
+.Loop_neon:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+	@   \___________________/
+	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+	@   \___________________/ \____________________/
+	@
+	@ Note that we start with inp[2:3]*r^2. This is because it
+	@ doesn't depend on reduction in previous iteration.
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
+	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
+	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
+	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
+	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ inp[2:3]*r^2
+
+	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ accumulate inp[0:1]
+	vmull.u32	$D2,$H2#hi,${R0}[1]
+	vadd.i32	$H0#lo,$H0#lo,$D0#lo
+	vmull.u32	$D0,$H0#hi,${R0}[1]
+	vadd.i32	$H3#lo,$H3#lo,$D3#lo
+	vmull.u32	$D3,$H3#hi,${R0}[1]
+	vmlal.u32	$D2,$H1#hi,${R1}[1]
+	vadd.i32	$H1#lo,$H1#lo,$D1#lo
+	vmull.u32	$D1,$H1#hi,${R0}[1]
+
+	vadd.i32	$H4#lo,$H4#lo,$D4#lo
+	vmull.u32	$D4,$H4#hi,${R0}[1]
+	subs		$len,$len,#64
+	vmlal.u32	$D0,$H4#hi,${S1}[1]
+	it		lo
+	movlo		$in2,$zeros
+	vmlal.u32	$D3,$H2#hi,${R1}[1]
+	vld1.32		${S4}[1],[$tbl1,:32]
+	vmlal.u32	$D1,$H0#hi,${R1}[1]
+	vmlal.u32	$D4,$H3#hi,${R1}[1]
+
+	vmlal.u32	$D0,$H3#hi,${S2}[1]
+	vmlal.u32	$D3,$H1#hi,${R2}[1]
+	vmlal.u32	$D4,$H2#hi,${R2}[1]
+	vmlal.u32	$D1,$H4#hi,${S2}[1]
+	vmlal.u32	$D2,$H0#hi,${R2}[1]
+
+	vmlal.u32	$D3,$H0#hi,${R3}[1]
+	vmlal.u32	$D0,$H2#hi,${S3}[1]
+	vmlal.u32	$D4,$H1#hi,${R3}[1]
+	vmlal.u32	$D1,$H3#hi,${S3}[1]
+	vmlal.u32	$D2,$H4#hi,${S3}[1]
+
+	vmlal.u32	$D3,$H4#hi,${S4}[1]
+	vmlal.u32	$D0,$H1#hi,${S4}[1]
+	vmlal.u32	$D4,$H0#hi,${R4}[1]
+	vmlal.u32	$D1,$H2#hi,${S4}[1]
+	vmlal.u32	$D2,$H3#hi,${S4}[1]
+
+	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
+	add		$in2,$in2,#64
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ (hash+inp[0:1])*r^4 and accumulate
+
+	vmlal.u32	$D3,$H3#lo,${R0}[0]
+	vmlal.u32	$D0,$H0#lo,${R0}[0]
+	vmlal.u32	$D4,$H4#lo,${R0}[0]
+	vmlal.u32	$D1,$H1#lo,${R0}[0]
+	vmlal.u32	$D2,$H2#lo,${R0}[0]
+	vld1.32		${S4}[0],[$tbl0,:32]
+
+	vmlal.u32	$D3,$H2#lo,${R1}[0]
+	vmlal.u32	$D0,$H4#lo,${S1}[0]
+	vmlal.u32	$D4,$H3#lo,${R1}[0]
+	vmlal.u32	$D1,$H0#lo,${R1}[0]
+	vmlal.u32	$D2,$H1#lo,${R1}[0]
+
+	vmlal.u32	$D3,$H1#lo,${R2}[0]
+	vmlal.u32	$D0,$H3#lo,${S2}[0]
+	vmlal.u32	$D4,$H2#lo,${R2}[0]
+	vmlal.u32	$D1,$H4#lo,${S2}[0]
+	vmlal.u32	$D2,$H0#lo,${R2}[0]
+
+	vmlal.u32	$D3,$H0#lo,${R3}[0]
+	vmlal.u32	$D0,$H2#lo,${S3}[0]
+	vmlal.u32	$D4,$H1#lo,${R3}[0]
+	vmlal.u32	$D1,$H3#lo,${S3}[0]
+	vmlal.u32	$D3,$H4#lo,${S4}[0]
+
+	vmlal.u32	$D2,$H4#lo,${S3}[0]
+	vmlal.u32	$D0,$H1#lo,${S4}[0]
+	vmlal.u32	$D4,$H0#lo,${R4}[0]
+	vmov.i32	$H4,#1<<24		@ padbit, yes, always
+	vmlal.u32	$D1,$H2#lo,${S4}[0]
+	vmlal.u32	$D2,$H3#lo,${S4}[0]
+
+	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
+	add		$inp,$inp,#64
+# ifdef	__ARMEB__
+	vrev32.8	$H0,$H0
+	vrev32.8	$H1,$H1
+	vrev32.8	$H2,$H2
+	vrev32.8	$H3,$H3
+# endif
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
+	@ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
+
+	vshr.u64	$T0,$D3,#26
+	vmovn.i64	$D3#lo,$D3
+	 vshr.u64	$T1,$D0,#26
+	 vmovn.i64	$D0#lo,$D0
+	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
+	vbic.i32	$D3#lo,#0xfc000000
+	  vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
+	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
+	  vshl.u32	$H3,$H3,#18
+	 vbic.i32	$D0#lo,#0xfc000000
+
+	vshrn.u64	$T0#lo,$D4,#26
+	vmovn.i64	$D4#lo,$D4
+	 vshr.u64	$T1,$D1,#26
+	 vmovn.i64	$D1#lo,$D1
+	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
+	  vsri.u32	$H3,$H2,#14
+	vbic.i32	$D4#lo,#0xfc000000
+	  vshl.u32	$H2,$H2,#12
+	 vbic.i32	$D1#lo,#0xfc000000
+
+	vadd.i32	$D0#lo,$D0#lo,$T0#lo
+	vshl.u32	$T0#lo,$T0#lo,#2
+	  vbic.i32	$H3,#0xfc000000
+	 vshrn.u64	$T1#lo,$D2,#26
+	 vmovn.i64	$D2#lo,$D2
+	vaddl.u32	$D0,$D0#lo,$T0#lo	@ h4 -> h0 [widen for a sec]
+	  vsri.u32	$H2,$H1,#20
+	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
+	  vshl.u32	$H1,$H1,#6
+	 vbic.i32	$D2#lo,#0xfc000000
+	  vbic.i32	$H2,#0xfc000000
+
+	vshrn.u64	$T0#lo,$D0,#26		@ re-narrow
+	vmovn.i64	$D0#lo,$D0
+	  vsri.u32	$H1,$H0,#26
+	  vbic.i32	$H0,#0xfc000000
+	 vshr.u32	$T1#lo,$D3#lo,#26
+	 vbic.i32	$D3#lo,#0xfc000000
+	vbic.i32	$D0#lo,#0xfc000000
+	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
+	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
+	  vbic.i32	$H1,#0xfc000000
+
+	bhi		.Loop_neon
+
+.Lskip_loop:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+	add		$tbl1,$ctx,#(48+0*9*4)
+	add		$tbl0,$ctx,#(48+1*9*4)
+	adds		$len,$len,#32
+	it		ne
+	movne		$len,#0
+	bne		.Long_tail
+
+	vadd.i32	$H2#hi,$H2#lo,$D2#lo	@ add hash value and move to #hi
+	vadd.i32	$H0#hi,$H0#lo,$D0#lo
+	vadd.i32	$H3#hi,$H3#lo,$D3#lo
+	vadd.i32	$H1#hi,$H1#lo,$D1#lo
+	vadd.i32	$H4#hi,$H4#lo,$D4#lo
+
+.Long_tail:
+	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^1
+	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^2
+
+	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ can be redundant
+	vmull.u32	$D2,$H2#hi,$R0
+	vadd.i32	$H0#lo,$H0#lo,$D0#lo
+	vmull.u32	$D0,$H0#hi,$R0
+	vadd.i32	$H3#lo,$H3#lo,$D3#lo
+	vmull.u32	$D3,$H3#hi,$R0
+	vadd.i32	$H1#lo,$H1#lo,$D1#lo
+	vmull.u32	$D1,$H1#hi,$R0
+	vadd.i32	$H4#lo,$H4#lo,$D4#lo
+	vmull.u32	$D4,$H4#hi,$R0
+
+	vmlal.u32	$D0,$H4#hi,$S1
+	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+	vmlal.u32	$D3,$H2#hi,$R1
+	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+	vmlal.u32	$D1,$H0#hi,$R1
+	vmlal.u32	$D4,$H3#hi,$R1
+	vmlal.u32	$D2,$H1#hi,$R1
+
+	vmlal.u32	$D3,$H1#hi,$R2
+	vld1.32		${S4}[1],[$tbl1,:32]
+	vmlal.u32	$D0,$H3#hi,$S2
+	vld1.32		${S4}[0],[$tbl0,:32]
+	vmlal.u32	$D4,$H2#hi,$R2
+	vmlal.u32	$D1,$H4#hi,$S2
+	vmlal.u32	$D2,$H0#hi,$R2
+
+	vmlal.u32	$D3,$H0#hi,$R3
+	 it		ne
+	 addne		$tbl1,$ctx,#(48+2*9*4)
+	vmlal.u32	$D0,$H2#hi,$S3
+	 it		ne
+	 addne		$tbl0,$ctx,#(48+3*9*4)
+	vmlal.u32	$D4,$H1#hi,$R3
+	vmlal.u32	$D1,$H3#hi,$S3
+	vmlal.u32	$D2,$H4#hi,$S3
+
+	vmlal.u32	$D3,$H4#hi,$S4
+	 vorn		$MASK,$MASK,$MASK	@ all-ones, can be redundant
+	vmlal.u32	$D0,$H1#hi,$S4
+	 vshr.u64	$MASK,$MASK,#38
+	vmlal.u32	$D4,$H0#hi,$R4
+	vmlal.u32	$D1,$H2#hi,$S4
+	vmlal.u32	$D2,$H3#hi,$S4
+
+	beq		.Lshort_tail
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ (hash+inp[0:1])*r^4:r^3 and accumulate
+
+	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^3
+	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
+
+	vmlal.u32	$D2,$H2#lo,$R0
+	vmlal.u32	$D0,$H0#lo,$R0
+	vmlal.u32	$D3,$H3#lo,$R0
+	vmlal.u32	$D1,$H1#lo,$R0
+	vmlal.u32	$D4,$H4#lo,$R0
+
+	vmlal.u32	$D0,$H4#lo,$S1
+	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
+	vmlal.u32	$D3,$H2#lo,$R1
+	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
+	vmlal.u32	$D1,$H0#lo,$R1
+	vmlal.u32	$D4,$H3#lo,$R1
+	vmlal.u32	$D2,$H1#lo,$R1
+
+	vmlal.u32	$D3,$H1#lo,$R2
+	vld1.32		${S4}[1],[$tbl1,:32]
+	vmlal.u32	$D0,$H3#lo,$S2
+	vld1.32		${S4}[0],[$tbl0,:32]
+	vmlal.u32	$D4,$H2#lo,$R2
+	vmlal.u32	$D1,$H4#lo,$S2
+	vmlal.u32	$D2,$H0#lo,$R2
+
+	vmlal.u32	$D3,$H0#lo,$R3
+	vmlal.u32	$D0,$H2#lo,$S3
+	vmlal.u32	$D4,$H1#lo,$R3
+	vmlal.u32	$D1,$H3#lo,$S3
+	vmlal.u32	$D2,$H4#lo,$S3
+
+	vmlal.u32	$D3,$H4#lo,$S4
+	 vorn		$MASK,$MASK,$MASK	@ all-ones
+	vmlal.u32	$D0,$H1#lo,$S4
+	 vshr.u64	$MASK,$MASK,#38
+	vmlal.u32	$D4,$H0#lo,$R4
+	vmlal.u32	$D1,$H2#lo,$S4
+	vmlal.u32	$D2,$H3#lo,$S4
+
+.Lshort_tail:
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ horizontal addition
+
+	vadd.i64	$D3#lo,$D3#lo,$D3#hi
+	vadd.i64	$D0#lo,$D0#lo,$D0#hi
+	vadd.i64	$D4#lo,$D4#lo,$D4#hi
+	vadd.i64	$D1#lo,$D1#lo,$D1#hi
+	vadd.i64	$D2#lo,$D2#lo,$D2#hi
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ lazy reduction, but without narrowing
+
+	vshr.u64	$T0,$D3,#26
+	vand.i64	$D3,$D3,$MASK
+	 vshr.u64	$T1,$D0,#26
+	 vand.i64	$D0,$D0,$MASK
+	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
+	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
+
+	vshr.u64	$T0,$D4,#26
+	vand.i64	$D4,$D4,$MASK
+	 vshr.u64	$T1,$D1,#26
+	 vand.i64	$D1,$D1,$MASK
+	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
+
+	vadd.i64	$D0,$D0,$T0
+	vshl.u64	$T0,$T0,#2
+	 vshr.u64	$T1,$D2,#26
+	 vand.i64	$D2,$D2,$MASK
+	vadd.i64	$D0,$D0,$T0		@ h4 -> h0
+	 vadd.i64	$D3,$D3,$T1		@ h2 -> h3
+
+	vshr.u64	$T0,$D0,#26
+	vand.i64	$D0,$D0,$MASK
+	 vshr.u64	$T1,$D3,#26
+	 vand.i64	$D3,$D3,$MASK
+	vadd.i64	$D1,$D1,$T0		@ h0 -> h1
+	 vadd.i64	$D4,$D4,$T1		@ h3 -> h4
+
+	cmp		$len,#0
+	bne		.Leven
+
+	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+	@ store hash value
+
+	vst4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
+	vst1.32		{$D4#lo[0]},[$ctx]
+
+	vldmia	sp!,{d8-d15}			@ epilogue
+	ldmia	sp!,{r4-r7}
+	ret					@ bx	lr
+.size	poly1305_blocks_neon,.-poly1305_blocks_neon
+
+.align	5
+.Lzeros:
+.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+#ifndef	__KERNEL__
+.LOPENSSL_armcap:
+# ifdef	_WIN32
+.word	OPENSSL_armcap_P
+# else
+.word	OPENSSL_armcap_P-.Lpoly1305_init
+# endif
+.comm	OPENSSL_armcap_P,4,4
+.hidden	OPENSSL_armcap_P
+#endif
+#endif
+___
+}	}
+$code.=<<___;
+.asciz	"Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
+.align	2
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/\bret\b/bx	lr/go						or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+close STDOUT; # enforce flush
diff --git a/arch/arm/lib/crypto/poly1305-glue.c b/arch/arm/lib/crypto/poly1305-glue.c
new file mode 100644
index 000000000000..42d0ebde1ae1
--- /dev/null
+++ b/arch/arm/lib/crypto/poly1305-glue.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/poly1305.h>
+#include <crypto/internal/simd.h>
+#include <linux/cpufeature.h>
+#include <linux/jump_label.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+
+void poly1305_init_arm(void *state, const u8 *key);
+void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
+void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
+void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce);
+
+void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
+{
+}
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
+{
+	poly1305_init_arm(&dctx->h, key);
+	dctx->s[0] = get_unaligned_le32(key + 16);
+	dctx->s[1] = get_unaligned_le32(key + 20);
+	dctx->s[2] = get_unaligned_le32(key + 24);
+	dctx->s[3] = get_unaligned_le32(key + 28);
+	dctx->buflen = 0;
+}
+EXPORT_SYMBOL(poly1305_init_arch);
+
+void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
+			  unsigned int nbytes)
+{
+	bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+		       crypto_simd_usable();
+
+	if (unlikely(dctx->buflen)) {
+		u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
+
+		memcpy(dctx->buf + dctx->buflen, src, bytes);
+		src += bytes;
+		nbytes -= bytes;
+		dctx->buflen += bytes;
+
+		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+			poly1305_blocks_arm(&dctx->h, dctx->buf,
+					    POLY1305_BLOCK_SIZE, 1);
+			dctx->buflen = 0;
+		}
+	}
+
+	if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
+		unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
+
+		if (static_branch_likely(&have_neon) && do_neon) {
+			do {
+				unsigned int todo = min_t(unsigned int, len, SZ_4K);
+
+				kernel_neon_begin();
+				poly1305_blocks_neon(&dctx->h, src, todo, 1);
+				kernel_neon_end();
+
+				len -= todo;
+				src += todo;
+			} while (len);
+		} else {
+			poly1305_blocks_arm(&dctx->h, src, len, 1);
+			src += len;
+		}
+		nbytes %= POLY1305_BLOCK_SIZE;
+	}
+
+	if (unlikely(nbytes)) {
+		dctx->buflen = nbytes;
+		memcpy(dctx->buf, src, nbytes);
+	}
+}
+EXPORT_SYMBOL(poly1305_update_arch);
+
+void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
+{
+	if (unlikely(dctx->buflen)) {
+		dctx->buf[dctx->buflen++] = 1;
+		memset(dctx->buf + dctx->buflen, 0,
+		       POLY1305_BLOCK_SIZE - dctx->buflen);
+		poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+	}
+
+	poly1305_emit_arm(&dctx->h, dst, dctx->s);
+	*dctx = (struct poly1305_desc_ctx){};
+}
+EXPORT_SYMBOL(poly1305_final_arch);
+
+bool poly1305_is_arch_optimized(void)
+{
+	/* We always can use at least the ARM scalar implementation. */
+	return true;
+}
+EXPORT_SYMBOL(poly1305_is_arch_optimized);
+
+static int __init arm_poly1305_mod_init(void)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
+	    (elf_hwcap & HWCAP_NEON))
+		static_branch_enable(&have_neon);
+	return 0;
+}
+arch_initcall(arm_poly1305_mod_init);
+
+static void __exit arm_poly1305_mod_exit(void)
+{
+}
+module_exit(arm_poly1305_mod_exit);
+
+MODULE_DESCRIPTION("Accelerated Poly1305 transform for ARM");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 2c6ab80e0cdc..59135009e4f0 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -155,4 +155,10 @@ config CRYPTO_LIB_SHA256
 config CRYPTO_LIB_SM3
 	tristate
 
+if !KMSAN # avoid false positives from assembly
+if ARM
+source "arch/arm/lib/crypto/Kconfig"
+endif
+endif
+
 endmenu