From: Eric Biggers <ebiggers@kernel.org>
Date: Thu, 19 Jun 2025 19:19:02 +0000 (-0700)
Subject: lib/crypto: mips: Move arch/mips/lib/crypto/ into lib/crypto/
X-Git-Tag: block-6.17-20250808~34^2~47
X-Git-Url: https://git.kernel.dk/?a=commitdiff_plain;h=7e54e993ab8c98d912f54ad6f46bfcc9dcd65368;p=linux-block.git

lib/crypto: mips: Move arch/mips/lib/crypto/ into lib/crypto/

Move the contents of arch/mips/lib/crypto/ into lib/crypto/mips/.

The new code organization makes a lot more sense for how this code
actually works and is developed.  In particular, it makes it possible to
build each algorithm as a single module, with better inlining and dead
code elimination.  For a more detailed explanation, see the patchset
which did this for the CRC library code:
https://lore.kernel.org/r/20250607200454.73587-1-ebiggers@kernel.org/.
Also see the patchset which did this for SHA-512:
https://lore.kernel.org/linux-crypto/20250616014019.415791-1-ebiggers@kernel.org/

This is just a preparatory commit, which does the move to get the files
into their new location but keeps them building the same way as before.
Later commits will make the actual improvements to the way the
arch-optimized code is integrated for each algorithm.

Add a gitignore entry for the removed directory arch/mips/lib/crypto/ so
that people don't accidentally commit leftover generated files.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Sohil Mehta <sohil.mehta@intel.com>
Link: https://lore.kernel.org/r/20250619191908.134235-4-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---

diff --git a/arch/mips/lib/.gitignore b/arch/mips/lib/.gitignore
new file mode 100644
index 000000000000..647d7a922e68
--- /dev/null
+++ b/arch/mips/lib/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# This now-removed directory used to contain generated files.
+/crypto/
diff --git a/arch/mips/lib/Makefile b/arch/mips/lib/Makefile
index 9d75845ef78e..9c024e6d5e54 100644
--- a/arch/mips/lib/Makefile
+++ b/arch/mips/lib/Makefile
@@ -3,8 +3,6 @@
 # Makefile for MIPS-specific library files..
 #
 
-obj-y	+= crypto/
-
 lib-y	+= bitops.o csum_partial.o delay.o memcpy.o memset.o \
 	   mips-atomic.o strncpy_user.o \
 	   strnlen_user.o uncached.o
diff --git a/arch/mips/lib/crypto/.gitignore b/arch/mips/lib/crypto/.gitignore
deleted file mode 100644
index 0d47d4f21c6d..000000000000
--- a/arch/mips/lib/crypto/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-poly1305-core.S
diff --git a/arch/mips/lib/crypto/Kconfig b/arch/mips/lib/crypto/Kconfig
deleted file mode 100644
index 0670a170c1be..000000000000
--- a/arch/mips/lib/crypto/Kconfig
+++ /dev/null
@@ -1,12 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config CRYPTO_CHACHA_MIPS
-	tristate
-	depends on CPU_MIPS32_R2
-	default CRYPTO_LIB_CHACHA
-	select CRYPTO_ARCH_HAVE_LIB_CHACHA
-
-config CRYPTO_POLY1305_MIPS
-	tristate
-	default CRYPTO_LIB_POLY1305
-	select CRYPTO_ARCH_HAVE_LIB_POLY1305
diff --git a/arch/mips/lib/crypto/Makefile b/arch/mips/lib/crypto/Makefile
deleted file mode 100644
index 804488c7aded..000000000000
--- a/arch/mips/lib/crypto/Makefile
+++ /dev/null
@@ -1,19 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
-chacha-mips-y := chacha-core.o chacha-glue.o
-AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
-
-obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
-poly1305-mips-y := poly1305-core.o poly1305-glue.o
-
-perlasm-flavour-$(CONFIG_32BIT) := o32
-perlasm-flavour-$(CONFIG_64BIT) := 64
-
-quiet_cmd_perlasm = PERLASM $@
-      cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
-
-$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
-	$(call if_changed,perlasm)
-
-targets += poly1305-core.S
diff --git a/arch/mips/lib/crypto/chacha-core.S b/arch/mips/lib/crypto/chacha-core.S
deleted file mode 100644
index 5755f69cfe00..000000000000
--- a/arch/mips/lib/crypto/chacha-core.S
+++ /dev/null
@@ -1,497 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Copyright (C) 2016-2018 RenÃ© van Dorst <opensource@vdorst.com>. All Rights Reserved.
- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- */
-
-#define MASK_U32		0x3c
-#define CHACHA20_BLOCK_SIZE	64
-#define STACK_SIZE		32
-
-#define X0	$t0
-#define X1	$t1
-#define X2	$t2
-#define X3	$t3
-#define X4	$t4
-#define X5	$t5
-#define X6	$t6
-#define X7	$t7
-#define X8	$t8
-#define X9	$t9
-#define X10	$v1
-#define X11	$s6
-#define X12	$s5
-#define X13	$s4
-#define X14	$s3
-#define X15	$s2
-/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
-#define T0	$s1
-#define T1	$s0
-#define T(n)	T ## n
-#define X(n)	X ## n
-
-/* Input arguments */
-#define STATE		$a0
-#define OUT		$a1
-#define IN		$a2
-#define BYTES		$a3
-
-/* Output argument */
-/* NONCE[0] is kept in a register and not in memory.
- * We don't want to touch original value in memory.
- * Must be incremented every loop iteration.
- */
-#define NONCE_0		$v0
-
-/* SAVED_X and SAVED_CA are set in the jump table.
- * Use regs which are overwritten on exit else we don't leak clear data.
- * They are used to handling the last bytes which are not multiple of 4.
- */
-#define SAVED_X		X15
-#define SAVED_CA	$s7
-
-#define IS_UNALIGNED	$s7
-
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-#define MSB 0
-#define LSB 3
-#define ROTx rotl
-#define ROTR(n) rotr n, 24
-#define	CPU_TO_LE32(n) \
-	wsbh	n; \
-	rotr	n, 16;
-#else
-#define MSB 3
-#define LSB 0
-#define ROTx rotr
-#define CPU_TO_LE32(n)
-#define ROTR(n)
-#endif
-
-#define FOR_EACH_WORD(x) \
-	x( 0); \
-	x( 1); \
-	x( 2); \
-	x( 3); \
-	x( 4); \
-	x( 5); \
-	x( 6); \
-	x( 7); \
-	x( 8); \
-	x( 9); \
-	x(10); \
-	x(11); \
-	x(12); \
-	x(13); \
-	x(14); \
-	x(15);
-
-#define FOR_EACH_WORD_REV(x) \
-	x(15); \
-	x(14); \
-	x(13); \
-	x(12); \
-	x(11); \
-	x(10); \
-	x( 9); \
-	x( 8); \
-	x( 7); \
-	x( 6); \
-	x( 5); \
-	x( 4); \
-	x( 3); \
-	x( 2); \
-	x( 1); \
-	x( 0);
-
-#define PLUS_ONE_0	 1
-#define PLUS_ONE_1	 2
-#define PLUS_ONE_2	 3
-#define PLUS_ONE_3	 4
-#define PLUS_ONE_4	 5
-#define PLUS_ONE_5	 6
-#define PLUS_ONE_6	 7
-#define PLUS_ONE_7	 8
-#define PLUS_ONE_8	 9
-#define PLUS_ONE_9	10
-#define PLUS_ONE_10	11
-#define PLUS_ONE_11	12
-#define PLUS_ONE_12	13
-#define PLUS_ONE_13	14
-#define PLUS_ONE_14	15
-#define PLUS_ONE_15	16
-#define PLUS_ONE(x)	PLUS_ONE_ ## x
-#define _CONCAT3(a,b,c)	a ## b ## c
-#define CONCAT3(a,b,c)	_CONCAT3(a,b,c)
-
-#define STORE_UNALIGNED(x) \
-CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
-	.if (x != 12); \
-		lw	T0, (x*4)(STATE); \
-	.endif; \
-	lwl	T1, (x*4)+MSB ## (IN); \
-	lwr	T1, (x*4)+LSB ## (IN); \
-	.if (x == 12); \
-		addu	X ## x, NONCE_0; \
-	.else; \
-		addu	X ## x, T0; \
-	.endif; \
-	CPU_TO_LE32(X ## x); \
-	xor	X ## x, T1; \
-	swl	X ## x, (x*4)+MSB ## (OUT); \
-	swr	X ## x, (x*4)+LSB ## (OUT);
-
-#define STORE_ALIGNED(x) \
-CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
-	.if (x != 12); \
-		lw	T0, (x*4)(STATE); \
-	.endif; \
-	lw	T1, (x*4) ## (IN); \
-	.if (x == 12); \
-		addu	X ## x, NONCE_0; \
-	.else; \
-		addu	X ## x, T0; \
-	.endif; \
-	CPU_TO_LE32(X ## x); \
-	xor	X ## x, T1; \
-	sw	X ## x, (x*4) ## (OUT);
-
-/* Jump table macro.
- * Used for setup and handling the last bytes, which are not multiple of 4.
- * X15 is free to store Xn
- * Every jumptable entry must be equal in size.
- */
-#define JMPTBL_ALIGNED(x) \
-.Lchacha_mips_jmptbl_aligned_ ## x: ; \
-	.set	noreorder; \
-	b	.Lchacha_mips_xor_aligned_ ## x ## _b; \
-	.if (x == 12); \
-		addu	SAVED_X, X ## x, NONCE_0; \
-	.else; \
-		addu	SAVED_X, X ## x, SAVED_CA; \
-	.endif; \
-	.set	reorder
-
-#define JMPTBL_UNALIGNED(x) \
-.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
-	.set	noreorder; \
-	b	.Lchacha_mips_xor_unaligned_ ## x ## _b; \
-	.if (x == 12); \
-		addu	SAVED_X, X ## x, NONCE_0; \
-	.else; \
-		addu	SAVED_X, X ## x, SAVED_CA; \
-	.endif; \
-	.set	reorder
-
-#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
-	addu	X(A), X(K); \
-	addu	X(B), X(L); \
-	addu	X(C), X(M); \
-	addu	X(D), X(N); \
-	xor	X(V), X(A); \
-	xor	X(W), X(B); \
-	xor	X(Y), X(C); \
-	xor	X(Z), X(D); \
-	rotl	X(V), S;    \
-	rotl	X(W), S;    \
-	rotl	X(Y), S;    \
-	rotl	X(Z), S;
-
-.text
-.set	reorder
-.set	noat
-.globl	chacha_crypt_arch
-.ent	chacha_crypt_arch
-chacha_crypt_arch:
-	.frame	$sp, STACK_SIZE, $ra
-
-	/* Load number of rounds */
-	lw	$at, 16($sp)
-
-	addiu	$sp, -STACK_SIZE
-
-	/* Return bytes = 0. */
-	beqz	BYTES, .Lchacha_mips_end
-
-	lw	NONCE_0, 48(STATE)
-
-	/* Save s0-s7 */
-	sw	$s0,  0($sp)
-	sw	$s1,  4($sp)
-	sw	$s2,  8($sp)
-	sw	$s3, 12($sp)
-	sw	$s4, 16($sp)
-	sw	$s5, 20($sp)
-	sw	$s6, 24($sp)
-	sw	$s7, 28($sp)
-
-	/* Test IN or OUT is unaligned.
-	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
-	 */
-	or	IS_UNALIGNED, IN, OUT
-	andi	IS_UNALIGNED, 0x3
-
-	b	.Lchacha_rounds_start
-
-.align 4
-.Loop_chacha_rounds:
-	addiu	IN,  CHACHA20_BLOCK_SIZE
-	addiu	OUT, CHACHA20_BLOCK_SIZE
-	addiu	NONCE_0, 1
-
-.Lchacha_rounds_start:
-	lw	X0,  0(STATE)
-	lw	X1,  4(STATE)
-	lw	X2,  8(STATE)
-	lw	X3,  12(STATE)
-
-	lw	X4,  16(STATE)
-	lw	X5,  20(STATE)
-	lw	X6,  24(STATE)
-	lw	X7,  28(STATE)
-	lw	X8,  32(STATE)
-	lw	X9,  36(STATE)
-	lw	X10, 40(STATE)
-	lw	X11, 44(STATE)
-
-	move	X12, NONCE_0
-	lw	X13, 52(STATE)
-	lw	X14, 56(STATE)
-	lw	X15, 60(STATE)
-
-.Loop_chacha_xor_rounds:
-	addiu	$at, -2
-	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
-	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
-	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
-	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
-	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
-	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
-	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
-	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
-	bnez	$at, .Loop_chacha_xor_rounds
-
-	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)
-
-	/* Is data src/dst unaligned? Jump */
-	bnez	IS_UNALIGNED, .Loop_chacha_unaligned
-
-	/* Set number rounds here to fill delayslot. */
-	lw	$at, (STACK_SIZE+16)($sp)
-
-	/* BYTES < 0, it has no full block. */
-	bltz	BYTES, .Lchacha_mips_no_full_block_aligned
-
-	FOR_EACH_WORD_REV(STORE_ALIGNED)
-
-	/* BYTES > 0? Loop again. */
-	bgtz	BYTES, .Loop_chacha_rounds
-
-	/* Place this here to fill delay slot */
-	addiu	NONCE_0, 1
-
-	/* BYTES < 0? Handle last bytes */
-	bltz	BYTES, .Lchacha_mips_xor_bytes
-
-.Lchacha_mips_xor_done:
-	/* Restore used registers */
-	lw	$s0,  0($sp)
-	lw	$s1,  4($sp)
-	lw	$s2,  8($sp)
-	lw	$s3, 12($sp)
-	lw	$s4, 16($sp)
-	lw	$s5, 20($sp)
-	lw	$s6, 24($sp)
-	lw	$s7, 28($sp)
-
-	/* Write NONCE_0 back to right location in state */
-	sw	NONCE_0, 48(STATE)
-
-.Lchacha_mips_end:
-	addiu	$sp, STACK_SIZE
-	jr	$ra
-
-.Lchacha_mips_no_full_block_aligned:
-	/* Restore the offset on BYTES */
-	addiu	BYTES, CHACHA20_BLOCK_SIZE
-
-	/* Get number of full WORDS */
-	andi	$at, BYTES, MASK_U32
-
-	/* Load upper half of jump table addr */
-	lui	T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
-
-	/* Calculate lower half jump table offset */
-	ins	T0, $at, 1, 6
-
-	/* Add offset to STATE */
-	addu	T1, STATE, $at
-
-	/* Add lower half jump table addr */
-	addiu	T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
-
-	/* Read value from STATE */
-	lw	SAVED_CA, 0(T1)
-
-	/* Store remaining bytecounter as negative value */
-	subu	BYTES, $at, BYTES
-
-	jr	T0
-
-	/* Jump table */
-	FOR_EACH_WORD(JMPTBL_ALIGNED)
-
-
-.Loop_chacha_unaligned:
-	/* Set number rounds here to fill delayslot. */
-	lw	$at, (STACK_SIZE+16)($sp)
-
-	/* BYTES > 0, it has no full block. */
-	bltz	BYTES, .Lchacha_mips_no_full_block_unaligned
-
-	FOR_EACH_WORD_REV(STORE_UNALIGNED)
-
-	/* BYTES > 0? Loop again. */
-	bgtz	BYTES, .Loop_chacha_rounds
-
-	/* Write NONCE_0 back to right location in state */
-	sw	NONCE_0, 48(STATE)
-
-	.set noreorder
-	/* Fall through to byte handling */
-	bgez	BYTES, .Lchacha_mips_xor_done
-.Lchacha_mips_xor_unaligned_0_b:
-.Lchacha_mips_xor_aligned_0_b:
-	/* Place this here to fill delay slot */
-	addiu	NONCE_0, 1
-	.set reorder
-
-.Lchacha_mips_xor_bytes:
-	addu	IN, $at
-	addu	OUT, $at
-	/* First byte */
-	lbu	T1, 0(IN)
-	addiu	$at, BYTES, 1
-	CPU_TO_LE32(SAVED_X)
-	ROTR(SAVED_X)
-	xor	T1, SAVED_X
-	sb	T1, 0(OUT)
-	beqz	$at, .Lchacha_mips_xor_done
-	/* Second byte */
-	lbu	T1, 1(IN)
-	addiu	$at, BYTES, 2
-	ROTx	SAVED_X, 8
-	xor	T1, SAVED_X
-	sb	T1, 1(OUT)
-	beqz	$at, .Lchacha_mips_xor_done
-	/* Third byte */
-	lbu	T1, 2(IN)
-	ROTx	SAVED_X, 8
-	xor	T1, SAVED_X
-	sb	T1, 2(OUT)
-	b	.Lchacha_mips_xor_done
-
-.Lchacha_mips_no_full_block_unaligned:
-	/* Restore the offset on BYTES */
-	addiu	BYTES, CHACHA20_BLOCK_SIZE
-
-	/* Get number of full WORDS */
-	andi	$at, BYTES, MASK_U32
-
-	/* Load upper half of jump table addr */
-	lui	T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
-
-	/* Calculate lower half jump table offset */
-	ins	T0, $at, 1, 6
-
-	/* Add offset to STATE */
-	addu	T1, STATE, $at
-
-	/* Add lower half jump table addr */
-	addiu	T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
-
-	/* Read value from STATE */
-	lw	SAVED_CA, 0(T1)
-
-	/* Store remaining bytecounter as negative value */
-	subu	BYTES, $at, BYTES
-
-	jr	T0
-
-	/* Jump table */
-	FOR_EACH_WORD(JMPTBL_UNALIGNED)
-.end chacha_crypt_arch
-.set at
-
-/* Input arguments
- * STATE	$a0
- * OUT		$a1
- * NROUND	$a2
- */
-
-#undef X12
-#undef X13
-#undef X14
-#undef X15
-
-#define X12	$a3
-#define X13	$at
-#define X14	$v0
-#define X15	STATE
-
-.set noat
-.globl	hchacha_block_arch
-.ent	hchacha_block_arch
-hchacha_block_arch:
-	.frame	$sp, STACK_SIZE, $ra
-
-	addiu	$sp, -STACK_SIZE
-
-	/* Save X11(s6) */
-	sw	X11, 0($sp)
-
-	lw	X0,  0(STATE)
-	lw	X1,  4(STATE)
-	lw	X2,  8(STATE)
-	lw	X3,  12(STATE)
-	lw	X4,  16(STATE)
-	lw	X5,  20(STATE)
-	lw	X6,  24(STATE)
-	lw	X7,  28(STATE)
-	lw	X8,  32(STATE)
-	lw	X9,  36(STATE)
-	lw	X10, 40(STATE)
-	lw	X11, 44(STATE)
-	lw	X12, 48(STATE)
-	lw	X13, 52(STATE)
-	lw	X14, 56(STATE)
-	lw	X15, 60(STATE)
-
-.Loop_hchacha_xor_rounds:
-	addiu	$a2, -2
-	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
-	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
-	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
-	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
-	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
-	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
-	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
-	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
-	bnez	$a2, .Loop_hchacha_xor_rounds
-
-	/* Restore used register */
-	lw	X11, 0($sp)
-
-	sw	X0,  0(OUT)
-	sw	X1,  4(OUT)
-	sw	X2,  8(OUT)
-	sw	X3,  12(OUT)
-	sw	X12, 16(OUT)
-	sw	X13, 20(OUT)
-	sw	X14, 24(OUT)
-	sw	X15, 28(OUT)
-
-	addiu	$sp, STACK_SIZE
-	jr	$ra
-.end hchacha_block_arch
-.set at
diff --git a/arch/mips/lib/crypto/chacha-glue.c b/arch/mips/lib/crypto/chacha-glue.c
deleted file mode 100644
index 88c097594eb0..000000000000
--- a/arch/mips/lib/crypto/chacha-glue.c
+++ /dev/null
@@ -1,29 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ChaCha and HChaCha functions (MIPS optimized)
- *
- * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <crypto/chacha.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-asmlinkage void chacha_crypt_arch(struct chacha_state *state,
-				  u8 *dst, const u8 *src,
-				  unsigned int bytes, int nrounds);
-EXPORT_SYMBOL(chacha_crypt_arch);
-
-asmlinkage void hchacha_block_arch(const struct chacha_state *state,
-				   u32 out[HCHACHA_OUT_WORDS], int nrounds);
-EXPORT_SYMBOL(hchacha_block_arch);
-
-bool chacha_is_arch_optimized(void)
-{
-	return true;
-}
-EXPORT_SYMBOL(chacha_is_arch_optimized);
-
-MODULE_DESCRIPTION("ChaCha and HChaCha functions (MIPS optimized)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/mips/lib/crypto/poly1305-glue.c b/arch/mips/lib/crypto/poly1305-glue.c
deleted file mode 100644
index 764a38a65200..000000000000
--- a/arch/mips/lib/crypto/poly1305-glue.c
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
- *
- * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
- */
-
-#include <crypto/internal/poly1305.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/unaligned.h>
-
-asmlinkage void poly1305_block_init_arch(
-	struct poly1305_block_state *state,
-	const u8 raw_key[POLY1305_BLOCK_SIZE]);
-EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
-asmlinkage void poly1305_blocks_arch(struct poly1305_block_state *state,
-				     const u8 *src, u32 len, u32 hibit);
-EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
-asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
-				   u8 digest[POLY1305_DIGEST_SIZE],
-				   const u32 nonce[4]);
-EXPORT_SYMBOL_GPL(poly1305_emit_arch);
-
-bool poly1305_is_arch_optimized(void)
-{
-	return true;
-}
-EXPORT_SYMBOL(poly1305_is_arch_optimized);
-
-MODULE_DESCRIPTION("Poly1305 transform (MIPS accelerated");
-MODULE_LICENSE("GPL v2");
diff --git a/arch/mips/lib/crypto/poly1305-mips.pl b/arch/mips/lib/crypto/poly1305-mips.pl
deleted file mode 100644
index 399f10c3e385..000000000000
--- a/arch/mips/lib/crypto/poly1305-mips.pl
+++ /dev/null
@@ -1,1273 +0,0 @@
-#!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
-#
-# ====================================================================
-# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
-# project.
-# ====================================================================
-
-# Poly1305 hash for MIPS.
-#
-# May 2016
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone.
-#
-#		IALU/gcc
-# R1x000	~5.5/+130%	(big-endian)
-# Octeon II	2.50/+70%	(little-endian)
-#
-# March 2019
-#
-# Add 32-bit code path.
-#
-# October 2019
-#
-# Modulo-scheduling reduction allows to omit dependency chain at the
-# end of inner loop and improve performance. Also optimize MIPS32R2
-# code path for MIPS 1004K core. Per RenÃ© von Dorst's suggestions.
-#
-#		IALU/gcc
-# R1x000	~9.8/?		(big-endian)
-# Octeon II	3.65/+140%	(little-endian)
-# MT7621/1004K	4.75/?		(little-endian)
-#
-######################################################################
-# There is a number of MIPS ABI in use, O32 and N32/64 are most
-# widely used. Then there is a new contender: NUBI. It appears that if
-# one picks the latter, it's possible to arrange code in ABI neutral
-# manner. Therefore let's stick to NUBI register layout:
-#
-($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
-($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
-($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
-#
-# The return value is placed in $a0. Following coding rules facilitate
-# interoperability:
-#
-# - never ever touch $tp, "thread pointer", former $gp [o32 can be
-#   excluded from the rule, because it's specified volatile];
-# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
-#   old code];
-# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
-#
-# For reference here is register layout for N32/64 MIPS ABIs:
-#
-# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
-# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
-# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
-# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
-# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
-#
-# <appro@openssl.org>
-#
-######################################################################
-
-$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
-
-$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
-
-if ($flavour =~ /64|n32/i) {{{
-######################################################################
-# 64-bit code path
-#
-
-my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
-my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
-
-$code.=<<___;
-#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
-     defined(_MIPS_ARCH_MIPS64R6)) \\
-     && !defined(_MIPS_ARCH_MIPS64R2)
-# define _MIPS_ARCH_MIPS64R2
-#endif
-
-#if defined(_MIPS_ARCH_MIPS64R6)
-# define dmultu(rs,rt)
-# define mflo(rd,rs,rt)	dmulu	rd,rs,rt
-# define mfhi(rd,rs,rt)	dmuhu	rd,rs,rt
-#else
-# define dmultu(rs,rt)		dmultu	rs,rt
-# define mflo(rd,rs,rt)	mflo	rd
-# define mfhi(rd,rs,rt)	mfhi	rd
-#endif
-
-#ifdef	__KERNEL__
-# define poly1305_init   poly1305_block_init_arch
-# define poly1305_blocks poly1305_blocks_arch
-# define poly1305_emit   poly1305_emit_arch
-#endif
-
-#if defined(__MIPSEB__) && !defined(MIPSEB)
-# define MIPSEB
-#endif
-
-#ifdef MIPSEB
-# define MSB 0
-# define LSB 7
-#else
-# define MSB 7
-# define LSB 0
-#endif
-
-.text
-.set	noat
-.set	noreorder
-
-.align	5
-.globl	poly1305_init
-.ent	poly1305_init
-poly1305_init:
-	.frame	$sp,0,$ra
-	.set	reorder
-
-	sd	$zero,0($ctx)
-	sd	$zero,8($ctx)
-	sd	$zero,16($ctx)
-
-	beqz	$inp,.Lno_key
-
-#if defined(_MIPS_ARCH_MIPS64R6)
-	andi	$tmp0,$inp,7		# $inp % 8
-	dsubu	$inp,$inp,$tmp0		# align $inp
-	sll	$tmp0,$tmp0,3		# byte to bit offset
-	ld	$in0,0($inp)
-	ld	$in1,8($inp)
-	beqz	$tmp0,.Laligned_key
-	ld	$tmp2,16($inp)
-
-	subu	$tmp1,$zero,$tmp0
-# ifdef	MIPSEB
-	dsllv	$in0,$in0,$tmp0
-	dsrlv	$tmp3,$in1,$tmp1
-	dsllv	$in1,$in1,$tmp0
-	dsrlv	$tmp2,$tmp2,$tmp1
-# else
-	dsrlv	$in0,$in0,$tmp0
-	dsllv	$tmp3,$in1,$tmp1
-	dsrlv	$in1,$in1,$tmp0
-	dsllv	$tmp2,$tmp2,$tmp1
-# endif
-	or	$in0,$in0,$tmp3
-	or	$in1,$in1,$tmp2
-.Laligned_key:
-#else
-	ldl	$in0,0+MSB($inp)
-	ldl	$in1,8+MSB($inp)
-	ldr	$in0,0+LSB($inp)
-	ldr	$in1,8+LSB($inp)
-#endif
-#ifdef	MIPSEB
-# if defined(_MIPS_ARCH_MIPS64R2)
-	dsbh	$in0,$in0		# byte swap
-	 dsbh	$in1,$in1
-	dshd	$in0,$in0
-	 dshd	$in1,$in1
-# else
-	ori	$tmp0,$zero,0xFF
-	dsll	$tmp2,$tmp0,32
-	or	$tmp0,$tmp2		# 0x000000FF000000FF
-
-	and	$tmp1,$in0,$tmp0	# byte swap
-	 and	$tmp3,$in1,$tmp0
-	dsrl	$tmp2,$in0,24
-	 dsrl	$tmp4,$in1,24
-	dsll	$tmp1,24
-	 dsll	$tmp3,24
-	and	$tmp2,$tmp0
-	 and	$tmp4,$tmp0
-	dsll	$tmp0,8			# 0x0000FF000000FF00
-	or	$tmp1,$tmp2
-	 or	$tmp3,$tmp4
-	and	$tmp2,$in0,$tmp0
-	 and	$tmp4,$in1,$tmp0
-	dsrl	$in0,8
-	 dsrl	$in1,8
-	dsll	$tmp2,8
-	 dsll	$tmp4,8
-	and	$in0,$tmp0
-	 and	$in1,$tmp0
-	or	$tmp1,$tmp2
-	 or	$tmp3,$tmp4
-	or	$in0,$tmp1
-	 or	$in1,$tmp3
-	dsrl	$tmp1,$in0,32
-	 dsrl	$tmp3,$in1,32
-	dsll	$in0,32
-	 dsll	$in1,32
-	or	$in0,$tmp1
-	 or	$in1,$tmp3
-# endif
-#endif
-	li	$tmp0,1
-	dsll	$tmp0,32		# 0x0000000100000000
-	daddiu	$tmp0,-63		# 0x00000000ffffffc1
-	dsll	$tmp0,28		# 0x0ffffffc10000000
-	daddiu	$tmp0,-1		# 0x0ffffffc0fffffff
-
-	and	$in0,$tmp0
-	daddiu	$tmp0,-3		# 0x0ffffffc0ffffffc
-	and	$in1,$tmp0
-
-	sd	$in0,24($ctx)
-	dsrl	$tmp0,$in1,2
-	sd	$in1,32($ctx)
-	daddu	$tmp0,$in1		# s1 = r1 + (r1 >> 2)
-	sd	$tmp0,40($ctx)
-
-.Lno_key:
-	li	$v0,0			# return 0
-	jr	$ra
-.end	poly1305_init
-___
-{
-my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
-
-my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
-   ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
-my ($shr,$shl) = ($s6,$s7);		# used on R6
-
-$code.=<<___;
-.align	5
-.globl	poly1305_blocks
-.ent	poly1305_blocks
-poly1305_blocks:
-	.set	noreorder
-	dsrl	$len,4			# number of complete blocks
-	bnez	$len,poly1305_blocks_internal
-	nop
-	jr	$ra
-	nop
-.end	poly1305_blocks
-
-.align	5
-.ent	poly1305_blocks_internal
-poly1305_blocks_internal:
-	.set	noreorder
-#if defined(_MIPS_ARCH_MIPS64R6)
-	.frame	$sp,8*8,$ra
-	.mask	$SAVED_REGS_MASK|0x000c0000,-8
-	dsubu	$sp,8*8
-	sd	$s7,56($sp)
-	sd	$s6,48($sp)
-#else
-	.frame	$sp,6*8,$ra
-	.mask	$SAVED_REGS_MASK,-8
-	dsubu	$sp,6*8
-#endif
-	sd	$s5,40($sp)
-	sd	$s4,32($sp)
-___
-$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
-	sd	$s3,24($sp)
-	sd	$s2,16($sp)
-	sd	$s1,8($sp)
-	sd	$s0,0($sp)
-___
-$code.=<<___;
-	.set	reorder
-
-#if defined(_MIPS_ARCH_MIPS64R6)
-	andi	$shr,$inp,7
-	dsubu	$inp,$inp,$shr		# align $inp
-	sll	$shr,$shr,3		# byte to bit offset
-	subu	$shl,$zero,$shr
-#endif
-
-	ld	$h0,0($ctx)		# load hash value
-	ld	$h1,8($ctx)
-	ld	$h2,16($ctx)
-
-	ld	$r0,24($ctx)		# load key
-	ld	$r1,32($ctx)
-	ld	$rs1,40($ctx)
-
-	dsll	$len,4
-	daddu	$len,$inp		# end of buffer
-	b	.Loop
-
-.align	4
-.Loop:
-#if defined(_MIPS_ARCH_MIPS64R6)
-	ld	$in0,0($inp)		# load input
-	ld	$in1,8($inp)
-	beqz	$shr,.Laligned_inp
-
-	ld	$tmp2,16($inp)
-# ifdef	MIPSEB
-	dsllv	$in0,$in0,$shr
-	dsrlv	$tmp3,$in1,$shl
-	dsllv	$in1,$in1,$shr
-	dsrlv	$tmp2,$tmp2,$shl
-# else
-	dsrlv	$in0,$in0,$shr
-	dsllv	$tmp3,$in1,$shl
-	dsrlv	$in1,$in1,$shr
-	dsllv	$tmp2,$tmp2,$shl
-# endif
-	or	$in0,$in0,$tmp3
-	or	$in1,$in1,$tmp2
-.Laligned_inp:
-#else
-	ldl	$in0,0+MSB($inp)	# load input
-	ldl	$in1,8+MSB($inp)
-	ldr	$in0,0+LSB($inp)
-	ldr	$in1,8+LSB($inp)
-#endif
-	daddiu	$inp,16
-#ifdef	MIPSEB
-# if defined(_MIPS_ARCH_MIPS64R2)
-	dsbh	$in0,$in0		# byte swap
-	 dsbh	$in1,$in1
-	dshd	$in0,$in0
-	 dshd	$in1,$in1
-# else
-	ori	$tmp0,$zero,0xFF
-	dsll	$tmp2,$tmp0,32
-	or	$tmp0,$tmp2		# 0x000000FF000000FF
-
-	and	$tmp1,$in0,$tmp0	# byte swap
-	 and	$tmp3,$in1,$tmp0
-	dsrl	$tmp2,$in0,24
-	 dsrl	$tmp4,$in1,24
-	dsll	$tmp1,24
-	 dsll	$tmp3,24
-	and	$tmp2,$tmp0
-	 and	$tmp4,$tmp0
-	dsll	$tmp0,8			# 0x0000FF000000FF00
-	or	$tmp1,$tmp2
-	 or	$tmp3,$tmp4
-	and	$tmp2,$in0,$tmp0
-	 and	$tmp4,$in1,$tmp0
-	dsrl	$in0,8
-	 dsrl	$in1,8
-	dsll	$tmp2,8
-	 dsll	$tmp4,8
-	and	$in0,$tmp0
-	 and	$in1,$tmp0
-	or	$tmp1,$tmp2
-	 or	$tmp3,$tmp4
-	or	$in0,$tmp1
-	 or	$in1,$tmp3
-	dsrl	$tmp1,$in0,32
-	 dsrl	$tmp3,$in1,32
-	dsll	$in0,32
-	 dsll	$in1,32
-	or	$in0,$tmp1
-	 or	$in1,$tmp3
-# endif
-#endif
-	dsrl	$tmp1,$h2,2		# modulo-scheduled reduction
-	andi	$h2,$h2,3
-	dsll	$tmp0,$tmp1,2
-
-	daddu	$d0,$h0,$in0		# accumulate input
-	 daddu	$tmp1,$tmp0
-	sltu	$tmp0,$d0,$h0
-	daddu	$d0,$d0,$tmp1		# ... and residue
-	sltu	$tmp1,$d0,$tmp1
-	daddu	$d1,$h1,$in1
-	daddu	$tmp0,$tmp1
-	sltu	$tmp1,$d1,$h1
-	daddu	$d1,$tmp0
-
-	dmultu	($r0,$d0)		# h0*r0
-	 daddu	$d2,$h2,$padbit
-	 sltu	$tmp0,$d1,$tmp0
-	mflo	($h0,$r0,$d0)
-	mfhi	($h1,$r0,$d0)
-
-	dmultu	($rs1,$d1)		# h1*5*r1
-	 daddu	$d2,$tmp1
-	 daddu	$d2,$tmp0
-	mflo	($tmp0,$rs1,$d1)
-	mfhi	($tmp1,$rs1,$d1)
-
-	dmultu	($r1,$d0)		# h0*r1
-	mflo	($tmp2,$r1,$d0)
-	mfhi	($h2,$r1,$d0)
-	 daddu	$h0,$tmp0
-	 daddu	$h1,$tmp1
-	 sltu	$tmp0,$h0,$tmp0
-
-	dmultu	($r0,$d1)		# h1*r0
-	 daddu	$h1,$tmp0
-	 daddu	$h1,$tmp2
-	mflo	($tmp0,$r0,$d1)
-	mfhi	($tmp1,$r0,$d1)
-
-	dmultu	($rs1,$d2)		# h2*5*r1
-	 sltu	$tmp2,$h1,$tmp2
-	 daddu	$h2,$tmp2
-	mflo	($tmp2,$rs1,$d2)
-
-	dmultu	($r0,$d2)		# h2*r0
-	 daddu	$h1,$tmp0
-	 daddu	$h2,$tmp1
-	mflo	($tmp3,$r0,$d2)
-	 sltu	$tmp0,$h1,$tmp0
-	 daddu	$h2,$tmp0
-
-	daddu	$h1,$tmp2
-	sltu	$tmp2,$h1,$tmp2
-	daddu	$h2,$tmp2
-	daddu	$h2,$tmp3
-
-	bne	$inp,$len,.Loop
-
-	sd	$h0,0($ctx)		# store hash value
-	sd	$h1,8($ctx)
-	sd	$h2,16($ctx)
-
-	.set	noreorder
-#if defined(_MIPS_ARCH_MIPS64R6)
-	ld	$s7,56($sp)
-	ld	$s6,48($sp)
-#endif
-	ld	$s5,40($sp)		# epilogue
-	ld	$s4,32($sp)
-___
-$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi epilogue
-	ld	$s3,24($sp)
-	ld	$s2,16($sp)
-	ld	$s1,8($sp)
-	ld	$s0,0($sp)
-___
-$code.=<<___;
-	jr	$ra
-#if defined(_MIPS_ARCH_MIPS64R6)
-	daddu	$sp,8*8
-#else
-	daddu	$sp,6*8
-#endif
-.end	poly1305_blocks_internal
-___
-}
-{
-my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
-
-$code.=<<___;
-.align	5
-.globl	poly1305_emit
-.ent	poly1305_emit
-poly1305_emit:
-	.frame	$sp,0,$ra
-	.set	reorder
-
-	ld	$tmp2,16($ctx)
-	ld	$tmp0,0($ctx)
-	ld	$tmp1,8($ctx)
-
-	li	$in0,-4			# final reduction
-	dsrl	$in1,$tmp2,2
-	and	$in0,$tmp2
-	andi	$tmp2,$tmp2,3
-	daddu	$in0,$in1
-
-	daddu	$tmp0,$tmp0,$in0
-	sltu	$in1,$tmp0,$in0
-	 daddiu	$in0,$tmp0,5		# compare to modulus
-	daddu	$tmp1,$tmp1,$in1
-	 sltiu	$tmp3,$in0,5
-	sltu	$tmp4,$tmp1,$in1
-	 daddu	$in1,$tmp1,$tmp3
-	daddu	$tmp2,$tmp2,$tmp4
-	 sltu	$tmp3,$in1,$tmp3
-	 daddu	$tmp2,$tmp2,$tmp3
-
-	dsrl	$tmp2,2			# see if it carried/borrowed
-	dsubu	$tmp2,$zero,$tmp2
-
-	xor	$in0,$tmp0
-	xor	$in1,$tmp1
-	and	$in0,$tmp2
-	and	$in1,$tmp2
-	xor	$in0,$tmp0
-	xor	$in1,$tmp1
-
-	lwu	$tmp0,0($nonce)		# load nonce
-	lwu	$tmp1,4($nonce)
-	lwu	$tmp2,8($nonce)
-	lwu	$tmp3,12($nonce)
-	dsll	$tmp1,32
-	dsll	$tmp3,32
-	or	$tmp0,$tmp1
-	or	$tmp2,$tmp3
-
-	daddu	$in0,$tmp0		# accumulate nonce
-	daddu	$in1,$tmp2
-	sltu	$tmp0,$in0,$tmp0
-	daddu	$in1,$tmp0
-
-	dsrl	$tmp0,$in0,8		# write mac value
-	dsrl	$tmp1,$in0,16
-	dsrl	$tmp2,$in0,24
-	sb	$in0,0($mac)
-	dsrl	$tmp3,$in0,32
-	sb	$tmp0,1($mac)
-	dsrl	$tmp0,$in0,40
-	sb	$tmp1,2($mac)
-	dsrl	$tmp1,$in0,48
-	sb	$tmp2,3($mac)
-	dsrl	$tmp2,$in0,56
-	sb	$tmp3,4($mac)
-	dsrl	$tmp3,$in1,8
-	sb	$tmp0,5($mac)
-	dsrl	$tmp0,$in1,16
-	sb	$tmp1,6($mac)
-	dsrl	$tmp1,$in1,24
-	sb	$tmp2,7($mac)
-
-	sb	$in1,8($mac)
-	dsrl	$tmp2,$in1,32
-	sb	$tmp3,9($mac)
-	dsrl	$tmp3,$in1,40
-	sb	$tmp0,10($mac)
-	dsrl	$tmp0,$in1,48
-	sb	$tmp1,11($mac)
-	dsrl	$tmp1,$in1,56
-	sb	$tmp2,12($mac)
-	sb	$tmp3,13($mac)
-	sb	$tmp0,14($mac)
-	sb	$tmp1,15($mac)
-
-	jr	$ra
-.end	poly1305_emit
-.rdata
-.asciiz	"Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
-.align	2
-___
-}
-}}} else {{{
-######################################################################
-# 32-bit code path
-#
-
-my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
-my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
-   ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
-
-$code.=<<___;
-#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
-     defined(_MIPS_ARCH_MIPS32R6)) \\
-     && !defined(_MIPS_ARCH_MIPS32R2)
-# define _MIPS_ARCH_MIPS32R2
-#endif
-
-#if defined(_MIPS_ARCH_MIPS32R6)
-# define multu(rs,rt)
-# define mflo(rd,rs,rt)	mulu	rd,rs,rt
-# define mfhi(rd,rs,rt)	muhu	rd,rs,rt
-#else
-# define multu(rs,rt)	multu	rs,rt
-# define mflo(rd,rs,rt)	mflo	rd
-# define mfhi(rd,rs,rt)	mfhi	rd
-#endif
-
-#ifdef	__KERNEL__
-# define poly1305_init   poly1305_block_init_arch
-# define poly1305_blocks poly1305_blocks_arch
-# define poly1305_emit   poly1305_emit_arch
-#endif
-
-#if defined(__MIPSEB__) && !defined(MIPSEB)
-# define MIPSEB
-#endif
-
-#ifdef MIPSEB
-# define MSB 0
-# define LSB 3
-#else
-# define MSB 3
-# define LSB 0
-#endif
-
-.text
-.set	noat
-.set	noreorder
-
-.align	5
-.globl	poly1305_init
-.ent	poly1305_init
-poly1305_init:
-	.frame	$sp,0,$ra
-	.set	reorder
-
-	sw	$zero,0($ctx)
-	sw	$zero,4($ctx)
-	sw	$zero,8($ctx)
-	sw	$zero,12($ctx)
-	sw	$zero,16($ctx)
-
-	beqz	$inp,.Lno_key
-
-#if defined(_MIPS_ARCH_MIPS32R6)
-	andi	$tmp0,$inp,3		# $inp % 4
-	subu	$inp,$inp,$tmp0		# align $inp
-	sll	$tmp0,$tmp0,3		# byte to bit offset
-	lw	$in0,0($inp)
-	lw	$in1,4($inp)
-	lw	$in2,8($inp)
-	lw	$in3,12($inp)
-	beqz	$tmp0,.Laligned_key
-
-	lw	$tmp2,16($inp)
-	subu	$tmp1,$zero,$tmp0
-# ifdef	MIPSEB
-	sllv	$in0,$in0,$tmp0
-	srlv	$tmp3,$in1,$tmp1
-	sllv	$in1,$in1,$tmp0
-	or	$in0,$in0,$tmp3
-	srlv	$tmp3,$in2,$tmp1
-	sllv	$in2,$in2,$tmp0
-	or	$in1,$in1,$tmp3
-	srlv	$tmp3,$in3,$tmp1
-	sllv	$in3,$in3,$tmp0
-	or	$in2,$in2,$tmp3
-	srlv	$tmp2,$tmp2,$tmp1
-	or	$in3,$in3,$tmp2
-# else
-	srlv	$in0,$in0,$tmp0
-	sllv	$tmp3,$in1,$tmp1
-	srlv	$in1,$in1,$tmp0
-	or	$in0,$in0,$tmp3
-	sllv	$tmp3,$in2,$tmp1
-	srlv	$in2,$in2,$tmp0
-	or	$in1,$in1,$tmp3
-	sllv	$tmp3,$in3,$tmp1
-	srlv	$in3,$in3,$tmp0
-	or	$in2,$in2,$tmp3
-	sllv	$tmp2,$tmp2,$tmp1
-	or	$in3,$in3,$tmp2
-# endif
-.Laligned_key:
-#else
-	lwl	$in0,0+MSB($inp)
-	lwl	$in1,4+MSB($inp)
-	lwl	$in2,8+MSB($inp)
-	lwl	$in3,12+MSB($inp)
-	lwr	$in0,0+LSB($inp)
-	lwr	$in1,4+LSB($inp)
-	lwr	$in2,8+LSB($inp)
-	lwr	$in3,12+LSB($inp)
-#endif
-#ifdef	MIPSEB
-# if defined(_MIPS_ARCH_MIPS32R2)
-	wsbh	$in0,$in0		# byte swap
-	wsbh	$in1,$in1
-	wsbh	$in2,$in2
-	wsbh	$in3,$in3
-	rotr	$in0,$in0,16
-	rotr	$in1,$in1,16
-	rotr	$in2,$in2,16
-	rotr	$in3,$in3,16
-# else
-	srl	$tmp0,$in0,24		# byte swap
-	srl	$tmp1,$in0,8
-	andi	$tmp2,$in0,0xFF00
-	sll	$in0,$in0,24
-	andi	$tmp1,0xFF00
-	sll	$tmp2,$tmp2,8
-	or	$in0,$tmp0
-	 srl	$tmp0,$in1,24
-	or	$tmp1,$tmp2
-	 srl	$tmp2,$in1,8
-	or	$in0,$tmp1
-	 andi	$tmp1,$in1,0xFF00
-	 sll	$in1,$in1,24
-	 andi	$tmp2,0xFF00
-	 sll	$tmp1,$tmp1,8
-	 or	$in1,$tmp0
-	srl	$tmp0,$in2,24
-	 or	$tmp2,$tmp1
-	srl	$tmp1,$in2,8
-	 or	$in1,$tmp2
-	andi	$tmp2,$in2,0xFF00
-	sll	$in2,$in2,24
-	andi	$tmp1,0xFF00
-	sll	$tmp2,$tmp2,8
-	or	$in2,$tmp0
-	 srl	$tmp0,$in3,24
-	or	$tmp1,$tmp2
-	 srl	$tmp2,$in3,8
-	or	$in2,$tmp1
-	 andi	$tmp1,$in3,0xFF00
-	 sll	$in3,$in3,24
-	 andi	$tmp2,0xFF00
-	 sll	$tmp1,$tmp1,8
-	 or	$in3,$tmp0
-	 or	$tmp2,$tmp1
-	 or	$in3,$tmp2
-# endif
-#endif
-	lui	$tmp0,0x0fff
-	ori	$tmp0,0xffff		# 0x0fffffff
-	and	$in0,$in0,$tmp0
-	subu	$tmp0,3			# 0x0ffffffc
-	and	$in1,$in1,$tmp0
-	and	$in2,$in2,$tmp0
-	and	$in3,$in3,$tmp0
-
-	sw	$in0,20($ctx)
-	sw	$in1,24($ctx)
-	sw	$in2,28($ctx)
-	sw	$in3,32($ctx)
-
-	srl	$tmp1,$in1,2
-	srl	$tmp2,$in2,2
-	srl	$tmp3,$in3,2
-	addu	$in1,$in1,$tmp1		# s1 = r1 + (r1 >> 2)
-	addu	$in2,$in2,$tmp2
-	addu	$in3,$in3,$tmp3
-	sw	$in1,36($ctx)
-	sw	$in2,40($ctx)
-	sw	$in3,44($ctx)
-.Lno_key:
-	li	$v0,0
-	jr	$ra
-.end	poly1305_init
-___
-{
-my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
-
-my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
-   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
-my ($d0,$d1,$d2,$d3) =
-   ($a4,$a5,$a6,$a7);
-my $shr = $t2;		# used on R6
-my $one = $t2;		# used on R2
-
-$code.=<<___;
-.globl	poly1305_blocks
-.align	5
-.ent	poly1305_blocks
-poly1305_blocks:
-	.frame	$sp,16*4,$ra
-	.mask	$SAVED_REGS_MASK,-4
-	.set	noreorder
-	subu	$sp, $sp,4*12
-	sw	$s11,4*11($sp)
-	sw	$s10,4*10($sp)
-	sw	$s9, 4*9($sp)
-	sw	$s8, 4*8($sp)
-	sw	$s7, 4*7($sp)
-	sw	$s6, 4*6($sp)
-	sw	$s5, 4*5($sp)
-	sw	$s4, 4*4($sp)
-___
-$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
-	sw	$s3, 4*3($sp)
-	sw	$s2, 4*2($sp)
-	sw	$s1, 4*1($sp)
-	sw	$s0, 4*0($sp)
-___
-$code.=<<___;
-	.set	reorder
-
-	srl	$len,4			# number of complete blocks
-	li	$one,1
-	beqz	$len,.Labort
-
-#if defined(_MIPS_ARCH_MIPS32R6)
-	andi	$shr,$inp,3
-	subu	$inp,$inp,$shr		# align $inp
-	sll	$shr,$shr,3		# byte to bit offset
-#endif
-
-	lw	$h0,0($ctx)		# load hash value
-	lw	$h1,4($ctx)
-	lw	$h2,8($ctx)
-	lw	$h3,12($ctx)
-	lw	$h4,16($ctx)
-
-	lw	$r0,20($ctx)		# load key
-	lw	$r1,24($ctx)
-	lw	$r2,28($ctx)
-	lw	$r3,32($ctx)
-	lw	$rs1,36($ctx)
-	lw	$rs2,40($ctx)
-	lw	$rs3,44($ctx)
-
-	sll	$len,4
-	addu	$len,$len,$inp		# end of buffer
-	b	.Loop
-
-.align	4
-.Loop:
-#if defined(_MIPS_ARCH_MIPS32R6)
-	lw	$d0,0($inp)		# load input
-	lw	$d1,4($inp)
-	lw	$d2,8($inp)
-	lw	$d3,12($inp)
-	beqz	$shr,.Laligned_inp
-
-	lw	$t0,16($inp)
-	subu	$t1,$zero,$shr
-# ifdef	MIPSEB
-	sllv	$d0,$d0,$shr
-	srlv	$at,$d1,$t1
-	sllv	$d1,$d1,$shr
-	or	$d0,$d0,$at
-	srlv	$at,$d2,$t1
-	sllv	$d2,$d2,$shr
-	or	$d1,$d1,$at
-	srlv	$at,$d3,$t1
-	sllv	$d3,$d3,$shr
-	or	$d2,$d2,$at
-	srlv	$t0,$t0,$t1
-	or	$d3,$d3,$t0
-# else
-	srlv	$d0,$d0,$shr
-	sllv	$at,$d1,$t1
-	srlv	$d1,$d1,$shr
-	or	$d0,$d0,$at
-	sllv	$at,$d2,$t1
-	srlv	$d2,$d2,$shr
-	or	$d1,$d1,$at
-	sllv	$at,$d3,$t1
-	srlv	$d3,$d3,$shr
-	or	$d2,$d2,$at
-	sllv	$t0,$t0,$t1
-	or	$d3,$d3,$t0
-# endif
-.Laligned_inp:
-#else
-	lwl	$d0,0+MSB($inp)		# load input
-	lwl	$d1,4+MSB($inp)
-	lwl	$d2,8+MSB($inp)
-	lwl	$d3,12+MSB($inp)
-	lwr	$d0,0+LSB($inp)
-	lwr	$d1,4+LSB($inp)
-	lwr	$d2,8+LSB($inp)
-	lwr	$d3,12+LSB($inp)
-#endif
-#ifdef	MIPSEB
-# if defined(_MIPS_ARCH_MIPS32R2)
-	wsbh	$d0,$d0			# byte swap
-	wsbh	$d1,$d1
-	wsbh	$d2,$d2
-	wsbh	$d3,$d3
-	rotr	$d0,$d0,16
-	rotr	$d1,$d1,16
-	rotr	$d2,$d2,16
-	rotr	$d3,$d3,16
-# else
-	srl	$at,$d0,24		# byte swap
-	srl	$t0,$d0,8
-	andi	$t1,$d0,0xFF00
-	sll	$d0,$d0,24
-	andi	$t0,0xFF00
-	sll	$t1,$t1,8
-	or	$d0,$at
-	 srl	$at,$d1,24
-	or	$t0,$t1
-	 srl	$t1,$d1,8
-	or	$d0,$t0
-	 andi	$t0,$d1,0xFF00
-	 sll	$d1,$d1,24
-	 andi	$t1,0xFF00
-	 sll	$t0,$t0,8
-	 or	$d1,$at
-	srl	$at,$d2,24
-	 or	$t1,$t0
-	srl	$t0,$d2,8
-	 or	$d1,$t1
-	andi	$t1,$d2,0xFF00
-	sll	$d2,$d2,24
-	andi	$t0,0xFF00
-	sll	$t1,$t1,8
-	or	$d2,$at
-	 srl	$at,$d3,24
-	or	$t0,$t1
-	 srl	$t1,$d3,8
-	or	$d2,$t0
-	 andi	$t0,$d3,0xFF00
-	 sll	$d3,$d3,24
-	 andi	$t1,0xFF00
-	 sll	$t0,$t0,8
-	 or	$d3,$at
-	 or	$t1,$t0
-	 or	$d3,$t1
-# endif
-#endif
-	srl	$t0,$h4,2		# modulo-scheduled reduction
-	andi	$h4,$h4,3
-	sll	$at,$t0,2
-
-	addu	$d0,$d0,$h0		# accumulate input
-	 addu	$t0,$t0,$at
-	sltu	$h0,$d0,$h0
-	addu	$d0,$d0,$t0		# ... and residue
-	sltu	$at,$d0,$t0
-
-	addu	$d1,$d1,$h1
-	 addu	$h0,$h0,$at		# carry
-	sltu	$h1,$d1,$h1
-	addu	$d1,$d1,$h0
-	sltu	$h0,$d1,$h0
-
-	addu	$d2,$d2,$h2
-	 addu	$h1,$h1,$h0		# carry
-	sltu	$h2,$d2,$h2
-	addu	$d2,$d2,$h1
-	sltu	$h1,$d2,$h1
-
-	addu	$d3,$d3,$h3
-	 addu	$h2,$h2,$h1		# carry
-	sltu	$h3,$d3,$h3
-	addu	$d3,$d3,$h2
-
-#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
-	multu	$r0,$d0			# d0*r0
-	 sltu	$h2,$d3,$h2
-	maddu	$rs3,$d1		# d1*s3
-	 addu	$h3,$h3,$h2		# carry
-	maddu	$rs2,$d2		# d2*s2
-	 addu	$h4,$h4,$padbit
-	maddu	$rs1,$d3		# d3*s1
-	 addu	$h4,$h4,$h3
-	mfhi	$at
-	mflo	$h0
-
-	multu	$r1,$d0			# d0*r1
-	maddu	$r0,$d1			# d1*r0
-	maddu	$rs3,$d2		# d2*s3
-	maddu	$rs2,$d3		# d3*s2
-	maddu	$rs1,$h4		# h4*s1
-	maddu	$at,$one		# hi*1
-	mfhi	$at
-	mflo	$h1
-
-	multu	$r2,$d0			# d0*r2
-	maddu	$r1,$d1			# d1*r1
-	maddu	$r0,$d2			# d2*r0
-	maddu	$rs3,$d3		# d3*s3
-	maddu	$rs2,$h4		# h4*s2
-	maddu	$at,$one		# hi*1
-	mfhi	$at
-	mflo	$h2
-
-	mul	$t0,$r0,$h4		# h4*r0
-
-	multu	$r3,$d0			# d0*r3
-	maddu	$r2,$d1			# d1*r2
-	maddu	$r1,$d2			# d2*r1
-	maddu	$r0,$d3			# d3*r0
-	maddu	$rs3,$h4		# h4*s3
-	maddu	$at,$one		# hi*1
-	mfhi	$at
-	mflo	$h3
-
-	 addiu	$inp,$inp,16
-
-	addu	$h4,$t0,$at
-#else
-	multu	($r0,$d0)		# d0*r0
-	mflo	($h0,$r0,$d0)
-	mfhi	($h1,$r0,$d0)
-
-	 sltu	$h2,$d3,$h2
-	 addu	$h3,$h3,$h2		# carry
-
-	multu	($rs3,$d1)		# d1*s3
-	mflo	($at,$rs3,$d1)
-	mfhi	($t0,$rs3,$d1)
-
-	 addu	$h4,$h4,$padbit
-	 addiu	$inp,$inp,16
-	 addu	$h4,$h4,$h3
-
-	multu	($rs2,$d2)		# d2*s2
-	mflo	($a3,$rs2,$d2)
-	mfhi	($t1,$rs2,$d2)
-	 addu	$h0,$h0,$at
-	 addu	$h1,$h1,$t0
-	multu	($rs1,$d3)		# d3*s1
-	 sltu	$at,$h0,$at
-	 addu	$h1,$h1,$at
-
-	mflo	($at,$rs1,$d3)
-	mfhi	($t0,$rs1,$d3)
-	 addu	$h0,$h0,$a3
-	 addu	$h1,$h1,$t1
-	multu	($r1,$d0)		# d0*r1
-	 sltu	$a3,$h0,$a3
-	 addu	$h1,$h1,$a3
-
-
-	mflo	($a3,$r1,$d0)
-	mfhi	($h2,$r1,$d0)
-	 addu	$h0,$h0,$at
-	 addu	$h1,$h1,$t0
-	multu	($r0,$d1)		# d1*r0
-	 sltu	$at,$h0,$at
-	 addu	$h1,$h1,$at
-
-	mflo	($at,$r0,$d1)
-	mfhi	($t0,$r0,$d1)
-	 addu	$h1,$h1,$a3
-	 sltu	$a3,$h1,$a3
-	multu	($rs3,$d2)		# d2*s3
-	 addu	$h2,$h2,$a3
-
-	mflo	($a3,$rs3,$d2)
-	mfhi	($t1,$rs3,$d2)
-	 addu	$h1,$h1,$at
-	 addu	$h2,$h2,$t0
-	multu	($rs2,$d3)		# d3*s2
-	 sltu	$at,$h1,$at
-	 addu	$h2,$h2,$at
-
-	mflo	($at,$rs2,$d3)
-	mfhi	($t0,$rs2,$d3)
-	 addu	$h1,$h1,$a3
-	 addu	$h2,$h2,$t1
-	multu	($rs1,$h4)		# h4*s1
-	 sltu	$a3,$h1,$a3
-	 addu	$h2,$h2,$a3
-
-	mflo	($a3,$rs1,$h4)
-	 addu	$h1,$h1,$at
-	 addu	$h2,$h2,$t0
-	multu	($r2,$d0)		# d0*r2
-	 sltu	$at,$h1,$at
-	 addu	$h2,$h2,$at
-
-
-	mflo	($at,$r2,$d0)
-	mfhi	($h3,$r2,$d0)
-	 addu	$h1,$h1,$a3
-	 sltu	$a3,$h1,$a3
-	multu	($r1,$d1)		# d1*r1
-	 addu	$h2,$h2,$a3
-
-	mflo	($a3,$r1,$d1)
-	mfhi	($t1,$r1,$d1)
-	 addu	$h2,$h2,$at
-	 sltu	$at,$h2,$at
-	multu	($r0,$d2)		# d2*r0
-	 addu	$h3,$h3,$at
-
-	mflo	($at,$r0,$d2)
-	mfhi	($t0,$r0,$d2)
-	 addu	$h2,$h2,$a3
-	 addu	$h3,$h3,$t1
-	multu	($rs3,$d3)		# d3*s3
-	 sltu	$a3,$h2,$a3
-	 addu	$h3,$h3,$a3
-
-	mflo	($a3,$rs3,$d3)
-	mfhi	($t1,$rs3,$d3)
-	 addu	$h2,$h2,$at
-	 addu	$h3,$h3,$t0
-	multu	($rs2,$h4)		# h4*s2
-	 sltu	$at,$h2,$at
-	 addu	$h3,$h3,$at
-
-	mflo	($at,$rs2,$h4)
-	 addu	$h2,$h2,$a3
-	 addu	$h3,$h3,$t1
-	multu	($r3,$d0)		# d0*r3
-	 sltu	$a3,$h2,$a3
-	 addu	$h3,$h3,$a3
-
-
-	mflo	($a3,$r3,$d0)
-	mfhi	($t1,$r3,$d0)
-	 addu	$h2,$h2,$at
-	 sltu	$at,$h2,$at
-	multu	($r2,$d1)		# d1*r2
-	 addu	$h3,$h3,$at
-
-	mflo	($at,$r2,$d1)
-	mfhi	($t0,$r2,$d1)
-	 addu	$h3,$h3,$a3
-	 sltu	$a3,$h3,$a3
-	multu	($r0,$d3)		# d3*r0
-	 addu	$t1,$t1,$a3
-
-	mflo	($a3,$r0,$d3)
-	mfhi	($d3,$r0,$d3)
-	 addu	$h3,$h3,$at
-	 addu	$t1,$t1,$t0
-	multu	($r1,$d2)		# d2*r1
-	 sltu	$at,$h3,$at
-	 addu	$t1,$t1,$at
-
-	mflo	($at,$r1,$d2)
-	mfhi	($t0,$r1,$d2)
-	 addu	$h3,$h3,$a3
-	 addu	$t1,$t1,$d3
-	multu	($rs3,$h4)		# h4*s3
-	 sltu	$a3,$h3,$a3
-	 addu	$t1,$t1,$a3
-
-	mflo	($a3,$rs3,$h4)
-	 addu	$h3,$h3,$at
-	 addu	$t1,$t1,$t0
-	multu	($r0,$h4)		# h4*r0
-	 sltu	$at,$h3,$at
-	 addu	$t1,$t1,$at
-
-
-	mflo	($h4,$r0,$h4)
-	 addu	$h3,$h3,$a3
-	 sltu	$a3,$h3,$a3
-	 addu	$t1,$t1,$a3
-	addu	$h4,$h4,$t1
-
-	li	$padbit,1		# if we loop, padbit is 1
-#endif
-	bne	$inp,$len,.Loop
-
-	sw	$h0,0($ctx)		# store hash value
-	sw	$h1,4($ctx)
-	sw	$h2,8($ctx)
-	sw	$h3,12($ctx)
-	sw	$h4,16($ctx)
-
-	.set	noreorder
-.Labort:
-	lw	$s11,4*11($sp)
-	lw	$s10,4*10($sp)
-	lw	$s9, 4*9($sp)
-	lw	$s8, 4*8($sp)
-	lw	$s7, 4*7($sp)
-	lw	$s6, 4*6($sp)
-	lw	$s5, 4*5($sp)
-	lw	$s4, 4*4($sp)
-___
-$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
-	lw	$s3, 4*3($sp)
-	lw	$s2, 4*2($sp)
-	lw	$s1, 4*1($sp)
-	lw	$s0, 4*0($sp)
-___
-$code.=<<___;
-	jr	$ra
-	addu	$sp,$sp,4*12
-.end	poly1305_blocks
-___
-}
-{
-my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
-
-$code.=<<___;
-.align	5
-.globl	poly1305_emit
-.ent	poly1305_emit
-poly1305_emit:
-	.frame	$sp,0,$ra
-	.set	reorder
-
-	lw	$tmp4,16($ctx)
-	lw	$tmp0,0($ctx)
-	lw	$tmp1,4($ctx)
-	lw	$tmp2,8($ctx)
-	lw	$tmp3,12($ctx)
-
-	li	$in0,-4			# final reduction
-	srl	$ctx,$tmp4,2
-	and	$in0,$in0,$tmp4
-	andi	$tmp4,$tmp4,3
-	addu	$ctx,$ctx,$in0
-
-	addu	$tmp0,$tmp0,$ctx
-	sltu	$ctx,$tmp0,$ctx
-	 addiu	$in0,$tmp0,5		# compare to modulus
-	addu	$tmp1,$tmp1,$ctx
-	 sltiu	$in1,$in0,5
-	sltu	$ctx,$tmp1,$ctx
-	 addu	$in1,$in1,$tmp1
-	addu	$tmp2,$tmp2,$ctx
-	 sltu	$in2,$in1,$tmp1
-	sltu	$ctx,$tmp2,$ctx
-	 addu	$in2,$in2,$tmp2
-	addu	$tmp3,$tmp3,$ctx
-	 sltu	$in3,$in2,$tmp2
-	sltu	$ctx,$tmp3,$ctx
-	 addu	$in3,$in3,$tmp3
-	addu	$tmp4,$tmp4,$ctx
-	 sltu	$ctx,$in3,$tmp3
-	 addu	$ctx,$tmp4
-
-	srl	$ctx,2			# see if it carried/borrowed
-	subu	$ctx,$zero,$ctx
-
-	xor	$in0,$tmp0
-	xor	$in1,$tmp1
-	xor	$in2,$tmp2
-	xor	$in3,$tmp3
-	and	$in0,$ctx
-	and	$in1,$ctx
-	and	$in2,$ctx
-	and	$in3,$ctx
-	xor	$in0,$tmp0
-	xor	$in1,$tmp1
-	xor	$in2,$tmp2
-	xor	$in3,$tmp3
-
-	lw	$tmp0,0($nonce)		# load nonce
-	lw	$tmp1,4($nonce)
-	lw	$tmp2,8($nonce)
-	lw	$tmp3,12($nonce)
-
-	addu	$in0,$tmp0		# accumulate nonce
-	sltu	$ctx,$in0,$tmp0
-
-	addu	$in1,$tmp1
-	sltu	$tmp1,$in1,$tmp1
-	addu	$in1,$ctx
-	sltu	$ctx,$in1,$ctx
-	addu	$ctx,$tmp1
-
-	addu	$in2,$tmp2
-	sltu	$tmp2,$in2,$tmp2
-	addu	$in2,$ctx
-	sltu	$ctx,$in2,$ctx
-	addu	$ctx,$tmp2
-
-	addu	$in3,$tmp3
-	addu	$in3,$ctx
-
-	srl	$tmp0,$in0,8		# write mac value
-	srl	$tmp1,$in0,16
-	srl	$tmp2,$in0,24
-	sb	$in0, 0($mac)
-	sb	$tmp0,1($mac)
-	srl	$tmp0,$in1,8
-	sb	$tmp1,2($mac)
-	srl	$tmp1,$in1,16
-	sb	$tmp2,3($mac)
-	srl	$tmp2,$in1,24
-	sb	$in1, 4($mac)
-	sb	$tmp0,5($mac)
-	srl	$tmp0,$in2,8
-	sb	$tmp1,6($mac)
-	srl	$tmp1,$in2,16
-	sb	$tmp2,7($mac)
-	srl	$tmp2,$in2,24
-	sb	$in2, 8($mac)
-	sb	$tmp0,9($mac)
-	srl	$tmp0,$in3,8
-	sb	$tmp1,10($mac)
-	srl	$tmp1,$in3,16
-	sb	$tmp2,11($mac)
-	srl	$tmp2,$in3,24
-	sb	$in3, 12($mac)
-	sb	$tmp0,13($mac)
-	sb	$tmp1,14($mac)
-	sb	$tmp2,15($mac)
-
-	jr	$ra
-.end	poly1305_emit
-.rdata
-.asciiz	"Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
-.align	2
-___
-}
-}}}
-
-$output=pop and open STDOUT,">$output";
-print $code;
-close STDOUT;
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index fdeb91bf0032..43c44316fbbd 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -196,7 +196,7 @@ if ARM64
 source "lib/crypto/arm64/Kconfig"
 endif
 if MIPS
-source "arch/mips/lib/crypto/Kconfig"
+source "lib/crypto/mips/Kconfig"
 endif
 if PPC
 source "arch/powerpc/lib/crypto/Kconfig"
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 19e9880c5d5f..f54d2f3edc40 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -109,3 +109,4 @@ libsm3-y					:= sm3.o
 
 obj-$(CONFIG_ARM) += arm/
 obj-$(CONFIG_ARM64) += arm64/
+obj-$(CONFIG_MIPS) += mips/
diff --git a/lib/crypto/mips/.gitignore b/lib/crypto/mips/.gitignore
new file mode 100644
index 000000000000..0d47d4f21c6d
--- /dev/null
+++ b/lib/crypto/mips/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+poly1305-core.S
diff --git a/lib/crypto/mips/Kconfig b/lib/crypto/mips/Kconfig
new file mode 100644
index 000000000000..0670a170c1be
--- /dev/null
+++ b/lib/crypto/mips/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config CRYPTO_CHACHA_MIPS
+	tristate
+	depends on CPU_MIPS32_R2
+	default CRYPTO_LIB_CHACHA
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
+
+config CRYPTO_POLY1305_MIPS
+	tristate
+	default CRYPTO_LIB_POLY1305
+	select CRYPTO_ARCH_HAVE_LIB_POLY1305
diff --git a/lib/crypto/mips/Makefile b/lib/crypto/mips/Makefile
new file mode 100644
index 000000000000..804488c7aded
--- /dev/null
+++ b/lib/crypto/mips/Makefile
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
+chacha-mips-y := chacha-core.o chacha-glue.o
+AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
+
+obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
+poly1305-mips-y := poly1305-core.o poly1305-glue.o
+
+perlasm-flavour-$(CONFIG_32BIT) := o32
+perlasm-flavour-$(CONFIG_64BIT) := 64
+
+quiet_cmd_perlasm = PERLASM $@
+      cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
+
+$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
+	$(call if_changed,perlasm)
+
+targets += poly1305-core.S
diff --git a/lib/crypto/mips/chacha-core.S b/lib/crypto/mips/chacha-core.S
new file mode 100644
index 000000000000..5755f69cfe00
--- /dev/null
+++ b/lib/crypto/mips/chacha-core.S
@@ -0,0 +1,497 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright (C) 2016-2018 RenÃ© van Dorst <opensource@vdorst.com>. All Rights Reserved.
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#define MASK_U32		0x3c
+#define CHACHA20_BLOCK_SIZE	64
+#define STACK_SIZE		32
+
+#define X0	$t0
+#define X1	$t1
+#define X2	$t2
+#define X3	$t3
+#define X4	$t4
+#define X5	$t5
+#define X6	$t6
+#define X7	$t7
+#define X8	$t8
+#define X9	$t9
+#define X10	$v1
+#define X11	$s6
+#define X12	$s5
+#define X13	$s4
+#define X14	$s3
+#define X15	$s2
+/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
+#define T0	$s1
+#define T1	$s0
+#define T(n)	T ## n
+#define X(n)	X ## n
+
+/* Input arguments */
+#define STATE		$a0
+#define OUT		$a1
+#define IN		$a2
+#define BYTES		$a3
+
+/* Output argument */
+/* NONCE[0] is kept in a register and not in memory.
+ * We don't want to touch original value in memory.
+ * Must be incremented every loop iteration.
+ */
+#define NONCE_0		$v0
+
+/* SAVED_X and SAVED_CA are set in the jump table.
+ * Use regs which are overwritten on exit else we don't leak clear data.
+ * They are used to handling the last bytes which are not multiple of 4.
+ */
+#define SAVED_X		X15
+#define SAVED_CA	$s7
+
+#define IS_UNALIGNED	$s7
+
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define MSB 0
+#define LSB 3
+#define ROTx rotl
+#define ROTR(n) rotr n, 24
+#define	CPU_TO_LE32(n) \
+	wsbh	n; \
+	rotr	n, 16;
+#else
+#define MSB 3
+#define LSB 0
+#define ROTx rotr
+#define CPU_TO_LE32(n)
+#define ROTR(n)
+#endif
+
+#define FOR_EACH_WORD(x) \
+	x( 0); \
+	x( 1); \
+	x( 2); \
+	x( 3); \
+	x( 4); \
+	x( 5); \
+	x( 6); \
+	x( 7); \
+	x( 8); \
+	x( 9); \
+	x(10); \
+	x(11); \
+	x(12); \
+	x(13); \
+	x(14); \
+	x(15);
+
+#define FOR_EACH_WORD_REV(x) \
+	x(15); \
+	x(14); \
+	x(13); \
+	x(12); \
+	x(11); \
+	x(10); \
+	x( 9); \
+	x( 8); \
+	x( 7); \
+	x( 6); \
+	x( 5); \
+	x( 4); \
+	x( 3); \
+	x( 2); \
+	x( 1); \
+	x( 0);
+
+#define PLUS_ONE_0	 1
+#define PLUS_ONE_1	 2
+#define PLUS_ONE_2	 3
+#define PLUS_ONE_3	 4
+#define PLUS_ONE_4	 5
+#define PLUS_ONE_5	 6
+#define PLUS_ONE_6	 7
+#define PLUS_ONE_7	 8
+#define PLUS_ONE_8	 9
+#define PLUS_ONE_9	10
+#define PLUS_ONE_10	11
+#define PLUS_ONE_11	12
+#define PLUS_ONE_12	13
+#define PLUS_ONE_13	14
+#define PLUS_ONE_14	15
+#define PLUS_ONE_15	16
+#define PLUS_ONE(x)	PLUS_ONE_ ## x
+#define _CONCAT3(a,b,c)	a ## b ## c
+#define CONCAT3(a,b,c)	_CONCAT3(a,b,c)
+
+#define STORE_UNALIGNED(x) \
+CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
+	.if (x != 12); \
+		lw	T0, (x*4)(STATE); \
+	.endif; \
+	lwl	T1, (x*4)+MSB ## (IN); \
+	lwr	T1, (x*4)+LSB ## (IN); \
+	.if (x == 12); \
+		addu	X ## x, NONCE_0; \
+	.else; \
+		addu	X ## x, T0; \
+	.endif; \
+	CPU_TO_LE32(X ## x); \
+	xor	X ## x, T1; \
+	swl	X ## x, (x*4)+MSB ## (OUT); \
+	swr	X ## x, (x*4)+LSB ## (OUT);
+
+#define STORE_ALIGNED(x) \
+CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
+	.if (x != 12); \
+		lw	T0, (x*4)(STATE); \
+	.endif; \
+	lw	T1, (x*4) ## (IN); \
+	.if (x == 12); \
+		addu	X ## x, NONCE_0; \
+	.else; \
+		addu	X ## x, T0; \
+	.endif; \
+	CPU_TO_LE32(X ## x); \
+	xor	X ## x, T1; \
+	sw	X ## x, (x*4) ## (OUT);
+
+/* Jump table macro.
+ * Used for setup and handling the last bytes, which are not multiple of 4.
+ * X15 is free to store Xn
+ * Every jumptable entry must be equal in size.
+ */
+#define JMPTBL_ALIGNED(x) \
+.Lchacha_mips_jmptbl_aligned_ ## x: ; \
+	.set	noreorder; \
+	b	.Lchacha_mips_xor_aligned_ ## x ## _b; \
+	.if (x == 12); \
+		addu	SAVED_X, X ## x, NONCE_0; \
+	.else; \
+		addu	SAVED_X, X ## x, SAVED_CA; \
+	.endif; \
+	.set	reorder
+
+#define JMPTBL_UNALIGNED(x) \
+.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
+	.set	noreorder; \
+	b	.Lchacha_mips_xor_unaligned_ ## x ## _b; \
+	.if (x == 12); \
+		addu	SAVED_X, X ## x, NONCE_0; \
+	.else; \
+		addu	SAVED_X, X ## x, SAVED_CA; \
+	.endif; \
+	.set	reorder
+
+#define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
+	addu	X(A), X(K); \
+	addu	X(B), X(L); \
+	addu	X(C), X(M); \
+	addu	X(D), X(N); \
+	xor	X(V), X(A); \
+	xor	X(W), X(B); \
+	xor	X(Y), X(C); \
+	xor	X(Z), X(D); \
+	rotl	X(V), S;    \
+	rotl	X(W), S;    \
+	rotl	X(Y), S;    \
+	rotl	X(Z), S;
+
+.text
+.set	reorder
+.set	noat
+.globl	chacha_crypt_arch
+.ent	chacha_crypt_arch
+chacha_crypt_arch:
+	.frame	$sp, STACK_SIZE, $ra
+
+	/* Load number of rounds */
+	lw	$at, 16($sp)
+
+	addiu	$sp, -STACK_SIZE
+
+	/* Return bytes = 0. */
+	beqz	BYTES, .Lchacha_mips_end
+
+	lw	NONCE_0, 48(STATE)
+
+	/* Save s0-s7 */
+	sw	$s0,  0($sp)
+	sw	$s1,  4($sp)
+	sw	$s2,  8($sp)
+	sw	$s3, 12($sp)
+	sw	$s4, 16($sp)
+	sw	$s5, 20($sp)
+	sw	$s6, 24($sp)
+	sw	$s7, 28($sp)
+
+	/* Test IN or OUT is unaligned.
+	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
+	 */
+	or	IS_UNALIGNED, IN, OUT
+	andi	IS_UNALIGNED, 0x3
+
+	b	.Lchacha_rounds_start
+
+.align 4
+.Loop_chacha_rounds:
+	addiu	IN,  CHACHA20_BLOCK_SIZE
+	addiu	OUT, CHACHA20_BLOCK_SIZE
+	addiu	NONCE_0, 1
+
+.Lchacha_rounds_start:
+	lw	X0,  0(STATE)
+	lw	X1,  4(STATE)
+	lw	X2,  8(STATE)
+	lw	X3,  12(STATE)
+
+	lw	X4,  16(STATE)
+	lw	X5,  20(STATE)
+	lw	X6,  24(STATE)
+	lw	X7,  28(STATE)
+	lw	X8,  32(STATE)
+	lw	X9,  36(STATE)
+	lw	X10, 40(STATE)
+	lw	X11, 44(STATE)
+
+	move	X12, NONCE_0
+	lw	X13, 52(STATE)
+	lw	X14, 56(STATE)
+	lw	X15, 60(STATE)
+
+.Loop_chacha_xor_rounds:
+	addiu	$at, -2
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
+	bnez	$at, .Loop_chacha_xor_rounds
+
+	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)
+
+	/* Is data src/dst unaligned? Jump */
+	bnez	IS_UNALIGNED, .Loop_chacha_unaligned
+
+	/* Set number rounds here to fill delayslot. */
+	lw	$at, (STACK_SIZE+16)($sp)
+
+	/* BYTES < 0, it has no full block. */
+	bltz	BYTES, .Lchacha_mips_no_full_block_aligned
+
+	FOR_EACH_WORD_REV(STORE_ALIGNED)
+
+	/* BYTES > 0? Loop again. */
+	bgtz	BYTES, .Loop_chacha_rounds
+
+	/* Place this here to fill delay slot */
+	addiu	NONCE_0, 1
+
+	/* BYTES < 0? Handle last bytes */
+	bltz	BYTES, .Lchacha_mips_xor_bytes
+
+.Lchacha_mips_xor_done:
+	/* Restore used registers */
+	lw	$s0,  0($sp)
+	lw	$s1,  4($sp)
+	lw	$s2,  8($sp)
+	lw	$s3, 12($sp)
+	lw	$s4, 16($sp)
+	lw	$s5, 20($sp)
+	lw	$s6, 24($sp)
+	lw	$s7, 28($sp)
+
+	/* Write NONCE_0 back to right location in state */
+	sw	NONCE_0, 48(STATE)
+
+.Lchacha_mips_end:
+	addiu	$sp, STACK_SIZE
+	jr	$ra
+
+.Lchacha_mips_no_full_block_aligned:
+	/* Restore the offset on BYTES */
+	addiu	BYTES, CHACHA20_BLOCK_SIZE
+
+	/* Get number of full WORDS */
+	andi	$at, BYTES, MASK_U32
+
+	/* Load upper half of jump table addr */
+	lui	T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
+
+	/* Calculate lower half jump table offset */
+	ins	T0, $at, 1, 6
+
+	/* Add offset to STATE */
+	addu	T1, STATE, $at
+
+	/* Add lower half jump table addr */
+	addiu	T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
+
+	/* Read value from STATE */
+	lw	SAVED_CA, 0(T1)
+
+	/* Store remaining bytecounter as negative value */
+	subu	BYTES, $at, BYTES
+
+	jr	T0
+
+	/* Jump table */
+	FOR_EACH_WORD(JMPTBL_ALIGNED)
+
+
+.Loop_chacha_unaligned:
+	/* Set number rounds here to fill delayslot. */
+	lw	$at, (STACK_SIZE+16)($sp)
+
+	/* BYTES > 0, it has no full block. */
+	bltz	BYTES, .Lchacha_mips_no_full_block_unaligned
+
+	FOR_EACH_WORD_REV(STORE_UNALIGNED)
+
+	/* BYTES > 0? Loop again. */
+	bgtz	BYTES, .Loop_chacha_rounds
+
+	/* Write NONCE_0 back to right location in state */
+	sw	NONCE_0, 48(STATE)
+
+	.set noreorder
+	/* Fall through to byte handling */
+	bgez	BYTES, .Lchacha_mips_xor_done
+.Lchacha_mips_xor_unaligned_0_b:
+.Lchacha_mips_xor_aligned_0_b:
+	/* Place this here to fill delay slot */
+	addiu	NONCE_0, 1
+	.set reorder
+
+.Lchacha_mips_xor_bytes:
+	addu	IN, $at
+	addu	OUT, $at
+	/* First byte */
+	lbu	T1, 0(IN)
+	addiu	$at, BYTES, 1
+	CPU_TO_LE32(SAVED_X)
+	ROTR(SAVED_X)
+	xor	T1, SAVED_X
+	sb	T1, 0(OUT)
+	beqz	$at, .Lchacha_mips_xor_done
+	/* Second byte */
+	lbu	T1, 1(IN)
+	addiu	$at, BYTES, 2
+	ROTx	SAVED_X, 8
+	xor	T1, SAVED_X
+	sb	T1, 1(OUT)
+	beqz	$at, .Lchacha_mips_xor_done
+	/* Third byte */
+	lbu	T1, 2(IN)
+	ROTx	SAVED_X, 8
+	xor	T1, SAVED_X
+	sb	T1, 2(OUT)
+	b	.Lchacha_mips_xor_done
+
+.Lchacha_mips_no_full_block_unaligned:
+	/* Restore the offset on BYTES */
+	addiu	BYTES, CHACHA20_BLOCK_SIZE
+
+	/* Get number of full WORDS */
+	andi	$at, BYTES, MASK_U32
+
+	/* Load upper half of jump table addr */
+	lui	T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
+
+	/* Calculate lower half jump table offset */
+	ins	T0, $at, 1, 6
+
+	/* Add offset to STATE */
+	addu	T1, STATE, $at
+
+	/* Add lower half jump table addr */
+	addiu	T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
+
+	/* Read value from STATE */
+	lw	SAVED_CA, 0(T1)
+
+	/* Store remaining bytecounter as negative value */
+	subu	BYTES, $at, BYTES
+
+	jr	T0
+
+	/* Jump table */
+	FOR_EACH_WORD(JMPTBL_UNALIGNED)
+.end chacha_crypt_arch
+.set at
+
+/* Input arguments
+ * STATE	$a0
+ * OUT		$a1
+ * NROUND	$a2
+ */
+
+#undef X12
+#undef X13
+#undef X14
+#undef X15
+
+#define X12	$a3
+#define X13	$at
+#define X14	$v0
+#define X15	STATE
+
+.set noat
+.globl	hchacha_block_arch
+.ent	hchacha_block_arch
+hchacha_block_arch:
+	.frame	$sp, STACK_SIZE, $ra
+
+	addiu	$sp, -STACK_SIZE
+
+	/* Save X11(s6) */
+	sw	X11, 0($sp)
+
+	lw	X0,  0(STATE)
+	lw	X1,  4(STATE)
+	lw	X2,  8(STATE)
+	lw	X3,  12(STATE)
+	lw	X4,  16(STATE)
+	lw	X5,  20(STATE)
+	lw	X6,  24(STATE)
+	lw	X7,  28(STATE)
+	lw	X8,  32(STATE)
+	lw	X9,  36(STATE)
+	lw	X10, 40(STATE)
+	lw	X11, 44(STATE)
+	lw	X12, 48(STATE)
+	lw	X13, 52(STATE)
+	lw	X14, 56(STATE)
+	lw	X15, 60(STATE)
+
+.Loop_hchacha_xor_rounds:
+	addiu	$a2, -2
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
+	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
+	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
+	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
+	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
+	bnez	$a2, .Loop_hchacha_xor_rounds
+
+	/* Restore used register */
+	lw	X11, 0($sp)
+
+	sw	X0,  0(OUT)
+	sw	X1,  4(OUT)
+	sw	X2,  8(OUT)
+	sw	X3,  12(OUT)
+	sw	X12, 16(OUT)
+	sw	X13, 20(OUT)
+	sw	X14, 24(OUT)
+	sw	X15, 28(OUT)
+
+	addiu	$sp, STACK_SIZE
+	jr	$ra
+.end hchacha_block_arch
+.set at
diff --git a/lib/crypto/mips/chacha-glue.c b/lib/crypto/mips/chacha-glue.c
new file mode 100644
index 000000000000..88c097594eb0
--- /dev/null
+++ b/lib/crypto/mips/chacha-glue.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ChaCha and HChaCha functions (MIPS optimized)
+ *
+ * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <crypto/chacha.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+asmlinkage void chacha_crypt_arch(struct chacha_state *state,
+				  u8 *dst, const u8 *src,
+				  unsigned int bytes, int nrounds);
+EXPORT_SYMBOL(chacha_crypt_arch);
+
+asmlinkage void hchacha_block_arch(const struct chacha_state *state,
+				   u32 out[HCHACHA_OUT_WORDS], int nrounds);
+EXPORT_SYMBOL(hchacha_block_arch);
+
+bool chacha_is_arch_optimized(void)
+{
+	return true;
+}
+EXPORT_SYMBOL(chacha_is_arch_optimized);
+
+MODULE_DESCRIPTION("ChaCha and HChaCha functions (MIPS optimized)");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/mips/poly1305-glue.c b/lib/crypto/mips/poly1305-glue.c
new file mode 100644
index 000000000000..764a38a65200
--- /dev/null
+++ b/lib/crypto/mips/poly1305-glue.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
+ *
+ * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ */
+
+#include <crypto/internal/poly1305.h>
+#include <linux/cpufeature.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+
+asmlinkage void poly1305_block_init_arch(
+	struct poly1305_block_state *state,
+	const u8 raw_key[POLY1305_BLOCK_SIZE]);
+EXPORT_SYMBOL_GPL(poly1305_block_init_arch);
+asmlinkage void poly1305_blocks_arch(struct poly1305_block_state *state,
+				     const u8 *src, u32 len, u32 hibit);
+EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
+asmlinkage void poly1305_emit_arch(const struct poly1305_state *state,
+				   u8 digest[POLY1305_DIGEST_SIZE],
+				   const u32 nonce[4]);
+EXPORT_SYMBOL_GPL(poly1305_emit_arch);
+
+bool poly1305_is_arch_optimized(void)
+{
+	return true;
+}
+EXPORT_SYMBOL(poly1305_is_arch_optimized);
+
+MODULE_DESCRIPTION("Poly1305 transform (MIPS accelerated");
+MODULE_LICENSE("GPL v2");
diff --git a/lib/crypto/mips/poly1305-mips.pl b/lib/crypto/mips/poly1305-mips.pl
new file mode 100644
index 000000000000..399f10c3e385
--- /dev/null
+++ b/lib/crypto/mips/poly1305-mips.pl
@@ -0,0 +1,1273 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
+#
+# ====================================================================
+# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
+# project.
+# ====================================================================
+
+# Poly1305 hash for MIPS.
+#
+# May 2016
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone.
+#
+#		IALU/gcc
+# R1x000	~5.5/+130%	(big-endian)
+# Octeon II	2.50/+70%	(little-endian)
+#
+# March 2019
+#
+# Add 32-bit code path.
+#
+# October 2019
+#
+# Modulo-scheduling reduction allows to omit dependency chain at the
+# end of inner loop and improve performance. Also optimize MIPS32R2
+# code path for MIPS 1004K core. Per RenÃ© von Dorst's suggestions.
+#
+#		IALU/gcc
+# R1x000	~9.8/?		(big-endian)
+# Octeon II	3.65/+140%	(little-endian)
+# MT7621/1004K	4.75/?		(little-endian)
+#
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp [o32 can be
+#   excluded from the rule, because it's specified volatile];
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
+
+$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
+
+if ($flavour =~ /64|n32/i) {{{
+######################################################################
+# 64-bit code path
+#
+
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
+
+$code.=<<___;
+#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
+     defined(_MIPS_ARCH_MIPS64R6)) \\
+     && !defined(_MIPS_ARCH_MIPS64R2)
+# define _MIPS_ARCH_MIPS64R2
+#endif
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+# define dmultu(rs,rt)
+# define mflo(rd,rs,rt)	dmulu	rd,rs,rt
+# define mfhi(rd,rs,rt)	dmuhu	rd,rs,rt
+#else
+# define dmultu(rs,rt)		dmultu	rs,rt
+# define mflo(rd,rs,rt)	mflo	rd
+# define mfhi(rd,rs,rt)	mfhi	rd
+#endif
+
+#ifdef	__KERNEL__
+# define poly1305_init   poly1305_block_init_arch
+# define poly1305_blocks poly1305_blocks_arch
+# define poly1305_emit   poly1305_emit_arch
+#endif
+
+#if defined(__MIPSEB__) && !defined(MIPSEB)
+# define MIPSEB
+#endif
+
+#ifdef MIPSEB
+# define MSB 0
+# define LSB 7
+#else
+# define MSB 7
+# define LSB 0
+#endif
+
+.text
+.set	noat
+.set	noreorder
+
+.align	5
+.globl	poly1305_init
+.ent	poly1305_init
+poly1305_init:
+	.frame	$sp,0,$ra
+	.set	reorder
+
+	sd	$zero,0($ctx)
+	sd	$zero,8($ctx)
+	sd	$zero,16($ctx)
+
+	beqz	$inp,.Lno_key
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+	andi	$tmp0,$inp,7		# $inp % 8
+	dsubu	$inp,$inp,$tmp0		# align $inp
+	sll	$tmp0,$tmp0,3		# byte to bit offset
+	ld	$in0,0($inp)
+	ld	$in1,8($inp)
+	beqz	$tmp0,.Laligned_key
+	ld	$tmp2,16($inp)
+
+	subu	$tmp1,$zero,$tmp0
+# ifdef	MIPSEB
+	dsllv	$in0,$in0,$tmp0
+	dsrlv	$tmp3,$in1,$tmp1
+	dsllv	$in1,$in1,$tmp0
+	dsrlv	$tmp2,$tmp2,$tmp1
+# else
+	dsrlv	$in0,$in0,$tmp0
+	dsllv	$tmp3,$in1,$tmp1
+	dsrlv	$in1,$in1,$tmp0
+	dsllv	$tmp2,$tmp2,$tmp1
+# endif
+	or	$in0,$in0,$tmp3
+	or	$in1,$in1,$tmp2
+.Laligned_key:
+#else
+	ldl	$in0,0+MSB($inp)
+	ldl	$in1,8+MSB($inp)
+	ldr	$in0,0+LSB($inp)
+	ldr	$in1,8+LSB($inp)
+#endif
+#ifdef	MIPSEB
+# if defined(_MIPS_ARCH_MIPS64R2)
+	dsbh	$in0,$in0		# byte swap
+	 dsbh	$in1,$in1
+	dshd	$in0,$in0
+	 dshd	$in1,$in1
+# else
+	ori	$tmp0,$zero,0xFF
+	dsll	$tmp2,$tmp0,32
+	or	$tmp0,$tmp2		# 0x000000FF000000FF
+
+	and	$tmp1,$in0,$tmp0	# byte swap
+	 and	$tmp3,$in1,$tmp0
+	dsrl	$tmp2,$in0,24
+	 dsrl	$tmp4,$in1,24
+	dsll	$tmp1,24
+	 dsll	$tmp3,24
+	and	$tmp2,$tmp0
+	 and	$tmp4,$tmp0
+	dsll	$tmp0,8			# 0x0000FF000000FF00
+	or	$tmp1,$tmp2
+	 or	$tmp3,$tmp4
+	and	$tmp2,$in0,$tmp0
+	 and	$tmp4,$in1,$tmp0
+	dsrl	$in0,8
+	 dsrl	$in1,8
+	dsll	$tmp2,8
+	 dsll	$tmp4,8
+	and	$in0,$tmp0
+	 and	$in1,$tmp0
+	or	$tmp1,$tmp2
+	 or	$tmp3,$tmp4
+	or	$in0,$tmp1
+	 or	$in1,$tmp3
+	dsrl	$tmp1,$in0,32
+	 dsrl	$tmp3,$in1,32
+	dsll	$in0,32
+	 dsll	$in1,32
+	or	$in0,$tmp1
+	 or	$in1,$tmp3
+# endif
+#endif
+	li	$tmp0,1
+	dsll	$tmp0,32		# 0x0000000100000000
+	daddiu	$tmp0,-63		# 0x00000000ffffffc1
+	dsll	$tmp0,28		# 0x0ffffffc10000000
+	daddiu	$tmp0,-1		# 0x0ffffffc0fffffff
+
+	and	$in0,$tmp0
+	daddiu	$tmp0,-3		# 0x0ffffffc0ffffffc
+	and	$in1,$tmp0
+
+	sd	$in0,24($ctx)
+	dsrl	$tmp0,$in1,2
+	sd	$in1,32($ctx)
+	daddu	$tmp0,$in1		# s1 = r1 + (r1 >> 2)
+	sd	$tmp0,40($ctx)
+
+.Lno_key:
+	li	$v0,0			# return 0
+	jr	$ra
+.end	poly1305_init
+___
+{
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
+
+my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
+   ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
+my ($shr,$shl) = ($s6,$s7);		# used on R6
+
+$code.=<<___;
+.align	5
+.globl	poly1305_blocks
+.ent	poly1305_blocks
+poly1305_blocks:
+	.set	noreorder
+	dsrl	$len,4			# number of complete blocks
+	bnez	$len,poly1305_blocks_internal
+	nop
+	jr	$ra
+	nop
+.end	poly1305_blocks
+
+.align	5
+.ent	poly1305_blocks_internal
+poly1305_blocks_internal:
+	.set	noreorder
+#if defined(_MIPS_ARCH_MIPS64R6)
+	.frame	$sp,8*8,$ra
+	.mask	$SAVED_REGS_MASK|0x000c0000,-8
+	dsubu	$sp,8*8
+	sd	$s7,56($sp)
+	sd	$s6,48($sp)
+#else
+	.frame	$sp,6*8,$ra
+	.mask	$SAVED_REGS_MASK,-8
+	dsubu	$sp,6*8
+#endif
+	sd	$s5,40($sp)
+	sd	$s4,32($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	sd	$s3,24($sp)
+	sd	$s2,16($sp)
+	sd	$s1,8($sp)
+	sd	$s0,0($sp)
+___
+$code.=<<___;
+	.set	reorder
+
+#if defined(_MIPS_ARCH_MIPS64R6)
+	andi	$shr,$inp,7
+	dsubu	$inp,$inp,$shr		# align $inp
+	sll	$shr,$shr,3		# byte to bit offset
+	subu	$shl,$zero,$shr
+#endif
+
+	ld	$h0,0($ctx)		# load hash value
+	ld	$h1,8($ctx)
+	ld	$h2,16($ctx)
+
+	ld	$r0,24($ctx)		# load key
+	ld	$r1,32($ctx)
+	ld	$rs1,40($ctx)
+
+	dsll	$len,4
+	daddu	$len,$inp		# end of buffer
+	b	.Loop
+
+.align	4
+.Loop:
+#if defined(_MIPS_ARCH_MIPS64R6)
+	ld	$in0,0($inp)		# load input
+	ld	$in1,8($inp)
+	beqz	$shr,.Laligned_inp
+
+	ld	$tmp2,16($inp)
+# ifdef	MIPSEB
+	dsllv	$in0,$in0,$shr
+	dsrlv	$tmp3,$in1,$shl
+	dsllv	$in1,$in1,$shr
+	dsrlv	$tmp2,$tmp2,$shl
+# else
+	dsrlv	$in0,$in0,$shr
+	dsllv	$tmp3,$in1,$shl
+	dsrlv	$in1,$in1,$shr
+	dsllv	$tmp2,$tmp2,$shl
+# endif
+	or	$in0,$in0,$tmp3
+	or	$in1,$in1,$tmp2
+.Laligned_inp:
+#else
+	ldl	$in0,0+MSB($inp)	# load input
+	ldl	$in1,8+MSB($inp)
+	ldr	$in0,0+LSB($inp)
+	ldr	$in1,8+LSB($inp)
+#endif
+	daddiu	$inp,16
+#ifdef	MIPSEB
+# if defined(_MIPS_ARCH_MIPS64R2)
+	dsbh	$in0,$in0		# byte swap
+	 dsbh	$in1,$in1
+	dshd	$in0,$in0
+	 dshd	$in1,$in1
+# else
+	ori	$tmp0,$zero,0xFF
+	dsll	$tmp2,$tmp0,32
+	or	$tmp0,$tmp2		# 0x000000FF000000FF
+
+	and	$tmp1,$in0,$tmp0	# byte swap
+	 and	$tmp3,$in1,$tmp0
+	dsrl	$tmp2,$in0,24
+	 dsrl	$tmp4,$in1,24
+	dsll	$tmp1,24
+	 dsll	$tmp3,24
+	and	$tmp2,$tmp0
+	 and	$tmp4,$tmp0
+	dsll	$tmp0,8			# 0x0000FF000000FF00
+	or	$tmp1,$tmp2
+	 or	$tmp3,$tmp4
+	and	$tmp2,$in0,$tmp0
+	 and	$tmp4,$in1,$tmp0
+	dsrl	$in0,8
+	 dsrl	$in1,8
+	dsll	$tmp2,8
+	 dsll	$tmp4,8
+	and	$in0,$tmp0
+	 and	$in1,$tmp0
+	or	$tmp1,$tmp2
+	 or	$tmp3,$tmp4
+	or	$in0,$tmp1
+	 or	$in1,$tmp3
+	dsrl	$tmp1,$in0,32
+	 dsrl	$tmp3,$in1,32
+	dsll	$in0,32
+	 dsll	$in1,32
+	or	$in0,$tmp1
+	 or	$in1,$tmp3
+# endif
+#endif
+	dsrl	$tmp1,$h2,2		# modulo-scheduled reduction
+	andi	$h2,$h2,3
+	dsll	$tmp0,$tmp1,2
+
+	daddu	$d0,$h0,$in0		# accumulate input
+	 daddu	$tmp1,$tmp0
+	sltu	$tmp0,$d0,$h0
+	daddu	$d0,$d0,$tmp1		# ... and residue
+	sltu	$tmp1,$d0,$tmp1
+	daddu	$d1,$h1,$in1
+	daddu	$tmp0,$tmp1
+	sltu	$tmp1,$d1,$h1
+	daddu	$d1,$tmp0
+
+	dmultu	($r0,$d0)		# h0*r0
+	 daddu	$d2,$h2,$padbit
+	 sltu	$tmp0,$d1,$tmp0
+	mflo	($h0,$r0,$d0)
+	mfhi	($h1,$r0,$d0)
+
+	dmultu	($rs1,$d1)		# h1*5*r1
+	 daddu	$d2,$tmp1
+	 daddu	$d2,$tmp0
+	mflo	($tmp0,$rs1,$d1)
+	mfhi	($tmp1,$rs1,$d1)
+
+	dmultu	($r1,$d0)		# h0*r1
+	mflo	($tmp2,$r1,$d0)
+	mfhi	($h2,$r1,$d0)
+	 daddu	$h0,$tmp0
+	 daddu	$h1,$tmp1
+	 sltu	$tmp0,$h0,$tmp0
+
+	dmultu	($r0,$d1)		# h1*r0
+	 daddu	$h1,$tmp0
+	 daddu	$h1,$tmp2
+	mflo	($tmp0,$r0,$d1)
+	mfhi	($tmp1,$r0,$d1)
+
+	dmultu	($rs1,$d2)		# h2*5*r1
+	 sltu	$tmp2,$h1,$tmp2
+	 daddu	$h2,$tmp2
+	mflo	($tmp2,$rs1,$d2)
+
+	dmultu	($r0,$d2)		# h2*r0
+	 daddu	$h1,$tmp0
+	 daddu	$h2,$tmp1
+	mflo	($tmp3,$r0,$d2)
+	 sltu	$tmp0,$h1,$tmp0
+	 daddu	$h2,$tmp0
+
+	daddu	$h1,$tmp2
+	sltu	$tmp2,$h1,$tmp2
+	daddu	$h2,$tmp2
+	daddu	$h2,$tmp3
+
+	bne	$inp,$len,.Loop
+
+	sd	$h0,0($ctx)		# store hash value
+	sd	$h1,8($ctx)
+	sd	$h2,16($ctx)
+
+	.set	noreorder
+#if defined(_MIPS_ARCH_MIPS64R6)
+	ld	$s7,56($sp)
+	ld	$s6,48($sp)
+#endif
+	ld	$s5,40($sp)		# epilogue
+	ld	$s4,32($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi epilogue
+	ld	$s3,24($sp)
+	ld	$s2,16($sp)
+	ld	$s1,8($sp)
+	ld	$s0,0($sp)
+___
+$code.=<<___;
+	jr	$ra
+#if defined(_MIPS_ARCH_MIPS64R6)
+	daddu	$sp,8*8
+#else
+	daddu	$sp,6*8
+#endif
+.end	poly1305_blocks_internal
+___
+}
+{
+my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
+
+$code.=<<___;
+.align	5
+.globl	poly1305_emit
+.ent	poly1305_emit
+poly1305_emit:
+	.frame	$sp,0,$ra
+	.set	reorder
+
+	ld	$tmp2,16($ctx)
+	ld	$tmp0,0($ctx)
+	ld	$tmp1,8($ctx)
+
+	li	$in0,-4			# final reduction
+	dsrl	$in1,$tmp2,2
+	and	$in0,$tmp2
+	andi	$tmp2,$tmp2,3
+	daddu	$in0,$in1
+
+	daddu	$tmp0,$tmp0,$in0
+	sltu	$in1,$tmp0,$in0
+	 daddiu	$in0,$tmp0,5		# compare to modulus
+	daddu	$tmp1,$tmp1,$in1
+	 sltiu	$tmp3,$in0,5
+	sltu	$tmp4,$tmp1,$in1
+	 daddu	$in1,$tmp1,$tmp3
+	daddu	$tmp2,$tmp2,$tmp4
+	 sltu	$tmp3,$in1,$tmp3
+	 daddu	$tmp2,$tmp2,$tmp3
+
+	dsrl	$tmp2,2			# see if it carried/borrowed
+	dsubu	$tmp2,$zero,$tmp2
+
+	xor	$in0,$tmp0
+	xor	$in1,$tmp1
+	and	$in0,$tmp2
+	and	$in1,$tmp2
+	xor	$in0,$tmp0
+	xor	$in1,$tmp1
+
+	lwu	$tmp0,0($nonce)		# load nonce
+	lwu	$tmp1,4($nonce)
+	lwu	$tmp2,8($nonce)
+	lwu	$tmp3,12($nonce)
+	dsll	$tmp1,32
+	dsll	$tmp3,32
+	or	$tmp0,$tmp1
+	or	$tmp2,$tmp3
+
+	daddu	$in0,$tmp0		# accumulate nonce
+	daddu	$in1,$tmp2
+	sltu	$tmp0,$in0,$tmp0
+	daddu	$in1,$tmp0
+
+	dsrl	$tmp0,$in0,8		# write mac value
+	dsrl	$tmp1,$in0,16
+	dsrl	$tmp2,$in0,24
+	sb	$in0,0($mac)
+	dsrl	$tmp3,$in0,32
+	sb	$tmp0,1($mac)
+	dsrl	$tmp0,$in0,40
+	sb	$tmp1,2($mac)
+	dsrl	$tmp1,$in0,48
+	sb	$tmp2,3($mac)
+	dsrl	$tmp2,$in0,56
+	sb	$tmp3,4($mac)
+	dsrl	$tmp3,$in1,8
+	sb	$tmp0,5($mac)
+	dsrl	$tmp0,$in1,16
+	sb	$tmp1,6($mac)
+	dsrl	$tmp1,$in1,24
+	sb	$tmp2,7($mac)
+
+	sb	$in1,8($mac)
+	dsrl	$tmp2,$in1,32
+	sb	$tmp3,9($mac)
+	dsrl	$tmp3,$in1,40
+	sb	$tmp0,10($mac)
+	dsrl	$tmp0,$in1,48
+	sb	$tmp1,11($mac)
+	dsrl	$tmp1,$in1,56
+	sb	$tmp2,12($mac)
+	sb	$tmp3,13($mac)
+	sb	$tmp0,14($mac)
+	sb	$tmp1,15($mac)
+
+	jr	$ra
+.end	poly1305_emit
+.rdata
+.asciiz	"Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
+.align	2
+___
+}
+}}} else {{{
+######################################################################
+# 32-bit code path
+#
+
+my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
+my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
+   ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
+
+$code.=<<___;
+#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
+     defined(_MIPS_ARCH_MIPS32R6)) \\
+     && !defined(_MIPS_ARCH_MIPS32R2)
+# define _MIPS_ARCH_MIPS32R2
+#endif
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+# define multu(rs,rt)
+# define mflo(rd,rs,rt)	mulu	rd,rs,rt
+# define mfhi(rd,rs,rt)	muhu	rd,rs,rt
+#else
+# define multu(rs,rt)	multu	rs,rt
+# define mflo(rd,rs,rt)	mflo	rd
+# define mfhi(rd,rs,rt)	mfhi	rd
+#endif
+
+#ifdef	__KERNEL__
+# define poly1305_init   poly1305_block_init_arch
+# define poly1305_blocks poly1305_blocks_arch
+# define poly1305_emit   poly1305_emit_arch
+#endif
+
+#if defined(__MIPSEB__) && !defined(MIPSEB)
+# define MIPSEB
+#endif
+
+#ifdef MIPSEB
+# define MSB 0
+# define LSB 3
+#else
+# define MSB 3
+# define LSB 0
+#endif
+
+.text
+.set	noat
+.set	noreorder
+
+.align	5
+.globl	poly1305_init
+.ent	poly1305_init
+poly1305_init:
+	.frame	$sp,0,$ra
+	.set	reorder
+
+	sw	$zero,0($ctx)
+	sw	$zero,4($ctx)
+	sw	$zero,8($ctx)
+	sw	$zero,12($ctx)
+	sw	$zero,16($ctx)
+
+	beqz	$inp,.Lno_key
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+	andi	$tmp0,$inp,3		# $inp % 4
+	subu	$inp,$inp,$tmp0		# align $inp
+	sll	$tmp0,$tmp0,3		# byte to bit offset
+	lw	$in0,0($inp)
+	lw	$in1,4($inp)
+	lw	$in2,8($inp)
+	lw	$in3,12($inp)
+	beqz	$tmp0,.Laligned_key
+
+	lw	$tmp2,16($inp)
+	subu	$tmp1,$zero,$tmp0
+# ifdef	MIPSEB
+	sllv	$in0,$in0,$tmp0
+	srlv	$tmp3,$in1,$tmp1
+	sllv	$in1,$in1,$tmp0
+	or	$in0,$in0,$tmp3
+	srlv	$tmp3,$in2,$tmp1
+	sllv	$in2,$in2,$tmp0
+	or	$in1,$in1,$tmp3
+	srlv	$tmp3,$in3,$tmp1
+	sllv	$in3,$in3,$tmp0
+	or	$in2,$in2,$tmp3
+	srlv	$tmp2,$tmp2,$tmp1
+	or	$in3,$in3,$tmp2
+# else
+	srlv	$in0,$in0,$tmp0
+	sllv	$tmp3,$in1,$tmp1
+	srlv	$in1,$in1,$tmp0
+	or	$in0,$in0,$tmp3
+	sllv	$tmp3,$in2,$tmp1
+	srlv	$in2,$in2,$tmp0
+	or	$in1,$in1,$tmp3
+	sllv	$tmp3,$in3,$tmp1
+	srlv	$in3,$in3,$tmp0
+	or	$in2,$in2,$tmp3
+	sllv	$tmp2,$tmp2,$tmp1
+	or	$in3,$in3,$tmp2
+# endif
+.Laligned_key:
+#else
+	lwl	$in0,0+MSB($inp)
+	lwl	$in1,4+MSB($inp)
+	lwl	$in2,8+MSB($inp)
+	lwl	$in3,12+MSB($inp)
+	lwr	$in0,0+LSB($inp)
+	lwr	$in1,4+LSB($inp)
+	lwr	$in2,8+LSB($inp)
+	lwr	$in3,12+LSB($inp)
+#endif
+#ifdef	MIPSEB
+# if defined(_MIPS_ARCH_MIPS32R2)
+	wsbh	$in0,$in0		# byte swap
+	wsbh	$in1,$in1
+	wsbh	$in2,$in2
+	wsbh	$in3,$in3
+	rotr	$in0,$in0,16
+	rotr	$in1,$in1,16
+	rotr	$in2,$in2,16
+	rotr	$in3,$in3,16
+# else
+	srl	$tmp0,$in0,24		# byte swap
+	srl	$tmp1,$in0,8
+	andi	$tmp2,$in0,0xFF00
+	sll	$in0,$in0,24
+	andi	$tmp1,0xFF00
+	sll	$tmp2,$tmp2,8
+	or	$in0,$tmp0
+	 srl	$tmp0,$in1,24
+	or	$tmp1,$tmp2
+	 srl	$tmp2,$in1,8
+	or	$in0,$tmp1
+	 andi	$tmp1,$in1,0xFF00
+	 sll	$in1,$in1,24
+	 andi	$tmp2,0xFF00
+	 sll	$tmp1,$tmp1,8
+	 or	$in1,$tmp0
+	srl	$tmp0,$in2,24
+	 or	$tmp2,$tmp1
+	srl	$tmp1,$in2,8
+	 or	$in1,$tmp2
+	andi	$tmp2,$in2,0xFF00
+	sll	$in2,$in2,24
+	andi	$tmp1,0xFF00
+	sll	$tmp2,$tmp2,8
+	or	$in2,$tmp0
+	 srl	$tmp0,$in3,24
+	or	$tmp1,$tmp2
+	 srl	$tmp2,$in3,8
+	or	$in2,$tmp1
+	 andi	$tmp1,$in3,0xFF00
+	 sll	$in3,$in3,24
+	 andi	$tmp2,0xFF00
+	 sll	$tmp1,$tmp1,8
+	 or	$in3,$tmp0
+	 or	$tmp2,$tmp1
+	 or	$in3,$tmp2
+# endif
+#endif
+	lui	$tmp0,0x0fff
+	ori	$tmp0,0xffff		# 0x0fffffff
+	and	$in0,$in0,$tmp0
+	subu	$tmp0,3			# 0x0ffffffc
+	and	$in1,$in1,$tmp0
+	and	$in2,$in2,$tmp0
+	and	$in3,$in3,$tmp0
+
+	sw	$in0,20($ctx)
+	sw	$in1,24($ctx)
+	sw	$in2,28($ctx)
+	sw	$in3,32($ctx)
+
+	srl	$tmp1,$in1,2
+	srl	$tmp2,$in2,2
+	srl	$tmp3,$in3,2
+	addu	$in1,$in1,$tmp1		# s1 = r1 + (r1 >> 2)
+	addu	$in2,$in2,$tmp2
+	addu	$in3,$in3,$tmp3
+	sw	$in1,36($ctx)
+	sw	$in2,40($ctx)
+	sw	$in3,44($ctx)
+.Lno_key:
+	li	$v0,0
+	jr	$ra
+.end	poly1305_init
+___
+{
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
+
+my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
+   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
+my ($d0,$d1,$d2,$d3) =
+   ($a4,$a5,$a6,$a7);
+my $shr = $t2;		# used on R6
+my $one = $t2;		# used on R2
+
+$code.=<<___;
+.globl	poly1305_blocks
+.align	5
+.ent	poly1305_blocks
+poly1305_blocks:
+	.frame	$sp,16*4,$ra
+	.mask	$SAVED_REGS_MASK,-4
+	.set	noreorder
+	subu	$sp, $sp,4*12
+	sw	$s11,4*11($sp)
+	sw	$s10,4*10($sp)
+	sw	$s9, 4*9($sp)
+	sw	$s8, 4*8($sp)
+	sw	$s7, 4*7($sp)
+	sw	$s6, 4*6($sp)
+	sw	$s5, 4*5($sp)
+	sw	$s4, 4*4($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	sw	$s3, 4*3($sp)
+	sw	$s2, 4*2($sp)
+	sw	$s1, 4*1($sp)
+	sw	$s0, 4*0($sp)
+___
+$code.=<<___;
+	.set	reorder
+
+	srl	$len,4			# number of complete blocks
+	li	$one,1
+	beqz	$len,.Labort
+
+#if defined(_MIPS_ARCH_MIPS32R6)
+	andi	$shr,$inp,3
+	subu	$inp,$inp,$shr		# align $inp
+	sll	$shr,$shr,3		# byte to bit offset
+#endif
+
+	lw	$h0,0($ctx)		# load hash value
+	lw	$h1,4($ctx)
+	lw	$h2,8($ctx)
+	lw	$h3,12($ctx)
+	lw	$h4,16($ctx)
+
+	lw	$r0,20($ctx)		# load key
+	lw	$r1,24($ctx)
+	lw	$r2,28($ctx)
+	lw	$r3,32($ctx)
+	lw	$rs1,36($ctx)
+	lw	$rs2,40($ctx)
+	lw	$rs3,44($ctx)
+
+	sll	$len,4
+	addu	$len,$len,$inp		# end of buffer
+	b	.Loop
+
+.align	4
+.Loop:
+#if defined(_MIPS_ARCH_MIPS32R6)
+	lw	$d0,0($inp)		# load input
+	lw	$d1,4($inp)
+	lw	$d2,8($inp)
+	lw	$d3,12($inp)
+	beqz	$shr,.Laligned_inp
+
+	lw	$t0,16($inp)
+	subu	$t1,$zero,$shr
+# ifdef	MIPSEB
+	sllv	$d0,$d0,$shr
+	srlv	$at,$d1,$t1
+	sllv	$d1,$d1,$shr
+	or	$d0,$d0,$at
+	srlv	$at,$d2,$t1
+	sllv	$d2,$d2,$shr
+	or	$d1,$d1,$at
+	srlv	$at,$d3,$t1
+	sllv	$d3,$d3,$shr
+	or	$d2,$d2,$at
+	srlv	$t0,$t0,$t1
+	or	$d3,$d3,$t0
+# else
+	srlv	$d0,$d0,$shr
+	sllv	$at,$d1,$t1
+	srlv	$d1,$d1,$shr
+	or	$d0,$d0,$at
+	sllv	$at,$d2,$t1
+	srlv	$d2,$d2,$shr
+	or	$d1,$d1,$at
+	sllv	$at,$d3,$t1
+	srlv	$d3,$d3,$shr
+	or	$d2,$d2,$at
+	sllv	$t0,$t0,$t1
+	or	$d3,$d3,$t0
+# endif
+.Laligned_inp:
+#else
+	lwl	$d0,0+MSB($inp)		# load input
+	lwl	$d1,4+MSB($inp)
+	lwl	$d2,8+MSB($inp)
+	lwl	$d3,12+MSB($inp)
+	lwr	$d0,0+LSB($inp)
+	lwr	$d1,4+LSB($inp)
+	lwr	$d2,8+LSB($inp)
+	lwr	$d3,12+LSB($inp)
+#endif
+#ifdef	MIPSEB
+# if defined(_MIPS_ARCH_MIPS32R2)
+	wsbh	$d0,$d0			# byte swap
+	wsbh	$d1,$d1
+	wsbh	$d2,$d2
+	wsbh	$d3,$d3
+	rotr	$d0,$d0,16
+	rotr	$d1,$d1,16
+	rotr	$d2,$d2,16
+	rotr	$d3,$d3,16
+# else
+	srl	$at,$d0,24		# byte swap
+	srl	$t0,$d0,8
+	andi	$t1,$d0,0xFF00
+	sll	$d0,$d0,24
+	andi	$t0,0xFF00
+	sll	$t1,$t1,8
+	or	$d0,$at
+	 srl	$at,$d1,24
+	or	$t0,$t1
+	 srl	$t1,$d1,8
+	or	$d0,$t0
+	 andi	$t0,$d1,0xFF00
+	 sll	$d1,$d1,24
+	 andi	$t1,0xFF00
+	 sll	$t0,$t0,8
+	 or	$d1,$at
+	srl	$at,$d2,24
+	 or	$t1,$t0
+	srl	$t0,$d2,8
+	 or	$d1,$t1
+	andi	$t1,$d2,0xFF00
+	sll	$d2,$d2,24
+	andi	$t0,0xFF00
+	sll	$t1,$t1,8
+	or	$d2,$at
+	 srl	$at,$d3,24
+	or	$t0,$t1
+	 srl	$t1,$d3,8
+	or	$d2,$t0
+	 andi	$t0,$d3,0xFF00
+	 sll	$d3,$d3,24
+	 andi	$t1,0xFF00
+	 sll	$t0,$t0,8
+	 or	$d3,$at
+	 or	$t1,$t0
+	 or	$d3,$t1
+# endif
+#endif
+	srl	$t0,$h4,2		# modulo-scheduled reduction
+	andi	$h4,$h4,3
+	sll	$at,$t0,2
+
+	addu	$d0,$d0,$h0		# accumulate input
+	 addu	$t0,$t0,$at
+	sltu	$h0,$d0,$h0
+	addu	$d0,$d0,$t0		# ... and residue
+	sltu	$at,$d0,$t0
+
+	addu	$d1,$d1,$h1
+	 addu	$h0,$h0,$at		# carry
+	sltu	$h1,$d1,$h1
+	addu	$d1,$d1,$h0
+	sltu	$h0,$d1,$h0
+
+	addu	$d2,$d2,$h2
+	 addu	$h1,$h1,$h0		# carry
+	sltu	$h2,$d2,$h2
+	addu	$d2,$d2,$h1
+	sltu	$h1,$d2,$h1
+
+	addu	$d3,$d3,$h3
+	 addu	$h2,$h2,$h1		# carry
+	sltu	$h3,$d3,$h3
+	addu	$d3,$d3,$h2
+
+#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
+	multu	$r0,$d0			# d0*r0
+	 sltu	$h2,$d3,$h2
+	maddu	$rs3,$d1		# d1*s3
+	 addu	$h3,$h3,$h2		# carry
+	maddu	$rs2,$d2		# d2*s2
+	 addu	$h4,$h4,$padbit
+	maddu	$rs1,$d3		# d3*s1
+	 addu	$h4,$h4,$h3
+	mfhi	$at
+	mflo	$h0
+
+	multu	$r1,$d0			# d0*r1
+	maddu	$r0,$d1			# d1*r0
+	maddu	$rs3,$d2		# d2*s3
+	maddu	$rs2,$d3		# d3*s2
+	maddu	$rs1,$h4		# h4*s1
+	maddu	$at,$one		# hi*1
+	mfhi	$at
+	mflo	$h1
+
+	multu	$r2,$d0			# d0*r2
+	maddu	$r1,$d1			# d1*r1
+	maddu	$r0,$d2			# d2*r0
+	maddu	$rs3,$d3		# d3*s3
+	maddu	$rs2,$h4		# h4*s2
+	maddu	$at,$one		# hi*1
+	mfhi	$at
+	mflo	$h2
+
+	mul	$t0,$r0,$h4		# h4*r0
+
+	multu	$r3,$d0			# d0*r3
+	maddu	$r2,$d1			# d1*r2
+	maddu	$r1,$d2			# d2*r1
+	maddu	$r0,$d3			# d3*r0
+	maddu	$rs3,$h4		# h4*s3
+	maddu	$at,$one		# hi*1
+	mfhi	$at
+	mflo	$h3
+
+	 addiu	$inp,$inp,16
+
+	addu	$h4,$t0,$at
+#else
+	multu	($r0,$d0)		# d0*r0
+	mflo	($h0,$r0,$d0)
+	mfhi	($h1,$r0,$d0)
+
+	 sltu	$h2,$d3,$h2
+	 addu	$h3,$h3,$h2		# carry
+
+	multu	($rs3,$d1)		# d1*s3
+	mflo	($at,$rs3,$d1)
+	mfhi	($t0,$rs3,$d1)
+
+	 addu	$h4,$h4,$padbit
+	 addiu	$inp,$inp,16
+	 addu	$h4,$h4,$h3
+
+	multu	($rs2,$d2)		# d2*s2
+	mflo	($a3,$rs2,$d2)
+	mfhi	($t1,$rs2,$d2)
+	 addu	$h0,$h0,$at
+	 addu	$h1,$h1,$t0
+	multu	($rs1,$d3)		# d3*s1
+	 sltu	$at,$h0,$at
+	 addu	$h1,$h1,$at
+
+	mflo	($at,$rs1,$d3)
+	mfhi	($t0,$rs1,$d3)
+	 addu	$h0,$h0,$a3
+	 addu	$h1,$h1,$t1
+	multu	($r1,$d0)		# d0*r1
+	 sltu	$a3,$h0,$a3
+	 addu	$h1,$h1,$a3
+
+
+	mflo	($a3,$r1,$d0)
+	mfhi	($h2,$r1,$d0)
+	 addu	$h0,$h0,$at
+	 addu	$h1,$h1,$t0
+	multu	($r0,$d1)		# d1*r0
+	 sltu	$at,$h0,$at
+	 addu	$h1,$h1,$at
+
+	mflo	($at,$r0,$d1)
+	mfhi	($t0,$r0,$d1)
+	 addu	$h1,$h1,$a3
+	 sltu	$a3,$h1,$a3
+	multu	($rs3,$d2)		# d2*s3
+	 addu	$h2,$h2,$a3
+
+	mflo	($a3,$rs3,$d2)
+	mfhi	($t1,$rs3,$d2)
+	 addu	$h1,$h1,$at
+	 addu	$h2,$h2,$t0
+	multu	($rs2,$d3)		# d3*s2
+	 sltu	$at,$h1,$at
+	 addu	$h2,$h2,$at
+
+	mflo	($at,$rs2,$d3)
+	mfhi	($t0,$rs2,$d3)
+	 addu	$h1,$h1,$a3
+	 addu	$h2,$h2,$t1
+	multu	($rs1,$h4)		# h4*s1
+	 sltu	$a3,$h1,$a3
+	 addu	$h2,$h2,$a3
+
+	mflo	($a3,$rs1,$h4)
+	 addu	$h1,$h1,$at
+	 addu	$h2,$h2,$t0
+	multu	($r2,$d0)		# d0*r2
+	 sltu	$at,$h1,$at
+	 addu	$h2,$h2,$at
+
+
+	mflo	($at,$r2,$d0)
+	mfhi	($h3,$r2,$d0)
+	 addu	$h1,$h1,$a3
+	 sltu	$a3,$h1,$a3
+	multu	($r1,$d1)		# d1*r1
+	 addu	$h2,$h2,$a3
+
+	mflo	($a3,$r1,$d1)
+	mfhi	($t1,$r1,$d1)
+	 addu	$h2,$h2,$at
+	 sltu	$at,$h2,$at
+	multu	($r0,$d2)		# d2*r0
+	 addu	$h3,$h3,$at
+
+	mflo	($at,$r0,$d2)
+	mfhi	($t0,$r0,$d2)
+	 addu	$h2,$h2,$a3
+	 addu	$h3,$h3,$t1
+	multu	($rs3,$d3)		# d3*s3
+	 sltu	$a3,$h2,$a3
+	 addu	$h3,$h3,$a3
+
+	mflo	($a3,$rs3,$d3)
+	mfhi	($t1,$rs3,$d3)
+	 addu	$h2,$h2,$at
+	 addu	$h3,$h3,$t0
+	multu	($rs2,$h4)		# h4*s2
+	 sltu	$at,$h2,$at
+	 addu	$h3,$h3,$at
+
+	mflo	($at,$rs2,$h4)
+	 addu	$h2,$h2,$a3
+	 addu	$h3,$h3,$t1
+	multu	($r3,$d0)		# d0*r3
+	 sltu	$a3,$h2,$a3
+	 addu	$h3,$h3,$a3
+
+
+	mflo	($a3,$r3,$d0)
+	mfhi	($t1,$r3,$d0)
+	 addu	$h2,$h2,$at
+	 sltu	$at,$h2,$at
+	multu	($r2,$d1)		# d1*r2
+	 addu	$h3,$h3,$at
+
+	mflo	($at,$r2,$d1)
+	mfhi	($t0,$r2,$d1)
+	 addu	$h3,$h3,$a3
+	 sltu	$a3,$h3,$a3
+	multu	($r0,$d3)		# d3*r0
+	 addu	$t1,$t1,$a3
+
+	mflo	($a3,$r0,$d3)
+	mfhi	($d3,$r0,$d3)
+	 addu	$h3,$h3,$at
+	 addu	$t1,$t1,$t0
+	multu	($r1,$d2)		# d2*r1
+	 sltu	$at,$h3,$at
+	 addu	$t1,$t1,$at
+
+	mflo	($at,$r1,$d2)
+	mfhi	($t0,$r1,$d2)
+	 addu	$h3,$h3,$a3
+	 addu	$t1,$t1,$d3
+	multu	($rs3,$h4)		# h4*s3
+	 sltu	$a3,$h3,$a3
+	 addu	$t1,$t1,$a3
+
+	mflo	($a3,$rs3,$h4)
+	 addu	$h3,$h3,$at
+	 addu	$t1,$t1,$t0
+	multu	($r0,$h4)		# h4*r0
+	 sltu	$at,$h3,$at
+	 addu	$t1,$t1,$at
+
+
+	mflo	($h4,$r0,$h4)
+	 addu	$h3,$h3,$a3
+	 sltu	$a3,$h3,$a3
+	 addu	$t1,$t1,$a3
+	addu	$h4,$h4,$t1
+
+	li	$padbit,1		# if we loop, padbit is 1
+#endif
+	bne	$inp,$len,.Loop
+
+	sw	$h0,0($ctx)		# store hash value
+	sw	$h1,4($ctx)
+	sw	$h2,8($ctx)
+	sw	$h3,12($ctx)
+	sw	$h4,16($ctx)
+
+	.set	noreorder
+.Labort:
+	lw	$s11,4*11($sp)
+	lw	$s10,4*10($sp)
+	lw	$s9, 4*9($sp)
+	lw	$s8, 4*8($sp)
+	lw	$s7, 4*7($sp)
+	lw	$s6, 4*6($sp)
+	lw	$s5, 4*5($sp)
+	lw	$s4, 4*4($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	lw	$s3, 4*3($sp)
+	lw	$s2, 4*2($sp)
+	lw	$s1, 4*1($sp)
+	lw	$s0, 4*0($sp)
+___
+$code.=<<___;
+	jr	$ra
+	addu	$sp,$sp,4*12
+.end	poly1305_blocks
+___
+}
+{
+my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
+
+$code.=<<___;
+.align	5
+.globl	poly1305_emit
+.ent	poly1305_emit
+poly1305_emit:
+	.frame	$sp,0,$ra
+	.set	reorder
+
+	lw	$tmp4,16($ctx)
+	lw	$tmp0,0($ctx)
+	lw	$tmp1,4($ctx)
+	lw	$tmp2,8($ctx)
+	lw	$tmp3,12($ctx)
+
+	li	$in0,-4			# final reduction
+	srl	$ctx,$tmp4,2
+	and	$in0,$in0,$tmp4
+	andi	$tmp4,$tmp4,3
+	addu	$ctx,$ctx,$in0
+
+	addu	$tmp0,$tmp0,$ctx
+	sltu	$ctx,$tmp0,$ctx
+	 addiu	$in0,$tmp0,5		# compare to modulus
+	addu	$tmp1,$tmp1,$ctx
+	 sltiu	$in1,$in0,5
+	sltu	$ctx,$tmp1,$ctx
+	 addu	$in1,$in1,$tmp1
+	addu	$tmp2,$tmp2,$ctx
+	 sltu	$in2,$in1,$tmp1
+	sltu	$ctx,$tmp2,$ctx
+	 addu	$in2,$in2,$tmp2
+	addu	$tmp3,$tmp3,$ctx
+	 sltu	$in3,$in2,$tmp2
+	sltu	$ctx,$tmp3,$ctx
+	 addu	$in3,$in3,$tmp3
+	addu	$tmp4,$tmp4,$ctx
+	 sltu	$ctx,$in3,$tmp3
+	 addu	$ctx,$tmp4
+
+	srl	$ctx,2			# see if it carried/borrowed
+	subu	$ctx,$zero,$ctx
+
+	xor	$in0,$tmp0
+	xor	$in1,$tmp1
+	xor	$in2,$tmp2
+	xor	$in3,$tmp3
+	and	$in0,$ctx
+	and	$in1,$ctx
+	and	$in2,$ctx
+	and	$in3,$ctx
+	xor	$in0,$tmp0
+	xor	$in1,$tmp1
+	xor	$in2,$tmp2
+	xor	$in3,$tmp3
+
+	lw	$tmp0,0($nonce)		# load nonce
+	lw	$tmp1,4($nonce)
+	lw	$tmp2,8($nonce)
+	lw	$tmp3,12($nonce)
+
+	addu	$in0,$tmp0		# accumulate nonce
+	sltu	$ctx,$in0,$tmp0
+
+	addu	$in1,$tmp1
+	sltu	$tmp1,$in1,$tmp1
+	addu	$in1,$ctx
+	sltu	$ctx,$in1,$ctx
+	addu	$ctx,$tmp1
+
+	addu	$in2,$tmp2
+	sltu	$tmp2,$in2,$tmp2
+	addu	$in2,$ctx
+	sltu	$ctx,$in2,$ctx
+	addu	$ctx,$tmp2
+
+	addu	$in3,$tmp3
+	addu	$in3,$ctx
+
+	srl	$tmp0,$in0,8		# write mac value
+	srl	$tmp1,$in0,16
+	srl	$tmp2,$in0,24
+	sb	$in0, 0($mac)
+	sb	$tmp0,1($mac)
+	srl	$tmp0,$in1,8
+	sb	$tmp1,2($mac)
+	srl	$tmp1,$in1,16
+	sb	$tmp2,3($mac)
+	srl	$tmp2,$in1,24
+	sb	$in1, 4($mac)
+	sb	$tmp0,5($mac)
+	srl	$tmp0,$in2,8
+	sb	$tmp1,6($mac)
+	srl	$tmp1,$in2,16
+	sb	$tmp2,7($mac)
+	srl	$tmp2,$in2,24
+	sb	$in2, 8($mac)
+	sb	$tmp0,9($mac)
+	srl	$tmp0,$in3,8
+	sb	$tmp1,10($mac)
+	srl	$tmp1,$in3,16
+	sb	$tmp2,11($mac)
+	srl	$tmp2,$in3,24
+	sb	$in3, 12($mac)
+	sb	$tmp0,13($mac)
+	sb	$tmp1,14($mac)
+	sb	$tmp2,15($mac)
+
+	jr	$ra
+.end	poly1305_emit
+.rdata
+.asciiz	"Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
+.align	2
+___
+}
+}}}
+
+$output=pop and open STDOUT,">$output";
+print $code;
+close STDOUT;