From: Eric Biggers Date: Thu, 19 Jun 2025 19:19:02 +0000 (-0700) Subject: lib/crypto: mips: Move arch/mips/lib/crypto/ into lib/crypto/ X-Git-Tag: block-6.17-20250808~34^2~47 X-Git-Url: https://git.kernel.dk/?a=commitdiff_plain;h=7e54e993ab8c98d912f54ad6f46bfcc9dcd65368;p=linux-block.git lib/crypto: mips: Move arch/mips/lib/crypto/ into lib/crypto/ Move the contents of arch/mips/lib/crypto/ into lib/crypto/mips/. The new code organization makes a lot more sense for how this code actually works and is developed. In particular, it makes it possible to build each algorithm as a single module, with better inlining and dead code elimination. For a more detailed explanation, see the patchset which did this for the CRC library code: https://lore.kernel.org/r/20250607200454.73587-1-ebiggers@kernel.org/. Also see the patchset which did this for SHA-512: https://lore.kernel.org/linux-crypto/20250616014019.415791-1-ebiggers@kernel.org/ This is just a preparatory commit, which does the move to get the files into their new location but keeps them building the same way as before. Later commits will make the actual improvements to the way the arch-optimized code is integrated for each algorithm. Add a gitignore entry for the removed directory arch/mips/lib/crypto/ so that people don't accidentally commit leftover generated files. Acked-by: Ard Biesheuvel Reviewed-by: Martin K. Petersen Reviewed-by: Sohil Mehta Link: https://lore.kernel.org/r/20250619191908.134235-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- diff --git a/arch/mips/lib/.gitignore b/arch/mips/lib/.gitignore new file mode 100644 index 000000000000..647d7a922e68 --- /dev/null +++ b/arch/mips/lib/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only + +# This now-removed directory used to contain generated files. +/crypto/ diff --git a/arch/mips/lib/Makefile b/arch/mips/lib/Makefile index 9d75845ef78e..9c024e6d5e54 100644 --- a/arch/mips/lib/Makefile +++ b/arch/mips/lib/Makefile @@ -3,8 +3,6 @@ # Makefile for MIPS-specific library files.. # -obj-y += crypto/ - lib-y += bitops.o csum_partial.o delay.o memcpy.o memset.o \ mips-atomic.o strncpy_user.o \ strnlen_user.o uncached.o diff --git a/arch/mips/lib/crypto/.gitignore b/arch/mips/lib/crypto/.gitignore deleted file mode 100644 index 0d47d4f21c6d..000000000000 --- a/arch/mips/lib/crypto/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -poly1305-core.S diff --git a/arch/mips/lib/crypto/Kconfig b/arch/mips/lib/crypto/Kconfig deleted file mode 100644 index 0670a170c1be..000000000000 --- a/arch/mips/lib/crypto/Kconfig +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -config CRYPTO_CHACHA_MIPS - tristate - depends on CPU_MIPS32_R2 - default CRYPTO_LIB_CHACHA - select CRYPTO_ARCH_HAVE_LIB_CHACHA - -config CRYPTO_POLY1305_MIPS - tristate - default CRYPTO_LIB_POLY1305 - select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/arch/mips/lib/crypto/Makefile b/arch/mips/lib/crypto/Makefile deleted file mode 100644 index 804488c7aded..000000000000 --- a/arch/mips/lib/crypto/Makefile +++ /dev/null @@ -1,19 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only - -obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o -chacha-mips-y := chacha-core.o chacha-glue.o -AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots - -obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o -poly1305-mips-y := poly1305-core.o poly1305-glue.o - -perlasm-flavour-$(CONFIG_32BIT) := o32 -perlasm-flavour-$(CONFIG_64BIT) := 64 - -quiet_cmd_perlasm = PERLASM $@ - cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@) - -$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE - $(call if_changed,perlasm) - -targets += poly1305-core.S diff --git a/arch/mips/lib/crypto/chacha-core.S b/arch/mips/lib/crypto/chacha-core.S deleted file mode 100644 index 5755f69cfe00..000000000000 --- a/arch/mips/lib/crypto/chacha-core.S +++ /dev/null @@ -1,497 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Copyright (C) 2016-2018 René van Dorst . All Rights Reserved. - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - */ - -#define MASK_U32 0x3c -#define CHACHA20_BLOCK_SIZE 64 -#define STACK_SIZE 32 - -#define X0 $t0 -#define X1 $t1 -#define X2 $t2 -#define X3 $t3 -#define X4 $t4 -#define X5 $t5 -#define X6 $t6 -#define X7 $t7 -#define X8 $t8 -#define X9 $t9 -#define X10 $v1 -#define X11 $s6 -#define X12 $s5 -#define X13 $s4 -#define X14 $s3 -#define X15 $s2 -/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */ -#define T0 $s1 -#define T1 $s0 -#define T(n) T ## n -#define X(n) X ## n - -/* Input arguments */ -#define STATE $a0 -#define OUT $a1 -#define IN $a2 -#define BYTES $a3 - -/* Output argument */ -/* NONCE[0] is kept in a register and not in memory. - * We don't want to touch original value in memory. - * Must be incremented every loop iteration. - */ -#define NONCE_0 $v0 - -/* SAVED_X and SAVED_CA are set in the jump table. - * Use regs which are overwritten on exit else we don't leak clear data. - * They are used to handling the last bytes which are not multiple of 4. - */ -#define SAVED_X X15 -#define SAVED_CA $s7 - -#define IS_UNALIGNED $s7 - -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -#define MSB 0 -#define LSB 3 -#define ROTx rotl -#define ROTR(n) rotr n, 24 -#define CPU_TO_LE32(n) \ - wsbh n; \ - rotr n, 16; -#else -#define MSB 3 -#define LSB 0 -#define ROTx rotr -#define CPU_TO_LE32(n) -#define ROTR(n) -#endif - -#define FOR_EACH_WORD(x) \ - x( 0); \ - x( 1); \ - x( 2); \ - x( 3); \ - x( 4); \ - x( 5); \ - x( 6); \ - x( 7); \ - x( 8); \ - x( 9); \ - x(10); \ - x(11); \ - x(12); \ - x(13); \ - x(14); \ - x(15); - -#define FOR_EACH_WORD_REV(x) \ - x(15); \ - x(14); \ - x(13); \ - x(12); \ - x(11); \ - x(10); \ - x( 9); \ - x( 8); \ - x( 7); \ - x( 6); \ - x( 5); \ - x( 4); \ - x( 3); \ - x( 2); \ - x( 1); \ - x( 0); - -#define PLUS_ONE_0 1 -#define PLUS_ONE_1 2 -#define PLUS_ONE_2 3 -#define PLUS_ONE_3 4 -#define PLUS_ONE_4 5 -#define PLUS_ONE_5 6 -#define PLUS_ONE_6 7 -#define PLUS_ONE_7 8 -#define PLUS_ONE_8 9 -#define PLUS_ONE_9 10 -#define PLUS_ONE_10 11 -#define PLUS_ONE_11 12 -#define PLUS_ONE_12 13 -#define PLUS_ONE_13 14 -#define PLUS_ONE_14 15 -#define PLUS_ONE_15 16 -#define PLUS_ONE(x) PLUS_ONE_ ## x -#define _CONCAT3(a,b,c) a ## b ## c -#define CONCAT3(a,b,c) _CONCAT3(a,b,c) - -#define STORE_UNALIGNED(x) \ -CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \ - .if (x != 12); \ - lw T0, (x*4)(STATE); \ - .endif; \ - lwl T1, (x*4)+MSB ## (IN); \ - lwr T1, (x*4)+LSB ## (IN); \ - .if (x == 12); \ - addu X ## x, NONCE_0; \ - .else; \ - addu X ## x, T0; \ - .endif; \ - CPU_TO_LE32(X ## x); \ - xor X ## x, T1; \ - swl X ## x, (x*4)+MSB ## (OUT); \ - swr X ## x, (x*4)+LSB ## (OUT); - -#define STORE_ALIGNED(x) \ -CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ - .if (x != 12); \ - lw T0, (x*4)(STATE); \ - .endif; \ - lw T1, (x*4) ## (IN); \ - .if (x == 12); \ - addu X ## x, NONCE_0; \ - .else; \ - addu X ## x, T0; \ - .endif; \ - CPU_TO_LE32(X ## x); \ - xor X ## x, T1; \ - sw X ## x, (x*4) ## (OUT); - -/* Jump table macro. - * Used for setup and handling the last bytes, which are not multiple of 4. - * X15 is free to store Xn - * Every jumptable entry must be equal in size. - */ -#define JMPTBL_ALIGNED(x) \ -.Lchacha_mips_jmptbl_aligned_ ## x: ; \ - .set noreorder; \ - b .Lchacha_mips_xor_aligned_ ## x ## _b; \ - .if (x == 12); \ - addu SAVED_X, X ## x, NONCE_0; \ - .else; \ - addu SAVED_X, X ## x, SAVED_CA; \ - .endif; \ - .set reorder - -#define JMPTBL_UNALIGNED(x) \ -.Lchacha_mips_jmptbl_unaligned_ ## x: ; \ - .set noreorder; \ - b .Lchacha_mips_xor_unaligned_ ## x ## _b; \ - .if (x == 12); \ - addu SAVED_X, X ## x, NONCE_0; \ - .else; \ - addu SAVED_X, X ## x, SAVED_CA; \ - .endif; \ - .set reorder - -#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \ - addu X(A), X(K); \ - addu X(B), X(L); \ - addu X(C), X(M); \ - addu X(D), X(N); \ - xor X(V), X(A); \ - xor X(W), X(B); \ - xor X(Y), X(C); \ - xor X(Z), X(D); \ - rotl X(V), S; \ - rotl X(W), S; \ - rotl X(Y), S; \ - rotl X(Z), S; - -.text -.set reorder -.set noat -.globl chacha_crypt_arch -.ent chacha_crypt_arch -chacha_crypt_arch: - .frame $sp, STACK_SIZE, $ra - - /* Load number of rounds */ - lw $at, 16($sp) - - addiu $sp, -STACK_SIZE - - /* Return bytes = 0. */ - beqz BYTES, .Lchacha_mips_end - - lw NONCE_0, 48(STATE) - - /* Save s0-s7 */ - sw $s0, 0($sp) - sw $s1, 4($sp) - sw $s2, 8($sp) - sw $s3, 12($sp) - sw $s4, 16($sp) - sw $s5, 20($sp) - sw $s6, 24($sp) - sw $s7, 28($sp) - - /* Test IN or OUT is unaligned. - * IS_UNALIGNED = ( IN | OUT ) & 0x00000003 - */ - or IS_UNALIGNED, IN, OUT - andi IS_UNALIGNED, 0x3 - - b .Lchacha_rounds_start - -.align 4 -.Loop_chacha_rounds: - addiu IN, CHACHA20_BLOCK_SIZE - addiu OUT, CHACHA20_BLOCK_SIZE - addiu NONCE_0, 1 - -.Lchacha_rounds_start: - lw X0, 0(STATE) - lw X1, 4(STATE) - lw X2, 8(STATE) - lw X3, 12(STATE) - - lw X4, 16(STATE) - lw X5, 20(STATE) - lw X6, 24(STATE) - lw X7, 28(STATE) - lw X8, 32(STATE) - lw X9, 36(STATE) - lw X10, 40(STATE) - lw X11, 44(STATE) - - move X12, NONCE_0 - lw X13, 52(STATE) - lw X14, 56(STATE) - lw X15, 60(STATE) - -.Loop_chacha_xor_rounds: - addiu $at, -2 - AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); - AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); - AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); - AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); - AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); - AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); - AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); - AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); - bnez $at, .Loop_chacha_xor_rounds - - addiu BYTES, -(CHACHA20_BLOCK_SIZE) - - /* Is data src/dst unaligned? Jump */ - bnez IS_UNALIGNED, .Loop_chacha_unaligned - - /* Set number rounds here to fill delayslot. */ - lw $at, (STACK_SIZE+16)($sp) - - /* BYTES < 0, it has no full block. */ - bltz BYTES, .Lchacha_mips_no_full_block_aligned - - FOR_EACH_WORD_REV(STORE_ALIGNED) - - /* BYTES > 0? Loop again. */ - bgtz BYTES, .Loop_chacha_rounds - - /* Place this here to fill delay slot */ - addiu NONCE_0, 1 - - /* BYTES < 0? Handle last bytes */ - bltz BYTES, .Lchacha_mips_xor_bytes - -.Lchacha_mips_xor_done: - /* Restore used registers */ - lw $s0, 0($sp) - lw $s1, 4($sp) - lw $s2, 8($sp) - lw $s3, 12($sp) - lw $s4, 16($sp) - lw $s5, 20($sp) - lw $s6, 24($sp) - lw $s7, 28($sp) - - /* Write NONCE_0 back to right location in state */ - sw NONCE_0, 48(STATE) - -.Lchacha_mips_end: - addiu $sp, STACK_SIZE - jr $ra - -.Lchacha_mips_no_full_block_aligned: - /* Restore the offset on BYTES */ - addiu BYTES, CHACHA20_BLOCK_SIZE - - /* Get number of full WORDS */ - andi $at, BYTES, MASK_U32 - - /* Load upper half of jump table addr */ - lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0) - - /* Calculate lower half jump table offset */ - ins T0, $at, 1, 6 - - /* Add offset to STATE */ - addu T1, STATE, $at - - /* Add lower half jump table addr */ - addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0) - - /* Read value from STATE */ - lw SAVED_CA, 0(T1) - - /* Store remaining bytecounter as negative value */ - subu BYTES, $at, BYTES - - jr T0 - - /* Jump table */ - FOR_EACH_WORD(JMPTBL_ALIGNED) - - -.Loop_chacha_unaligned: - /* Set number rounds here to fill delayslot. */ - lw $at, (STACK_SIZE+16)($sp) - - /* BYTES > 0, it has no full block. */ - bltz BYTES, .Lchacha_mips_no_full_block_unaligned - - FOR_EACH_WORD_REV(STORE_UNALIGNED) - - /* BYTES > 0? Loop again. */ - bgtz BYTES, .Loop_chacha_rounds - - /* Write NONCE_0 back to right location in state */ - sw NONCE_0, 48(STATE) - - .set noreorder - /* Fall through to byte handling */ - bgez BYTES, .Lchacha_mips_xor_done -.Lchacha_mips_xor_unaligned_0_b: -.Lchacha_mips_xor_aligned_0_b: - /* Place this here to fill delay slot */ - addiu NONCE_0, 1 - .set reorder - -.Lchacha_mips_xor_bytes: - addu IN, $at - addu OUT, $at - /* First byte */ - lbu T1, 0(IN) - addiu $at, BYTES, 1 - CPU_TO_LE32(SAVED_X) - ROTR(SAVED_X) - xor T1, SAVED_X - sb T1, 0(OUT) - beqz $at, .Lchacha_mips_xor_done - /* Second byte */ - lbu T1, 1(IN) - addiu $at, BYTES, 2 - ROTx SAVED_X, 8 - xor T1, SAVED_X - sb T1, 1(OUT) - beqz $at, .Lchacha_mips_xor_done - /* Third byte */ - lbu T1, 2(IN) - ROTx SAVED_X, 8 - xor T1, SAVED_X - sb T1, 2(OUT) - b .Lchacha_mips_xor_done - -.Lchacha_mips_no_full_block_unaligned: - /* Restore the offset on BYTES */ - addiu BYTES, CHACHA20_BLOCK_SIZE - - /* Get number of full WORDS */ - andi $at, BYTES, MASK_U32 - - /* Load upper half of jump table addr */ - lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0) - - /* Calculate lower half jump table offset */ - ins T0, $at, 1, 6 - - /* Add offset to STATE */ - addu T1, STATE, $at - - /* Add lower half jump table addr */ - addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0) - - /* Read value from STATE */ - lw SAVED_CA, 0(T1) - - /* Store remaining bytecounter as negative value */ - subu BYTES, $at, BYTES - - jr T0 - - /* Jump table */ - FOR_EACH_WORD(JMPTBL_UNALIGNED) -.end chacha_crypt_arch -.set at - -/* Input arguments - * STATE $a0 - * OUT $a1 - * NROUND $a2 - */ - -#undef X12 -#undef X13 -#undef X14 -#undef X15 - -#define X12 $a3 -#define X13 $at -#define X14 $v0 -#define X15 STATE - -.set noat -.globl hchacha_block_arch -.ent hchacha_block_arch -hchacha_block_arch: - .frame $sp, STACK_SIZE, $ra - - addiu $sp, -STACK_SIZE - - /* Save X11(s6) */ - sw X11, 0($sp) - - lw X0, 0(STATE) - lw X1, 4(STATE) - lw X2, 8(STATE) - lw X3, 12(STATE) - lw X4, 16(STATE) - lw X5, 20(STATE) - lw X6, 24(STATE) - lw X7, 28(STATE) - lw X8, 32(STATE) - lw X9, 36(STATE) - lw X10, 40(STATE) - lw X11, 44(STATE) - lw X12, 48(STATE) - lw X13, 52(STATE) - lw X14, 56(STATE) - lw X15, 60(STATE) - -.Loop_hchacha_xor_rounds: - addiu $a2, -2 - AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); - AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); - AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); - AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); - AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); - AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); - AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); - AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); - bnez $a2, .Loop_hchacha_xor_rounds - - /* Restore used register */ - lw X11, 0($sp) - - sw X0, 0(OUT) - sw X1, 4(OUT) - sw X2, 8(OUT) - sw X3, 12(OUT) - sw X12, 16(OUT) - sw X13, 20(OUT) - sw X14, 24(OUT) - sw X15, 28(OUT) - - addiu $sp, STACK_SIZE - jr $ra -.end hchacha_block_arch -.set at diff --git a/arch/mips/lib/crypto/chacha-glue.c b/arch/mips/lib/crypto/chacha-glue.c deleted file mode 100644 index 88c097594eb0..000000000000 --- a/arch/mips/lib/crypto/chacha-glue.c +++ /dev/null @@ -1,29 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * ChaCha and HChaCha functions (MIPS optimized) - * - * Copyright (C) 2019 Linaro, Ltd. - */ - -#include -#include -#include - -asmlinkage void chacha_crypt_arch(struct chacha_state *state, - u8 *dst, const u8 *src, - unsigned int bytes, int nrounds); -EXPORT_SYMBOL(chacha_crypt_arch); - -asmlinkage void hchacha_block_arch(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); -EXPORT_SYMBOL(hchacha_block_arch); - -bool chacha_is_arch_optimized(void) -{ - return true; -} -EXPORT_SYMBOL(chacha_is_arch_optimized); - -MODULE_DESCRIPTION("ChaCha and HChaCha functions (MIPS optimized)"); -MODULE_AUTHOR("Ard Biesheuvel "); -MODULE_LICENSE("GPL v2"); diff --git a/arch/mips/lib/crypto/poly1305-glue.c b/arch/mips/lib/crypto/poly1305-glue.c deleted file mode 100644 index 764a38a65200..000000000000 --- a/arch/mips/lib/crypto/poly1305-glue.c +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS - * - * Copyright (C) 2019 Linaro Ltd. - */ - -#include -#include -#include -#include -#include - -asmlinkage void poly1305_block_init_arch( - struct poly1305_block_state *state, - const u8 raw_key[POLY1305_BLOCK_SIZE]); -EXPORT_SYMBOL_GPL(poly1305_block_init_arch); -asmlinkage void poly1305_blocks_arch(struct poly1305_block_state *state, - const u8 *src, u32 len, u32 hibit); -EXPORT_SYMBOL_GPL(poly1305_blocks_arch); -asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, - u8 digest[POLY1305_DIGEST_SIZE], - const u32 nonce[4]); -EXPORT_SYMBOL_GPL(poly1305_emit_arch); - -bool poly1305_is_arch_optimized(void) -{ - return true; -} -EXPORT_SYMBOL(poly1305_is_arch_optimized); - -MODULE_DESCRIPTION("Poly1305 transform (MIPS accelerated"); -MODULE_LICENSE("GPL v2"); diff --git a/arch/mips/lib/crypto/poly1305-mips.pl b/arch/mips/lib/crypto/poly1305-mips.pl deleted file mode 100644 index 399f10c3e385..000000000000 --- a/arch/mips/lib/crypto/poly1305-mips.pl +++ /dev/null @@ -1,1273 +0,0 @@ -#!/usr/bin/env perl -# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause -# -# ==================================================================== -# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL -# project. -# ==================================================================== - -# Poly1305 hash for MIPS. -# -# May 2016 -# -# Numbers are cycles per processed byte with poly1305_blocks alone. -# -# IALU/gcc -# R1x000 ~5.5/+130% (big-endian) -# Octeon II 2.50/+70% (little-endian) -# -# March 2019 -# -# Add 32-bit code path. -# -# October 2019 -# -# Modulo-scheduling reduction allows to omit dependency chain at the -# end of inner loop and improve performance. Also optimize MIPS32R2 -# code path for MIPS 1004K core. Per René von Dorst's suggestions. -# -# IALU/gcc -# R1x000 ~9.8/? (big-endian) -# Octeon II 3.65/+140% (little-endian) -# MT7621/1004K 4.75/? (little-endian) -# -###################################################################### -# There is a number of MIPS ABI in use, O32 and N32/64 are most -# widely used. Then there is a new contender: NUBI. It appears that if -# one picks the latter, it's possible to arrange code in ABI neutral -# manner. Therefore let's stick to NUBI register layout: -# -($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); -($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); -($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); -($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); -# -# The return value is placed in $a0. Following coding rules facilitate -# interoperability: -# -# - never ever touch $tp, "thread pointer", former $gp [o32 can be -# excluded from the rule, because it's specified volatile]; -# - copy return value to $t0, former $v0 [or to $a0 if you're adapting -# old code]; -# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; -# -# For reference here is register layout for N32/64 MIPS ABIs: -# -# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); -# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); -# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); -# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); -# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); -# -# -# -###################################################################### - -$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64 - -$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; - -if ($flavour =~ /64|n32/i) {{{ -###################################################################### -# 64-bit code path -# - -my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); -my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); - -$code.=<<___; -#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\ - defined(_MIPS_ARCH_MIPS64R6)) \\ - && !defined(_MIPS_ARCH_MIPS64R2) -# define _MIPS_ARCH_MIPS64R2 -#endif - -#if defined(_MIPS_ARCH_MIPS64R6) -# define dmultu(rs,rt) -# define mflo(rd,rs,rt) dmulu rd,rs,rt -# define mfhi(rd,rs,rt) dmuhu rd,rs,rt -#else -# define dmultu(rs,rt) dmultu rs,rt -# define mflo(rd,rs,rt) mflo rd -# define mfhi(rd,rs,rt) mfhi rd -#endif - -#ifdef __KERNEL__ -# define poly1305_init poly1305_block_init_arch -# define poly1305_blocks poly1305_blocks_arch -# define poly1305_emit poly1305_emit_arch -#endif - -#if defined(__MIPSEB__) && !defined(MIPSEB) -# define MIPSEB -#endif - -#ifdef MIPSEB -# define MSB 0 -# define LSB 7 -#else -# define MSB 7 -# define LSB 0 -#endif - -.text -.set noat -.set noreorder - -.align 5 -.globl poly1305_init -.ent poly1305_init -poly1305_init: - .frame $sp,0,$ra - .set reorder - - sd $zero,0($ctx) - sd $zero,8($ctx) - sd $zero,16($ctx) - - beqz $inp,.Lno_key - -#if defined(_MIPS_ARCH_MIPS64R6) - andi $tmp0,$inp,7 # $inp % 8 - dsubu $inp,$inp,$tmp0 # align $inp - sll $tmp0,$tmp0,3 # byte to bit offset - ld $in0,0($inp) - ld $in1,8($inp) - beqz $tmp0,.Laligned_key - ld $tmp2,16($inp) - - subu $tmp1,$zero,$tmp0 -# ifdef MIPSEB - dsllv $in0,$in0,$tmp0 - dsrlv $tmp3,$in1,$tmp1 - dsllv $in1,$in1,$tmp0 - dsrlv $tmp2,$tmp2,$tmp1 -# else - dsrlv $in0,$in0,$tmp0 - dsllv $tmp3,$in1,$tmp1 - dsrlv $in1,$in1,$tmp0 - dsllv $tmp2,$tmp2,$tmp1 -# endif - or $in0,$in0,$tmp3 - or $in1,$in1,$tmp2 -.Laligned_key: -#else - ldl $in0,0+MSB($inp) - ldl $in1,8+MSB($inp) - ldr $in0,0+LSB($inp) - ldr $in1,8+LSB($inp) -#endif -#ifdef MIPSEB -# if defined(_MIPS_ARCH_MIPS64R2) - dsbh $in0,$in0 # byte swap - dsbh $in1,$in1 - dshd $in0,$in0 - dshd $in1,$in1 -# else - ori $tmp0,$zero,0xFF - dsll $tmp2,$tmp0,32 - or $tmp0,$tmp2 # 0x000000FF000000FF - - and $tmp1,$in0,$tmp0 # byte swap - and $tmp3,$in1,$tmp0 - dsrl $tmp2,$in0,24 - dsrl $tmp4,$in1,24 - dsll $tmp1,24 - dsll $tmp3,24 - and $tmp2,$tmp0 - and $tmp4,$tmp0 - dsll $tmp0,8 # 0x0000FF000000FF00 - or $tmp1,$tmp2 - or $tmp3,$tmp4 - and $tmp2,$in0,$tmp0 - and $tmp4,$in1,$tmp0 - dsrl $in0,8 - dsrl $in1,8 - dsll $tmp2,8 - dsll $tmp4,8 - and $in0,$tmp0 - and $in1,$tmp0 - or $tmp1,$tmp2 - or $tmp3,$tmp4 - or $in0,$tmp1 - or $in1,$tmp3 - dsrl $tmp1,$in0,32 - dsrl $tmp3,$in1,32 - dsll $in0,32 - dsll $in1,32 - or $in0,$tmp1 - or $in1,$tmp3 -# endif -#endif - li $tmp0,1 - dsll $tmp0,32 # 0x0000000100000000 - daddiu $tmp0,-63 # 0x00000000ffffffc1 - dsll $tmp0,28 # 0x0ffffffc10000000 - daddiu $tmp0,-1 # 0x0ffffffc0fffffff - - and $in0,$tmp0 - daddiu $tmp0,-3 # 0x0ffffffc0ffffffc - and $in1,$tmp0 - - sd $in0,24($ctx) - dsrl $tmp0,$in1,2 - sd $in1,32($ctx) - daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) - sd $tmp0,40($ctx) - -.Lno_key: - li $v0,0 # return 0 - jr $ra -.end poly1305_init -___ -{ -my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; - -my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = - ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); -my ($shr,$shl) = ($s6,$s7); # used on R6 - -$code.=<<___; -.align 5 -.globl poly1305_blocks -.ent poly1305_blocks -poly1305_blocks: - .set noreorder - dsrl $len,4 # number of complete blocks - bnez $len,poly1305_blocks_internal - nop - jr $ra - nop -.end poly1305_blocks - -.align 5 -.ent poly1305_blocks_internal -poly1305_blocks_internal: - .set noreorder -#if defined(_MIPS_ARCH_MIPS64R6) - .frame $sp,8*8,$ra - .mask $SAVED_REGS_MASK|0x000c0000,-8 - dsubu $sp,8*8 - sd $s7,56($sp) - sd $s6,48($sp) -#else - .frame $sp,6*8,$ra - .mask $SAVED_REGS_MASK,-8 - dsubu $sp,6*8 -#endif - sd $s5,40($sp) - sd $s4,32($sp) -___ -$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue - sd $s3,24($sp) - sd $s2,16($sp) - sd $s1,8($sp) - sd $s0,0($sp) -___ -$code.=<<___; - .set reorder - -#if defined(_MIPS_ARCH_MIPS64R6) - andi $shr,$inp,7 - dsubu $inp,$inp,$shr # align $inp - sll $shr,$shr,3 # byte to bit offset - subu $shl,$zero,$shr -#endif - - ld $h0,0($ctx) # load hash value - ld $h1,8($ctx) - ld $h2,16($ctx) - - ld $r0,24($ctx) # load key - ld $r1,32($ctx) - ld $rs1,40($ctx) - - dsll $len,4 - daddu $len,$inp # end of buffer - b .Loop - -.align 4 -.Loop: -#if defined(_MIPS_ARCH_MIPS64R6) - ld $in0,0($inp) # load input - ld $in1,8($inp) - beqz $shr,.Laligned_inp - - ld $tmp2,16($inp) -# ifdef MIPSEB - dsllv $in0,$in0,$shr - dsrlv $tmp3,$in1,$shl - dsllv $in1,$in1,$shr - dsrlv $tmp2,$tmp2,$shl -# else - dsrlv $in0,$in0,$shr - dsllv $tmp3,$in1,$shl - dsrlv $in1,$in1,$shr - dsllv $tmp2,$tmp2,$shl -# endif - or $in0,$in0,$tmp3 - or $in1,$in1,$tmp2 -.Laligned_inp: -#else - ldl $in0,0+MSB($inp) # load input - ldl $in1,8+MSB($inp) - ldr $in0,0+LSB($inp) - ldr $in1,8+LSB($inp) -#endif - daddiu $inp,16 -#ifdef MIPSEB -# if defined(_MIPS_ARCH_MIPS64R2) - dsbh $in0,$in0 # byte swap - dsbh $in1,$in1 - dshd $in0,$in0 - dshd $in1,$in1 -# else - ori $tmp0,$zero,0xFF - dsll $tmp2,$tmp0,32 - or $tmp0,$tmp2 # 0x000000FF000000FF - - and $tmp1,$in0,$tmp0 # byte swap - and $tmp3,$in1,$tmp0 - dsrl $tmp2,$in0,24 - dsrl $tmp4,$in1,24 - dsll $tmp1,24 - dsll $tmp3,24 - and $tmp2,$tmp0 - and $tmp4,$tmp0 - dsll $tmp0,8 # 0x0000FF000000FF00 - or $tmp1,$tmp2 - or $tmp3,$tmp4 - and $tmp2,$in0,$tmp0 - and $tmp4,$in1,$tmp0 - dsrl $in0,8 - dsrl $in1,8 - dsll $tmp2,8 - dsll $tmp4,8 - and $in0,$tmp0 - and $in1,$tmp0 - or $tmp1,$tmp2 - or $tmp3,$tmp4 - or $in0,$tmp1 - or $in1,$tmp3 - dsrl $tmp1,$in0,32 - dsrl $tmp3,$in1,32 - dsll $in0,32 - dsll $in1,32 - or $in0,$tmp1 - or $in1,$tmp3 -# endif -#endif - dsrl $tmp1,$h2,2 # modulo-scheduled reduction - andi $h2,$h2,3 - dsll $tmp0,$tmp1,2 - - daddu $d0,$h0,$in0 # accumulate input - daddu $tmp1,$tmp0 - sltu $tmp0,$d0,$h0 - daddu $d0,$d0,$tmp1 # ... and residue - sltu $tmp1,$d0,$tmp1 - daddu $d1,$h1,$in1 - daddu $tmp0,$tmp1 - sltu $tmp1,$d1,$h1 - daddu $d1,$tmp0 - - dmultu ($r0,$d0) # h0*r0 - daddu $d2,$h2,$padbit - sltu $tmp0,$d1,$tmp0 - mflo ($h0,$r0,$d0) - mfhi ($h1,$r0,$d0) - - dmultu ($rs1,$d1) # h1*5*r1 - daddu $d2,$tmp1 - daddu $d2,$tmp0 - mflo ($tmp0,$rs1,$d1) - mfhi ($tmp1,$rs1,$d1) - - dmultu ($r1,$d0) # h0*r1 - mflo ($tmp2,$r1,$d0) - mfhi ($h2,$r1,$d0) - daddu $h0,$tmp0 - daddu $h1,$tmp1 - sltu $tmp0,$h0,$tmp0 - - dmultu ($r0,$d1) # h1*r0 - daddu $h1,$tmp0 - daddu $h1,$tmp2 - mflo ($tmp0,$r0,$d1) - mfhi ($tmp1,$r0,$d1) - - dmultu ($rs1,$d2) # h2*5*r1 - sltu $tmp2,$h1,$tmp2 - daddu $h2,$tmp2 - mflo ($tmp2,$rs1,$d2) - - dmultu ($r0,$d2) # h2*r0 - daddu $h1,$tmp0 - daddu $h2,$tmp1 - mflo ($tmp3,$r0,$d2) - sltu $tmp0,$h1,$tmp0 - daddu $h2,$tmp0 - - daddu $h1,$tmp2 - sltu $tmp2,$h1,$tmp2 - daddu $h2,$tmp2 - daddu $h2,$tmp3 - - bne $inp,$len,.Loop - - sd $h0,0($ctx) # store hash value - sd $h1,8($ctx) - sd $h2,16($ctx) - - .set noreorder -#if defined(_MIPS_ARCH_MIPS64R6) - ld $s7,56($sp) - ld $s6,48($sp) -#endif - ld $s5,40($sp) # epilogue - ld $s4,32($sp) -___ -$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue - ld $s3,24($sp) - ld $s2,16($sp) - ld $s1,8($sp) - ld $s0,0($sp) -___ -$code.=<<___; - jr $ra -#if defined(_MIPS_ARCH_MIPS64R6) - daddu $sp,8*8 -#else - daddu $sp,6*8 -#endif -.end poly1305_blocks_internal -___ -} -{ -my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); - -$code.=<<___; -.align 5 -.globl poly1305_emit -.ent poly1305_emit -poly1305_emit: - .frame $sp,0,$ra - .set reorder - - ld $tmp2,16($ctx) - ld $tmp0,0($ctx) - ld $tmp1,8($ctx) - - li $in0,-4 # final reduction - dsrl $in1,$tmp2,2 - and $in0,$tmp2 - andi $tmp2,$tmp2,3 - daddu $in0,$in1 - - daddu $tmp0,$tmp0,$in0 - sltu $in1,$tmp0,$in0 - daddiu $in0,$tmp0,5 # compare to modulus - daddu $tmp1,$tmp1,$in1 - sltiu $tmp3,$in0,5 - sltu $tmp4,$tmp1,$in1 - daddu $in1,$tmp1,$tmp3 - daddu $tmp2,$tmp2,$tmp4 - sltu $tmp3,$in1,$tmp3 - daddu $tmp2,$tmp2,$tmp3 - - dsrl $tmp2,2 # see if it carried/borrowed - dsubu $tmp2,$zero,$tmp2 - - xor $in0,$tmp0 - xor $in1,$tmp1 - and $in0,$tmp2 - and $in1,$tmp2 - xor $in0,$tmp0 - xor $in1,$tmp1 - - lwu $tmp0,0($nonce) # load nonce - lwu $tmp1,4($nonce) - lwu $tmp2,8($nonce) - lwu $tmp3,12($nonce) - dsll $tmp1,32 - dsll $tmp3,32 - or $tmp0,$tmp1 - or $tmp2,$tmp3 - - daddu $in0,$tmp0 # accumulate nonce - daddu $in1,$tmp2 - sltu $tmp0,$in0,$tmp0 - daddu $in1,$tmp0 - - dsrl $tmp0,$in0,8 # write mac value - dsrl $tmp1,$in0,16 - dsrl $tmp2,$in0,24 - sb $in0,0($mac) - dsrl $tmp3,$in0,32 - sb $tmp0,1($mac) - dsrl $tmp0,$in0,40 - sb $tmp1,2($mac) - dsrl $tmp1,$in0,48 - sb $tmp2,3($mac) - dsrl $tmp2,$in0,56 - sb $tmp3,4($mac) - dsrl $tmp3,$in1,8 - sb $tmp0,5($mac) - dsrl $tmp0,$in1,16 - sb $tmp1,6($mac) - dsrl $tmp1,$in1,24 - sb $tmp2,7($mac) - - sb $in1,8($mac) - dsrl $tmp2,$in1,32 - sb $tmp3,9($mac) - dsrl $tmp3,$in1,40 - sb $tmp0,10($mac) - dsrl $tmp0,$in1,48 - sb $tmp1,11($mac) - dsrl $tmp1,$in1,56 - sb $tmp2,12($mac) - sb $tmp3,13($mac) - sb $tmp0,14($mac) - sb $tmp1,15($mac) - - jr $ra -.end poly1305_emit -.rdata -.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm" -.align 2 -___ -} -}}} else {{{ -###################################################################### -# 32-bit code path -# - -my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); -my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = - ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2); - -$code.=<<___; -#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\ - defined(_MIPS_ARCH_MIPS32R6)) \\ - && !defined(_MIPS_ARCH_MIPS32R2) -# define _MIPS_ARCH_MIPS32R2 -#endif - -#if defined(_MIPS_ARCH_MIPS32R6) -# define multu(rs,rt) -# define mflo(rd,rs,rt) mulu rd,rs,rt -# define mfhi(rd,rs,rt) muhu rd,rs,rt -#else -# define multu(rs,rt) multu rs,rt -# define mflo(rd,rs,rt) mflo rd -# define mfhi(rd,rs,rt) mfhi rd -#endif - -#ifdef __KERNEL__ -# define poly1305_init poly1305_block_init_arch -# define poly1305_blocks poly1305_blocks_arch -# define poly1305_emit poly1305_emit_arch -#endif - -#if defined(__MIPSEB__) && !defined(MIPSEB) -# define MIPSEB -#endif - -#ifdef MIPSEB -# define MSB 0 -# define LSB 3 -#else -# define MSB 3 -# define LSB 0 -#endif - -.text -.set noat -.set noreorder - -.align 5 -.globl poly1305_init -.ent poly1305_init -poly1305_init: - .frame $sp,0,$ra - .set reorder - - sw $zero,0($ctx) - sw $zero,4($ctx) - sw $zero,8($ctx) - sw $zero,12($ctx) - sw $zero,16($ctx) - - beqz $inp,.Lno_key - -#if defined(_MIPS_ARCH_MIPS32R6) - andi $tmp0,$inp,3 # $inp % 4 - subu $inp,$inp,$tmp0 # align $inp - sll $tmp0,$tmp0,3 # byte to bit offset - lw $in0,0($inp) - lw $in1,4($inp) - lw $in2,8($inp) - lw $in3,12($inp) - beqz $tmp0,.Laligned_key - - lw $tmp2,16($inp) - subu $tmp1,$zero,$tmp0 -# ifdef MIPSEB - sllv $in0,$in0,$tmp0 - srlv $tmp3,$in1,$tmp1 - sllv $in1,$in1,$tmp0 - or $in0,$in0,$tmp3 - srlv $tmp3,$in2,$tmp1 - sllv $in2,$in2,$tmp0 - or $in1,$in1,$tmp3 - srlv $tmp3,$in3,$tmp1 - sllv $in3,$in3,$tmp0 - or $in2,$in2,$tmp3 - srlv $tmp2,$tmp2,$tmp1 - or $in3,$in3,$tmp2 -# else - srlv $in0,$in0,$tmp0 - sllv $tmp3,$in1,$tmp1 - srlv $in1,$in1,$tmp0 - or $in0,$in0,$tmp3 - sllv $tmp3,$in2,$tmp1 - srlv $in2,$in2,$tmp0 - or $in1,$in1,$tmp3 - sllv $tmp3,$in3,$tmp1 - srlv $in3,$in3,$tmp0 - or $in2,$in2,$tmp3 - sllv $tmp2,$tmp2,$tmp1 - or $in3,$in3,$tmp2 -# endif -.Laligned_key: -#else - lwl $in0,0+MSB($inp) - lwl $in1,4+MSB($inp) - lwl $in2,8+MSB($inp) - lwl $in3,12+MSB($inp) - lwr $in0,0+LSB($inp) - lwr $in1,4+LSB($inp) - lwr $in2,8+LSB($inp) - lwr $in3,12+LSB($inp) -#endif -#ifdef MIPSEB -# if defined(_MIPS_ARCH_MIPS32R2) - wsbh $in0,$in0 # byte swap - wsbh $in1,$in1 - wsbh $in2,$in2 - wsbh $in3,$in3 - rotr $in0,$in0,16 - rotr $in1,$in1,16 - rotr $in2,$in2,16 - rotr $in3,$in3,16 -# else - srl $tmp0,$in0,24 # byte swap - srl $tmp1,$in0,8 - andi $tmp2,$in0,0xFF00 - sll $in0,$in0,24 - andi $tmp1,0xFF00 - sll $tmp2,$tmp2,8 - or $in0,$tmp0 - srl $tmp0,$in1,24 - or $tmp1,$tmp2 - srl $tmp2,$in1,8 - or $in0,$tmp1 - andi $tmp1,$in1,0xFF00 - sll $in1,$in1,24 - andi $tmp2,0xFF00 - sll $tmp1,$tmp1,8 - or $in1,$tmp0 - srl $tmp0,$in2,24 - or $tmp2,$tmp1 - srl $tmp1,$in2,8 - or $in1,$tmp2 - andi $tmp2,$in2,0xFF00 - sll $in2,$in2,24 - andi $tmp1,0xFF00 - sll $tmp2,$tmp2,8 - or $in2,$tmp0 - srl $tmp0,$in3,24 - or $tmp1,$tmp2 - srl $tmp2,$in3,8 - or $in2,$tmp1 - andi $tmp1,$in3,0xFF00 - sll $in3,$in3,24 - andi $tmp2,0xFF00 - sll $tmp1,$tmp1,8 - or $in3,$tmp0 - or $tmp2,$tmp1 - or $in3,$tmp2 -# endif -#endif - lui $tmp0,0x0fff - ori $tmp0,0xffff # 0x0fffffff - and $in0,$in0,$tmp0 - subu $tmp0,3 # 0x0ffffffc - and $in1,$in1,$tmp0 - and $in2,$in2,$tmp0 - and $in3,$in3,$tmp0 - - sw $in0,20($ctx) - sw $in1,24($ctx) - sw $in2,28($ctx) - sw $in3,32($ctx) - - srl $tmp1,$in1,2 - srl $tmp2,$in2,2 - srl $tmp3,$in3,2 - addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2) - addu $in2,$in2,$tmp2 - addu $in3,$in3,$tmp3 - sw $in1,36($ctx) - sw $in2,40($ctx) - sw $in3,44($ctx) -.Lno_key: - li $v0,0 - jr $ra -.end poly1305_init -___ -{ -my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000"; - -my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = - ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11); -my ($d0,$d1,$d2,$d3) = - ($a4,$a5,$a6,$a7); -my $shr = $t2; # used on R6 -my $one = $t2; # used on R2 - -$code.=<<___; -.globl poly1305_blocks -.align 5 -.ent poly1305_blocks -poly1305_blocks: - .frame $sp,16*4,$ra - .mask $SAVED_REGS_MASK,-4 - .set noreorder - subu $sp, $sp,4*12 - sw $s11,4*11($sp) - sw $s10,4*10($sp) - sw $s9, 4*9($sp) - sw $s8, 4*8($sp) - sw $s7, 4*7($sp) - sw $s6, 4*6($sp) - sw $s5, 4*5($sp) - sw $s4, 4*4($sp) -___ -$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue - sw $s3, 4*3($sp) - sw $s2, 4*2($sp) - sw $s1, 4*1($sp) - sw $s0, 4*0($sp) -___ -$code.=<<___; - .set reorder - - srl $len,4 # number of complete blocks - li $one,1 - beqz $len,.Labort - -#if defined(_MIPS_ARCH_MIPS32R6) - andi $shr,$inp,3 - subu $inp,$inp,$shr # align $inp - sll $shr,$shr,3 # byte to bit offset -#endif - - lw $h0,0($ctx) # load hash value - lw $h1,4($ctx) - lw $h2,8($ctx) - lw $h3,12($ctx) - lw $h4,16($ctx) - - lw $r0,20($ctx) # load key - lw $r1,24($ctx) - lw $r2,28($ctx) - lw $r3,32($ctx) - lw $rs1,36($ctx) - lw $rs2,40($ctx) - lw $rs3,44($ctx) - - sll $len,4 - addu $len,$len,$inp # end of buffer - b .Loop - -.align 4 -.Loop: -#if defined(_MIPS_ARCH_MIPS32R6) - lw $d0,0($inp) # load input - lw $d1,4($inp) - lw $d2,8($inp) - lw $d3,12($inp) - beqz $shr,.Laligned_inp - - lw $t0,16($inp) - subu $t1,$zero,$shr -# ifdef MIPSEB - sllv $d0,$d0,$shr - srlv $at,$d1,$t1 - sllv $d1,$d1,$shr - or $d0,$d0,$at - srlv $at,$d2,$t1 - sllv $d2,$d2,$shr - or $d1,$d1,$at - srlv $at,$d3,$t1 - sllv $d3,$d3,$shr - or $d2,$d2,$at - srlv $t0,$t0,$t1 - or $d3,$d3,$t0 -# else - srlv $d0,$d0,$shr - sllv $at,$d1,$t1 - srlv $d1,$d1,$shr - or $d0,$d0,$at - sllv $at,$d2,$t1 - srlv $d2,$d2,$shr - or $d1,$d1,$at - sllv $at,$d3,$t1 - srlv $d3,$d3,$shr - or $d2,$d2,$at - sllv $t0,$t0,$t1 - or $d3,$d3,$t0 -# endif -.Laligned_inp: -#else - lwl $d0,0+MSB($inp) # load input - lwl $d1,4+MSB($inp) - lwl $d2,8+MSB($inp) - lwl $d3,12+MSB($inp) - lwr $d0,0+LSB($inp) - lwr $d1,4+LSB($inp) - lwr $d2,8+LSB($inp) - lwr $d3,12+LSB($inp) -#endif -#ifdef MIPSEB -# if defined(_MIPS_ARCH_MIPS32R2) - wsbh $d0,$d0 # byte swap - wsbh $d1,$d1 - wsbh $d2,$d2 - wsbh $d3,$d3 - rotr $d0,$d0,16 - rotr $d1,$d1,16 - rotr $d2,$d2,16 - rotr $d3,$d3,16 -# else - srl $at,$d0,24 # byte swap - srl $t0,$d0,8 - andi $t1,$d0,0xFF00 - sll $d0,$d0,24 - andi $t0,0xFF00 - sll $t1,$t1,8 - or $d0,$at - srl $at,$d1,24 - or $t0,$t1 - srl $t1,$d1,8 - or $d0,$t0 - andi $t0,$d1,0xFF00 - sll $d1,$d1,24 - andi $t1,0xFF00 - sll $t0,$t0,8 - or $d1,$at - srl $at,$d2,24 - or $t1,$t0 - srl $t0,$d2,8 - or $d1,$t1 - andi $t1,$d2,0xFF00 - sll $d2,$d2,24 - andi $t0,0xFF00 - sll $t1,$t1,8 - or $d2,$at - srl $at,$d3,24 - or $t0,$t1 - srl $t1,$d3,8 - or $d2,$t0 - andi $t0,$d3,0xFF00 - sll $d3,$d3,24 - andi $t1,0xFF00 - sll $t0,$t0,8 - or $d3,$at - or $t1,$t0 - or $d3,$t1 -# endif -#endif - srl $t0,$h4,2 # modulo-scheduled reduction - andi $h4,$h4,3 - sll $at,$t0,2 - - addu $d0,$d0,$h0 # accumulate input - addu $t0,$t0,$at - sltu $h0,$d0,$h0 - addu $d0,$d0,$t0 # ... and residue - sltu $at,$d0,$t0 - - addu $d1,$d1,$h1 - addu $h0,$h0,$at # carry - sltu $h1,$d1,$h1 - addu $d1,$d1,$h0 - sltu $h0,$d1,$h0 - - addu $d2,$d2,$h2 - addu $h1,$h1,$h0 # carry - sltu $h2,$d2,$h2 - addu $d2,$d2,$h1 - sltu $h1,$d2,$h1 - - addu $d3,$d3,$h3 - addu $h2,$h2,$h1 # carry - sltu $h3,$d3,$h3 - addu $d3,$d3,$h2 - -#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6) - multu $r0,$d0 # d0*r0 - sltu $h2,$d3,$h2 - maddu $rs3,$d1 # d1*s3 - addu $h3,$h3,$h2 # carry - maddu $rs2,$d2 # d2*s2 - addu $h4,$h4,$padbit - maddu $rs1,$d3 # d3*s1 - addu $h4,$h4,$h3 - mfhi $at - mflo $h0 - - multu $r1,$d0 # d0*r1 - maddu $r0,$d1 # d1*r0 - maddu $rs3,$d2 # d2*s3 - maddu $rs2,$d3 # d3*s2 - maddu $rs1,$h4 # h4*s1 - maddu $at,$one # hi*1 - mfhi $at - mflo $h1 - - multu $r2,$d0 # d0*r2 - maddu $r1,$d1 # d1*r1 - maddu $r0,$d2 # d2*r0 - maddu $rs3,$d3 # d3*s3 - maddu $rs2,$h4 # h4*s2 - maddu $at,$one # hi*1 - mfhi $at - mflo $h2 - - mul $t0,$r0,$h4 # h4*r0 - - multu $r3,$d0 # d0*r3 - maddu $r2,$d1 # d1*r2 - maddu $r1,$d2 # d2*r1 - maddu $r0,$d3 # d3*r0 - maddu $rs3,$h4 # h4*s3 - maddu $at,$one # hi*1 - mfhi $at - mflo $h3 - - addiu $inp,$inp,16 - - addu $h4,$t0,$at -#else - multu ($r0,$d0) # d0*r0 - mflo ($h0,$r0,$d0) - mfhi ($h1,$r0,$d0) - - sltu $h2,$d3,$h2 - addu $h3,$h3,$h2 # carry - - multu ($rs3,$d1) # d1*s3 - mflo ($at,$rs3,$d1) - mfhi ($t0,$rs3,$d1) - - addu $h4,$h4,$padbit - addiu $inp,$inp,16 - addu $h4,$h4,$h3 - - multu ($rs2,$d2) # d2*s2 - mflo ($a3,$rs2,$d2) - mfhi ($t1,$rs2,$d2) - addu $h0,$h0,$at - addu $h1,$h1,$t0 - multu ($rs1,$d3) # d3*s1 - sltu $at,$h0,$at - addu $h1,$h1,$at - - mflo ($at,$rs1,$d3) - mfhi ($t0,$rs1,$d3) - addu $h0,$h0,$a3 - addu $h1,$h1,$t1 - multu ($r1,$d0) # d0*r1 - sltu $a3,$h0,$a3 - addu $h1,$h1,$a3 - - - mflo ($a3,$r1,$d0) - mfhi ($h2,$r1,$d0) - addu $h0,$h0,$at - addu $h1,$h1,$t0 - multu ($r0,$d1) # d1*r0 - sltu $at,$h0,$at - addu $h1,$h1,$at - - mflo ($at,$r0,$d1) - mfhi ($t0,$r0,$d1) - addu $h1,$h1,$a3 - sltu $a3,$h1,$a3 - multu ($rs3,$d2) # d2*s3 - addu $h2,$h2,$a3 - - mflo ($a3,$rs3,$d2) - mfhi ($t1,$rs3,$d2) - addu $h1,$h1,$at - addu $h2,$h2,$t0 - multu ($rs2,$d3) # d3*s2 - sltu $at,$h1,$at - addu $h2,$h2,$at - - mflo ($at,$rs2,$d3) - mfhi ($t0,$rs2,$d3) - addu $h1,$h1,$a3 - addu $h2,$h2,$t1 - multu ($rs1,$h4) # h4*s1 - sltu $a3,$h1,$a3 - addu $h2,$h2,$a3 - - mflo ($a3,$rs1,$h4) - addu $h1,$h1,$at - addu $h2,$h2,$t0 - multu ($r2,$d0) # d0*r2 - sltu $at,$h1,$at - addu $h2,$h2,$at - - - mflo ($at,$r2,$d0) - mfhi ($h3,$r2,$d0) - addu $h1,$h1,$a3 - sltu $a3,$h1,$a3 - multu ($r1,$d1) # d1*r1 - addu $h2,$h2,$a3 - - mflo ($a3,$r1,$d1) - mfhi ($t1,$r1,$d1) - addu $h2,$h2,$at - sltu $at,$h2,$at - multu ($r0,$d2) # d2*r0 - addu $h3,$h3,$at - - mflo ($at,$r0,$d2) - mfhi ($t0,$r0,$d2) - addu $h2,$h2,$a3 - addu $h3,$h3,$t1 - multu ($rs3,$d3) # d3*s3 - sltu $a3,$h2,$a3 - addu $h3,$h3,$a3 - - mflo ($a3,$rs3,$d3) - mfhi ($t1,$rs3,$d3) - addu $h2,$h2,$at - addu $h3,$h3,$t0 - multu ($rs2,$h4) # h4*s2 - sltu $at,$h2,$at - addu $h3,$h3,$at - - mflo ($at,$rs2,$h4) - addu $h2,$h2,$a3 - addu $h3,$h3,$t1 - multu ($r3,$d0) # d0*r3 - sltu $a3,$h2,$a3 - addu $h3,$h3,$a3 - - - mflo ($a3,$r3,$d0) - mfhi ($t1,$r3,$d0) - addu $h2,$h2,$at - sltu $at,$h2,$at - multu ($r2,$d1) # d1*r2 - addu $h3,$h3,$at - - mflo ($at,$r2,$d1) - mfhi ($t0,$r2,$d1) - addu $h3,$h3,$a3 - sltu $a3,$h3,$a3 - multu ($r0,$d3) # d3*r0 - addu $t1,$t1,$a3 - - mflo ($a3,$r0,$d3) - mfhi ($d3,$r0,$d3) - addu $h3,$h3,$at - addu $t1,$t1,$t0 - multu ($r1,$d2) # d2*r1 - sltu $at,$h3,$at - addu $t1,$t1,$at - - mflo ($at,$r1,$d2) - mfhi ($t0,$r1,$d2) - addu $h3,$h3,$a3 - addu $t1,$t1,$d3 - multu ($rs3,$h4) # h4*s3 - sltu $a3,$h3,$a3 - addu $t1,$t1,$a3 - - mflo ($a3,$rs3,$h4) - addu $h3,$h3,$at - addu $t1,$t1,$t0 - multu ($r0,$h4) # h4*r0 - sltu $at,$h3,$at - addu $t1,$t1,$at - - - mflo ($h4,$r0,$h4) - addu $h3,$h3,$a3 - sltu $a3,$h3,$a3 - addu $t1,$t1,$a3 - addu $h4,$h4,$t1 - - li $padbit,1 # if we loop, padbit is 1 -#endif - bne $inp,$len,.Loop - - sw $h0,0($ctx) # store hash value - sw $h1,4($ctx) - sw $h2,8($ctx) - sw $h3,12($ctx) - sw $h4,16($ctx) - - .set noreorder -.Labort: - lw $s11,4*11($sp) - lw $s10,4*10($sp) - lw $s9, 4*9($sp) - lw $s8, 4*8($sp) - lw $s7, 4*7($sp) - lw $s6, 4*6($sp) - lw $s5, 4*5($sp) - lw $s4, 4*4($sp) -___ -$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue - lw $s3, 4*3($sp) - lw $s2, 4*2($sp) - lw $s1, 4*1($sp) - lw $s0, 4*0($sp) -___ -$code.=<<___; - jr $ra - addu $sp,$sp,4*12 -.end poly1305_blocks -___ -} -{ -my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); - -$code.=<<___; -.align 5 -.globl poly1305_emit -.ent poly1305_emit -poly1305_emit: - .frame $sp,0,$ra - .set reorder - - lw $tmp4,16($ctx) - lw $tmp0,0($ctx) - lw $tmp1,4($ctx) - lw $tmp2,8($ctx) - lw $tmp3,12($ctx) - - li $in0,-4 # final reduction - srl $ctx,$tmp4,2 - and $in0,$in0,$tmp4 - andi $tmp4,$tmp4,3 - addu $ctx,$ctx,$in0 - - addu $tmp0,$tmp0,$ctx - sltu $ctx,$tmp0,$ctx - addiu $in0,$tmp0,5 # compare to modulus - addu $tmp1,$tmp1,$ctx - sltiu $in1,$in0,5 - sltu $ctx,$tmp1,$ctx - addu $in1,$in1,$tmp1 - addu $tmp2,$tmp2,$ctx - sltu $in2,$in1,$tmp1 - sltu $ctx,$tmp2,$ctx - addu $in2,$in2,$tmp2 - addu $tmp3,$tmp3,$ctx - sltu $in3,$in2,$tmp2 - sltu $ctx,$tmp3,$ctx - addu $in3,$in3,$tmp3 - addu $tmp4,$tmp4,$ctx - sltu $ctx,$in3,$tmp3 - addu $ctx,$tmp4 - - srl $ctx,2 # see if it carried/borrowed - subu $ctx,$zero,$ctx - - xor $in0,$tmp0 - xor $in1,$tmp1 - xor $in2,$tmp2 - xor $in3,$tmp3 - and $in0,$ctx - and $in1,$ctx - and $in2,$ctx - and $in3,$ctx - xor $in0,$tmp0 - xor $in1,$tmp1 - xor $in2,$tmp2 - xor $in3,$tmp3 - - lw $tmp0,0($nonce) # load nonce - lw $tmp1,4($nonce) - lw $tmp2,8($nonce) - lw $tmp3,12($nonce) - - addu $in0,$tmp0 # accumulate nonce - sltu $ctx,$in0,$tmp0 - - addu $in1,$tmp1 - sltu $tmp1,$in1,$tmp1 - addu $in1,$ctx - sltu $ctx,$in1,$ctx - addu $ctx,$tmp1 - - addu $in2,$tmp2 - sltu $tmp2,$in2,$tmp2 - addu $in2,$ctx - sltu $ctx,$in2,$ctx - addu $ctx,$tmp2 - - addu $in3,$tmp3 - addu $in3,$ctx - - srl $tmp0,$in0,8 # write mac value - srl $tmp1,$in0,16 - srl $tmp2,$in0,24 - sb $in0, 0($mac) - sb $tmp0,1($mac) - srl $tmp0,$in1,8 - sb $tmp1,2($mac) - srl $tmp1,$in1,16 - sb $tmp2,3($mac) - srl $tmp2,$in1,24 - sb $in1, 4($mac) - sb $tmp0,5($mac) - srl $tmp0,$in2,8 - sb $tmp1,6($mac) - srl $tmp1,$in2,16 - sb $tmp2,7($mac) - srl $tmp2,$in2,24 - sb $in2, 8($mac) - sb $tmp0,9($mac) - srl $tmp0,$in3,8 - sb $tmp1,10($mac) - srl $tmp1,$in3,16 - sb $tmp2,11($mac) - srl $tmp2,$in3,24 - sb $in3, 12($mac) - sb $tmp0,13($mac) - sb $tmp1,14($mac) - sb $tmp2,15($mac) - - jr $ra -.end poly1305_emit -.rdata -.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm" -.align 2 -___ -} -}}} - -$output=pop and open STDOUT,">$output"; -print $code; -close STDOUT; diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig index fdeb91bf0032..43c44316fbbd 100644 --- a/lib/crypto/Kconfig +++ b/lib/crypto/Kconfig @@ -196,7 +196,7 @@ if ARM64 source "lib/crypto/arm64/Kconfig" endif if MIPS -source "arch/mips/lib/crypto/Kconfig" +source "lib/crypto/mips/Kconfig" endif if PPC source "arch/powerpc/lib/crypto/Kconfig" diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 19e9880c5d5f..f54d2f3edc40 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -109,3 +109,4 @@ libsm3-y := sm3.o obj-$(CONFIG_ARM) += arm/ obj-$(CONFIG_ARM64) += arm64/ +obj-$(CONFIG_MIPS) += mips/ diff --git a/lib/crypto/mips/.gitignore b/lib/crypto/mips/.gitignore new file mode 100644 index 000000000000..0d47d4f21c6d --- /dev/null +++ b/lib/crypto/mips/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +poly1305-core.S diff --git a/lib/crypto/mips/Kconfig b/lib/crypto/mips/Kconfig new file mode 100644 index 000000000000..0670a170c1be --- /dev/null +++ b/lib/crypto/mips/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config CRYPTO_CHACHA_MIPS + tristate + depends on CPU_MIPS32_R2 + default CRYPTO_LIB_CHACHA + select CRYPTO_ARCH_HAVE_LIB_CHACHA + +config CRYPTO_POLY1305_MIPS + tristate + default CRYPTO_LIB_POLY1305 + select CRYPTO_ARCH_HAVE_LIB_POLY1305 diff --git a/lib/crypto/mips/Makefile b/lib/crypto/mips/Makefile new file mode 100644 index 000000000000..804488c7aded --- /dev/null +++ b/lib/crypto/mips/Makefile @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o +chacha-mips-y := chacha-core.o chacha-glue.o +AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots + +obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o +poly1305-mips-y := poly1305-core.o poly1305-glue.o + +perlasm-flavour-$(CONFIG_32BIT) := o32 +perlasm-flavour-$(CONFIG_64BIT) := 64 + +quiet_cmd_perlasm = PERLASM $@ + cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@) + +$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE + $(call if_changed,perlasm) + +targets += poly1305-core.S diff --git a/lib/crypto/mips/chacha-core.S b/lib/crypto/mips/chacha-core.S new file mode 100644 index 000000000000..5755f69cfe00 --- /dev/null +++ b/lib/crypto/mips/chacha-core.S @@ -0,0 +1,497 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2016-2018 René van Dorst . All Rights Reserved. + * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. + */ + +#define MASK_U32 0x3c +#define CHACHA20_BLOCK_SIZE 64 +#define STACK_SIZE 32 + +#define X0 $t0 +#define X1 $t1 +#define X2 $t2 +#define X3 $t3 +#define X4 $t4 +#define X5 $t5 +#define X6 $t6 +#define X7 $t7 +#define X8 $t8 +#define X9 $t9 +#define X10 $v1 +#define X11 $s6 +#define X12 $s5 +#define X13 $s4 +#define X14 $s3 +#define X15 $s2 +/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */ +#define T0 $s1 +#define T1 $s0 +#define T(n) T ## n +#define X(n) X ## n + +/* Input arguments */ +#define STATE $a0 +#define OUT $a1 +#define IN $a2 +#define BYTES $a3 + +/* Output argument */ +/* NONCE[0] is kept in a register and not in memory. + * We don't want to touch original value in memory. + * Must be incremented every loop iteration. + */ +#define NONCE_0 $v0 + +/* SAVED_X and SAVED_CA are set in the jump table. + * Use regs which are overwritten on exit else we don't leak clear data. + * They are used to handling the last bytes which are not multiple of 4. + */ +#define SAVED_X X15 +#define SAVED_CA $s7 + +#define IS_UNALIGNED $s7 + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define MSB 0 +#define LSB 3 +#define ROTx rotl +#define ROTR(n) rotr n, 24 +#define CPU_TO_LE32(n) \ + wsbh n; \ + rotr n, 16; +#else +#define MSB 3 +#define LSB 0 +#define ROTx rotr +#define CPU_TO_LE32(n) +#define ROTR(n) +#endif + +#define FOR_EACH_WORD(x) \ + x( 0); \ + x( 1); \ + x( 2); \ + x( 3); \ + x( 4); \ + x( 5); \ + x( 6); \ + x( 7); \ + x( 8); \ + x( 9); \ + x(10); \ + x(11); \ + x(12); \ + x(13); \ + x(14); \ + x(15); + +#define FOR_EACH_WORD_REV(x) \ + x(15); \ + x(14); \ + x(13); \ + x(12); \ + x(11); \ + x(10); \ + x( 9); \ + x( 8); \ + x( 7); \ + x( 6); \ + x( 5); \ + x( 4); \ + x( 3); \ + x( 2); \ + x( 1); \ + x( 0); + +#define PLUS_ONE_0 1 +#define PLUS_ONE_1 2 +#define PLUS_ONE_2 3 +#define PLUS_ONE_3 4 +#define PLUS_ONE_4 5 +#define PLUS_ONE_5 6 +#define PLUS_ONE_6 7 +#define PLUS_ONE_7 8 +#define PLUS_ONE_8 9 +#define PLUS_ONE_9 10 +#define PLUS_ONE_10 11 +#define PLUS_ONE_11 12 +#define PLUS_ONE_12 13 +#define PLUS_ONE_13 14 +#define PLUS_ONE_14 15 +#define PLUS_ONE_15 16 +#define PLUS_ONE(x) PLUS_ONE_ ## x +#define _CONCAT3(a,b,c) a ## b ## c +#define CONCAT3(a,b,c) _CONCAT3(a,b,c) + +#define STORE_UNALIGNED(x) \ +CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \ + .if (x != 12); \ + lw T0, (x*4)(STATE); \ + .endif; \ + lwl T1, (x*4)+MSB ## (IN); \ + lwr T1, (x*4)+LSB ## (IN); \ + .if (x == 12); \ + addu X ## x, NONCE_0; \ + .else; \ + addu X ## x, T0; \ + .endif; \ + CPU_TO_LE32(X ## x); \ + xor X ## x, T1; \ + swl X ## x, (x*4)+MSB ## (OUT); \ + swr X ## x, (x*4)+LSB ## (OUT); + +#define STORE_ALIGNED(x) \ +CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \ + .if (x != 12); \ + lw T0, (x*4)(STATE); \ + .endif; \ + lw T1, (x*4) ## (IN); \ + .if (x == 12); \ + addu X ## x, NONCE_0; \ + .else; \ + addu X ## x, T0; \ + .endif; \ + CPU_TO_LE32(X ## x); \ + xor X ## x, T1; \ + sw X ## x, (x*4) ## (OUT); + +/* Jump table macro. + * Used for setup and handling the last bytes, which are not multiple of 4. + * X15 is free to store Xn + * Every jumptable entry must be equal in size. + */ +#define JMPTBL_ALIGNED(x) \ +.Lchacha_mips_jmptbl_aligned_ ## x: ; \ + .set noreorder; \ + b .Lchacha_mips_xor_aligned_ ## x ## _b; \ + .if (x == 12); \ + addu SAVED_X, X ## x, NONCE_0; \ + .else; \ + addu SAVED_X, X ## x, SAVED_CA; \ + .endif; \ + .set reorder + +#define JMPTBL_UNALIGNED(x) \ +.Lchacha_mips_jmptbl_unaligned_ ## x: ; \ + .set noreorder; \ + b .Lchacha_mips_xor_unaligned_ ## x ## _b; \ + .if (x == 12); \ + addu SAVED_X, X ## x, NONCE_0; \ + .else; \ + addu SAVED_X, X ## x, SAVED_CA; \ + .endif; \ + .set reorder + +#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \ + addu X(A), X(K); \ + addu X(B), X(L); \ + addu X(C), X(M); \ + addu X(D), X(N); \ + xor X(V), X(A); \ + xor X(W), X(B); \ + xor X(Y), X(C); \ + xor X(Z), X(D); \ + rotl X(V), S; \ + rotl X(W), S; \ + rotl X(Y), S; \ + rotl X(Z), S; + +.text +.set reorder +.set noat +.globl chacha_crypt_arch +.ent chacha_crypt_arch +chacha_crypt_arch: + .frame $sp, STACK_SIZE, $ra + + /* Load number of rounds */ + lw $at, 16($sp) + + addiu $sp, -STACK_SIZE + + /* Return bytes = 0. */ + beqz BYTES, .Lchacha_mips_end + + lw NONCE_0, 48(STATE) + + /* Save s0-s7 */ + sw $s0, 0($sp) + sw $s1, 4($sp) + sw $s2, 8($sp) + sw $s3, 12($sp) + sw $s4, 16($sp) + sw $s5, 20($sp) + sw $s6, 24($sp) + sw $s7, 28($sp) + + /* Test IN or OUT is unaligned. + * IS_UNALIGNED = ( IN | OUT ) & 0x00000003 + */ + or IS_UNALIGNED, IN, OUT + andi IS_UNALIGNED, 0x3 + + b .Lchacha_rounds_start + +.align 4 +.Loop_chacha_rounds: + addiu IN, CHACHA20_BLOCK_SIZE + addiu OUT, CHACHA20_BLOCK_SIZE + addiu NONCE_0, 1 + +.Lchacha_rounds_start: + lw X0, 0(STATE) + lw X1, 4(STATE) + lw X2, 8(STATE) + lw X3, 12(STATE) + + lw X4, 16(STATE) + lw X5, 20(STATE) + lw X6, 24(STATE) + lw X7, 28(STATE) + lw X8, 32(STATE) + lw X9, 36(STATE) + lw X10, 40(STATE) + lw X11, 44(STATE) + + move X12, NONCE_0 + lw X13, 52(STATE) + lw X14, 56(STATE) + lw X15, 60(STATE) + +.Loop_chacha_xor_rounds: + addiu $at, -2 + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); + bnez $at, .Loop_chacha_xor_rounds + + addiu BYTES, -(CHACHA20_BLOCK_SIZE) + + /* Is data src/dst unaligned? Jump */ + bnez IS_UNALIGNED, .Loop_chacha_unaligned + + /* Set number rounds here to fill delayslot. */ + lw $at, (STACK_SIZE+16)($sp) + + /* BYTES < 0, it has no full block. */ + bltz BYTES, .Lchacha_mips_no_full_block_aligned + + FOR_EACH_WORD_REV(STORE_ALIGNED) + + /* BYTES > 0? Loop again. */ + bgtz BYTES, .Loop_chacha_rounds + + /* Place this here to fill delay slot */ + addiu NONCE_0, 1 + + /* BYTES < 0? Handle last bytes */ + bltz BYTES, .Lchacha_mips_xor_bytes + +.Lchacha_mips_xor_done: + /* Restore used registers */ + lw $s0, 0($sp) + lw $s1, 4($sp) + lw $s2, 8($sp) + lw $s3, 12($sp) + lw $s4, 16($sp) + lw $s5, 20($sp) + lw $s6, 24($sp) + lw $s7, 28($sp) + + /* Write NONCE_0 back to right location in state */ + sw NONCE_0, 48(STATE) + +.Lchacha_mips_end: + addiu $sp, STACK_SIZE + jr $ra + +.Lchacha_mips_no_full_block_aligned: + /* Restore the offset on BYTES */ + addiu BYTES, CHACHA20_BLOCK_SIZE + + /* Get number of full WORDS */ + andi $at, BYTES, MASK_U32 + + /* Load upper half of jump table addr */ + lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0) + + /* Calculate lower half jump table offset */ + ins T0, $at, 1, 6 + + /* Add offset to STATE */ + addu T1, STATE, $at + + /* Add lower half jump table addr */ + addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0) + + /* Read value from STATE */ + lw SAVED_CA, 0(T1) + + /* Store remaining bytecounter as negative value */ + subu BYTES, $at, BYTES + + jr T0 + + /* Jump table */ + FOR_EACH_WORD(JMPTBL_ALIGNED) + + +.Loop_chacha_unaligned: + /* Set number rounds here to fill delayslot. */ + lw $at, (STACK_SIZE+16)($sp) + + /* BYTES > 0, it has no full block. */ + bltz BYTES, .Lchacha_mips_no_full_block_unaligned + + FOR_EACH_WORD_REV(STORE_UNALIGNED) + + /* BYTES > 0? Loop again. */ + bgtz BYTES, .Loop_chacha_rounds + + /* Write NONCE_0 back to right location in state */ + sw NONCE_0, 48(STATE) + + .set noreorder + /* Fall through to byte handling */ + bgez BYTES, .Lchacha_mips_xor_done +.Lchacha_mips_xor_unaligned_0_b: +.Lchacha_mips_xor_aligned_0_b: + /* Place this here to fill delay slot */ + addiu NONCE_0, 1 + .set reorder + +.Lchacha_mips_xor_bytes: + addu IN, $at + addu OUT, $at + /* First byte */ + lbu T1, 0(IN) + addiu $at, BYTES, 1 + CPU_TO_LE32(SAVED_X) + ROTR(SAVED_X) + xor T1, SAVED_X + sb T1, 0(OUT) + beqz $at, .Lchacha_mips_xor_done + /* Second byte */ + lbu T1, 1(IN) + addiu $at, BYTES, 2 + ROTx SAVED_X, 8 + xor T1, SAVED_X + sb T1, 1(OUT) + beqz $at, .Lchacha_mips_xor_done + /* Third byte */ + lbu T1, 2(IN) + ROTx SAVED_X, 8 + xor T1, SAVED_X + sb T1, 2(OUT) + b .Lchacha_mips_xor_done + +.Lchacha_mips_no_full_block_unaligned: + /* Restore the offset on BYTES */ + addiu BYTES, CHACHA20_BLOCK_SIZE + + /* Get number of full WORDS */ + andi $at, BYTES, MASK_U32 + + /* Load upper half of jump table addr */ + lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0) + + /* Calculate lower half jump table offset */ + ins T0, $at, 1, 6 + + /* Add offset to STATE */ + addu T1, STATE, $at + + /* Add lower half jump table addr */ + addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0) + + /* Read value from STATE */ + lw SAVED_CA, 0(T1) + + /* Store remaining bytecounter as negative value */ + subu BYTES, $at, BYTES + + jr T0 + + /* Jump table */ + FOR_EACH_WORD(JMPTBL_UNALIGNED) +.end chacha_crypt_arch +.set at + +/* Input arguments + * STATE $a0 + * OUT $a1 + * NROUND $a2 + */ + +#undef X12 +#undef X13 +#undef X14 +#undef X15 + +#define X12 $a3 +#define X13 $at +#define X14 $v0 +#define X15 STATE + +.set noat +.globl hchacha_block_arch +.ent hchacha_block_arch +hchacha_block_arch: + .frame $sp, STACK_SIZE, $ra + + addiu $sp, -STACK_SIZE + + /* Save X11(s6) */ + sw X11, 0($sp) + + lw X0, 0(STATE) + lw X1, 4(STATE) + lw X2, 8(STATE) + lw X3, 12(STATE) + lw X4, 16(STATE) + lw X5, 20(STATE) + lw X6, 24(STATE) + lw X7, 28(STATE) + lw X8, 32(STATE) + lw X9, 36(STATE) + lw X10, 40(STATE) + lw X11, 44(STATE) + lw X12, 48(STATE) + lw X13, 52(STATE) + lw X14, 56(STATE) + lw X15, 60(STATE) + +.Loop_hchacha_xor_rounds: + addiu $a2, -2 + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12); + AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8); + AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12); + AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8); + AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7); + bnez $a2, .Loop_hchacha_xor_rounds + + /* Restore used register */ + lw X11, 0($sp) + + sw X0, 0(OUT) + sw X1, 4(OUT) + sw X2, 8(OUT) + sw X3, 12(OUT) + sw X12, 16(OUT) + sw X13, 20(OUT) + sw X14, 24(OUT) + sw X15, 28(OUT) + + addiu $sp, STACK_SIZE + jr $ra +.end hchacha_block_arch +.set at diff --git a/lib/crypto/mips/chacha-glue.c b/lib/crypto/mips/chacha-glue.c new file mode 100644 index 000000000000..88c097594eb0 --- /dev/null +++ b/lib/crypto/mips/chacha-glue.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ChaCha and HChaCha functions (MIPS optimized) + * + * Copyright (C) 2019 Linaro, Ltd. + */ + +#include +#include +#include + +asmlinkage void chacha_crypt_arch(struct chacha_state *state, + u8 *dst, const u8 *src, + unsigned int bytes, int nrounds); +EXPORT_SYMBOL(chacha_crypt_arch); + +asmlinkage void hchacha_block_arch(const struct chacha_state *state, + u32 out[HCHACHA_OUT_WORDS], int nrounds); +EXPORT_SYMBOL(hchacha_block_arch); + +bool chacha_is_arch_optimized(void) +{ + return true; +} +EXPORT_SYMBOL(chacha_is_arch_optimized); + +MODULE_DESCRIPTION("ChaCha and HChaCha functions (MIPS optimized)"); +MODULE_AUTHOR("Ard Biesheuvel "); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/mips/poly1305-glue.c b/lib/crypto/mips/poly1305-glue.c new file mode 100644 index 000000000000..764a38a65200 --- /dev/null +++ b/lib/crypto/mips/poly1305-glue.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS + * + * Copyright (C) 2019 Linaro Ltd. + */ + +#include +#include +#include +#include +#include + +asmlinkage void poly1305_block_init_arch( + struct poly1305_block_state *state, + const u8 raw_key[POLY1305_BLOCK_SIZE]); +EXPORT_SYMBOL_GPL(poly1305_block_init_arch); +asmlinkage void poly1305_blocks_arch(struct poly1305_block_state *state, + const u8 *src, u32 len, u32 hibit); +EXPORT_SYMBOL_GPL(poly1305_blocks_arch); +asmlinkage void poly1305_emit_arch(const struct poly1305_state *state, + u8 digest[POLY1305_DIGEST_SIZE], + const u32 nonce[4]); +EXPORT_SYMBOL_GPL(poly1305_emit_arch); + +bool poly1305_is_arch_optimized(void) +{ + return true; +} +EXPORT_SYMBOL(poly1305_is_arch_optimized); + +MODULE_DESCRIPTION("Poly1305 transform (MIPS accelerated"); +MODULE_LICENSE("GPL v2"); diff --git a/lib/crypto/mips/poly1305-mips.pl b/lib/crypto/mips/poly1305-mips.pl new file mode 100644 index 000000000000..399f10c3e385 --- /dev/null +++ b/lib/crypto/mips/poly1305-mips.pl @@ -0,0 +1,1273 @@ +#!/usr/bin/env perl +# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL +# project. +# ==================================================================== + +# Poly1305 hash for MIPS. +# +# May 2016 +# +# Numbers are cycles per processed byte with poly1305_blocks alone. +# +# IALU/gcc +# R1x000 ~5.5/+130% (big-endian) +# Octeon II 2.50/+70% (little-endian) +# +# March 2019 +# +# Add 32-bit code path. +# +# October 2019 +# +# Modulo-scheduling reduction allows to omit dependency chain at the +# end of inner loop and improve performance. Also optimize MIPS32R2 +# code path for MIPS 1004K core. Per René von Dorst's suggestions. +# +# IALU/gcc +# R1x000 ~9.8/? (big-endian) +# Octeon II 3.65/+140% (little-endian) +# MT7621/1004K 4.75/? (little-endian) +# +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp [o32 can be +# excluded from the rule, because it's specified volatile]; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +# +# +###################################################################### + +$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64 + +$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; + +if ($flavour =~ /64|n32/i) {{{ +###################################################################### +# 64-bit code path +# + +my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); +my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); + +$code.=<<___; +#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\ + defined(_MIPS_ARCH_MIPS64R6)) \\ + && !defined(_MIPS_ARCH_MIPS64R2) +# define _MIPS_ARCH_MIPS64R2 +#endif + +#if defined(_MIPS_ARCH_MIPS64R6) +# define dmultu(rs,rt) +# define mflo(rd,rs,rt) dmulu rd,rs,rt +# define mfhi(rd,rs,rt) dmuhu rd,rs,rt +#else +# define dmultu(rs,rt) dmultu rs,rt +# define mflo(rd,rs,rt) mflo rd +# define mfhi(rd,rs,rt) mfhi rd +#endif + +#ifdef __KERNEL__ +# define poly1305_init poly1305_block_init_arch +# define poly1305_blocks poly1305_blocks_arch +# define poly1305_emit poly1305_emit_arch +#endif + +#if defined(__MIPSEB__) && !defined(MIPSEB) +# define MIPSEB +#endif + +#ifdef MIPSEB +# define MSB 0 +# define LSB 7 +#else +# define MSB 7 +# define LSB 0 +#endif + +.text +.set noat +.set noreorder + +.align 5 +.globl poly1305_init +.ent poly1305_init +poly1305_init: + .frame $sp,0,$ra + .set reorder + + sd $zero,0($ctx) + sd $zero,8($ctx) + sd $zero,16($ctx) + + beqz $inp,.Lno_key + +#if defined(_MIPS_ARCH_MIPS64R6) + andi $tmp0,$inp,7 # $inp % 8 + dsubu $inp,$inp,$tmp0 # align $inp + sll $tmp0,$tmp0,3 # byte to bit offset + ld $in0,0($inp) + ld $in1,8($inp) + beqz $tmp0,.Laligned_key + ld $tmp2,16($inp) + + subu $tmp1,$zero,$tmp0 +# ifdef MIPSEB + dsllv $in0,$in0,$tmp0 + dsrlv $tmp3,$in1,$tmp1 + dsllv $in1,$in1,$tmp0 + dsrlv $tmp2,$tmp2,$tmp1 +# else + dsrlv $in0,$in0,$tmp0 + dsllv $tmp3,$in1,$tmp1 + dsrlv $in1,$in1,$tmp0 + dsllv $tmp2,$tmp2,$tmp1 +# endif + or $in0,$in0,$tmp3 + or $in1,$in1,$tmp2 +.Laligned_key: +#else + ldl $in0,0+MSB($inp) + ldl $in1,8+MSB($inp) + ldr $in0,0+LSB($inp) + ldr $in1,8+LSB($inp) +#endif +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS64R2) + dsbh $in0,$in0 # byte swap + dsbh $in1,$in1 + dshd $in0,$in0 + dshd $in1,$in1 +# else + ori $tmp0,$zero,0xFF + dsll $tmp2,$tmp0,32 + or $tmp0,$tmp2 # 0x000000FF000000FF + + and $tmp1,$in0,$tmp0 # byte swap + and $tmp3,$in1,$tmp0 + dsrl $tmp2,$in0,24 + dsrl $tmp4,$in1,24 + dsll $tmp1,24 + dsll $tmp3,24 + and $tmp2,$tmp0 + and $tmp4,$tmp0 + dsll $tmp0,8 # 0x0000FF000000FF00 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + and $tmp2,$in0,$tmp0 + and $tmp4,$in1,$tmp0 + dsrl $in0,8 + dsrl $in1,8 + dsll $tmp2,8 + dsll $tmp4,8 + and $in0,$tmp0 + and $in1,$tmp0 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + or $in0,$tmp1 + or $in1,$tmp3 + dsrl $tmp1,$in0,32 + dsrl $tmp3,$in1,32 + dsll $in0,32 + dsll $in1,32 + or $in0,$tmp1 + or $in1,$tmp3 +# endif +#endif + li $tmp0,1 + dsll $tmp0,32 # 0x0000000100000000 + daddiu $tmp0,-63 # 0x00000000ffffffc1 + dsll $tmp0,28 # 0x0ffffffc10000000 + daddiu $tmp0,-1 # 0x0ffffffc0fffffff + + and $in0,$tmp0 + daddiu $tmp0,-3 # 0x0ffffffc0ffffffc + and $in1,$tmp0 + + sd $in0,24($ctx) + dsrl $tmp0,$in1,2 + sd $in1,32($ctx) + daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) + sd $tmp0,40($ctx) + +.Lno_key: + li $v0,0 # return 0 + jr $ra +.end poly1305_init +___ +{ +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; + +my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = + ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); +my ($shr,$shl) = ($s6,$s7); # used on R6 + +$code.=<<___; +.align 5 +.globl poly1305_blocks +.ent poly1305_blocks +poly1305_blocks: + .set noreorder + dsrl $len,4 # number of complete blocks + bnez $len,poly1305_blocks_internal + nop + jr $ra + nop +.end poly1305_blocks + +.align 5 +.ent poly1305_blocks_internal +poly1305_blocks_internal: + .set noreorder +#if defined(_MIPS_ARCH_MIPS64R6) + .frame $sp,8*8,$ra + .mask $SAVED_REGS_MASK|0x000c0000,-8 + dsubu $sp,8*8 + sd $s7,56($sp) + sd $s6,48($sp) +#else + .frame $sp,6*8,$ra + .mask $SAVED_REGS_MASK,-8 + dsubu $sp,6*8 +#endif + sd $s5,40($sp) + sd $s4,32($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + sd $s3,24($sp) + sd $s2,16($sp) + sd $s1,8($sp) + sd $s0,0($sp) +___ +$code.=<<___; + .set reorder + +#if defined(_MIPS_ARCH_MIPS64R6) + andi $shr,$inp,7 + dsubu $inp,$inp,$shr # align $inp + sll $shr,$shr,3 # byte to bit offset + subu $shl,$zero,$shr +#endif + + ld $h0,0($ctx) # load hash value + ld $h1,8($ctx) + ld $h2,16($ctx) + + ld $r0,24($ctx) # load key + ld $r1,32($ctx) + ld $rs1,40($ctx) + + dsll $len,4 + daddu $len,$inp # end of buffer + b .Loop + +.align 4 +.Loop: +#if defined(_MIPS_ARCH_MIPS64R6) + ld $in0,0($inp) # load input + ld $in1,8($inp) + beqz $shr,.Laligned_inp + + ld $tmp2,16($inp) +# ifdef MIPSEB + dsllv $in0,$in0,$shr + dsrlv $tmp3,$in1,$shl + dsllv $in1,$in1,$shr + dsrlv $tmp2,$tmp2,$shl +# else + dsrlv $in0,$in0,$shr + dsllv $tmp3,$in1,$shl + dsrlv $in1,$in1,$shr + dsllv $tmp2,$tmp2,$shl +# endif + or $in0,$in0,$tmp3 + or $in1,$in1,$tmp2 +.Laligned_inp: +#else + ldl $in0,0+MSB($inp) # load input + ldl $in1,8+MSB($inp) + ldr $in0,0+LSB($inp) + ldr $in1,8+LSB($inp) +#endif + daddiu $inp,16 +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS64R2) + dsbh $in0,$in0 # byte swap + dsbh $in1,$in1 + dshd $in0,$in0 + dshd $in1,$in1 +# else + ori $tmp0,$zero,0xFF + dsll $tmp2,$tmp0,32 + or $tmp0,$tmp2 # 0x000000FF000000FF + + and $tmp1,$in0,$tmp0 # byte swap + and $tmp3,$in1,$tmp0 + dsrl $tmp2,$in0,24 + dsrl $tmp4,$in1,24 + dsll $tmp1,24 + dsll $tmp3,24 + and $tmp2,$tmp0 + and $tmp4,$tmp0 + dsll $tmp0,8 # 0x0000FF000000FF00 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + and $tmp2,$in0,$tmp0 + and $tmp4,$in1,$tmp0 + dsrl $in0,8 + dsrl $in1,8 + dsll $tmp2,8 + dsll $tmp4,8 + and $in0,$tmp0 + and $in1,$tmp0 + or $tmp1,$tmp2 + or $tmp3,$tmp4 + or $in0,$tmp1 + or $in1,$tmp3 + dsrl $tmp1,$in0,32 + dsrl $tmp3,$in1,32 + dsll $in0,32 + dsll $in1,32 + or $in0,$tmp1 + or $in1,$tmp3 +# endif +#endif + dsrl $tmp1,$h2,2 # modulo-scheduled reduction + andi $h2,$h2,3 + dsll $tmp0,$tmp1,2 + + daddu $d0,$h0,$in0 # accumulate input + daddu $tmp1,$tmp0 + sltu $tmp0,$d0,$h0 + daddu $d0,$d0,$tmp1 # ... and residue + sltu $tmp1,$d0,$tmp1 + daddu $d1,$h1,$in1 + daddu $tmp0,$tmp1 + sltu $tmp1,$d1,$h1 + daddu $d1,$tmp0 + + dmultu ($r0,$d0) # h0*r0 + daddu $d2,$h2,$padbit + sltu $tmp0,$d1,$tmp0 + mflo ($h0,$r0,$d0) + mfhi ($h1,$r0,$d0) + + dmultu ($rs1,$d1) # h1*5*r1 + daddu $d2,$tmp1 + daddu $d2,$tmp0 + mflo ($tmp0,$rs1,$d1) + mfhi ($tmp1,$rs1,$d1) + + dmultu ($r1,$d0) # h0*r1 + mflo ($tmp2,$r1,$d0) + mfhi ($h2,$r1,$d0) + daddu $h0,$tmp0 + daddu $h1,$tmp1 + sltu $tmp0,$h0,$tmp0 + + dmultu ($r0,$d1) # h1*r0 + daddu $h1,$tmp0 + daddu $h1,$tmp2 + mflo ($tmp0,$r0,$d1) + mfhi ($tmp1,$r0,$d1) + + dmultu ($rs1,$d2) # h2*5*r1 + sltu $tmp2,$h1,$tmp2 + daddu $h2,$tmp2 + mflo ($tmp2,$rs1,$d2) + + dmultu ($r0,$d2) # h2*r0 + daddu $h1,$tmp0 + daddu $h2,$tmp1 + mflo ($tmp3,$r0,$d2) + sltu $tmp0,$h1,$tmp0 + daddu $h2,$tmp0 + + daddu $h1,$tmp2 + sltu $tmp2,$h1,$tmp2 + daddu $h2,$tmp2 + daddu $h2,$tmp3 + + bne $inp,$len,.Loop + + sd $h0,0($ctx) # store hash value + sd $h1,8($ctx) + sd $h2,16($ctx) + + .set noreorder +#if defined(_MIPS_ARCH_MIPS64R6) + ld $s7,56($sp) + ld $s6,48($sp) +#endif + ld $s5,40($sp) # epilogue + ld $s4,32($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue + ld $s3,24($sp) + ld $s2,16($sp) + ld $s1,8($sp) + ld $s0,0($sp) +___ +$code.=<<___; + jr $ra +#if defined(_MIPS_ARCH_MIPS64R6) + daddu $sp,8*8 +#else + daddu $sp,6*8 +#endif +.end poly1305_blocks_internal +___ +} +{ +my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); + +$code.=<<___; +.align 5 +.globl poly1305_emit +.ent poly1305_emit +poly1305_emit: + .frame $sp,0,$ra + .set reorder + + ld $tmp2,16($ctx) + ld $tmp0,0($ctx) + ld $tmp1,8($ctx) + + li $in0,-4 # final reduction + dsrl $in1,$tmp2,2 + and $in0,$tmp2 + andi $tmp2,$tmp2,3 + daddu $in0,$in1 + + daddu $tmp0,$tmp0,$in0 + sltu $in1,$tmp0,$in0 + daddiu $in0,$tmp0,5 # compare to modulus + daddu $tmp1,$tmp1,$in1 + sltiu $tmp3,$in0,5 + sltu $tmp4,$tmp1,$in1 + daddu $in1,$tmp1,$tmp3 + daddu $tmp2,$tmp2,$tmp4 + sltu $tmp3,$in1,$tmp3 + daddu $tmp2,$tmp2,$tmp3 + + dsrl $tmp2,2 # see if it carried/borrowed + dsubu $tmp2,$zero,$tmp2 + + xor $in0,$tmp0 + xor $in1,$tmp1 + and $in0,$tmp2 + and $in1,$tmp2 + xor $in0,$tmp0 + xor $in1,$tmp1 + + lwu $tmp0,0($nonce) # load nonce + lwu $tmp1,4($nonce) + lwu $tmp2,8($nonce) + lwu $tmp3,12($nonce) + dsll $tmp1,32 + dsll $tmp3,32 + or $tmp0,$tmp1 + or $tmp2,$tmp3 + + daddu $in0,$tmp0 # accumulate nonce + daddu $in1,$tmp2 + sltu $tmp0,$in0,$tmp0 + daddu $in1,$tmp0 + + dsrl $tmp0,$in0,8 # write mac value + dsrl $tmp1,$in0,16 + dsrl $tmp2,$in0,24 + sb $in0,0($mac) + dsrl $tmp3,$in0,32 + sb $tmp0,1($mac) + dsrl $tmp0,$in0,40 + sb $tmp1,2($mac) + dsrl $tmp1,$in0,48 + sb $tmp2,3($mac) + dsrl $tmp2,$in0,56 + sb $tmp3,4($mac) + dsrl $tmp3,$in1,8 + sb $tmp0,5($mac) + dsrl $tmp0,$in1,16 + sb $tmp1,6($mac) + dsrl $tmp1,$in1,24 + sb $tmp2,7($mac) + + sb $in1,8($mac) + dsrl $tmp2,$in1,32 + sb $tmp3,9($mac) + dsrl $tmp3,$in1,40 + sb $tmp0,10($mac) + dsrl $tmp0,$in1,48 + sb $tmp1,11($mac) + dsrl $tmp1,$in1,56 + sb $tmp2,12($mac) + sb $tmp3,13($mac) + sb $tmp0,14($mac) + sb $tmp1,15($mac) + + jr $ra +.end poly1305_emit +.rdata +.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm" +.align 2 +___ +} +}}} else {{{ +###################################################################### +# 32-bit code path +# + +my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); +my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = + ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2); + +$code.=<<___; +#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\ + defined(_MIPS_ARCH_MIPS32R6)) \\ + && !defined(_MIPS_ARCH_MIPS32R2) +# define _MIPS_ARCH_MIPS32R2 +#endif + +#if defined(_MIPS_ARCH_MIPS32R6) +# define multu(rs,rt) +# define mflo(rd,rs,rt) mulu rd,rs,rt +# define mfhi(rd,rs,rt) muhu rd,rs,rt +#else +# define multu(rs,rt) multu rs,rt +# define mflo(rd,rs,rt) mflo rd +# define mfhi(rd,rs,rt) mfhi rd +#endif + +#ifdef __KERNEL__ +# define poly1305_init poly1305_block_init_arch +# define poly1305_blocks poly1305_blocks_arch +# define poly1305_emit poly1305_emit_arch +#endif + +#if defined(__MIPSEB__) && !defined(MIPSEB) +# define MIPSEB +#endif + +#ifdef MIPSEB +# define MSB 0 +# define LSB 3 +#else +# define MSB 3 +# define LSB 0 +#endif + +.text +.set noat +.set noreorder + +.align 5 +.globl poly1305_init +.ent poly1305_init +poly1305_init: + .frame $sp,0,$ra + .set reorder + + sw $zero,0($ctx) + sw $zero,4($ctx) + sw $zero,8($ctx) + sw $zero,12($ctx) + sw $zero,16($ctx) + + beqz $inp,.Lno_key + +#if defined(_MIPS_ARCH_MIPS32R6) + andi $tmp0,$inp,3 # $inp % 4 + subu $inp,$inp,$tmp0 # align $inp + sll $tmp0,$tmp0,3 # byte to bit offset + lw $in0,0($inp) + lw $in1,4($inp) + lw $in2,8($inp) + lw $in3,12($inp) + beqz $tmp0,.Laligned_key + + lw $tmp2,16($inp) + subu $tmp1,$zero,$tmp0 +# ifdef MIPSEB + sllv $in0,$in0,$tmp0 + srlv $tmp3,$in1,$tmp1 + sllv $in1,$in1,$tmp0 + or $in0,$in0,$tmp3 + srlv $tmp3,$in2,$tmp1 + sllv $in2,$in2,$tmp0 + or $in1,$in1,$tmp3 + srlv $tmp3,$in3,$tmp1 + sllv $in3,$in3,$tmp0 + or $in2,$in2,$tmp3 + srlv $tmp2,$tmp2,$tmp1 + or $in3,$in3,$tmp2 +# else + srlv $in0,$in0,$tmp0 + sllv $tmp3,$in1,$tmp1 + srlv $in1,$in1,$tmp0 + or $in0,$in0,$tmp3 + sllv $tmp3,$in2,$tmp1 + srlv $in2,$in2,$tmp0 + or $in1,$in1,$tmp3 + sllv $tmp3,$in3,$tmp1 + srlv $in3,$in3,$tmp0 + or $in2,$in2,$tmp3 + sllv $tmp2,$tmp2,$tmp1 + or $in3,$in3,$tmp2 +# endif +.Laligned_key: +#else + lwl $in0,0+MSB($inp) + lwl $in1,4+MSB($inp) + lwl $in2,8+MSB($inp) + lwl $in3,12+MSB($inp) + lwr $in0,0+LSB($inp) + lwr $in1,4+LSB($inp) + lwr $in2,8+LSB($inp) + lwr $in3,12+LSB($inp) +#endif +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS32R2) + wsbh $in0,$in0 # byte swap + wsbh $in1,$in1 + wsbh $in2,$in2 + wsbh $in3,$in3 + rotr $in0,$in0,16 + rotr $in1,$in1,16 + rotr $in2,$in2,16 + rotr $in3,$in3,16 +# else + srl $tmp0,$in0,24 # byte swap + srl $tmp1,$in0,8 + andi $tmp2,$in0,0xFF00 + sll $in0,$in0,24 + andi $tmp1,0xFF00 + sll $tmp2,$tmp2,8 + or $in0,$tmp0 + srl $tmp0,$in1,24 + or $tmp1,$tmp2 + srl $tmp2,$in1,8 + or $in0,$tmp1 + andi $tmp1,$in1,0xFF00 + sll $in1,$in1,24 + andi $tmp2,0xFF00 + sll $tmp1,$tmp1,8 + or $in1,$tmp0 + srl $tmp0,$in2,24 + or $tmp2,$tmp1 + srl $tmp1,$in2,8 + or $in1,$tmp2 + andi $tmp2,$in2,0xFF00 + sll $in2,$in2,24 + andi $tmp1,0xFF00 + sll $tmp2,$tmp2,8 + or $in2,$tmp0 + srl $tmp0,$in3,24 + or $tmp1,$tmp2 + srl $tmp2,$in3,8 + or $in2,$tmp1 + andi $tmp1,$in3,0xFF00 + sll $in3,$in3,24 + andi $tmp2,0xFF00 + sll $tmp1,$tmp1,8 + or $in3,$tmp0 + or $tmp2,$tmp1 + or $in3,$tmp2 +# endif +#endif + lui $tmp0,0x0fff + ori $tmp0,0xffff # 0x0fffffff + and $in0,$in0,$tmp0 + subu $tmp0,3 # 0x0ffffffc + and $in1,$in1,$tmp0 + and $in2,$in2,$tmp0 + and $in3,$in3,$tmp0 + + sw $in0,20($ctx) + sw $in1,24($ctx) + sw $in2,28($ctx) + sw $in3,32($ctx) + + srl $tmp1,$in1,2 + srl $tmp2,$in2,2 + srl $tmp3,$in3,2 + addu $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2) + addu $in2,$in2,$tmp2 + addu $in3,$in3,$tmp3 + sw $in1,36($ctx) + sw $in2,40($ctx) + sw $in3,44($ctx) +.Lno_key: + li $v0,0 + jr $ra +.end poly1305_init +___ +{ +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000"; + +my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = + ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11); +my ($d0,$d1,$d2,$d3) = + ($a4,$a5,$a6,$a7); +my $shr = $t2; # used on R6 +my $one = $t2; # used on R2 + +$code.=<<___; +.globl poly1305_blocks +.align 5 +.ent poly1305_blocks +poly1305_blocks: + .frame $sp,16*4,$ra + .mask $SAVED_REGS_MASK,-4 + .set noreorder + subu $sp, $sp,4*12 + sw $s11,4*11($sp) + sw $s10,4*10($sp) + sw $s9, 4*9($sp) + sw $s8, 4*8($sp) + sw $s7, 4*7($sp) + sw $s6, 4*6($sp) + sw $s5, 4*5($sp) + sw $s4, 4*4($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + sw $s3, 4*3($sp) + sw $s2, 4*2($sp) + sw $s1, 4*1($sp) + sw $s0, 4*0($sp) +___ +$code.=<<___; + .set reorder + + srl $len,4 # number of complete blocks + li $one,1 + beqz $len,.Labort + +#if defined(_MIPS_ARCH_MIPS32R6) + andi $shr,$inp,3 + subu $inp,$inp,$shr # align $inp + sll $shr,$shr,3 # byte to bit offset +#endif + + lw $h0,0($ctx) # load hash value + lw $h1,4($ctx) + lw $h2,8($ctx) + lw $h3,12($ctx) + lw $h4,16($ctx) + + lw $r0,20($ctx) # load key + lw $r1,24($ctx) + lw $r2,28($ctx) + lw $r3,32($ctx) + lw $rs1,36($ctx) + lw $rs2,40($ctx) + lw $rs3,44($ctx) + + sll $len,4 + addu $len,$len,$inp # end of buffer + b .Loop + +.align 4 +.Loop: +#if defined(_MIPS_ARCH_MIPS32R6) + lw $d0,0($inp) # load input + lw $d1,4($inp) + lw $d2,8($inp) + lw $d3,12($inp) + beqz $shr,.Laligned_inp + + lw $t0,16($inp) + subu $t1,$zero,$shr +# ifdef MIPSEB + sllv $d0,$d0,$shr + srlv $at,$d1,$t1 + sllv $d1,$d1,$shr + or $d0,$d0,$at + srlv $at,$d2,$t1 + sllv $d2,$d2,$shr + or $d1,$d1,$at + srlv $at,$d3,$t1 + sllv $d3,$d3,$shr + or $d2,$d2,$at + srlv $t0,$t0,$t1 + or $d3,$d3,$t0 +# else + srlv $d0,$d0,$shr + sllv $at,$d1,$t1 + srlv $d1,$d1,$shr + or $d0,$d0,$at + sllv $at,$d2,$t1 + srlv $d2,$d2,$shr + or $d1,$d1,$at + sllv $at,$d3,$t1 + srlv $d3,$d3,$shr + or $d2,$d2,$at + sllv $t0,$t0,$t1 + or $d3,$d3,$t0 +# endif +.Laligned_inp: +#else + lwl $d0,0+MSB($inp) # load input + lwl $d1,4+MSB($inp) + lwl $d2,8+MSB($inp) + lwl $d3,12+MSB($inp) + lwr $d0,0+LSB($inp) + lwr $d1,4+LSB($inp) + lwr $d2,8+LSB($inp) + lwr $d3,12+LSB($inp) +#endif +#ifdef MIPSEB +# if defined(_MIPS_ARCH_MIPS32R2) + wsbh $d0,$d0 # byte swap + wsbh $d1,$d1 + wsbh $d2,$d2 + wsbh $d3,$d3 + rotr $d0,$d0,16 + rotr $d1,$d1,16 + rotr $d2,$d2,16 + rotr $d3,$d3,16 +# else + srl $at,$d0,24 # byte swap + srl $t0,$d0,8 + andi $t1,$d0,0xFF00 + sll $d0,$d0,24 + andi $t0,0xFF00 + sll $t1,$t1,8 + or $d0,$at + srl $at,$d1,24 + or $t0,$t1 + srl $t1,$d1,8 + or $d0,$t0 + andi $t0,$d1,0xFF00 + sll $d1,$d1,24 + andi $t1,0xFF00 + sll $t0,$t0,8 + or $d1,$at + srl $at,$d2,24 + or $t1,$t0 + srl $t0,$d2,8 + or $d1,$t1 + andi $t1,$d2,0xFF00 + sll $d2,$d2,24 + andi $t0,0xFF00 + sll $t1,$t1,8 + or $d2,$at + srl $at,$d3,24 + or $t0,$t1 + srl $t1,$d3,8 + or $d2,$t0 + andi $t0,$d3,0xFF00 + sll $d3,$d3,24 + andi $t1,0xFF00 + sll $t0,$t0,8 + or $d3,$at + or $t1,$t0 + or $d3,$t1 +# endif +#endif + srl $t0,$h4,2 # modulo-scheduled reduction + andi $h4,$h4,3 + sll $at,$t0,2 + + addu $d0,$d0,$h0 # accumulate input + addu $t0,$t0,$at + sltu $h0,$d0,$h0 + addu $d0,$d0,$t0 # ... and residue + sltu $at,$d0,$t0 + + addu $d1,$d1,$h1 + addu $h0,$h0,$at # carry + sltu $h1,$d1,$h1 + addu $d1,$d1,$h0 + sltu $h0,$d1,$h0 + + addu $d2,$d2,$h2 + addu $h1,$h1,$h0 # carry + sltu $h2,$d2,$h2 + addu $d2,$d2,$h1 + sltu $h1,$d2,$h1 + + addu $d3,$d3,$h3 + addu $h2,$h2,$h1 # carry + sltu $h3,$d3,$h3 + addu $d3,$d3,$h2 + +#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6) + multu $r0,$d0 # d0*r0 + sltu $h2,$d3,$h2 + maddu $rs3,$d1 # d1*s3 + addu $h3,$h3,$h2 # carry + maddu $rs2,$d2 # d2*s2 + addu $h4,$h4,$padbit + maddu $rs1,$d3 # d3*s1 + addu $h4,$h4,$h3 + mfhi $at + mflo $h0 + + multu $r1,$d0 # d0*r1 + maddu $r0,$d1 # d1*r0 + maddu $rs3,$d2 # d2*s3 + maddu $rs2,$d3 # d3*s2 + maddu $rs1,$h4 # h4*s1 + maddu $at,$one # hi*1 + mfhi $at + mflo $h1 + + multu $r2,$d0 # d0*r2 + maddu $r1,$d1 # d1*r1 + maddu $r0,$d2 # d2*r0 + maddu $rs3,$d3 # d3*s3 + maddu $rs2,$h4 # h4*s2 + maddu $at,$one # hi*1 + mfhi $at + mflo $h2 + + mul $t0,$r0,$h4 # h4*r0 + + multu $r3,$d0 # d0*r3 + maddu $r2,$d1 # d1*r2 + maddu $r1,$d2 # d2*r1 + maddu $r0,$d3 # d3*r0 + maddu $rs3,$h4 # h4*s3 + maddu $at,$one # hi*1 + mfhi $at + mflo $h3 + + addiu $inp,$inp,16 + + addu $h4,$t0,$at +#else + multu ($r0,$d0) # d0*r0 + mflo ($h0,$r0,$d0) + mfhi ($h1,$r0,$d0) + + sltu $h2,$d3,$h2 + addu $h3,$h3,$h2 # carry + + multu ($rs3,$d1) # d1*s3 + mflo ($at,$rs3,$d1) + mfhi ($t0,$rs3,$d1) + + addu $h4,$h4,$padbit + addiu $inp,$inp,16 + addu $h4,$h4,$h3 + + multu ($rs2,$d2) # d2*s2 + mflo ($a3,$rs2,$d2) + mfhi ($t1,$rs2,$d2) + addu $h0,$h0,$at + addu $h1,$h1,$t0 + multu ($rs1,$d3) # d3*s1 + sltu $at,$h0,$at + addu $h1,$h1,$at + + mflo ($at,$rs1,$d3) + mfhi ($t0,$rs1,$d3) + addu $h0,$h0,$a3 + addu $h1,$h1,$t1 + multu ($r1,$d0) # d0*r1 + sltu $a3,$h0,$a3 + addu $h1,$h1,$a3 + + + mflo ($a3,$r1,$d0) + mfhi ($h2,$r1,$d0) + addu $h0,$h0,$at + addu $h1,$h1,$t0 + multu ($r0,$d1) # d1*r0 + sltu $at,$h0,$at + addu $h1,$h1,$at + + mflo ($at,$r0,$d1) + mfhi ($t0,$r0,$d1) + addu $h1,$h1,$a3 + sltu $a3,$h1,$a3 + multu ($rs3,$d2) # d2*s3 + addu $h2,$h2,$a3 + + mflo ($a3,$rs3,$d2) + mfhi ($t1,$rs3,$d2) + addu $h1,$h1,$at + addu $h2,$h2,$t0 + multu ($rs2,$d3) # d3*s2 + sltu $at,$h1,$at + addu $h2,$h2,$at + + mflo ($at,$rs2,$d3) + mfhi ($t0,$rs2,$d3) + addu $h1,$h1,$a3 + addu $h2,$h2,$t1 + multu ($rs1,$h4) # h4*s1 + sltu $a3,$h1,$a3 + addu $h2,$h2,$a3 + + mflo ($a3,$rs1,$h4) + addu $h1,$h1,$at + addu $h2,$h2,$t0 + multu ($r2,$d0) # d0*r2 + sltu $at,$h1,$at + addu $h2,$h2,$at + + + mflo ($at,$r2,$d0) + mfhi ($h3,$r2,$d0) + addu $h1,$h1,$a3 + sltu $a3,$h1,$a3 + multu ($r1,$d1) # d1*r1 + addu $h2,$h2,$a3 + + mflo ($a3,$r1,$d1) + mfhi ($t1,$r1,$d1) + addu $h2,$h2,$at + sltu $at,$h2,$at + multu ($r0,$d2) # d2*r0 + addu $h3,$h3,$at + + mflo ($at,$r0,$d2) + mfhi ($t0,$r0,$d2) + addu $h2,$h2,$a3 + addu $h3,$h3,$t1 + multu ($rs3,$d3) # d3*s3 + sltu $a3,$h2,$a3 + addu $h3,$h3,$a3 + + mflo ($a3,$rs3,$d3) + mfhi ($t1,$rs3,$d3) + addu $h2,$h2,$at + addu $h3,$h3,$t0 + multu ($rs2,$h4) # h4*s2 + sltu $at,$h2,$at + addu $h3,$h3,$at + + mflo ($at,$rs2,$h4) + addu $h2,$h2,$a3 + addu $h3,$h3,$t1 + multu ($r3,$d0) # d0*r3 + sltu $a3,$h2,$a3 + addu $h3,$h3,$a3 + + + mflo ($a3,$r3,$d0) + mfhi ($t1,$r3,$d0) + addu $h2,$h2,$at + sltu $at,$h2,$at + multu ($r2,$d1) # d1*r2 + addu $h3,$h3,$at + + mflo ($at,$r2,$d1) + mfhi ($t0,$r2,$d1) + addu $h3,$h3,$a3 + sltu $a3,$h3,$a3 + multu ($r0,$d3) # d3*r0 + addu $t1,$t1,$a3 + + mflo ($a3,$r0,$d3) + mfhi ($d3,$r0,$d3) + addu $h3,$h3,$at + addu $t1,$t1,$t0 + multu ($r1,$d2) # d2*r1 + sltu $at,$h3,$at + addu $t1,$t1,$at + + mflo ($at,$r1,$d2) + mfhi ($t0,$r1,$d2) + addu $h3,$h3,$a3 + addu $t1,$t1,$d3 + multu ($rs3,$h4) # h4*s3 + sltu $a3,$h3,$a3 + addu $t1,$t1,$a3 + + mflo ($a3,$rs3,$h4) + addu $h3,$h3,$at + addu $t1,$t1,$t0 + multu ($r0,$h4) # h4*r0 + sltu $at,$h3,$at + addu $t1,$t1,$at + + + mflo ($h4,$r0,$h4) + addu $h3,$h3,$a3 + sltu $a3,$h3,$a3 + addu $t1,$t1,$a3 + addu $h4,$h4,$t1 + + li $padbit,1 # if we loop, padbit is 1 +#endif + bne $inp,$len,.Loop + + sw $h0,0($ctx) # store hash value + sw $h1,4($ctx) + sw $h2,8($ctx) + sw $h3,12($ctx) + sw $h4,16($ctx) + + .set noreorder +.Labort: + lw $s11,4*11($sp) + lw $s10,4*10($sp) + lw $s9, 4*9($sp) + lw $s8, 4*8($sp) + lw $s7, 4*7($sp) + lw $s6, 4*6($sp) + lw $s5, 4*5($sp) + lw $s4, 4*4($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + lw $s3, 4*3($sp) + lw $s2, 4*2($sp) + lw $s1, 4*1($sp) + lw $s0, 4*0($sp) +___ +$code.=<<___; + jr $ra + addu $sp,$sp,4*12 +.end poly1305_blocks +___ +} +{ +my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); + +$code.=<<___; +.align 5 +.globl poly1305_emit +.ent poly1305_emit +poly1305_emit: + .frame $sp,0,$ra + .set reorder + + lw $tmp4,16($ctx) + lw $tmp0,0($ctx) + lw $tmp1,4($ctx) + lw $tmp2,8($ctx) + lw $tmp3,12($ctx) + + li $in0,-4 # final reduction + srl $ctx,$tmp4,2 + and $in0,$in0,$tmp4 + andi $tmp4,$tmp4,3 + addu $ctx,$ctx,$in0 + + addu $tmp0,$tmp0,$ctx + sltu $ctx,$tmp0,$ctx + addiu $in0,$tmp0,5 # compare to modulus + addu $tmp1,$tmp1,$ctx + sltiu $in1,$in0,5 + sltu $ctx,$tmp1,$ctx + addu $in1,$in1,$tmp1 + addu $tmp2,$tmp2,$ctx + sltu $in2,$in1,$tmp1 + sltu $ctx,$tmp2,$ctx + addu $in2,$in2,$tmp2 + addu $tmp3,$tmp3,$ctx + sltu $in3,$in2,$tmp2 + sltu $ctx,$tmp3,$ctx + addu $in3,$in3,$tmp3 + addu $tmp4,$tmp4,$ctx + sltu $ctx,$in3,$tmp3 + addu $ctx,$tmp4 + + srl $ctx,2 # see if it carried/borrowed + subu $ctx,$zero,$ctx + + xor $in0,$tmp0 + xor $in1,$tmp1 + xor $in2,$tmp2 + xor $in3,$tmp3 + and $in0,$ctx + and $in1,$ctx + and $in2,$ctx + and $in3,$ctx + xor $in0,$tmp0 + xor $in1,$tmp1 + xor $in2,$tmp2 + xor $in3,$tmp3 + + lw $tmp0,0($nonce) # load nonce + lw $tmp1,4($nonce) + lw $tmp2,8($nonce) + lw $tmp3,12($nonce) + + addu $in0,$tmp0 # accumulate nonce + sltu $ctx,$in0,$tmp0 + + addu $in1,$tmp1 + sltu $tmp1,$in1,$tmp1 + addu $in1,$ctx + sltu $ctx,$in1,$ctx + addu $ctx,$tmp1 + + addu $in2,$tmp2 + sltu $tmp2,$in2,$tmp2 + addu $in2,$ctx + sltu $ctx,$in2,$ctx + addu $ctx,$tmp2 + + addu $in3,$tmp3 + addu $in3,$ctx + + srl $tmp0,$in0,8 # write mac value + srl $tmp1,$in0,16 + srl $tmp2,$in0,24 + sb $in0, 0($mac) + sb $tmp0,1($mac) + srl $tmp0,$in1,8 + sb $tmp1,2($mac) + srl $tmp1,$in1,16 + sb $tmp2,3($mac) + srl $tmp2,$in1,24 + sb $in1, 4($mac) + sb $tmp0,5($mac) + srl $tmp0,$in2,8 + sb $tmp1,6($mac) + srl $tmp1,$in2,16 + sb $tmp2,7($mac) + srl $tmp2,$in2,24 + sb $in2, 8($mac) + sb $tmp0,9($mac) + srl $tmp0,$in3,8 + sb $tmp1,10($mac) + srl $tmp1,$in3,16 + sb $tmp2,11($mac) + srl $tmp2,$in3,24 + sb $in3, 12($mac) + sb $tmp0,13($mac) + sb $tmp1,14($mac) + sb $tmp2,15($mac) + + jr $ra +.end poly1305_emit +.rdata +.asciiz "Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm" +.align 2 +___ +} +}}} + +$output=pop and open STDOUT,">$output"; +print $code; +close STDOUT;