[linux-block.git] / arch / arm / crypto / chacha-scalar-core.S

/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2018 Google, Inc.
 */

#include <linux/linkage.h>
#include <asm/assembler.h>

/*
 * Design notes:
 *
 * 16 registers would be needed to hold the state matrix, but only 14 are
 * available because 'sp' and 'pc' cannot be used.  So we spill the elements
 * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
 * 'ldrd' and one 'strd' instruction per round.
 *
 * All rotates are performed using the implicit rotate operand accepted by the
 * 'add' and 'eor' instructions.  This is faster than using explicit rotate
 * instructions.  To make this work, we allow the values in the second and last
 * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
 * wrong rotation amount.  The rotation amount is then fixed up just in time
 * when the values are used.  'brot' is the number of bits the values in row 'b'
 * need to be rotated right to arrive at the correct values, and 'drot'
 * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
 * that they end up as (25, 24) after every round.
 */

	// ChaCha state registers
	X0	.req	r0
	X1	.req	r1
	X2	.req	r2
	X3	.req	r3
	X4	.req	r4
	X5	.req	r5
	X6	.req	r6
	X7	.req	r7
	X8_X10	.req	r8	// shared by x8 and x10
	X9_X11	.req	r9	// shared by x9 and x11
	X12	.req	r10
	X13	.req	r11
	X14	.req	r12
	X15	.req	r14

.Lexpand_32byte_k:
	// "expand 32-byte k"
	.word	0x61707865, 0x3320646e, 0x79622d32, 0x6b206574

#ifdef __thumb2__
#  define adrl adr
#endif

.macro __rev		out, in,  t0, t1, t2
.if __LINUX_ARM_ARCH__ >= 6
	rev		\out, \in
.else
	lsl		\t0, \in, #24
	and		\t1, \in, #0xff00
	and		\t2, \in, #0xff0000
	orr		\out, \t0, \in, lsr #24
	orr		\out, \out, \t1, lsl #8
	orr		\out, \out, \t2, lsr #8
.endif
.endm

.macro _le32_bswap	x,  t0, t1, t2
#ifdef __ARMEB__
	__rev		\x, \x,  \t0, \t1, \t2
#endif
.endm

.macro _le32_bswap_4x	a, b, c, d,  t0, t1, t2
	_le32_bswap	\a,  \t0, \t1, \t2
	_le32_bswap	\b,  \t0, \t1, \t2
	_le32_bswap	\c,  \t0, \t1, \t2
	_le32_bswap	\d,  \t0, \t1, \t2
.endm

.macro __ldrd		a, b, src, offset
#if __LINUX_ARM_ARCH__ >= 6
	ldrd		\a, \b, [\src, #\offset]
#else
	ldr		\a, [\src, #\offset]
	ldr		\b, [\src, #\offset + 4]
#endif
.endm

.macro __strd		a, b, dst, offset
#if __LINUX_ARM_ARCH__ >= 6
	strd		\a, \b, [\dst, #\offset]
#else
	str		\a, [\dst, #\offset]
	str		\b, [\dst, #\offset + 4]
#endif
.endm

.macro _halfround	a1, b1, c1, d1,  a2, b2, c2, d2

	// a += b; d ^= a; d = rol(d, 16);
	add		\a1, \a1, \b1, ror #brot
	add		\a2, \a2, \b2, ror #brot
	eor		\d1, \a1, \d1, ror #drot
	eor		\d2, \a2, \d2, ror #drot
	// drot == 32 - 16 == 16

	// c += d; b ^= c; b = rol(b, 12);
	add		\c1, \c1, \d1, ror #16
	add		\c2, \c2, \d2, ror #16
	eor		\b1, \c1, \b1, ror #brot
	eor		\b2, \c2, \b2, ror #brot
	// brot == 32 - 12 == 20

	// a += b; d ^= a; d = rol(d, 8);
	add		\a1, \a1, \b1, ror #20
	add		\a2, \a2, \b2, ror #20
	eor		\d1, \a1, \d1, ror #16
	eor		\d2, \a2, \d2, ror #16
	// drot == 32 - 8 == 24

	// c += d; b ^= c; b = rol(b, 7);
	add		\c1, \c1, \d1, ror #24
	add		\c2, \c2, \d2, ror #24
	eor		\b1, \c1, \b1, ror #20
	eor		\b2, \c2, \b2, ror #20
	// brot == 32 - 7 == 25
.endm

.macro _doubleround

	// column round

	// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
	_halfround	X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13

	// save (x8, x9); restore (x10, x11)
	__strd		X8_X10, X9_X11, sp, 0
	__ldrd		X8_X10, X9_X11, sp, 8

	// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
	_halfround	X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15

	.set brot, 25
	.set drot, 24

	// diagonal round

	// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
	_halfround	X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12

	// save (x10, x11); restore (x8, x9)
	__strd		X8_X10, X9_X11, sp, 8
	__ldrd		X8_X10, X9_X11, sp, 0

	// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
	_halfround	X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
.endm

.macro _chacha_permute	nrounds
	.set brot, 0
	.set drot, 0
	.rept \nrounds / 2
	 _doubleround
	.endr
.endm

.macro _chacha		nrounds

.Lnext_block\@:
	// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
	// Registers contain x0-x9,x12-x15.

	// Do the core ChaCha permutation to update x0-x15.
	_chacha_permute	\nrounds

	add		sp, #8
	// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
	// Registers contain x0-x9,x12-x15.
	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.

	// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
	push		{X8_X10, X9_X11, X12, X13, X14, X15}

	// Load (OUT, IN, LEN).
	ldr		r14, [sp, #96]
	ldr		r12, [sp, #100]
	ldr		r11, [sp, #104]

	orr		r10, r14, r12

	// Use slow path if fewer than 64 bytes remain.
	cmp		r11, #64
	blt		.Lxor_slowpath\@

	// Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
	// ARMv6+, since ldmia and stmia (used below) still require alignment.
	tst		r10, #3
	bne		.Lxor_slowpath\@

	// Fast path: XOR 64 bytes of aligned data.

	// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.

	// x0-x3
	__ldrd		r8, r9, sp, 32
	__ldrd		r10, r11, sp, 40
	add		X0, X0, r8
	add		X1, X1, r9
	add		X2, X2, r10
	add		X3, X3, r11
	_le32_bswap_4x	X0, X1, X2, X3,  r8, r9, r10
	ldmia		r12!, {r8-r11}
	eor		X0, X0, r8
	eor		X1, X1, r9
	eor		X2, X2, r10
	eor		X3, X3, r11
	stmia		r14!, {X0-X3}

	// x4-x7
	__ldrd		r8, r9, sp, 48
	__ldrd		r10, r11, sp, 56
	add		X4, r8, X4, ror #brot
	add		X5, r9, X5, ror #brot
	ldmia		r12!, {X0-X3}
	add		X6, r10, X6, ror #brot
	add		X7, r11, X7, ror #brot
	_le32_bswap_4x	X4, X5, X6, X7,  r8, r9, r10
	eor		X4, X4, X0
	eor		X5, X5, X1
	eor		X6, X6, X2
	eor		X7, X7, X3
	stmia		r14!, {X4-X7}

	// x8-x15
	pop		{r0-r7}			// (x8-x9,x12-x15,x10-x11)
	__ldrd		r8, r9, sp, 32
	__ldrd		r10, r11, sp, 40
	add		r0, r0, r8		// x8
	add		r1, r1, r9		// x9
	add		r6, r6, r10		// x10
	add		r7, r7, r11		// x11
	_le32_bswap_4x	r0, r1, r6, r7,  r8, r9, r10
	ldmia		r12!, {r8-r11}
	eor		r0, r0, r8		// x8
	eor		r1, r1, r9		// x9
	eor		r6, r6, r10		// x10
	eor		r7, r7, r11		// x11
	stmia		r14!, {r0,r1,r6,r7}
	ldmia		r12!, {r0,r1,r6,r7}
	__ldrd		r8, r9, sp, 48
	__ldrd		r10, r11, sp, 56
	add		r2, r8, r2, ror #drot	// x12
	add		r3, r9, r3, ror #drot	// x13
	add		r4, r10, r4, ror #drot	// x14
	add		r5, r11, r5, ror #drot	// x15
	_le32_bswap_4x	r2, r3, r4, r5,  r9, r10, r11
	  ldr		r9, [sp, #72]		// load LEN
	eor		r2, r2, r0		// x12
	eor		r3, r3, r1		// x13
	eor		r4, r4, r6		// x14
	eor		r5, r5, r7		// x15
	  subs		r9, #64			// decrement and check LEN
	stmia		r14!, {r2-r5}

	beq		.Ldone\@

.Lprepare_for_next_block\@:

	// Stack: x0-x15 OUT IN LEN

	// Increment block counter (x12)
	add		r8, #1

	// Store updated (OUT, IN, LEN)
	str		r14, [sp, #64]
	str		r12, [sp, #68]
	str		r9, [sp, #72]

	  mov		r14, sp

	// Store updated block counter (x12)
	str		r8, [sp, #48]

	  sub		sp, #16

	// Reload state and do next block
	ldmia		r14!, {r0-r11}		// load x0-x11
	__strd		r10, r11, sp, 8		// store x10-x11 before state
	ldmia		r14, {r10-r12,r14}	// load x12-x15
	b		.Lnext_block\@

.Lxor_slowpath\@:
	// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
	// We handle it by storing the 64 bytes of keystream to the stack, then
	// XOR-ing the needed portion with the data.

	// Allocate keystream buffer
	sub		sp, #64
	mov		r14, sp

	// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.

	// Save keystream for x0-x3
	__ldrd		r8, r9, sp, 96
	__ldrd		r10, r11, sp, 104
	add		X0, X0, r8
	add		X1, X1, r9
	add		X2, X2, r10
	add		X3, X3, r11
	_le32_bswap_4x	X0, X1, X2, X3,  r8, r9, r10
	stmia		r14!, {X0-X3}

	// Save keystream for x4-x7
	__ldrd		r8, r9, sp, 112
	__ldrd		r10, r11, sp, 120
	add		X4, r8, X4, ror #brot
	add		X5, r9, X5, ror #brot
	add		X6, r10, X6, ror #brot
	add		X7, r11, X7, ror #brot
	_le32_bswap_4x	X4, X5, X6, X7,  r8, r9, r10
	  add		r8, sp, #64
	stmia		r14!, {X4-X7}

	// Save keystream for x8-x15
	ldm		r8, {r0-r7}		// (x8-x9,x12-x15,x10-x11)
	__ldrd		r8, r9, sp, 128
	__ldrd		r10, r11, sp, 136
	add		r0, r0, r8		// x8
	add		r1, r1, r9		// x9
	add		r6, r6, r10		// x10
	add		r7, r7, r11		// x11
	_le32_bswap_4x	r0, r1, r6, r7,  r8, r9, r10
	stmia		r14!, {r0,r1,r6,r7}
	__ldrd		r8, r9, sp, 144
	__ldrd		r10, r11, sp, 152
	add		r2, r8, r2, ror #drot	// x12
	add		r3, r9, r3, ror #drot	// x13
	add		r4, r10, r4, ror #drot	// x14
	add		r5, r11, r5, ror #drot	// x15
	_le32_bswap_4x	r2, r3, r4, r5,  r9, r10, r11
	stmia		r14, {r2-r5}

	// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
	// Registers: r8 is block counter, r12 is IN.

	ldr		r9, [sp, #168]		// LEN
	ldr		r14, [sp, #160]		// OUT
	cmp		r9, #64
	  mov		r0, sp
	movle		r1, r9
	movgt		r1, #64
	// r1 is number of bytes to XOR, in range [1, 64]

.if __LINUX_ARM_ARCH__ < 6
	orr		r2, r12, r14
	tst		r2, #3			// IN or OUT misaligned?
	bne		.Lxor_next_byte\@
.endif

	// XOR a word at a time
.rept 16
	subs		r1, #4
	blt		.Lxor_words_done\@
	ldr		r2, [r12], #4
	ldr		r3, [r0], #4
	eor		r2, r2, r3
	str		r2, [r14], #4
.endr
	b		.Lxor_slowpath_done\@
.Lxor_words_done\@:
	ands		r1, r1, #3
	beq		.Lxor_slowpath_done\@

	// XOR a byte at a time
.Lxor_next_byte\@:
	ldrb		r2, [r12], #1
	ldrb		r3, [r0], #1
	eor		r2, r2, r3
	strb		r2, [r14], #1
	subs		r1, #1
	bne		.Lxor_next_byte\@

.Lxor_slowpath_done\@:
	subs		r9, #64
	add		sp, #96
	bgt		.Lprepare_for_next_block\@

.Ldone\@:
.endm	// _chacha

/*
 * void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8],
 *		     const u32 iv[4]);
 */
ENTRY(chacha20_arm)
	cmp		r2, #0			// len == 0?
	reteq		lr

	push		{r0-r2,r4-r11,lr}

	// Push state x0-x15 onto stack.
	// Also store an extra copy of x10-x11 just before the state.

	ldr		r4, [sp, #48]		// iv
	mov		r0, sp
	sub		sp, #80

	// iv: x12-x15
	ldm		r4, {X12,X13,X14,X15}
	stmdb		r0!, {X12,X13,X14,X15}

	// key: x4-x11
	__ldrd		X8_X10, X9_X11, r3, 24
	__strd		X8_X10, X9_X11, sp, 8
	stmdb		r0!, {X8_X10, X9_X11}
	ldm		r3, {X4-X9_X11}
	stmdb		r0!, {X4-X9_X11}

	// constants: x0-x3
	adrl		X3, .Lexpand_32byte_k
	ldm		X3, {X0-X3}
	__strd		X0, X1, sp, 16
	__strd		X2, X3, sp, 24

	_chacha		20

	add		sp, #76
	pop		{r4-r11, pc}
ENDPROC(chacha20_arm)

/*
 * void hchacha20_arm(const u32 state[16], u32 out[8]);
 */
ENTRY(hchacha20_arm)
	push		{r1,r4-r11,lr}

	mov		r14, r0
	ldmia		r14!, {r0-r11}		// load x0-x11
	push		{r10-r11}		// store x10-x11 to stack
	ldm		r14, {r10-r12,r14}	// load x12-x15
	sub		sp, #8

	_chacha_permute	20

	// Skip over (unused0-unused1, x10-x11)
	add		sp, #16

	// Fix up rotations of x12-x15
	ror		X12, X12, #drot
	ror		X13, X13, #drot
	  pop		{r4}			// load 'out'
	ror		X14, X14, #drot
	ror		X15, X15, #drot

	// Store (x0-x3,x12-x15) to 'out'
	stm		r4, {X0,X1,X2,X3,X12,X13,X14,X15}

	pop		{r4-r11,pc}
ENDPROC(hchacha20_arm)
Commit	Line	Data
29621d09 AB	1	/* SPDX-License-Identifier: GPL-2.0 */
	2	/*
	3	* Copyright (C) 2018 Google, Inc.
	4	*/
	5
	6	#include <linux/linkage.h>
	7	#include <asm/assembler.h>
	8
	9	/*
	10	* Design notes:
	11	*
	12	* 16 registers would be needed to hold the state matrix, but only 14 are
	13	* available because 'sp' and 'pc' cannot be used. So we spill the elements
	14	* (x8, x9) to the stack and swap them out with (x10, x11). This adds one
	15	* 'ldrd' and one 'strd' instruction per round.
	16	*
	17	* All rotates are performed using the implicit rotate operand accepted by the
	18	* 'add' and 'eor' instructions. This is faster than using explicit rotate
	19	* instructions. To make this work, we allow the values in the second and last
	20	* rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
	21	* wrong rotation amount. The rotation amount is then fixed up just in time
	22	* when the values are used. 'brot' is the number of bits the values in row 'b'
	23	* need to be rotated right to arrive at the correct values, and 'drot'
	24	* similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
	25	* that they end up as (25, 24) after every round.
	26	*/
	27
	28	// ChaCha state registers
	29	X0 .req r0
	30	X1 .req r1
	31	X2 .req r2
	32	X3 .req r3
	33	X4 .req r4
	34	X5 .req r5
	35	X6 .req r6
	36	X7 .req r7
	37	X8_X10 .req r8 // shared by x8 and x10
	38	X9_X11 .req r9 // shared by x9 and x11
	39	X12 .req r10
	40	X13 .req r11
	41	X14 .req r12
	42	X15 .req r14
	43
	44	.Lexpand_32byte_k:
	45	// "expand 32-byte k"
	46	.word 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
	47
	48	#ifdef __thumb2__
	49	# define adrl adr
	50	#endif
	51
	52	.macro __rev out, in, t0, t1, t2
	53	.if __LINUX_ARM_ARCH__ >= 6
	54	rev \out, \in
	55	.else
	56	lsl \t0, \in, #24
	57	and \t1, \in, #0xff00
	58	and \t2, \in, #0xff0000
	59	orr \out, \t0, \in, lsr #24
	60	orr \out, \out, \t1, lsl #8
	61	orr \out, \out, \t2, lsr #8
	62	.endif
	63	.endm
	64
65	.macro _le32_bswap x, t0, t1, t2
66	#ifdef __ARMEB__
67	__rev \x, \x, \t0, \t1, \t2
68	#endif
69	.endm
70
71	.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
72	_le32_bswap \a, \t0, \t1, \t2
73	_le32_bswap \b, \t0, \t1, \t2
74	_le32_bswap \c, \t0, \t1, \t2
75	_le32_bswap \d, \t0, \t1, \t2
76	.endm
77
78	.macro __ldrd a, b, src, offset
79	#if __LINUX_ARM_ARCH__ >= 6
80	ldrd \a, \b, [\src, #\offset]
81	#else
82	ldr \a, [\src, #\offset]
83	ldr \b, [\src, #\offset + 4]
84	#endif
85	.endm
86
87	.macro __strd a, b, dst, offset
88	#if __LINUX_ARM_ARCH__ >= 6
89	strd \a, \b, [\dst, #\offset]
90	#else
91	str \a, [\dst, #\offset]
92	str \b, [\dst, #\offset + 4]
93	#endif
94	.endm
95
96	.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
97
98	// a += b; d ^= a; d = rol(d, 16);
99	add \a1, \a1, \b1, ror #brot
100	add \a2, \a2, \b2, ror #brot
101	eor \d1, \a1, \d1, ror #drot
102	eor \d2, \a2, \d2, ror #drot
103	// drot == 32 - 16 == 16
104
105	// c += d; b ^= c; b = rol(b, 12);
106	add \c1, \c1, \d1, ror #16
107	add \c2, \c2, \d2, ror #16
108	eor \b1, \c1, \b1, ror #brot
109	eor \b2, \c2, \b2, ror #brot
110	// brot == 32 - 12 == 20
111
112	// a += b; d ^= a; d = rol(d, 8);
113	add \a1, \a1, \b1, ror #20
114	add \a2, \a2, \b2, ror #20
115	eor \d1, \a1, \d1, ror #16
116	eor \d2, \a2, \d2, ror #16
117	// drot == 32 - 8 == 24
118
119	// c += d; b ^= c; b = rol(b, 7);
120	add \c1, \c1, \d1, ror #24
121	add \c2, \c2, \d2, ror #24
122	eor \b1, \c1, \b1, ror #20
123	eor \b2, \c2, \b2, ror #20
124	// brot == 32 - 7 == 25
125	.endm
126
127	.macro _doubleround
128
129	// column round
130
131	// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
132	_halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
133
134	// save (x8, x9); restore (x10, x11)
135	__strd X8_X10, X9_X11, sp, 0
136	__ldrd X8_X10, X9_X11, sp, 8
137
138	// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
139	_halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
140
141	.set brot, 25
142	.set drot, 24
143
144	// diagonal round
145
146	// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
147	_halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
148
149	// save (x10, x11); restore (x8, x9)
150	__strd X8_X10, X9_X11, sp, 8
151	__ldrd X8_X10, X9_X11, sp, 0
152
153	// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
154	_halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
155	.endm
156
157	.macro _chacha_permute nrounds
158	.set brot, 0
159	.set drot, 0
160	.rept \nrounds / 2
161	_doubleround
162	.endr
163	.endm
164
165	.macro _chacha nrounds
166
167	.Lnext_block\@:
168	// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
169	// Registers contain x0-x9,x12-x15.
170
171	// Do the core ChaCha permutation to update x0-x15.
172	_chacha_permute \nrounds
173
174	add sp, #8
175	// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
176	// Registers contain x0-x9,x12-x15.
177	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
178
179	// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
180	push {X8_X10, X9_X11, X12, X13, X14, X15}
181
182	// Load (OUT, IN, LEN).
183	ldr r14, [sp, #96]
184	ldr r12, [sp, #100]
185	ldr r11, [sp, #104]
186
187	orr r10, r14, r12
188
189	// Use slow path if fewer than 64 bytes remain.
190	cmp r11, #64
191	blt .Lxor_slowpath\@
192
193	// Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
194	// ARMv6+, since ldmia and stmia (used below) still require alignment.
195	tst r10, #3
196	bne .Lxor_slowpath\@
197
198	// Fast path: XOR 64 bytes of aligned data.
199
200	// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
201	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
202	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
203
204	// x0-x3
205	__ldrd r8, r9, sp, 32
206	__ldrd r10, r11, sp, 40
207	add X0, X0, r8
208	add X1, X1, r9
209	add X2, X2, r10
210	add X3, X3, r11
211	_le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
212	ldmia r12!, {r8-r11}
213	eor X0, X0, r8
214	eor X1, X1, r9
215	eor X2, X2, r10
216	eor X3, X3, r11
217	stmia r14!, {X0-X3}
218
219	// x4-x7
220	__ldrd r8, r9, sp, 48
221	__ldrd r10, r11, sp, 56
222	add X4, r8, X4, ror #brot
223	add X5, r9, X5, ror #brot
224	ldmia r12!, {X0-X3}
225	add X6, r10, X6, ror #brot
226	add X7, r11, X7, ror #brot
227	_le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
228	eor X4, X4, X0
229	eor X5, X5, X1
230	eor X6, X6, X2
231	eor X7, X7, X3
232	stmia r14!, {X4-X7}
233
234	// x8-x15
235	pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
236	__ldrd r8, r9, sp, 32
237	__ldrd r10, r11, sp, 40
238	add r0, r0, r8 // x8
239	add r1, r1, r9 // x9
240	add r6, r6, r10 // x10
241	add r7, r7, r11 // x11
242	_le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
243	ldmia r12!, {r8-r11}
244	eor r0, r0, r8 // x8
245	eor r1, r1, r9 // x9
246	eor r6, r6, r10 // x10
247	eor r7, r7, r11 // x11
248	stmia r14!, {r0,r1,r6,r7}
249	ldmia r12!, {r0,r1,r6,r7}
250	__ldrd r8, r9, sp, 48
251	__ldrd r10, r11, sp, 56
252	add r2, r8, r2, ror #drot // x12
253	add r3, r9, r3, ror #drot // x13
254	add r4, r10, r4, ror #drot // x14
255	add r5, r11, r5, ror #drot // x15
256	_le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
257	ldr r9, [sp, #72] // load LEN
258	eor r2, r2, r0 // x12
259	eor r3, r3, r1 // x13
260	eor r4, r4, r6 // x14
261	eor r5, r5, r7 // x15
262	subs r9, #64 // decrement and check LEN
263	stmia r14!, {r2-r5}
264
265	beq .Ldone\@
266
267	.Lprepare_for_next_block\@:
268
269	// Stack: x0-x15 OUT IN LEN
270
271	// Increment block counter (x12)
272	add r8, #1
273
274	// Store updated (OUT, IN, LEN)
275	str r14, [sp, #64]
276	str r12, [sp, #68]
277	str r9, [sp, #72]
278
279	mov r14, sp
280
281	// Store updated block counter (x12)
282	str r8, [sp, #48]
283
284	sub sp, #16
285
286	// Reload state and do next block
287	ldmia r14!, {r0-r11} // load x0-x11
288	__strd r10, r11, sp, 8 // store x10-x11 before state
289	ldmia r14, {r10-r12,r14} // load x12-x15
290	b .Lnext_block\@
291
292	.Lxor_slowpath\@:
293	// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
294	// We handle it by storing the 64 bytes of keystream to the stack, then
295	// XOR-ing the needed portion with the data.
296
297	// Allocate keystream buffer
298	sub sp, #64
299	mov r14, sp
300
301	// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
302	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
303	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
304
305	// Save keystream for x0-x3
306	__ldrd r8, r9, sp, 96
307	__ldrd r10, r11, sp, 104
308	add X0, X0, r8
309	add X1, X1, r9
310	add X2, X2, r10
311	add X3, X3, r11
312	_le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
313	stmia r14!, {X0-X3}
314
315	// Save keystream for x4-x7
316	__ldrd r8, r9, sp, 112
317	__ldrd r10, r11, sp, 120
318	add X4, r8, X4, ror #brot
319	add X5, r9, X5, ror #brot
320	add X6, r10, X6, ror #brot
321	add X7, r11, X7, ror #brot
322	_le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
323	add r8, sp, #64
324	stmia r14!, {X4-X7}
325
326	// Save keystream for x8-x15
327	ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
328	__ldrd r8, r9, sp, 128
329	__ldrd r10, r11, sp, 136
330	add r0, r0, r8 // x8
331	add r1, r1, r9 // x9
332	add r6, r6, r10 // x10
333	add r7, r7, r11 // x11
334	_le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
335	stmia r14!, {r0,r1,r6,r7}
336	__ldrd r8, r9, sp, 144
337	__ldrd r10, r11, sp, 152
338	add r2, r8, r2, ror #drot // x12
339	add r3, r9, r3, ror #drot // x13
340	add r4, r10, r4, ror #drot // x14
341	add r5, r11, r5, ror #drot // x15
342	_le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
343	stmia r14, {r2-r5}
344
345	// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
346	// Registers: r8 is block counter, r12 is IN.
347
348	ldr r9, [sp, #168] // LEN
349	ldr r14, [sp, #160] // OUT
350	cmp r9, #64
351	mov r0, sp
352	movle r1, r9
353	movgt r1, #64
354	// r1 is number of bytes to XOR, in range [1, 64]
355
356	.if __LINUX_ARM_ARCH__ < 6
357	orr r2, r12, r14
358	tst r2, #3 // IN or OUT misaligned?
359	bne .Lxor_next_byte\@
360	.endif
361
362	// XOR a word at a time
363	.rept 16
364	subs r1, #4
365	blt .Lxor_words_done\@
366	ldr r2, [r12], #4
367	ldr r3, [r0], #4
368	eor r2, r2, r3
369	str r2, [r14], #4
370	.endr
371	b .Lxor_slowpath_done\@
372	.Lxor_words_done\@:
373	ands r1, r1, #3
374	beq .Lxor_slowpath_done\@
375
376	// XOR a byte at a time
377	.Lxor_next_byte\@:
378	ldrb r2, [r12], #1
379	ldrb r3, [r0], #1
380	eor r2, r2, r3
381	strb r2, [r14], #1
382	subs r1, #1
383	bne .Lxor_next_byte\@
384
385	.Lxor_slowpath_done\@:
386	subs r9, #64
387	add sp, #96
388	bgt .Lprepare_for_next_block\@
389
390	.Ldone\@:
391	.endm // _chacha
392
393	/*
394	* void chacha20_arm(u8 out, const u8 in, size_t len, const u32 key[8],
395	* const u32 iv[4]);
396	*/
397	ENTRY(chacha20_arm)
398	cmp r2, #0 // len == 0?
399	reteq lr
400
401	push {r0-r2,r4-r11,lr}
402
403	// Push state x0-x15 onto stack.
404	// Also store an extra copy of x10-x11 just before the state.
405
406	ldr r4, [sp, #48] // iv
407	mov r0, sp
408	sub sp, #80
409
410	// iv: x12-x15
411	ldm r4, {X12,X13,X14,X15}
412	stmdb r0!, {X12,X13,X14,X15}
413
414	// key: x4-x11
415	__ldrd X8_X10, X9_X11, r3, 24
416	__strd X8_X10, X9_X11, sp, 8
417	stmdb r0!, {X8_X10, X9_X11}
418	ldm r3, {X4-X9_X11}
419	stmdb r0!, {X4-X9_X11}
420
421	// constants: x0-x3
422	adrl X3, .Lexpand_32byte_k
423	ldm X3, {X0-X3}
424	__strd X0, X1, sp, 16
425	__strd X2, X3, sp, 24
426
427	_chacha 20
428
429	add sp, #76
430	pop {r4-r11, pc}
431	ENDPROC(chacha20_arm)
432
433	/*
434	* void hchacha20_arm(const u32 state[16], u32 out[8]);
435	*/
436	ENTRY(hchacha20_arm)
437	push {r1,r4-r11,lr}
438
439	mov r14, r0
440	ldmia r14!, {r0-r11} // load x0-x11
441	push {r10-r11} // store x10-x11 to stack
442	ldm r14, {r10-r12,r14} // load x12-x15
443	sub sp, #8
444
445	_chacha_permute 20
446
447	// Skip over (unused0-unused1, x10-x11)
448	add sp, #16
449
450	// Fix up rotations of x12-x15
451	ror X12, X12, #drot
452	ror X13, X13, #drot
453	pop {r4} // load 'out'
454	ror X14, X14, #drot
455	ror X15, X15, #drot
456
457	// Store (x0-x3,x12-x15) to 'out'
458	stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
459
460	pop {r4-r11,pc}
461	ENDPROC(hchacha20_arm)