crypto: arm/chacha - import Eric Biggers's scalar accelerated ChaCha code
[linux-block.git] / arch / arm / crypto / chacha-scalar-core.S
CommitLineData
29621d09
AB
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Copyright (C) 2018 Google, Inc.
4 */
5
6#include <linux/linkage.h>
7#include <asm/assembler.h>
8
9/*
10 * Design notes:
11 *
12 * 16 registers would be needed to hold the state matrix, but only 14 are
13 * available because 'sp' and 'pc' cannot be used. So we spill the elements
14 * (x8, x9) to the stack and swap them out with (x10, x11). This adds one
15 * 'ldrd' and one 'strd' instruction per round.
16 *
17 * All rotates are performed using the implicit rotate operand accepted by the
18 * 'add' and 'eor' instructions. This is faster than using explicit rotate
19 * instructions. To make this work, we allow the values in the second and last
20 * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
21 * wrong rotation amount. The rotation amount is then fixed up just in time
22 * when the values are used. 'brot' is the number of bits the values in row 'b'
23 * need to be rotated right to arrive at the correct values, and 'drot'
24 * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
25 * that they end up as (25, 24) after every round.
26 */
27
28 // ChaCha state registers
29 X0 .req r0
30 X1 .req r1
31 X2 .req r2
32 X3 .req r3
33 X4 .req r4
34 X5 .req r5
35 X6 .req r6
36 X7 .req r7
37 X8_X10 .req r8 // shared by x8 and x10
38 X9_X11 .req r9 // shared by x9 and x11
39 X12 .req r10
40 X13 .req r11
41 X14 .req r12
42 X15 .req r14
43
44.Lexpand_32byte_k:
45 // "expand 32-byte k"
46 .word 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
47
48#ifdef __thumb2__
49# define adrl adr
50#endif
51
52.macro __rev out, in, t0, t1, t2
53.if __LINUX_ARM_ARCH__ >= 6
54 rev \out, \in
55.else
56 lsl \t0, \in, #24
57 and \t1, \in, #0xff00
58 and \t2, \in, #0xff0000
59 orr \out, \t0, \in, lsr #24
60 orr \out, \out, \t1, lsl #8
61 orr \out, \out, \t2, lsr #8
62.endif
63.endm
64
65.macro _le32_bswap x, t0, t1, t2
66#ifdef __ARMEB__
67 __rev \x, \x, \t0, \t1, \t2
68#endif
69.endm
70
71.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
72 _le32_bswap \a, \t0, \t1, \t2
73 _le32_bswap \b, \t0, \t1, \t2
74 _le32_bswap \c, \t0, \t1, \t2
75 _le32_bswap \d, \t0, \t1, \t2
76.endm
77
78.macro __ldrd a, b, src, offset
79#if __LINUX_ARM_ARCH__ >= 6
80 ldrd \a, \b, [\src, #\offset]
81#else
82 ldr \a, [\src, #\offset]
83 ldr \b, [\src, #\offset + 4]
84#endif
85.endm
86
87.macro __strd a, b, dst, offset
88#if __LINUX_ARM_ARCH__ >= 6
89 strd \a, \b, [\dst, #\offset]
90#else
91 str \a, [\dst, #\offset]
92 str \b, [\dst, #\offset + 4]
93#endif
94.endm
95
96.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
97
98 // a += b; d ^= a; d = rol(d, 16);
99 add \a1, \a1, \b1, ror #brot
100 add \a2, \a2, \b2, ror #brot
101 eor \d1, \a1, \d1, ror #drot
102 eor \d2, \a2, \d2, ror #drot
103 // drot == 32 - 16 == 16
104
105 // c += d; b ^= c; b = rol(b, 12);
106 add \c1, \c1, \d1, ror #16
107 add \c2, \c2, \d2, ror #16
108 eor \b1, \c1, \b1, ror #brot
109 eor \b2, \c2, \b2, ror #brot
110 // brot == 32 - 12 == 20
111
112 // a += b; d ^= a; d = rol(d, 8);
113 add \a1, \a1, \b1, ror #20
114 add \a2, \a2, \b2, ror #20
115 eor \d1, \a1, \d1, ror #16
116 eor \d2, \a2, \d2, ror #16
117 // drot == 32 - 8 == 24
118
119 // c += d; b ^= c; b = rol(b, 7);
120 add \c1, \c1, \d1, ror #24
121 add \c2, \c2, \d2, ror #24
122 eor \b1, \c1, \b1, ror #20
123 eor \b2, \c2, \b2, ror #20
124 // brot == 32 - 7 == 25
125.endm
126
127.macro _doubleround
128
129 // column round
130
131 // quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
132 _halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
133
134 // save (x8, x9); restore (x10, x11)
135 __strd X8_X10, X9_X11, sp, 0
136 __ldrd X8_X10, X9_X11, sp, 8
137
138 // quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
139 _halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
140
141 .set brot, 25
142 .set drot, 24
143
144 // diagonal round
145
146 // quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
147 _halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
148
149 // save (x10, x11); restore (x8, x9)
150 __strd X8_X10, X9_X11, sp, 8
151 __ldrd X8_X10, X9_X11, sp, 0
152
153 // quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
154 _halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
155.endm
156
157.macro _chacha_permute nrounds
158 .set brot, 0
159 .set drot, 0
160 .rept \nrounds / 2
161 _doubleround
162 .endr
163.endm
164
165.macro _chacha nrounds
166
167.Lnext_block\@:
168 // Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
169 // Registers contain x0-x9,x12-x15.
170
171 // Do the core ChaCha permutation to update x0-x15.
172 _chacha_permute \nrounds
173
174 add sp, #8
175 // Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
176 // Registers contain x0-x9,x12-x15.
177 // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
178
179 // Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
180 push {X8_X10, X9_X11, X12, X13, X14, X15}
181
182 // Load (OUT, IN, LEN).
183 ldr r14, [sp, #96]
184 ldr r12, [sp, #100]
185 ldr r11, [sp, #104]
186
187 orr r10, r14, r12
188
189 // Use slow path if fewer than 64 bytes remain.
190 cmp r11, #64
191 blt .Lxor_slowpath\@
192
193 // Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
194 // ARMv6+, since ldmia and stmia (used below) still require alignment.
195 tst r10, #3
196 bne .Lxor_slowpath\@
197
198 // Fast path: XOR 64 bytes of aligned data.
199
200 // Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
201 // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
202 // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
203
204 // x0-x3
205 __ldrd r8, r9, sp, 32
206 __ldrd r10, r11, sp, 40
207 add X0, X0, r8
208 add X1, X1, r9
209 add X2, X2, r10
210 add X3, X3, r11
211 _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
212 ldmia r12!, {r8-r11}
213 eor X0, X0, r8
214 eor X1, X1, r9
215 eor X2, X2, r10
216 eor X3, X3, r11
217 stmia r14!, {X0-X3}
218
219 // x4-x7
220 __ldrd r8, r9, sp, 48
221 __ldrd r10, r11, sp, 56
222 add X4, r8, X4, ror #brot
223 add X5, r9, X5, ror #brot
224 ldmia r12!, {X0-X3}
225 add X6, r10, X6, ror #brot
226 add X7, r11, X7, ror #brot
227 _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
228 eor X4, X4, X0
229 eor X5, X5, X1
230 eor X6, X6, X2
231 eor X7, X7, X3
232 stmia r14!, {X4-X7}
233
234 // x8-x15
235 pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
236 __ldrd r8, r9, sp, 32
237 __ldrd r10, r11, sp, 40
238 add r0, r0, r8 // x8
239 add r1, r1, r9 // x9
240 add r6, r6, r10 // x10
241 add r7, r7, r11 // x11
242 _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
243 ldmia r12!, {r8-r11}
244 eor r0, r0, r8 // x8
245 eor r1, r1, r9 // x9
246 eor r6, r6, r10 // x10
247 eor r7, r7, r11 // x11
248 stmia r14!, {r0,r1,r6,r7}
249 ldmia r12!, {r0,r1,r6,r7}
250 __ldrd r8, r9, sp, 48
251 __ldrd r10, r11, sp, 56
252 add r2, r8, r2, ror #drot // x12
253 add r3, r9, r3, ror #drot // x13
254 add r4, r10, r4, ror #drot // x14
255 add r5, r11, r5, ror #drot // x15
256 _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
257 ldr r9, [sp, #72] // load LEN
258 eor r2, r2, r0 // x12
259 eor r3, r3, r1 // x13
260 eor r4, r4, r6 // x14
261 eor r5, r5, r7 // x15
262 subs r9, #64 // decrement and check LEN
263 stmia r14!, {r2-r5}
264
265 beq .Ldone\@
266
267.Lprepare_for_next_block\@:
268
269 // Stack: x0-x15 OUT IN LEN
270
271 // Increment block counter (x12)
272 add r8, #1
273
274 // Store updated (OUT, IN, LEN)
275 str r14, [sp, #64]
276 str r12, [sp, #68]
277 str r9, [sp, #72]
278
279 mov r14, sp
280
281 // Store updated block counter (x12)
282 str r8, [sp, #48]
283
284 sub sp, #16
285
286 // Reload state and do next block
287 ldmia r14!, {r0-r11} // load x0-x11
288 __strd r10, r11, sp, 8 // store x10-x11 before state
289 ldmia r14, {r10-r12,r14} // load x12-x15
290 b .Lnext_block\@
291
292.Lxor_slowpath\@:
293 // Slow path: < 64 bytes remaining, or unaligned input or output buffer.
294 // We handle it by storing the 64 bytes of keystream to the stack, then
295 // XOR-ing the needed portion with the data.
296
297 // Allocate keystream buffer
298 sub sp, #64
299 mov r14, sp
300
301 // Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
302 // Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
303 // x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
304
305 // Save keystream for x0-x3
306 __ldrd r8, r9, sp, 96
307 __ldrd r10, r11, sp, 104
308 add X0, X0, r8
309 add X1, X1, r9
310 add X2, X2, r10
311 add X3, X3, r11
312 _le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
313 stmia r14!, {X0-X3}
314
315 // Save keystream for x4-x7
316 __ldrd r8, r9, sp, 112
317 __ldrd r10, r11, sp, 120
318 add X4, r8, X4, ror #brot
319 add X5, r9, X5, ror #brot
320 add X6, r10, X6, ror #brot
321 add X7, r11, X7, ror #brot
322 _le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
323 add r8, sp, #64
324 stmia r14!, {X4-X7}
325
326 // Save keystream for x8-x15
327 ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
328 __ldrd r8, r9, sp, 128
329 __ldrd r10, r11, sp, 136
330 add r0, r0, r8 // x8
331 add r1, r1, r9 // x9
332 add r6, r6, r10 // x10
333 add r7, r7, r11 // x11
334 _le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
335 stmia r14!, {r0,r1,r6,r7}
336 __ldrd r8, r9, sp, 144
337 __ldrd r10, r11, sp, 152
338 add r2, r8, r2, ror #drot // x12
339 add r3, r9, r3, ror #drot // x13
340 add r4, r10, r4, ror #drot // x14
341 add r5, r11, r5, ror #drot // x15
342 _le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
343 stmia r14, {r2-r5}
344
345 // Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
346 // Registers: r8 is block counter, r12 is IN.
347
348 ldr r9, [sp, #168] // LEN
349 ldr r14, [sp, #160] // OUT
350 cmp r9, #64
351 mov r0, sp
352 movle r1, r9
353 movgt r1, #64
354 // r1 is number of bytes to XOR, in range [1, 64]
355
356.if __LINUX_ARM_ARCH__ < 6
357 orr r2, r12, r14
358 tst r2, #3 // IN or OUT misaligned?
359 bne .Lxor_next_byte\@
360.endif
361
362 // XOR a word at a time
363.rept 16
364 subs r1, #4
365 blt .Lxor_words_done\@
366 ldr r2, [r12], #4
367 ldr r3, [r0], #4
368 eor r2, r2, r3
369 str r2, [r14], #4
370.endr
371 b .Lxor_slowpath_done\@
372.Lxor_words_done\@:
373 ands r1, r1, #3
374 beq .Lxor_slowpath_done\@
375
376 // XOR a byte at a time
377.Lxor_next_byte\@:
378 ldrb r2, [r12], #1
379 ldrb r3, [r0], #1
380 eor r2, r2, r3
381 strb r2, [r14], #1
382 subs r1, #1
383 bne .Lxor_next_byte\@
384
385.Lxor_slowpath_done\@:
386 subs r9, #64
387 add sp, #96
388 bgt .Lprepare_for_next_block\@
389
390.Ldone\@:
391.endm // _chacha
392
393/*
394 * void chacha20_arm(u8 *out, const u8 *in, size_t len, const u32 key[8],
395 * const u32 iv[4]);
396 */
397ENTRY(chacha20_arm)
398 cmp r2, #0 // len == 0?
399 reteq lr
400
401 push {r0-r2,r4-r11,lr}
402
403 // Push state x0-x15 onto stack.
404 // Also store an extra copy of x10-x11 just before the state.
405
406 ldr r4, [sp, #48] // iv
407 mov r0, sp
408 sub sp, #80
409
410 // iv: x12-x15
411 ldm r4, {X12,X13,X14,X15}
412 stmdb r0!, {X12,X13,X14,X15}
413
414 // key: x4-x11
415 __ldrd X8_X10, X9_X11, r3, 24
416 __strd X8_X10, X9_X11, sp, 8
417 stmdb r0!, {X8_X10, X9_X11}
418 ldm r3, {X4-X9_X11}
419 stmdb r0!, {X4-X9_X11}
420
421 // constants: x0-x3
422 adrl X3, .Lexpand_32byte_k
423 ldm X3, {X0-X3}
424 __strd X0, X1, sp, 16
425 __strd X2, X3, sp, 24
426
427 _chacha 20
428
429 add sp, #76
430 pop {r4-r11, pc}
431ENDPROC(chacha20_arm)
432
433/*
434 * void hchacha20_arm(const u32 state[16], u32 out[8]);
435 */
436ENTRY(hchacha20_arm)
437 push {r1,r4-r11,lr}
438
439 mov r14, r0
440 ldmia r14!, {r0-r11} // load x0-x11
441 push {r10-r11} // store x10-x11 to stack
442 ldm r14, {r10-r12,r14} // load x12-x15
443 sub sp, #8
444
445 _chacha_permute 20
446
447 // Skip over (unused0-unused1, x10-x11)
448 add sp, #16
449
450 // Fix up rotations of x12-x15
451 ror X12, X12, #drot
452 ror X13, X13, #drot
453 pop {r4} // load 'out'
454 ror X14, X14, #drot
455 ror X15, X15, #drot
456
457 // Store (x0-x3,x12-x15) to 'out'
458 stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
459
460 pop {r4-r11,pc}
461ENDPROC(hchacha20_arm)