Commit | Line | Data |
---|---|---|
afaf712e | 1 | /* |
3cc21519 | 2 | * ChaCha/XChaCha NEON helper functions |
afaf712e AB |
3 | * |
4 | * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> | |
5 | * | |
6 | * This program is free software; you can redistribute it and/or modify | |
7 | * it under the terms of the GNU General Public License version 2 as | |
8 | * published by the Free Software Foundation. | |
9 | * | |
10 | * Based on: | |
11 | * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions | |
12 | * | |
13 | * Copyright (C) 2015 Martin Willi | |
14 | * | |
15 | * This program is free software; you can redistribute it and/or modify | |
16 | * it under the terms of the GNU General Public License as published by | |
17 | * the Free Software Foundation; either version 2 of the License, or | |
18 | * (at your option) any later version. | |
19 | */ | |
20 | ||
a1b22a5f EB |
21 | /* |
22 | * NEON doesn't have a rotate instruction. The alternatives are, more or less: | |
23 | * | |
24 | * (a) vshl.u32 + vsri.u32 (needs temporary register) | |
25 | * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register) | |
26 | * (c) vrev32.16 (16-bit rotations only) | |
27 | * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only, | |
28 | * needs index vector) | |
29 | * | |
3cc21519 EB |
30 | * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations, |
31 | * the only choices are (a) and (b). We use (a) since it takes two-thirds the | |
32 | * cycles of (b) on both Cortex-A7 and Cortex-A53. | |
a1b22a5f EB |
33 | * |
34 | * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest | |
35 | * and doesn't need a temporary register. | |
36 | * | |
37 | * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence | |
38 | * is twice as fast as (a), even when doing (a) on multiple registers | |
39 | * simultaneously to eliminate the stall between vshl and vsri. Also, it | |
40 | * parallelizes better when temporary registers are scarce. | |
41 | * | |
42 | * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as | |
43 | * (a), so the need to load the rotation table actually makes the vtbl method | |
44 | * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it | |
45 | * seems to be a good compromise to get a more significant speed boost on some | |
46 | * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7. | |
47 | */ | |
48 | ||
afaf712e AB |
49 | #include <linux/linkage.h> |
50 | ||
51 | .text | |
52 | .fpu neon | |
53 | .align 5 | |
54 | ||
d97a9430 | 55 | /* |
3cc21519 | 56 | * chacha_permute - permute one block |
d97a9430 EB |
57 | * |
58 | * Permute one 64-byte block where the state matrix is stored in the four NEON | |
59 | * registers q0-q3. It performs matrix operations on four words in parallel, | |
60 | * but requires shuffling to rearrange the words after each round. | |
61 | * | |
3cc21519 EB |
62 | * The round count is given in r3. |
63 | * | |
d97a9430 EB |
64 | * Clobbers: r3, ip, q4-q5 |
65 | */ | |
3cc21519 | 66 | chacha_permute: |
afaf712e | 67 | |
a1b22a5f | 68 | adr ip, .Lrol8_table |
a1b22a5f | 69 | vld1.8 {d10}, [ip, :64] |
afaf712e AB |
70 | |
71 | .Ldoubleround: | |
72 | // x0 += x1, x3 = rotl32(x3 ^ x0, 16) | |
73 | vadd.i32 q0, q0, q1 | |
4e34e51f EB |
74 | veor q3, q3, q0 |
75 | vrev32.16 q3, q3 | |
afaf712e AB |
76 | |
77 | // x2 += x3, x1 = rotl32(x1 ^ x2, 12) | |
78 | vadd.i32 q2, q2, q3 | |
79 | veor q4, q1, q2 | |
80 | vshl.u32 q1, q4, #12 | |
81 | vsri.u32 q1, q4, #20 | |
82 | ||
83 | // x0 += x1, x3 = rotl32(x3 ^ x0, 8) | |
84 | vadd.i32 q0, q0, q1 | |
a1b22a5f EB |
85 | veor q3, q3, q0 |
86 | vtbl.8 d6, {d6}, d10 | |
87 | vtbl.8 d7, {d7}, d10 | |
afaf712e AB |
88 | |
89 | // x2 += x3, x1 = rotl32(x1 ^ x2, 7) | |
90 | vadd.i32 q2, q2, q3 | |
91 | veor q4, q1, q2 | |
92 | vshl.u32 q1, q4, #7 | |
93 | vsri.u32 q1, q4, #25 | |
94 | ||
95 | // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) | |
96 | vext.8 q1, q1, q1, #4 | |
97 | // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) | |
98 | vext.8 q2, q2, q2, #8 | |
99 | // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) | |
100 | vext.8 q3, q3, q3, #12 | |
101 | ||
102 | // x0 += x1, x3 = rotl32(x3 ^ x0, 16) | |
103 | vadd.i32 q0, q0, q1 | |
4e34e51f EB |
104 | veor q3, q3, q0 |
105 | vrev32.16 q3, q3 | |
afaf712e AB |
106 | |
107 | // x2 += x3, x1 = rotl32(x1 ^ x2, 12) | |
108 | vadd.i32 q2, q2, q3 | |
109 | veor q4, q1, q2 | |
110 | vshl.u32 q1, q4, #12 | |
111 | vsri.u32 q1, q4, #20 | |
112 | ||
113 | // x0 += x1, x3 = rotl32(x3 ^ x0, 8) | |
114 | vadd.i32 q0, q0, q1 | |
a1b22a5f EB |
115 | veor q3, q3, q0 |
116 | vtbl.8 d6, {d6}, d10 | |
117 | vtbl.8 d7, {d7}, d10 | |
afaf712e AB |
118 | |
119 | // x2 += x3, x1 = rotl32(x1 ^ x2, 7) | |
120 | vadd.i32 q2, q2, q3 | |
121 | veor q4, q1, q2 | |
122 | vshl.u32 q1, q4, #7 | |
123 | vsri.u32 q1, q4, #25 | |
124 | ||
125 | // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) | |
126 | vext.8 q1, q1, q1, #12 | |
127 | // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) | |
128 | vext.8 q2, q2, q2, #8 | |
129 | // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) | |
130 | vext.8 q3, q3, q3, #4 | |
131 | ||
3cc21519 | 132 | subs r3, r3, #2 |
afaf712e AB |
133 | bne .Ldoubleround |
134 | ||
d97a9430 | 135 | bx lr |
3cc21519 | 136 | ENDPROC(chacha_permute) |
d97a9430 | 137 | |
3cc21519 | 138 | ENTRY(chacha_block_xor_neon) |
d97a9430 EB |
139 | // r0: Input state matrix, s |
140 | // r1: 1 data block output, o | |
141 | // r2: 1 data block input, i | |
3cc21519 | 142 | // r3: nrounds |
d97a9430 EB |
143 | push {lr} |
144 | ||
145 | // x0..3 = s0..3 | |
146 | add ip, r0, #0x20 | |
147 | vld1.32 {q0-q1}, [r0] | |
148 | vld1.32 {q2-q3}, [ip] | |
149 | ||
150 | vmov q8, q0 | |
151 | vmov q9, q1 | |
152 | vmov q10, q2 | |
153 | vmov q11, q3 | |
154 | ||
3cc21519 | 155 | bl chacha_permute |
d97a9430 | 156 | |
afaf712e AB |
157 | add ip, r2, #0x20 |
158 | vld1.8 {q4-q5}, [r2] | |
159 | vld1.8 {q6-q7}, [ip] | |
160 | ||
161 | // o0 = i0 ^ (x0 + s0) | |
162 | vadd.i32 q0, q0, q8 | |
163 | veor q0, q0, q4 | |
164 | ||
165 | // o1 = i1 ^ (x1 + s1) | |
166 | vadd.i32 q1, q1, q9 | |
167 | veor q1, q1, q5 | |
168 | ||
169 | // o2 = i2 ^ (x2 + s2) | |
170 | vadd.i32 q2, q2, q10 | |
171 | veor q2, q2, q6 | |
172 | ||
173 | // o3 = i3 ^ (x3 + s3) | |
174 | vadd.i32 q3, q3, q11 | |
175 | veor q3, q3, q7 | |
176 | ||
177 | add ip, r1, #0x20 | |
178 | vst1.8 {q0-q1}, [r1] | |
179 | vst1.8 {q2-q3}, [ip] | |
180 | ||
d97a9430 | 181 | pop {pc} |
3cc21519 | 182 | ENDPROC(chacha_block_xor_neon) |
afaf712e | 183 | |
3cc21519 | 184 | ENTRY(hchacha_block_neon) |
d97a9430 EB |
185 | // r0: Input state matrix, s |
186 | // r1: output (8 32-bit words) | |
3cc21519 | 187 | // r2: nrounds |
d97a9430 EB |
188 | push {lr} |
189 | ||
190 | vld1.32 {q0-q1}, [r0]! | |
191 | vld1.32 {q2-q3}, [r0] | |
192 | ||
3cc21519 EB |
193 | mov r3, r2 |
194 | bl chacha_permute | |
d97a9430 EB |
195 | |
196 | vst1.32 {q0}, [r1]! | |
197 | vst1.32 {q3}, [r1] | |
198 | ||
199 | pop {pc} | |
3cc21519 | 200 | ENDPROC(hchacha_block_neon) |
d97a9430 | 201 | |
a1b22a5f EB |
202 | .align 4 |
203 | .Lctrinc: .word 0, 1, 2, 3 | |
204 | .Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6 | |
205 | ||
afaf712e | 206 | .align 5 |
3cc21519 | 207 | ENTRY(chacha_4block_xor_neon) |
a1b22a5f EB |
208 | push {r4-r5} |
209 | mov r4, sp // preserve the stack pointer | |
210 | sub ip, sp, #0x20 // allocate a 32 byte buffer | |
211 | bic ip, ip, #0x1f // aligned to 32 bytes | |
212 | mov sp, ip | |
afaf712e AB |
213 | |
214 | // r0: Input state matrix, s | |
215 | // r1: 4 data blocks output, o | |
216 | // r2: 4 data blocks input, i | |
3cc21519 | 217 | // r3: nrounds |
afaf712e AB |
218 | |
219 | // | |
3cc21519 | 220 | // This function encrypts four consecutive ChaCha blocks by loading |
afaf712e AB |
221 | // the state matrix in NEON registers four times. The algorithm performs |
222 | // each operation on the corresponding word of each state matrix, hence | |
a1b22a5f EB |
223 | // requires no word shuffling. The words are re-interleaved before the |
224 | // final addition of the original state and the XORing step. | |
afaf712e AB |
225 | // |
226 | ||
a1b22a5f EB |
227 | // x0..15[0-3] = s0..15[0-3] |
228 | add ip, r0, #0x20 | |
afaf712e | 229 | vld1.32 {q0-q1}, [r0] |
a1b22a5f | 230 | vld1.32 {q2-q3}, [ip] |
afaf712e | 231 | |
a1b22a5f | 232 | adr r5, .Lctrinc |
afaf712e AB |
233 | vdup.32 q15, d7[1] |
234 | vdup.32 q14, d7[0] | |
a1b22a5f | 235 | vld1.32 {q4}, [r5, :128] |
afaf712e AB |
236 | vdup.32 q13, d6[1] |
237 | vdup.32 q12, d6[0] | |
afaf712e AB |
238 | vdup.32 q11, d5[1] |
239 | vdup.32 q10, d5[0] | |
a1b22a5f | 240 | vadd.u32 q12, q12, q4 // x12 += counter values 0-3 |
afaf712e AB |
241 | vdup.32 q9, d4[1] |
242 | vdup.32 q8, d4[0] | |
243 | vdup.32 q7, d3[1] | |
244 | vdup.32 q6, d3[0] | |
245 | vdup.32 q5, d2[1] | |
246 | vdup.32 q4, d2[0] | |
247 | vdup.32 q3, d1[1] | |
248 | vdup.32 q2, d1[0] | |
249 | vdup.32 q1, d0[1] | |
250 | vdup.32 q0, d0[0] | |
251 | ||
a1b22a5f | 252 | adr ip, .Lrol8_table |
a1b22a5f | 253 | b 1f |
afaf712e AB |
254 | |
255 | .Ldoubleround4: | |
a1b22a5f EB |
256 | vld1.32 {q8-q9}, [sp, :256] |
257 | 1: | |
afaf712e AB |
258 | // x0 += x4, x12 = rotl32(x12 ^ x0, 16) |
259 | // x1 += x5, x13 = rotl32(x13 ^ x1, 16) | |
260 | // x2 += x6, x14 = rotl32(x14 ^ x2, 16) | |
261 | // x3 += x7, x15 = rotl32(x15 ^ x3, 16) | |
262 | vadd.i32 q0, q0, q4 | |
263 | vadd.i32 q1, q1, q5 | |
264 | vadd.i32 q2, q2, q6 | |
265 | vadd.i32 q3, q3, q7 | |
266 | ||
267 | veor q12, q12, q0 | |
268 | veor q13, q13, q1 | |
269 | veor q14, q14, q2 | |
270 | veor q15, q15, q3 | |
271 | ||
272 | vrev32.16 q12, q12 | |
273 | vrev32.16 q13, q13 | |
274 | vrev32.16 q14, q14 | |
275 | vrev32.16 q15, q15 | |
276 | ||
277 | // x8 += x12, x4 = rotl32(x4 ^ x8, 12) | |
278 | // x9 += x13, x5 = rotl32(x5 ^ x9, 12) | |
279 | // x10 += x14, x6 = rotl32(x6 ^ x10, 12) | |
280 | // x11 += x15, x7 = rotl32(x7 ^ x11, 12) | |
281 | vadd.i32 q8, q8, q12 | |
282 | vadd.i32 q9, q9, q13 | |
283 | vadd.i32 q10, q10, q14 | |
284 | vadd.i32 q11, q11, q15 | |
285 | ||
286 | vst1.32 {q8-q9}, [sp, :256] | |
287 | ||
288 | veor q8, q4, q8 | |
289 | veor q9, q5, q9 | |
290 | vshl.u32 q4, q8, #12 | |
291 | vshl.u32 q5, q9, #12 | |
292 | vsri.u32 q4, q8, #20 | |
293 | vsri.u32 q5, q9, #20 | |
294 | ||
295 | veor q8, q6, q10 | |
296 | veor q9, q7, q11 | |
297 | vshl.u32 q6, q8, #12 | |
298 | vshl.u32 q7, q9, #12 | |
299 | vsri.u32 q6, q8, #20 | |
300 | vsri.u32 q7, q9, #20 | |
301 | ||
302 | // x0 += x4, x12 = rotl32(x12 ^ x0, 8) | |
303 | // x1 += x5, x13 = rotl32(x13 ^ x1, 8) | |
304 | // x2 += x6, x14 = rotl32(x14 ^ x2, 8) | |
305 | // x3 += x7, x15 = rotl32(x15 ^ x3, 8) | |
a1b22a5f | 306 | vld1.8 {d16}, [ip, :64] |
afaf712e AB |
307 | vadd.i32 q0, q0, q4 |
308 | vadd.i32 q1, q1, q5 | |
309 | vadd.i32 q2, q2, q6 | |
310 | vadd.i32 q3, q3, q7 | |
311 | ||
a1b22a5f EB |
312 | veor q12, q12, q0 |
313 | veor q13, q13, q1 | |
314 | veor q14, q14, q2 | |
315 | veor q15, q15, q3 | |
afaf712e | 316 | |
a1b22a5f EB |
317 | vtbl.8 d24, {d24}, d16 |
318 | vtbl.8 d25, {d25}, d16 | |
319 | vtbl.8 d26, {d26}, d16 | |
320 | vtbl.8 d27, {d27}, d16 | |
321 | vtbl.8 d28, {d28}, d16 | |
322 | vtbl.8 d29, {d29}, d16 | |
323 | vtbl.8 d30, {d30}, d16 | |
324 | vtbl.8 d31, {d31}, d16 | |
afaf712e AB |
325 | |
326 | vld1.32 {q8-q9}, [sp, :256] | |
327 | ||
328 | // x8 += x12, x4 = rotl32(x4 ^ x8, 7) | |
329 | // x9 += x13, x5 = rotl32(x5 ^ x9, 7) | |
330 | // x10 += x14, x6 = rotl32(x6 ^ x10, 7) | |
331 | // x11 += x15, x7 = rotl32(x7 ^ x11, 7) | |
332 | vadd.i32 q8, q8, q12 | |
333 | vadd.i32 q9, q9, q13 | |
334 | vadd.i32 q10, q10, q14 | |
335 | vadd.i32 q11, q11, q15 | |
336 | ||
337 | vst1.32 {q8-q9}, [sp, :256] | |
338 | ||
339 | veor q8, q4, q8 | |
340 | veor q9, q5, q9 | |
341 | vshl.u32 q4, q8, #7 | |
342 | vshl.u32 q5, q9, #7 | |
343 | vsri.u32 q4, q8, #25 | |
344 | vsri.u32 q5, q9, #25 | |
345 | ||
346 | veor q8, q6, q10 | |
347 | veor q9, q7, q11 | |
348 | vshl.u32 q6, q8, #7 | |
349 | vshl.u32 q7, q9, #7 | |
350 | vsri.u32 q6, q8, #25 | |
351 | vsri.u32 q7, q9, #25 | |
352 | ||
353 | vld1.32 {q8-q9}, [sp, :256] | |
354 | ||
355 | // x0 += x5, x15 = rotl32(x15 ^ x0, 16) | |
356 | // x1 += x6, x12 = rotl32(x12 ^ x1, 16) | |
357 | // x2 += x7, x13 = rotl32(x13 ^ x2, 16) | |
358 | // x3 += x4, x14 = rotl32(x14 ^ x3, 16) | |
359 | vadd.i32 q0, q0, q5 | |
360 | vadd.i32 q1, q1, q6 | |
361 | vadd.i32 q2, q2, q7 | |
362 | vadd.i32 q3, q3, q4 | |
363 | ||
364 | veor q15, q15, q0 | |
365 | veor q12, q12, q1 | |
366 | veor q13, q13, q2 | |
367 | veor q14, q14, q3 | |
368 | ||
369 | vrev32.16 q15, q15 | |
370 | vrev32.16 q12, q12 | |
371 | vrev32.16 q13, q13 | |
372 | vrev32.16 q14, q14 | |
373 | ||
374 | // x10 += x15, x5 = rotl32(x5 ^ x10, 12) | |
375 | // x11 += x12, x6 = rotl32(x6 ^ x11, 12) | |
376 | // x8 += x13, x7 = rotl32(x7 ^ x8, 12) | |
377 | // x9 += x14, x4 = rotl32(x4 ^ x9, 12) | |
378 | vadd.i32 q10, q10, q15 | |
379 | vadd.i32 q11, q11, q12 | |
380 | vadd.i32 q8, q8, q13 | |
381 | vadd.i32 q9, q9, q14 | |
382 | ||
383 | vst1.32 {q8-q9}, [sp, :256] | |
384 | ||
385 | veor q8, q7, q8 | |
386 | veor q9, q4, q9 | |
387 | vshl.u32 q7, q8, #12 | |
388 | vshl.u32 q4, q9, #12 | |
389 | vsri.u32 q7, q8, #20 | |
390 | vsri.u32 q4, q9, #20 | |
391 | ||
392 | veor q8, q5, q10 | |
393 | veor q9, q6, q11 | |
394 | vshl.u32 q5, q8, #12 | |
395 | vshl.u32 q6, q9, #12 | |
396 | vsri.u32 q5, q8, #20 | |
397 | vsri.u32 q6, q9, #20 | |
398 | ||
399 | // x0 += x5, x15 = rotl32(x15 ^ x0, 8) | |
400 | // x1 += x6, x12 = rotl32(x12 ^ x1, 8) | |
401 | // x2 += x7, x13 = rotl32(x13 ^ x2, 8) | |
402 | // x3 += x4, x14 = rotl32(x14 ^ x3, 8) | |
a1b22a5f | 403 | vld1.8 {d16}, [ip, :64] |
afaf712e AB |
404 | vadd.i32 q0, q0, q5 |
405 | vadd.i32 q1, q1, q6 | |
406 | vadd.i32 q2, q2, q7 | |
407 | vadd.i32 q3, q3, q4 | |
408 | ||
a1b22a5f EB |
409 | veor q15, q15, q0 |
410 | veor q12, q12, q1 | |
411 | veor q13, q13, q2 | |
412 | veor q14, q14, q3 | |
afaf712e | 413 | |
a1b22a5f EB |
414 | vtbl.8 d30, {d30}, d16 |
415 | vtbl.8 d31, {d31}, d16 | |
416 | vtbl.8 d24, {d24}, d16 | |
417 | vtbl.8 d25, {d25}, d16 | |
418 | vtbl.8 d26, {d26}, d16 | |
419 | vtbl.8 d27, {d27}, d16 | |
420 | vtbl.8 d28, {d28}, d16 | |
421 | vtbl.8 d29, {d29}, d16 | |
afaf712e AB |
422 | |
423 | vld1.32 {q8-q9}, [sp, :256] | |
424 | ||
425 | // x10 += x15, x5 = rotl32(x5 ^ x10, 7) | |
426 | // x11 += x12, x6 = rotl32(x6 ^ x11, 7) | |
427 | // x8 += x13, x7 = rotl32(x7 ^ x8, 7) | |
428 | // x9 += x14, x4 = rotl32(x4 ^ x9, 7) | |
429 | vadd.i32 q10, q10, q15 | |
430 | vadd.i32 q11, q11, q12 | |
431 | vadd.i32 q8, q8, q13 | |
432 | vadd.i32 q9, q9, q14 | |
433 | ||
434 | vst1.32 {q8-q9}, [sp, :256] | |
435 | ||
436 | veor q8, q7, q8 | |
437 | veor q9, q4, q9 | |
438 | vshl.u32 q7, q8, #7 | |
439 | vshl.u32 q4, q9, #7 | |
440 | vsri.u32 q7, q8, #25 | |
441 | vsri.u32 q4, q9, #25 | |
442 | ||
443 | veor q8, q5, q10 | |
444 | veor q9, q6, q11 | |
445 | vshl.u32 q5, q8, #7 | |
446 | vshl.u32 q6, q9, #7 | |
447 | vsri.u32 q5, q8, #25 | |
448 | vsri.u32 q6, q9, #25 | |
449 | ||
3cc21519 | 450 | subs r3, r3, #2 |
a1b22a5f EB |
451 | bne .Ldoubleround4 |
452 | ||
453 | // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15. | |
454 | // x8..9[0-3] are on the stack. | |
455 | ||
456 | // Re-interleave the words in the first two rows of each block (x0..7). | |
457 | // Also add the counter values 0-3 to x12[0-3]. | |
458 | vld1.32 {q8}, [r5, :128] // load counter values 0-3 | |
459 | vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1) | |
460 | vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3) | |
461 | vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5) | |
462 | vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7) | |
463 | vadd.u32 q12, q8 // x12 += counter values 0-3 | |
afaf712e AB |
464 | vswp d1, d4 |
465 | vswp d3, d6 | |
a1b22a5f | 466 | vld1.32 {q8-q9}, [r0]! // load s0..7 |
afaf712e AB |
467 | vswp d9, d12 |
468 | vswp d11, d14 | |
469 | ||
a1b22a5f EB |
470 | // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1) |
471 | // after XORing the first 32 bytes. | |
472 | vswp q1, q4 | |
473 | ||
474 | // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7) | |
475 | ||
476 | // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block) | |
477 | vadd.u32 q0, q0, q8 | |
478 | vadd.u32 q2, q2, q8 | |
479 | vadd.u32 q4, q4, q8 | |
480 | vadd.u32 q3, q3, q8 | |
481 | ||
482 | // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block) | |
483 | vadd.u32 q1, q1, q9 | |
484 | vadd.u32 q6, q6, q9 | |
485 | vadd.u32 q5, q5, q9 | |
486 | vadd.u32 q7, q7, q9 | |
487 | ||
488 | // XOR first 32 bytes using keystream from first two rows of first block | |
afaf712e AB |
489 | vld1.8 {q8-q9}, [r2]! |
490 | veor q8, q8, q0 | |
a1b22a5f | 491 | veor q9, q9, q1 |
afaf712e AB |
492 | vst1.8 {q8-q9}, [r1]! |
493 | ||
a1b22a5f | 494 | // Re-interleave the words in the last two rows of each block (x8..15). |
afaf712e | 495 | vld1.32 {q8-q9}, [sp, :256] |
a1b22a5f EB |
496 | vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13) |
497 | vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15) | |
498 | vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9) | |
499 | vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11) | |
500 | vld1.32 {q0-q1}, [r0] // load s8..15 | |
afaf712e AB |
501 | vswp d25, d28 |
502 | vswp d27, d30 | |
a1b22a5f EB |
503 | vswp d17, d20 |
504 | vswp d19, d22 | |
505 | ||
506 | // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15) | |
507 | ||
508 | // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block) | |
509 | vadd.u32 q8, q8, q0 | |
510 | vadd.u32 q10, q10, q0 | |
511 | vadd.u32 q9, q9, q0 | |
512 | vadd.u32 q11, q11, q0 | |
513 | ||
514 | // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block) | |
515 | vadd.u32 q12, q12, q1 | |
516 | vadd.u32 q14, q14, q1 | |
517 | vadd.u32 q13, q13, q1 | |
518 | vadd.u32 q15, q15, q1 | |
afaf712e | 519 | |
a1b22a5f | 520 | // XOR the rest of the data with the keystream |
afaf712e AB |
521 | |
522 | vld1.8 {q0-q1}, [r2]! | |
523 | veor q0, q0, q8 | |
524 | veor q1, q1, q12 | |
525 | vst1.8 {q0-q1}, [r1]! | |
526 | ||
527 | vld1.8 {q0-q1}, [r2]! | |
528 | veor q0, q0, q2 | |
529 | veor q1, q1, q6 | |
530 | vst1.8 {q0-q1}, [r1]! | |
531 | ||
532 | vld1.8 {q0-q1}, [r2]! | |
533 | veor q0, q0, q10 | |
534 | veor q1, q1, q14 | |
535 | vst1.8 {q0-q1}, [r1]! | |
536 | ||
537 | vld1.8 {q0-q1}, [r2]! | |
538 | veor q0, q0, q4 | |
539 | veor q1, q1, q5 | |
540 | vst1.8 {q0-q1}, [r1]! | |
541 | ||
542 | vld1.8 {q0-q1}, [r2]! | |
543 | veor q0, q0, q9 | |
544 | veor q1, q1, q13 | |
545 | vst1.8 {q0-q1}, [r1]! | |
546 | ||
547 | vld1.8 {q0-q1}, [r2]! | |
548 | veor q0, q0, q3 | |
549 | veor q1, q1, q7 | |
550 | vst1.8 {q0-q1}, [r1]! | |
551 | ||
552 | vld1.8 {q0-q1}, [r2] | |
a1b22a5f | 553 | mov sp, r4 // restore original stack pointer |
afaf712e AB |
554 | veor q0, q0, q11 |
555 | veor q1, q1, q15 | |
556 | vst1.8 {q0-q1}, [r1] | |
557 | ||
a1b22a5f EB |
558 | pop {r4-r5} |
559 | bx lr | |
3cc21519 | 560 | ENDPROC(chacha_4block_xor_neon) |