Commit | Line | Data |
---|---|---|
cee7a36e MW |
1 | /* SPDX-License-Identifier: GPL-2.0+ */ |
2 | /* | |
8b65f34c | 3 | * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions |
cee7a36e MW |
4 | * |
5 | * Copyright (C) 2018 Martin Willi | |
6 | */ | |
7 | ||
8 | #include <linux/linkage.h> | |
9 | ||
29a47b54 MW |
10 | .section .rodata.cst32.CTR2BL, "aM", @progbits, 32 |
11 | .align 32 | |
12 | CTR2BL: .octa 0x00000000000000000000000000000000 | |
13 | .octa 0x00000000000000000000000000000001 | |
14 | ||
180def6c MW |
15 | .section .rodata.cst32.CTR4BL, "aM", @progbits, 32 |
16 | .align 32 | |
17 | CTR4BL: .octa 0x00000000000000000000000000000002 | |
18 | .octa 0x00000000000000000000000000000003 | |
19 | ||
cee7a36e MW |
20 | .section .rodata.cst32.CTR8BL, "aM", @progbits, 32 |
21 | .align 32 | |
22 | CTR8BL: .octa 0x00000003000000020000000100000000 | |
23 | .octa 0x00000007000000060000000500000004 | |
24 | ||
25 | .text | |
26 | ||
6dcc5627 | 27 | SYM_FUNC_START(chacha_2block_xor_avx512vl) |
29a47b54 MW |
28 | # %rdi: Input state matrix, s |
29 | # %rsi: up to 2 data blocks output, o | |
30 | # %rdx: up to 2 data blocks input, i | |
31 | # %rcx: input/output length in bytes | |
8b65f34c | 32 | # %r8d: nrounds |
29a47b54 | 33 | |
8b65f34c | 34 | # This function encrypts two ChaCha blocks by loading the state |
29a47b54 MW |
35 | # matrix twice across four AVX registers. It performs matrix operations |
36 | # on four words in each matrix in parallel, but requires shuffling to | |
37 | # rearrange the words after each round. | |
38 | ||
39 | vzeroupper | |
40 | ||
41 | # x0..3[0-2] = s0..3 | |
42 | vbroadcasti128 0x00(%rdi),%ymm0 | |
43 | vbroadcasti128 0x10(%rdi),%ymm1 | |
44 | vbroadcasti128 0x20(%rdi),%ymm2 | |
45 | vbroadcasti128 0x30(%rdi),%ymm3 | |
46 | ||
47 | vpaddd CTR2BL(%rip),%ymm3,%ymm3 | |
48 | ||
49 | vmovdqa %ymm0,%ymm8 | |
50 | vmovdqa %ymm1,%ymm9 | |
51 | vmovdqa %ymm2,%ymm10 | |
52 | vmovdqa %ymm3,%ymm11 | |
53 | ||
29a47b54 MW |
54 | .Ldoubleround: |
55 | ||
56 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) | |
57 | vpaddd %ymm1,%ymm0,%ymm0 | |
58 | vpxord %ymm0,%ymm3,%ymm3 | |
59 | vprold $16,%ymm3,%ymm3 | |
60 | ||
61 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) | |
62 | vpaddd %ymm3,%ymm2,%ymm2 | |
63 | vpxord %ymm2,%ymm1,%ymm1 | |
64 | vprold $12,%ymm1,%ymm1 | |
65 | ||
66 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) | |
67 | vpaddd %ymm1,%ymm0,%ymm0 | |
68 | vpxord %ymm0,%ymm3,%ymm3 | |
69 | vprold $8,%ymm3,%ymm3 | |
70 | ||
71 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) | |
72 | vpaddd %ymm3,%ymm2,%ymm2 | |
73 | vpxord %ymm2,%ymm1,%ymm1 | |
74 | vprold $7,%ymm1,%ymm1 | |
75 | ||
76 | # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) | |
77 | vpshufd $0x39,%ymm1,%ymm1 | |
78 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) | |
79 | vpshufd $0x4e,%ymm2,%ymm2 | |
80 | # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) | |
81 | vpshufd $0x93,%ymm3,%ymm3 | |
82 | ||
83 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) | |
84 | vpaddd %ymm1,%ymm0,%ymm0 | |
85 | vpxord %ymm0,%ymm3,%ymm3 | |
86 | vprold $16,%ymm3,%ymm3 | |
87 | ||
88 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) | |
89 | vpaddd %ymm3,%ymm2,%ymm2 | |
90 | vpxord %ymm2,%ymm1,%ymm1 | |
91 | vprold $12,%ymm1,%ymm1 | |
92 | ||
93 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) | |
94 | vpaddd %ymm1,%ymm0,%ymm0 | |
95 | vpxord %ymm0,%ymm3,%ymm3 | |
96 | vprold $8,%ymm3,%ymm3 | |
97 | ||
98 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) | |
99 | vpaddd %ymm3,%ymm2,%ymm2 | |
100 | vpxord %ymm2,%ymm1,%ymm1 | |
101 | vprold $7,%ymm1,%ymm1 | |
102 | ||
103 | # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) | |
104 | vpshufd $0x93,%ymm1,%ymm1 | |
105 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) | |
106 | vpshufd $0x4e,%ymm2,%ymm2 | |
107 | # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) | |
108 | vpshufd $0x39,%ymm3,%ymm3 | |
109 | ||
8b65f34c | 110 | sub $2,%r8d |
29a47b54 MW |
111 | jnz .Ldoubleround |
112 | ||
113 | # o0 = i0 ^ (x0 + s0) | |
114 | vpaddd %ymm8,%ymm0,%ymm7 | |
115 | cmp $0x10,%rcx | |
116 | jl .Lxorpart2 | |
117 | vpxord 0x00(%rdx),%xmm7,%xmm6 | |
118 | vmovdqu %xmm6,0x00(%rsi) | |
119 | vextracti128 $1,%ymm7,%xmm0 | |
120 | # o1 = i1 ^ (x1 + s1) | |
121 | vpaddd %ymm9,%ymm1,%ymm7 | |
122 | cmp $0x20,%rcx | |
123 | jl .Lxorpart2 | |
124 | vpxord 0x10(%rdx),%xmm7,%xmm6 | |
125 | vmovdqu %xmm6,0x10(%rsi) | |
126 | vextracti128 $1,%ymm7,%xmm1 | |
127 | # o2 = i2 ^ (x2 + s2) | |
128 | vpaddd %ymm10,%ymm2,%ymm7 | |
129 | cmp $0x30,%rcx | |
130 | jl .Lxorpart2 | |
131 | vpxord 0x20(%rdx),%xmm7,%xmm6 | |
132 | vmovdqu %xmm6,0x20(%rsi) | |
133 | vextracti128 $1,%ymm7,%xmm2 | |
134 | # o3 = i3 ^ (x3 + s3) | |
135 | vpaddd %ymm11,%ymm3,%ymm7 | |
136 | cmp $0x40,%rcx | |
137 | jl .Lxorpart2 | |
138 | vpxord 0x30(%rdx),%xmm7,%xmm6 | |
139 | vmovdqu %xmm6,0x30(%rsi) | |
140 | vextracti128 $1,%ymm7,%xmm3 | |
141 | ||
142 | # xor and write second block | |
143 | vmovdqa %xmm0,%xmm7 | |
144 | cmp $0x50,%rcx | |
145 | jl .Lxorpart2 | |
146 | vpxord 0x40(%rdx),%xmm7,%xmm6 | |
147 | vmovdqu %xmm6,0x40(%rsi) | |
148 | ||
149 | vmovdqa %xmm1,%xmm7 | |
150 | cmp $0x60,%rcx | |
151 | jl .Lxorpart2 | |
152 | vpxord 0x50(%rdx),%xmm7,%xmm6 | |
153 | vmovdqu %xmm6,0x50(%rsi) | |
154 | ||
155 | vmovdqa %xmm2,%xmm7 | |
156 | cmp $0x70,%rcx | |
157 | jl .Lxorpart2 | |
158 | vpxord 0x60(%rdx),%xmm7,%xmm6 | |
159 | vmovdqu %xmm6,0x60(%rsi) | |
160 | ||
161 | vmovdqa %xmm3,%xmm7 | |
162 | cmp $0x80,%rcx | |
163 | jl .Lxorpart2 | |
164 | vpxord 0x70(%rdx),%xmm7,%xmm6 | |
165 | vmovdqu %xmm6,0x70(%rsi) | |
166 | ||
167 | .Ldone2: | |
168 | vzeroupper | |
169 | ret | |
170 | ||
171 | .Lxorpart2: | |
172 | # xor remaining bytes from partial register into output | |
173 | mov %rcx,%rax | |
174 | and $0xf,%rcx | |
175 | jz .Ldone8 | |
176 | mov %rax,%r9 | |
177 | and $~0xf,%r9 | |
178 | ||
179 | mov $1,%rax | |
180 | shld %cl,%rax,%rax | |
181 | sub $1,%rax | |
182 | kmovq %rax,%k1 | |
183 | ||
184 | vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} | |
185 | vpxord %xmm7,%xmm1,%xmm1 | |
186 | vmovdqu8 %xmm1,(%rsi,%r9){%k1} | |
187 | ||
188 | jmp .Ldone2 | |
189 | ||
6dcc5627 | 190 | SYM_FUNC_END(chacha_2block_xor_avx512vl) |
29a47b54 | 191 | |
6dcc5627 | 192 | SYM_FUNC_START(chacha_4block_xor_avx512vl) |
180def6c MW |
193 | # %rdi: Input state matrix, s |
194 | # %rsi: up to 4 data blocks output, o | |
195 | # %rdx: up to 4 data blocks input, i | |
196 | # %rcx: input/output length in bytes | |
8b65f34c | 197 | # %r8d: nrounds |
180def6c | 198 | |
8b65f34c | 199 | # This function encrypts four ChaCha blocks by loading the state |
180def6c MW |
200 | # matrix four times across eight AVX registers. It performs matrix |
201 | # operations on four words in two matrices in parallel, sequentially | |
202 | # to the operations on the four words of the other two matrices. The | |
203 | # required word shuffling has a rather high latency, we can do the | |
204 | # arithmetic on two matrix-pairs without much slowdown. | |
205 | ||
206 | vzeroupper | |
207 | ||
208 | # x0..3[0-4] = s0..3 | |
209 | vbroadcasti128 0x00(%rdi),%ymm0 | |
210 | vbroadcasti128 0x10(%rdi),%ymm1 | |
211 | vbroadcasti128 0x20(%rdi),%ymm2 | |
212 | vbroadcasti128 0x30(%rdi),%ymm3 | |
213 | ||
214 | vmovdqa %ymm0,%ymm4 | |
215 | vmovdqa %ymm1,%ymm5 | |
216 | vmovdqa %ymm2,%ymm6 | |
217 | vmovdqa %ymm3,%ymm7 | |
218 | ||
219 | vpaddd CTR2BL(%rip),%ymm3,%ymm3 | |
220 | vpaddd CTR4BL(%rip),%ymm7,%ymm7 | |
221 | ||
222 | vmovdqa %ymm0,%ymm11 | |
223 | vmovdqa %ymm1,%ymm12 | |
224 | vmovdqa %ymm2,%ymm13 | |
225 | vmovdqa %ymm3,%ymm14 | |
226 | vmovdqa %ymm7,%ymm15 | |
227 | ||
180def6c MW |
228 | .Ldoubleround4: |
229 | ||
230 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) | |
231 | vpaddd %ymm1,%ymm0,%ymm0 | |
232 | vpxord %ymm0,%ymm3,%ymm3 | |
233 | vprold $16,%ymm3,%ymm3 | |
234 | ||
235 | vpaddd %ymm5,%ymm4,%ymm4 | |
236 | vpxord %ymm4,%ymm7,%ymm7 | |
237 | vprold $16,%ymm7,%ymm7 | |
238 | ||
239 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) | |
240 | vpaddd %ymm3,%ymm2,%ymm2 | |
241 | vpxord %ymm2,%ymm1,%ymm1 | |
242 | vprold $12,%ymm1,%ymm1 | |
243 | ||
244 | vpaddd %ymm7,%ymm6,%ymm6 | |
245 | vpxord %ymm6,%ymm5,%ymm5 | |
246 | vprold $12,%ymm5,%ymm5 | |
247 | ||
248 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) | |
249 | vpaddd %ymm1,%ymm0,%ymm0 | |
250 | vpxord %ymm0,%ymm3,%ymm3 | |
251 | vprold $8,%ymm3,%ymm3 | |
252 | ||
253 | vpaddd %ymm5,%ymm4,%ymm4 | |
254 | vpxord %ymm4,%ymm7,%ymm7 | |
255 | vprold $8,%ymm7,%ymm7 | |
256 | ||
257 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) | |
258 | vpaddd %ymm3,%ymm2,%ymm2 | |
259 | vpxord %ymm2,%ymm1,%ymm1 | |
260 | vprold $7,%ymm1,%ymm1 | |
261 | ||
262 | vpaddd %ymm7,%ymm6,%ymm6 | |
263 | vpxord %ymm6,%ymm5,%ymm5 | |
264 | vprold $7,%ymm5,%ymm5 | |
265 | ||
266 | # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) | |
267 | vpshufd $0x39,%ymm1,%ymm1 | |
268 | vpshufd $0x39,%ymm5,%ymm5 | |
269 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) | |
270 | vpshufd $0x4e,%ymm2,%ymm2 | |
271 | vpshufd $0x4e,%ymm6,%ymm6 | |
272 | # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) | |
273 | vpshufd $0x93,%ymm3,%ymm3 | |
274 | vpshufd $0x93,%ymm7,%ymm7 | |
275 | ||
276 | # x0 += x1, x3 = rotl32(x3 ^ x0, 16) | |
277 | vpaddd %ymm1,%ymm0,%ymm0 | |
278 | vpxord %ymm0,%ymm3,%ymm3 | |
279 | vprold $16,%ymm3,%ymm3 | |
280 | ||
281 | vpaddd %ymm5,%ymm4,%ymm4 | |
282 | vpxord %ymm4,%ymm7,%ymm7 | |
283 | vprold $16,%ymm7,%ymm7 | |
284 | ||
285 | # x2 += x3, x1 = rotl32(x1 ^ x2, 12) | |
286 | vpaddd %ymm3,%ymm2,%ymm2 | |
287 | vpxord %ymm2,%ymm1,%ymm1 | |
288 | vprold $12,%ymm1,%ymm1 | |
289 | ||
290 | vpaddd %ymm7,%ymm6,%ymm6 | |
291 | vpxord %ymm6,%ymm5,%ymm5 | |
292 | vprold $12,%ymm5,%ymm5 | |
293 | ||
294 | # x0 += x1, x3 = rotl32(x3 ^ x0, 8) | |
295 | vpaddd %ymm1,%ymm0,%ymm0 | |
296 | vpxord %ymm0,%ymm3,%ymm3 | |
297 | vprold $8,%ymm3,%ymm3 | |
298 | ||
299 | vpaddd %ymm5,%ymm4,%ymm4 | |
300 | vpxord %ymm4,%ymm7,%ymm7 | |
301 | vprold $8,%ymm7,%ymm7 | |
302 | ||
303 | # x2 += x3, x1 = rotl32(x1 ^ x2, 7) | |
304 | vpaddd %ymm3,%ymm2,%ymm2 | |
305 | vpxord %ymm2,%ymm1,%ymm1 | |
306 | vprold $7,%ymm1,%ymm1 | |
307 | ||
308 | vpaddd %ymm7,%ymm6,%ymm6 | |
309 | vpxord %ymm6,%ymm5,%ymm5 | |
310 | vprold $7,%ymm5,%ymm5 | |
311 | ||
312 | # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) | |
313 | vpshufd $0x93,%ymm1,%ymm1 | |
314 | vpshufd $0x93,%ymm5,%ymm5 | |
315 | # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) | |
316 | vpshufd $0x4e,%ymm2,%ymm2 | |
317 | vpshufd $0x4e,%ymm6,%ymm6 | |
318 | # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) | |
319 | vpshufd $0x39,%ymm3,%ymm3 | |
320 | vpshufd $0x39,%ymm7,%ymm7 | |
321 | ||
8b65f34c | 322 | sub $2,%r8d |
180def6c MW |
323 | jnz .Ldoubleround4 |
324 | ||
325 | # o0 = i0 ^ (x0 + s0), first block | |
326 | vpaddd %ymm11,%ymm0,%ymm10 | |
327 | cmp $0x10,%rcx | |
328 | jl .Lxorpart4 | |
329 | vpxord 0x00(%rdx),%xmm10,%xmm9 | |
330 | vmovdqu %xmm9,0x00(%rsi) | |
331 | vextracti128 $1,%ymm10,%xmm0 | |
332 | # o1 = i1 ^ (x1 + s1), first block | |
333 | vpaddd %ymm12,%ymm1,%ymm10 | |
334 | cmp $0x20,%rcx | |
335 | jl .Lxorpart4 | |
336 | vpxord 0x10(%rdx),%xmm10,%xmm9 | |
337 | vmovdqu %xmm9,0x10(%rsi) | |
338 | vextracti128 $1,%ymm10,%xmm1 | |
339 | # o2 = i2 ^ (x2 + s2), first block | |
340 | vpaddd %ymm13,%ymm2,%ymm10 | |
341 | cmp $0x30,%rcx | |
342 | jl .Lxorpart4 | |
343 | vpxord 0x20(%rdx),%xmm10,%xmm9 | |
344 | vmovdqu %xmm9,0x20(%rsi) | |
345 | vextracti128 $1,%ymm10,%xmm2 | |
346 | # o3 = i3 ^ (x3 + s3), first block | |
347 | vpaddd %ymm14,%ymm3,%ymm10 | |
348 | cmp $0x40,%rcx | |
349 | jl .Lxorpart4 | |
350 | vpxord 0x30(%rdx),%xmm10,%xmm9 | |
351 | vmovdqu %xmm9,0x30(%rsi) | |
352 | vextracti128 $1,%ymm10,%xmm3 | |
353 | ||
354 | # xor and write second block | |
355 | vmovdqa %xmm0,%xmm10 | |
356 | cmp $0x50,%rcx | |
357 | jl .Lxorpart4 | |
358 | vpxord 0x40(%rdx),%xmm10,%xmm9 | |
359 | vmovdqu %xmm9,0x40(%rsi) | |
360 | ||
361 | vmovdqa %xmm1,%xmm10 | |
362 | cmp $0x60,%rcx | |
363 | jl .Lxorpart4 | |
364 | vpxord 0x50(%rdx),%xmm10,%xmm9 | |
365 | vmovdqu %xmm9,0x50(%rsi) | |
366 | ||
367 | vmovdqa %xmm2,%xmm10 | |
368 | cmp $0x70,%rcx | |
369 | jl .Lxorpart4 | |
370 | vpxord 0x60(%rdx),%xmm10,%xmm9 | |
371 | vmovdqu %xmm9,0x60(%rsi) | |
372 | ||
373 | vmovdqa %xmm3,%xmm10 | |
374 | cmp $0x80,%rcx | |
375 | jl .Lxorpart4 | |
376 | vpxord 0x70(%rdx),%xmm10,%xmm9 | |
377 | vmovdqu %xmm9,0x70(%rsi) | |
378 | ||
379 | # o0 = i0 ^ (x0 + s0), third block | |
380 | vpaddd %ymm11,%ymm4,%ymm10 | |
381 | cmp $0x90,%rcx | |
382 | jl .Lxorpart4 | |
383 | vpxord 0x80(%rdx),%xmm10,%xmm9 | |
384 | vmovdqu %xmm9,0x80(%rsi) | |
385 | vextracti128 $1,%ymm10,%xmm4 | |
386 | # o1 = i1 ^ (x1 + s1), third block | |
387 | vpaddd %ymm12,%ymm5,%ymm10 | |
388 | cmp $0xa0,%rcx | |
389 | jl .Lxorpart4 | |
390 | vpxord 0x90(%rdx),%xmm10,%xmm9 | |
391 | vmovdqu %xmm9,0x90(%rsi) | |
392 | vextracti128 $1,%ymm10,%xmm5 | |
393 | # o2 = i2 ^ (x2 + s2), third block | |
394 | vpaddd %ymm13,%ymm6,%ymm10 | |
395 | cmp $0xb0,%rcx | |
396 | jl .Lxorpart4 | |
397 | vpxord 0xa0(%rdx),%xmm10,%xmm9 | |
398 | vmovdqu %xmm9,0xa0(%rsi) | |
399 | vextracti128 $1,%ymm10,%xmm6 | |
400 | # o3 = i3 ^ (x3 + s3), third block | |
401 | vpaddd %ymm15,%ymm7,%ymm10 | |
402 | cmp $0xc0,%rcx | |
403 | jl .Lxorpart4 | |
404 | vpxord 0xb0(%rdx),%xmm10,%xmm9 | |
405 | vmovdqu %xmm9,0xb0(%rsi) | |
406 | vextracti128 $1,%ymm10,%xmm7 | |
407 | ||
408 | # xor and write fourth block | |
409 | vmovdqa %xmm4,%xmm10 | |
410 | cmp $0xd0,%rcx | |
411 | jl .Lxorpart4 | |
412 | vpxord 0xc0(%rdx),%xmm10,%xmm9 | |
413 | vmovdqu %xmm9,0xc0(%rsi) | |
414 | ||
415 | vmovdqa %xmm5,%xmm10 | |
416 | cmp $0xe0,%rcx | |
417 | jl .Lxorpart4 | |
418 | vpxord 0xd0(%rdx),%xmm10,%xmm9 | |
419 | vmovdqu %xmm9,0xd0(%rsi) | |
420 | ||
421 | vmovdqa %xmm6,%xmm10 | |
422 | cmp $0xf0,%rcx | |
423 | jl .Lxorpart4 | |
424 | vpxord 0xe0(%rdx),%xmm10,%xmm9 | |
425 | vmovdqu %xmm9,0xe0(%rsi) | |
426 | ||
427 | vmovdqa %xmm7,%xmm10 | |
428 | cmp $0x100,%rcx | |
429 | jl .Lxorpart4 | |
430 | vpxord 0xf0(%rdx),%xmm10,%xmm9 | |
431 | vmovdqu %xmm9,0xf0(%rsi) | |
432 | ||
433 | .Ldone4: | |
434 | vzeroupper | |
435 | ret | |
436 | ||
437 | .Lxorpart4: | |
438 | # xor remaining bytes from partial register into output | |
439 | mov %rcx,%rax | |
440 | and $0xf,%rcx | |
441 | jz .Ldone8 | |
442 | mov %rax,%r9 | |
443 | and $~0xf,%r9 | |
444 | ||
445 | mov $1,%rax | |
446 | shld %cl,%rax,%rax | |
447 | sub $1,%rax | |
448 | kmovq %rax,%k1 | |
449 | ||
450 | vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} | |
451 | vpxord %xmm10,%xmm1,%xmm1 | |
452 | vmovdqu8 %xmm1,(%rsi,%r9){%k1} | |
453 | ||
454 | jmp .Ldone4 | |
455 | ||
6dcc5627 | 456 | SYM_FUNC_END(chacha_4block_xor_avx512vl) |
180def6c | 457 | |
6dcc5627 | 458 | SYM_FUNC_START(chacha_8block_xor_avx512vl) |
cee7a36e MW |
459 | # %rdi: Input state matrix, s |
460 | # %rsi: up to 8 data blocks output, o | |
461 | # %rdx: up to 8 data blocks input, i | |
462 | # %rcx: input/output length in bytes | |
8b65f34c | 463 | # %r8d: nrounds |
cee7a36e | 464 | |
8b65f34c | 465 | # This function encrypts eight consecutive ChaCha blocks by loading |
cee7a36e MW |
466 | # the state matrix in AVX registers eight times. Compared to AVX2, this |
467 | # mostly benefits from the new rotate instructions in VL and the | |
468 | # additional registers. | |
469 | ||
470 | vzeroupper | |
471 | ||
472 | # x0..15[0-7] = s[0..15] | |
473 | vpbroadcastd 0x00(%rdi),%ymm0 | |
474 | vpbroadcastd 0x04(%rdi),%ymm1 | |
475 | vpbroadcastd 0x08(%rdi),%ymm2 | |
476 | vpbroadcastd 0x0c(%rdi),%ymm3 | |
477 | vpbroadcastd 0x10(%rdi),%ymm4 | |
478 | vpbroadcastd 0x14(%rdi),%ymm5 | |
479 | vpbroadcastd 0x18(%rdi),%ymm6 | |
480 | vpbroadcastd 0x1c(%rdi),%ymm7 | |
481 | vpbroadcastd 0x20(%rdi),%ymm8 | |
482 | vpbroadcastd 0x24(%rdi),%ymm9 | |
483 | vpbroadcastd 0x28(%rdi),%ymm10 | |
484 | vpbroadcastd 0x2c(%rdi),%ymm11 | |
485 | vpbroadcastd 0x30(%rdi),%ymm12 | |
486 | vpbroadcastd 0x34(%rdi),%ymm13 | |
487 | vpbroadcastd 0x38(%rdi),%ymm14 | |
488 | vpbroadcastd 0x3c(%rdi),%ymm15 | |
489 | ||
490 | # x12 += counter values 0-3 | |
491 | vpaddd CTR8BL(%rip),%ymm12,%ymm12 | |
492 | ||
493 | vmovdqa64 %ymm0,%ymm16 | |
494 | vmovdqa64 %ymm1,%ymm17 | |
495 | vmovdqa64 %ymm2,%ymm18 | |
496 | vmovdqa64 %ymm3,%ymm19 | |
497 | vmovdqa64 %ymm4,%ymm20 | |
498 | vmovdqa64 %ymm5,%ymm21 | |
499 | vmovdqa64 %ymm6,%ymm22 | |
500 | vmovdqa64 %ymm7,%ymm23 | |
501 | vmovdqa64 %ymm8,%ymm24 | |
502 | vmovdqa64 %ymm9,%ymm25 | |
503 | vmovdqa64 %ymm10,%ymm26 | |
504 | vmovdqa64 %ymm11,%ymm27 | |
505 | vmovdqa64 %ymm12,%ymm28 | |
506 | vmovdqa64 %ymm13,%ymm29 | |
507 | vmovdqa64 %ymm14,%ymm30 | |
508 | vmovdqa64 %ymm15,%ymm31 | |
509 | ||
cee7a36e MW |
510 | .Ldoubleround8: |
511 | # x0 += x4, x12 = rotl32(x12 ^ x0, 16) | |
512 | vpaddd %ymm0,%ymm4,%ymm0 | |
513 | vpxord %ymm0,%ymm12,%ymm12 | |
514 | vprold $16,%ymm12,%ymm12 | |
515 | # x1 += x5, x13 = rotl32(x13 ^ x1, 16) | |
516 | vpaddd %ymm1,%ymm5,%ymm1 | |
517 | vpxord %ymm1,%ymm13,%ymm13 | |
518 | vprold $16,%ymm13,%ymm13 | |
519 | # x2 += x6, x14 = rotl32(x14 ^ x2, 16) | |
520 | vpaddd %ymm2,%ymm6,%ymm2 | |
521 | vpxord %ymm2,%ymm14,%ymm14 | |
522 | vprold $16,%ymm14,%ymm14 | |
523 | # x3 += x7, x15 = rotl32(x15 ^ x3, 16) | |
524 | vpaddd %ymm3,%ymm7,%ymm3 | |
525 | vpxord %ymm3,%ymm15,%ymm15 | |
526 | vprold $16,%ymm15,%ymm15 | |
527 | ||
528 | # x8 += x12, x4 = rotl32(x4 ^ x8, 12) | |
529 | vpaddd %ymm12,%ymm8,%ymm8 | |
530 | vpxord %ymm8,%ymm4,%ymm4 | |
531 | vprold $12,%ymm4,%ymm4 | |
532 | # x9 += x13, x5 = rotl32(x5 ^ x9, 12) | |
533 | vpaddd %ymm13,%ymm9,%ymm9 | |
534 | vpxord %ymm9,%ymm5,%ymm5 | |
535 | vprold $12,%ymm5,%ymm5 | |
536 | # x10 += x14, x6 = rotl32(x6 ^ x10, 12) | |
537 | vpaddd %ymm14,%ymm10,%ymm10 | |
538 | vpxord %ymm10,%ymm6,%ymm6 | |
539 | vprold $12,%ymm6,%ymm6 | |
540 | # x11 += x15, x7 = rotl32(x7 ^ x11, 12) | |
541 | vpaddd %ymm15,%ymm11,%ymm11 | |
542 | vpxord %ymm11,%ymm7,%ymm7 | |
543 | vprold $12,%ymm7,%ymm7 | |
544 | ||
545 | # x0 += x4, x12 = rotl32(x12 ^ x0, 8) | |
546 | vpaddd %ymm0,%ymm4,%ymm0 | |
547 | vpxord %ymm0,%ymm12,%ymm12 | |
548 | vprold $8,%ymm12,%ymm12 | |
549 | # x1 += x5, x13 = rotl32(x13 ^ x1, 8) | |
550 | vpaddd %ymm1,%ymm5,%ymm1 | |
551 | vpxord %ymm1,%ymm13,%ymm13 | |
552 | vprold $8,%ymm13,%ymm13 | |
553 | # x2 += x6, x14 = rotl32(x14 ^ x2, 8) | |
554 | vpaddd %ymm2,%ymm6,%ymm2 | |
555 | vpxord %ymm2,%ymm14,%ymm14 | |
556 | vprold $8,%ymm14,%ymm14 | |
557 | # x3 += x7, x15 = rotl32(x15 ^ x3, 8) | |
558 | vpaddd %ymm3,%ymm7,%ymm3 | |
559 | vpxord %ymm3,%ymm15,%ymm15 | |
560 | vprold $8,%ymm15,%ymm15 | |
561 | ||
562 | # x8 += x12, x4 = rotl32(x4 ^ x8, 7) | |
563 | vpaddd %ymm12,%ymm8,%ymm8 | |
564 | vpxord %ymm8,%ymm4,%ymm4 | |
565 | vprold $7,%ymm4,%ymm4 | |
566 | # x9 += x13, x5 = rotl32(x5 ^ x9, 7) | |
567 | vpaddd %ymm13,%ymm9,%ymm9 | |
568 | vpxord %ymm9,%ymm5,%ymm5 | |
569 | vprold $7,%ymm5,%ymm5 | |
570 | # x10 += x14, x6 = rotl32(x6 ^ x10, 7) | |
571 | vpaddd %ymm14,%ymm10,%ymm10 | |
572 | vpxord %ymm10,%ymm6,%ymm6 | |
573 | vprold $7,%ymm6,%ymm6 | |
574 | # x11 += x15, x7 = rotl32(x7 ^ x11, 7) | |
575 | vpaddd %ymm15,%ymm11,%ymm11 | |
576 | vpxord %ymm11,%ymm7,%ymm7 | |
577 | vprold $7,%ymm7,%ymm7 | |
578 | ||
579 | # x0 += x5, x15 = rotl32(x15 ^ x0, 16) | |
580 | vpaddd %ymm0,%ymm5,%ymm0 | |
581 | vpxord %ymm0,%ymm15,%ymm15 | |
582 | vprold $16,%ymm15,%ymm15 | |
583 | # x1 += x6, x12 = rotl32(x12 ^ x1, 16) | |
584 | vpaddd %ymm1,%ymm6,%ymm1 | |
585 | vpxord %ymm1,%ymm12,%ymm12 | |
586 | vprold $16,%ymm12,%ymm12 | |
587 | # x2 += x7, x13 = rotl32(x13 ^ x2, 16) | |
588 | vpaddd %ymm2,%ymm7,%ymm2 | |
589 | vpxord %ymm2,%ymm13,%ymm13 | |
590 | vprold $16,%ymm13,%ymm13 | |
591 | # x3 += x4, x14 = rotl32(x14 ^ x3, 16) | |
592 | vpaddd %ymm3,%ymm4,%ymm3 | |
593 | vpxord %ymm3,%ymm14,%ymm14 | |
594 | vprold $16,%ymm14,%ymm14 | |
595 | ||
596 | # x10 += x15, x5 = rotl32(x5 ^ x10, 12) | |
597 | vpaddd %ymm15,%ymm10,%ymm10 | |
598 | vpxord %ymm10,%ymm5,%ymm5 | |
599 | vprold $12,%ymm5,%ymm5 | |
600 | # x11 += x12, x6 = rotl32(x6 ^ x11, 12) | |
601 | vpaddd %ymm12,%ymm11,%ymm11 | |
602 | vpxord %ymm11,%ymm6,%ymm6 | |
603 | vprold $12,%ymm6,%ymm6 | |
604 | # x8 += x13, x7 = rotl32(x7 ^ x8, 12) | |
605 | vpaddd %ymm13,%ymm8,%ymm8 | |
606 | vpxord %ymm8,%ymm7,%ymm7 | |
607 | vprold $12,%ymm7,%ymm7 | |
608 | # x9 += x14, x4 = rotl32(x4 ^ x9, 12) | |
609 | vpaddd %ymm14,%ymm9,%ymm9 | |
610 | vpxord %ymm9,%ymm4,%ymm4 | |
611 | vprold $12,%ymm4,%ymm4 | |
612 | ||
613 | # x0 += x5, x15 = rotl32(x15 ^ x0, 8) | |
614 | vpaddd %ymm0,%ymm5,%ymm0 | |
615 | vpxord %ymm0,%ymm15,%ymm15 | |
616 | vprold $8,%ymm15,%ymm15 | |
617 | # x1 += x6, x12 = rotl32(x12 ^ x1, 8) | |
618 | vpaddd %ymm1,%ymm6,%ymm1 | |
619 | vpxord %ymm1,%ymm12,%ymm12 | |
620 | vprold $8,%ymm12,%ymm12 | |
621 | # x2 += x7, x13 = rotl32(x13 ^ x2, 8) | |
622 | vpaddd %ymm2,%ymm7,%ymm2 | |
623 | vpxord %ymm2,%ymm13,%ymm13 | |
624 | vprold $8,%ymm13,%ymm13 | |
625 | # x3 += x4, x14 = rotl32(x14 ^ x3, 8) | |
626 | vpaddd %ymm3,%ymm4,%ymm3 | |
627 | vpxord %ymm3,%ymm14,%ymm14 | |
628 | vprold $8,%ymm14,%ymm14 | |
629 | ||
630 | # x10 += x15, x5 = rotl32(x5 ^ x10, 7) | |
631 | vpaddd %ymm15,%ymm10,%ymm10 | |
632 | vpxord %ymm10,%ymm5,%ymm5 | |
633 | vprold $7,%ymm5,%ymm5 | |
634 | # x11 += x12, x6 = rotl32(x6 ^ x11, 7) | |
635 | vpaddd %ymm12,%ymm11,%ymm11 | |
636 | vpxord %ymm11,%ymm6,%ymm6 | |
637 | vprold $7,%ymm6,%ymm6 | |
638 | # x8 += x13, x7 = rotl32(x7 ^ x8, 7) | |
639 | vpaddd %ymm13,%ymm8,%ymm8 | |
640 | vpxord %ymm8,%ymm7,%ymm7 | |
641 | vprold $7,%ymm7,%ymm7 | |
642 | # x9 += x14, x4 = rotl32(x4 ^ x9, 7) | |
643 | vpaddd %ymm14,%ymm9,%ymm9 | |
644 | vpxord %ymm9,%ymm4,%ymm4 | |
645 | vprold $7,%ymm4,%ymm4 | |
646 | ||
8b65f34c | 647 | sub $2,%r8d |
cee7a36e MW |
648 | jnz .Ldoubleround8 |
649 | ||
650 | # x0..15[0-3] += s[0..15] | |
651 | vpaddd %ymm16,%ymm0,%ymm0 | |
652 | vpaddd %ymm17,%ymm1,%ymm1 | |
653 | vpaddd %ymm18,%ymm2,%ymm2 | |
654 | vpaddd %ymm19,%ymm3,%ymm3 | |
655 | vpaddd %ymm20,%ymm4,%ymm4 | |
656 | vpaddd %ymm21,%ymm5,%ymm5 | |
657 | vpaddd %ymm22,%ymm6,%ymm6 | |
658 | vpaddd %ymm23,%ymm7,%ymm7 | |
659 | vpaddd %ymm24,%ymm8,%ymm8 | |
660 | vpaddd %ymm25,%ymm9,%ymm9 | |
661 | vpaddd %ymm26,%ymm10,%ymm10 | |
662 | vpaddd %ymm27,%ymm11,%ymm11 | |
663 | vpaddd %ymm28,%ymm12,%ymm12 | |
664 | vpaddd %ymm29,%ymm13,%ymm13 | |
665 | vpaddd %ymm30,%ymm14,%ymm14 | |
666 | vpaddd %ymm31,%ymm15,%ymm15 | |
667 | ||
668 | # interleave 32-bit words in state n, n+1 | |
669 | vpunpckldq %ymm1,%ymm0,%ymm16 | |
670 | vpunpckhdq %ymm1,%ymm0,%ymm17 | |
671 | vpunpckldq %ymm3,%ymm2,%ymm18 | |
672 | vpunpckhdq %ymm3,%ymm2,%ymm19 | |
673 | vpunpckldq %ymm5,%ymm4,%ymm20 | |
674 | vpunpckhdq %ymm5,%ymm4,%ymm21 | |
675 | vpunpckldq %ymm7,%ymm6,%ymm22 | |
676 | vpunpckhdq %ymm7,%ymm6,%ymm23 | |
677 | vpunpckldq %ymm9,%ymm8,%ymm24 | |
678 | vpunpckhdq %ymm9,%ymm8,%ymm25 | |
679 | vpunpckldq %ymm11,%ymm10,%ymm26 | |
680 | vpunpckhdq %ymm11,%ymm10,%ymm27 | |
681 | vpunpckldq %ymm13,%ymm12,%ymm28 | |
682 | vpunpckhdq %ymm13,%ymm12,%ymm29 | |
683 | vpunpckldq %ymm15,%ymm14,%ymm30 | |
684 | vpunpckhdq %ymm15,%ymm14,%ymm31 | |
685 | ||
686 | # interleave 64-bit words in state n, n+2 | |
687 | vpunpcklqdq %ymm18,%ymm16,%ymm0 | |
688 | vpunpcklqdq %ymm19,%ymm17,%ymm1 | |
689 | vpunpckhqdq %ymm18,%ymm16,%ymm2 | |
690 | vpunpckhqdq %ymm19,%ymm17,%ymm3 | |
691 | vpunpcklqdq %ymm22,%ymm20,%ymm4 | |
692 | vpunpcklqdq %ymm23,%ymm21,%ymm5 | |
693 | vpunpckhqdq %ymm22,%ymm20,%ymm6 | |
694 | vpunpckhqdq %ymm23,%ymm21,%ymm7 | |
695 | vpunpcklqdq %ymm26,%ymm24,%ymm8 | |
696 | vpunpcklqdq %ymm27,%ymm25,%ymm9 | |
697 | vpunpckhqdq %ymm26,%ymm24,%ymm10 | |
698 | vpunpckhqdq %ymm27,%ymm25,%ymm11 | |
699 | vpunpcklqdq %ymm30,%ymm28,%ymm12 | |
700 | vpunpcklqdq %ymm31,%ymm29,%ymm13 | |
701 | vpunpckhqdq %ymm30,%ymm28,%ymm14 | |
702 | vpunpckhqdq %ymm31,%ymm29,%ymm15 | |
703 | ||
704 | # interleave 128-bit words in state n, n+4 | |
705 | # xor/write first four blocks | |
706 | vmovdqa64 %ymm0,%ymm16 | |
707 | vperm2i128 $0x20,%ymm4,%ymm0,%ymm0 | |
708 | cmp $0x0020,%rcx | |
709 | jl .Lxorpart8 | |
710 | vpxord 0x0000(%rdx),%ymm0,%ymm0 | |
711 | vmovdqu64 %ymm0,0x0000(%rsi) | |
712 | vmovdqa64 %ymm16,%ymm0 | |
713 | vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 | |
714 | ||
715 | vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 | |
716 | cmp $0x0040,%rcx | |
717 | jl .Lxorpart8 | |
718 | vpxord 0x0020(%rdx),%ymm0,%ymm0 | |
719 | vmovdqu64 %ymm0,0x0020(%rsi) | |
720 | vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 | |
721 | ||
722 | vperm2i128 $0x20,%ymm6,%ymm2,%ymm0 | |
723 | cmp $0x0060,%rcx | |
724 | jl .Lxorpart8 | |
725 | vpxord 0x0040(%rdx),%ymm0,%ymm0 | |
726 | vmovdqu64 %ymm0,0x0040(%rsi) | |
727 | vperm2i128 $0x31,%ymm6,%ymm2,%ymm6 | |
728 | ||
729 | vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 | |
730 | cmp $0x0080,%rcx | |
731 | jl .Lxorpart8 | |
732 | vpxord 0x0060(%rdx),%ymm0,%ymm0 | |
733 | vmovdqu64 %ymm0,0x0060(%rsi) | |
734 | vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 | |
735 | ||
736 | vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 | |
737 | cmp $0x00a0,%rcx | |
738 | jl .Lxorpart8 | |
739 | vpxord 0x0080(%rdx),%ymm0,%ymm0 | |
740 | vmovdqu64 %ymm0,0x0080(%rsi) | |
741 | vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 | |
742 | ||
743 | vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 | |
744 | cmp $0x00c0,%rcx | |
745 | jl .Lxorpart8 | |
746 | vpxord 0x00a0(%rdx),%ymm0,%ymm0 | |
747 | vmovdqu64 %ymm0,0x00a0(%rsi) | |
748 | vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 | |
749 | ||
750 | vperm2i128 $0x20,%ymm7,%ymm3,%ymm0 | |
751 | cmp $0x00e0,%rcx | |
752 | jl .Lxorpart8 | |
753 | vpxord 0x00c0(%rdx),%ymm0,%ymm0 | |
754 | vmovdqu64 %ymm0,0x00c0(%rsi) | |
755 | vperm2i128 $0x31,%ymm7,%ymm3,%ymm7 | |
756 | ||
757 | vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 | |
758 | cmp $0x0100,%rcx | |
759 | jl .Lxorpart8 | |
760 | vpxord 0x00e0(%rdx),%ymm0,%ymm0 | |
761 | vmovdqu64 %ymm0,0x00e0(%rsi) | |
762 | vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 | |
763 | ||
764 | # xor remaining blocks, write to output | |
765 | vmovdqa64 %ymm4,%ymm0 | |
766 | cmp $0x0120,%rcx | |
767 | jl .Lxorpart8 | |
768 | vpxord 0x0100(%rdx),%ymm0,%ymm0 | |
769 | vmovdqu64 %ymm0,0x0100(%rsi) | |
770 | ||
771 | vmovdqa64 %ymm12,%ymm0 | |
772 | cmp $0x0140,%rcx | |
773 | jl .Lxorpart8 | |
774 | vpxord 0x0120(%rdx),%ymm0,%ymm0 | |
775 | vmovdqu64 %ymm0,0x0120(%rsi) | |
776 | ||
777 | vmovdqa64 %ymm6,%ymm0 | |
778 | cmp $0x0160,%rcx | |
779 | jl .Lxorpart8 | |
780 | vpxord 0x0140(%rdx),%ymm0,%ymm0 | |
781 | vmovdqu64 %ymm0,0x0140(%rsi) | |
782 | ||
783 | vmovdqa64 %ymm14,%ymm0 | |
784 | cmp $0x0180,%rcx | |
785 | jl .Lxorpart8 | |
786 | vpxord 0x0160(%rdx),%ymm0,%ymm0 | |
787 | vmovdqu64 %ymm0,0x0160(%rsi) | |
788 | ||
789 | vmovdqa64 %ymm5,%ymm0 | |
790 | cmp $0x01a0,%rcx | |
791 | jl .Lxorpart8 | |
792 | vpxord 0x0180(%rdx),%ymm0,%ymm0 | |
793 | vmovdqu64 %ymm0,0x0180(%rsi) | |
794 | ||
795 | vmovdqa64 %ymm13,%ymm0 | |
796 | cmp $0x01c0,%rcx | |
797 | jl .Lxorpart8 | |
798 | vpxord 0x01a0(%rdx),%ymm0,%ymm0 | |
799 | vmovdqu64 %ymm0,0x01a0(%rsi) | |
800 | ||
801 | vmovdqa64 %ymm7,%ymm0 | |
802 | cmp $0x01e0,%rcx | |
803 | jl .Lxorpart8 | |
804 | vpxord 0x01c0(%rdx),%ymm0,%ymm0 | |
805 | vmovdqu64 %ymm0,0x01c0(%rsi) | |
806 | ||
807 | vmovdqa64 %ymm15,%ymm0 | |
808 | cmp $0x0200,%rcx | |
809 | jl .Lxorpart8 | |
810 | vpxord 0x01e0(%rdx),%ymm0,%ymm0 | |
811 | vmovdqu64 %ymm0,0x01e0(%rsi) | |
812 | ||
813 | .Ldone8: | |
814 | vzeroupper | |
815 | ret | |
816 | ||
817 | .Lxorpart8: | |
818 | # xor remaining bytes from partial register into output | |
819 | mov %rcx,%rax | |
820 | and $0x1f,%rcx | |
821 | jz .Ldone8 | |
822 | mov %rax,%r9 | |
823 | and $~0x1f,%r9 | |
824 | ||
825 | mov $1,%rax | |
826 | shld %cl,%rax,%rax | |
827 | sub $1,%rax | |
828 | kmovq %rax,%k1 | |
829 | ||
830 | vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z} | |
831 | vpxord %ymm0,%ymm1,%ymm1 | |
832 | vmovdqu8 %ymm1,(%rsi,%r9){%k1} | |
833 | ||
834 | jmp .Ldone8 | |
835 | ||
6dcc5627 | 836 | SYM_FUNC_END(chacha_8block_xor_avx512vl) |