Merge tag 'pull-bcachefs-fix' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
[linux-2.6-block.git] / arch / x86 / crypto / aes_ctrby8_avx-x86_64.S
CommitLineData
90be188b 1/* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */
22cddcc7 2/*
90be188b 3 * AES CTR mode by8 optimization with AVX instructions. (x86_64)
22cddcc7 4 *
5 * Copyright(c) 2014 Intel Corporation.
6 *
22cddcc7 7 * Contact Information:
8 * James Guilford <james.guilford@intel.com>
9 * Sean Gulley <sean.m.gulley@intel.com>
10 * Chandramouli Narayanan <mouli@linux.intel.com>
90be188b
NH
11 */
12/*
13 * This is AES128/192/256 CTR mode optimization implementation. It requires
14 * the support of Intel(R) AESNI and AVX instructions.
22cddcc7 15 *
90be188b
NH
16 * This work was inspired by the AES CTR mode optimization published
17 * in Intel Optimized IPSEC Cryptographic library.
18 * Additional information on it can be found at:
19 * https://github.com/intel/intel-ipsec-mb
22cddcc7 20 */
21
22#include <linux/linkage.h>
22cddcc7 23
22cddcc7 24#define VMOVDQ vmovdqu
25
fd94fcf0
NH
26/*
27 * Note: the "x" prefix in these aliases means "this is an xmm register". The
28 * alias prefixes have no relation to XCTR where the "X" prefix means "XOR
29 * counter".
30 */
22cddcc7 31#define xdata0 %xmm0
32#define xdata1 %xmm1
33#define xdata2 %xmm2
34#define xdata3 %xmm3
35#define xdata4 %xmm4
36#define xdata5 %xmm5
37#define xdata6 %xmm6
38#define xdata7 %xmm7
fd94fcf0
NH
39#define xcounter %xmm8 // CTR mode only
40#define xiv %xmm8 // XCTR mode only
41#define xbyteswap %xmm9 // CTR mode only
42#define xtmp %xmm9 // XCTR mode only
22cddcc7 43#define xkey0 %xmm10
22cddcc7 44#define xkey4 %xmm11
45#define xkey8 %xmm12
46#define xkey12 %xmm13
47#define xkeyA %xmm14
48#define xkeyB %xmm15
49
50#define p_in %rdi
51#define p_iv %rsi
52#define p_keys %rdx
53#define p_out %rcx
54#define num_bytes %r8
fd94fcf0 55#define counter %r9 // XCTR mode only
22cddcc7 56#define tmp %r10
22cddcc7 57#define DDQ_DATA 0
58#define XDATA 1
59#define KEY_128 1
60#define KEY_192 2
61#define KEY_256 3
62
63.section .rodata
64.align 16
65
66byteswap_const:
67 .octa 0x000102030405060708090A0B0C0D0E0F
80dca473
MK
68ddq_low_msk:
69 .octa 0x0000000000000000FFFFFFFFFFFFFFFF
70ddq_high_add_1:
71 .octa 0x00000000000000010000000000000000
22cddcc7 72ddq_add_1:
73 .octa 0x00000000000000000000000000000001
74ddq_add_2:
75 .octa 0x00000000000000000000000000000002
76ddq_add_3:
77 .octa 0x00000000000000000000000000000003
78ddq_add_4:
79 .octa 0x00000000000000000000000000000004
80ddq_add_5:
81 .octa 0x00000000000000000000000000000005
82ddq_add_6:
83 .octa 0x00000000000000000000000000000006
84ddq_add_7:
85 .octa 0x00000000000000000000000000000007
86ddq_add_8:
87 .octa 0x00000000000000000000000000000008
88
89.text
90
91/* generate a unique variable for ddq_add_x */
92
22cddcc7 93/* generate a unique variable for xmm register */
94.macro setxdata n
fdb2726f 95 var_xdata = %xmm\n
22cddcc7 96.endm
97
98/* club the numeric 'id' to the symbol 'name' */
99
100.macro club name, id
101.altmacro
44069737 102 .if \name == XDATA
22cddcc7 103 setxdata %\id
104 .endif
105.noaltmacro
106.endm
107
108/*
109 * do_aes num_in_par load_keys key_len
110 * This increments p_in, but not p_out
111 */
fd94fcf0 112.macro do_aes b, k, key_len, xctr
22cddcc7 113 .set by, \b
114 .set load_keys, \k
115 .set klen, \key_len
116
117 .if (load_keys)
118 vmovdqa 0*16(p_keys), xkey0
119 .endif
120
fd94fcf0
NH
121 .if \xctr
122 movq counter, xtmp
123 .set i, 0
124 .rept (by)
125 club XDATA, i
126 vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata
127 .set i, (i +1)
128 .endr
129 .set i, 0
130 .rept (by)
131 club XDATA, i
132 vpxor xiv, var_xdata, var_xdata
133 .set i, (i +1)
134 .endr
135 .else
136 vpshufb xbyteswap, xcounter, xdata0
137 .set i, 1
138 .rept (by - 1)
139 club XDATA, i
140 vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
141 vptest ddq_low_msk(%rip), var_xdata
142 jnz 1f
143 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
144 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
145 1:
146 vpshufb xbyteswap, var_xdata, var_xdata
147 .set i, (i +1)
148 .endr
149 .endif
22cddcc7 150
151 vmovdqa 1*16(p_keys), xkeyA
152
153 vpxor xkey0, xdata0, xdata0
fd94fcf0
NH
154 .if \xctr
155 add $by, counter
156 .else
157 vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
158 vptest ddq_low_msk(%rip), xcounter
159 jnz 1f
160 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
161 1:
162 .endif
22cddcc7 163
164 .set i, 1
165 .rept (by - 1)
166 club XDATA, i
167 vpxor xkey0, var_xdata, var_xdata
168 .set i, (i +1)
169 .endr
170
171 vmovdqa 2*16(p_keys), xkeyB
172
173 .set i, 0
174 .rept by
175 club XDATA, i
176 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
177 .set i, (i +1)
178 .endr
179
180 .if (klen == KEY_128)
181 .if (load_keys)
0b1e95b2 182 vmovdqa 3*16(p_keys), xkey4
22cddcc7 183 .endif
184 .else
185 vmovdqa 3*16(p_keys), xkeyA
186 .endif
187
188 .set i, 0
189 .rept by
190 club XDATA, i
191 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
192 .set i, (i +1)
193 .endr
194
195 add $(16*by), p_in
196
197 .if (klen == KEY_128)
0b1e95b2 198 vmovdqa 4*16(p_keys), xkeyB
22cddcc7 199 .else
200 .if (load_keys)
201 vmovdqa 4*16(p_keys), xkey4
202 .endif
203 .endif
204
205 .set i, 0
206 .rept by
207 club XDATA, i
0b1e95b2
MK
208 /* key 3 */
209 .if (klen == KEY_128)
210 vaesenc xkey4, var_xdata, var_xdata
211 .else
212 vaesenc xkeyA, var_xdata, var_xdata
213 .endif
22cddcc7 214 .set i, (i +1)
215 .endr
216
217 vmovdqa 5*16(p_keys), xkeyA
218
219 .set i, 0
220 .rept by
221 club XDATA, i
0b1e95b2
MK
222 /* key 4 */
223 .if (klen == KEY_128)
224 vaesenc xkeyB, var_xdata, var_xdata
225 .else
226 vaesenc xkey4, var_xdata, var_xdata
227 .endif
22cddcc7 228 .set i, (i +1)
229 .endr
230
231 .if (klen == KEY_128)
232 .if (load_keys)
0b1e95b2 233 vmovdqa 6*16(p_keys), xkey8
22cddcc7 234 .endif
235 .else
236 vmovdqa 6*16(p_keys), xkeyB
237 .endif
238
239 .set i, 0
240 .rept by
241 club XDATA, i
242 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
243 .set i, (i +1)
244 .endr
245
246 vmovdqa 7*16(p_keys), xkeyA
247
248 .set i, 0
249 .rept by
250 club XDATA, i
0b1e95b2
MK
251 /* key 6 */
252 .if (klen == KEY_128)
253 vaesenc xkey8, var_xdata, var_xdata
254 .else
255 vaesenc xkeyB, var_xdata, var_xdata
256 .endif
22cddcc7 257 .set i, (i +1)
258 .endr
259
260 .if (klen == KEY_128)
0b1e95b2 261 vmovdqa 8*16(p_keys), xkeyB
22cddcc7 262 .else
263 .if (load_keys)
264 vmovdqa 8*16(p_keys), xkey8
265 .endif
266 .endif
267
268 .set i, 0
269 .rept by
270 club XDATA, i
271 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
272 .set i, (i +1)
273 .endr
274
275 .if (klen == KEY_128)
276 .if (load_keys)
0b1e95b2 277 vmovdqa 9*16(p_keys), xkey12
22cddcc7 278 .endif
279 .else
280 vmovdqa 9*16(p_keys), xkeyA
281 .endif
282
283 .set i, 0
284 .rept by
285 club XDATA, i
0b1e95b2
MK
286 /* key 8 */
287 .if (klen == KEY_128)
288 vaesenc xkeyB, var_xdata, var_xdata
289 .else
290 vaesenc xkey8, var_xdata, var_xdata
291 .endif
22cddcc7 292 .set i, (i +1)
293 .endr
294
295 vmovdqa 10*16(p_keys), xkeyB
296
297 .set i, 0
298 .rept by
299 club XDATA, i
0b1e95b2
MK
300 /* key 9 */
301 .if (klen == KEY_128)
302 vaesenc xkey12, var_xdata, var_xdata
303 .else
304 vaesenc xkeyA, var_xdata, var_xdata
305 .endif
22cddcc7 306 .set i, (i +1)
307 .endr
308
309 .if (klen != KEY_128)
310 vmovdqa 11*16(p_keys), xkeyA
311 .endif
312
313 .set i, 0
314 .rept by
315 club XDATA, i
316 /* key 10 */
317 .if (klen == KEY_128)
318 vaesenclast xkeyB, var_xdata, var_xdata
319 .else
320 vaesenc xkeyB, var_xdata, var_xdata
321 .endif
322 .set i, (i +1)
323 .endr
324
325 .if (klen != KEY_128)
326 .if (load_keys)
327 vmovdqa 12*16(p_keys), xkey12
328 .endif
329
330 .set i, 0
331 .rept by
332 club XDATA, i
333 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
334 .set i, (i +1)
335 .endr
336
337 .if (klen == KEY_256)
338 vmovdqa 13*16(p_keys), xkeyA
339 .endif
340
341 .set i, 0
342 .rept by
343 club XDATA, i
344 .if (klen == KEY_256)
345 /* key 12 */
346 vaesenc xkey12, var_xdata, var_xdata
347 .else
348 vaesenclast xkey12, var_xdata, var_xdata
349 .endif
350 .set i, (i +1)
351 .endr
352
353 .if (klen == KEY_256)
354 vmovdqa 14*16(p_keys), xkeyB
355
356 .set i, 0
357 .rept by
358 club XDATA, i
359 /* key 13 */
360 vaesenc xkeyA, var_xdata, var_xdata
361 .set i, (i +1)
362 .endr
363
364 .set i, 0
365 .rept by
366 club XDATA, i
367 /* key 14 */
368 vaesenclast xkeyB, var_xdata, var_xdata
369 .set i, (i +1)
370 .endr
371 .endif
372 .endif
373
374 .set i, 0
375 .rept (by / 2)
376 .set j, (i+1)
377 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
378 VMOVDQ (j*16 - 16*by)(p_in), xkeyB
379 club XDATA, i
380 vpxor xkeyA, var_xdata, var_xdata
381 club XDATA, j
382 vpxor xkeyB, var_xdata, var_xdata
383 .set i, (i+2)
384 .endr
385
386 .if (i < by)
387 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
388 club XDATA, i
389 vpxor xkeyA, var_xdata, var_xdata
390 .endif
391
392 .set i, 0
393 .rept by
394 club XDATA, i
395 VMOVDQ var_xdata, i*16(p_out)
396 .set i, (i+1)
397 .endr
398.endm
399
fd94fcf0
NH
400.macro do_aes_load val, key_len, xctr
401 do_aes \val, 1, \key_len, \xctr
22cddcc7 402.endm
403
fd94fcf0
NH
404.macro do_aes_noload val, key_len, xctr
405 do_aes \val, 0, \key_len, \xctr
22cddcc7 406.endm
407
408/* main body of aes ctr load */
409
fd94fcf0 410.macro do_aes_ctrmain key_len, xctr
22cddcc7 411 cmp $16, num_bytes
fd94fcf0 412 jb .Ldo_return2\xctr\key_len
22cddcc7 413
fd94fcf0
NH
414 .if \xctr
415 shr $4, counter
416 vmovdqu (p_iv), xiv
417 .else
418 vmovdqa byteswap_const(%rip), xbyteswap
419 vmovdqu (p_iv), xcounter
420 vpshufb xbyteswap, xcounter, xcounter
421 .endif
22cddcc7 422
423 mov num_bytes, tmp
424 and $(7*16), tmp
fd94fcf0 425 jz .Lmult_of_8_blks\xctr\key_len
22cddcc7 426
427 /* 1 <= tmp <= 7 */
428 cmp $(4*16), tmp
fd94fcf0
NH
429 jg .Lgt4\xctr\key_len
430 je .Leq4\xctr\key_len
22cddcc7 431
fd94fcf0 432.Llt4\xctr\key_len:
22cddcc7 433 cmp $(2*16), tmp
fd94fcf0
NH
434 jg .Leq3\xctr\key_len
435 je .Leq2\xctr\key_len
22cddcc7 436
fd94fcf0
NH
437.Leq1\xctr\key_len:
438 do_aes_load 1, \key_len, \xctr
22cddcc7 439 add $(1*16), p_out
440 and $(~7*16), num_bytes
fd94fcf0
NH
441 jz .Ldo_return2\xctr\key_len
442 jmp .Lmain_loop2\xctr\key_len
22cddcc7 443
fd94fcf0
NH
444.Leq2\xctr\key_len:
445 do_aes_load 2, \key_len, \xctr
22cddcc7 446 add $(2*16), p_out
447 and $(~7*16), num_bytes
fd94fcf0
NH
448 jz .Ldo_return2\xctr\key_len
449 jmp .Lmain_loop2\xctr\key_len
22cddcc7 450
451
fd94fcf0
NH
452.Leq3\xctr\key_len:
453 do_aes_load 3, \key_len, \xctr
22cddcc7 454 add $(3*16), p_out
455 and $(~7*16), num_bytes
fd94fcf0
NH
456 jz .Ldo_return2\xctr\key_len
457 jmp .Lmain_loop2\xctr\key_len
22cddcc7 458
fd94fcf0
NH
459.Leq4\xctr\key_len:
460 do_aes_load 4, \key_len, \xctr
22cddcc7 461 add $(4*16), p_out
462 and $(~7*16), num_bytes
fd94fcf0
NH
463 jz .Ldo_return2\xctr\key_len
464 jmp .Lmain_loop2\xctr\key_len
22cddcc7 465
fd94fcf0 466.Lgt4\xctr\key_len:
22cddcc7 467 cmp $(6*16), tmp
fd94fcf0
NH
468 jg .Leq7\xctr\key_len
469 je .Leq6\xctr\key_len
22cddcc7 470
fd94fcf0
NH
471.Leq5\xctr\key_len:
472 do_aes_load 5, \key_len, \xctr
22cddcc7 473 add $(5*16), p_out
474 and $(~7*16), num_bytes
fd94fcf0
NH
475 jz .Ldo_return2\xctr\key_len
476 jmp .Lmain_loop2\xctr\key_len
22cddcc7 477
fd94fcf0
NH
478.Leq6\xctr\key_len:
479 do_aes_load 6, \key_len, \xctr
22cddcc7 480 add $(6*16), p_out
481 and $(~7*16), num_bytes
fd94fcf0
NH
482 jz .Ldo_return2\xctr\key_len
483 jmp .Lmain_loop2\xctr\key_len
22cddcc7 484
fd94fcf0
NH
485.Leq7\xctr\key_len:
486 do_aes_load 7, \key_len, \xctr
22cddcc7 487 add $(7*16), p_out
488 and $(~7*16), num_bytes
fd94fcf0
NH
489 jz .Ldo_return2\xctr\key_len
490 jmp .Lmain_loop2\xctr\key_len
22cddcc7 491
fd94fcf0 492.Lmult_of_8_blks\xctr\key_len:
22cddcc7 493 .if (\key_len != KEY_128)
494 vmovdqa 0*16(p_keys), xkey0
495 vmovdqa 4*16(p_keys), xkey4
496 vmovdqa 8*16(p_keys), xkey8
497 vmovdqa 12*16(p_keys), xkey12
498 .else
499 vmovdqa 0*16(p_keys), xkey0
500 vmovdqa 3*16(p_keys), xkey4
501 vmovdqa 6*16(p_keys), xkey8
502 vmovdqa 9*16(p_keys), xkey12
503 .endif
504.align 16
fd94fcf0 505.Lmain_loop2\xctr\key_len:
22cddcc7 506 /* num_bytes is a multiple of 8 and >0 */
fd94fcf0 507 do_aes_noload 8, \key_len, \xctr
22cddcc7 508 add $(8*16), p_out
509 sub $(8*16), num_bytes
fd94fcf0 510 jne .Lmain_loop2\xctr\key_len
22cddcc7 511
fd94fcf0
NH
512.Ldo_return2\xctr\key_len:
513 .if !\xctr
514 /* return updated IV */
515 vpshufb xbyteswap, xcounter, xcounter
516 vmovdqu xcounter, (p_iv)
517 .endif
f94909ce 518 RET
22cddcc7 519.endm
520
521/*
522 * routine to do AES128 CTR enc/decrypt "by8"
523 * XMM registers are clobbered.
524 * Saving/restoring must be done at a higher level
525 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
526 * unsigned int num_bytes)
527 */
6dcc5627 528SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
22cddcc7 529 /* call the aes main loop */
fd94fcf0 530 do_aes_ctrmain KEY_128 0
22cddcc7 531
6dcc5627 532SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
22cddcc7 533
534/*
535 * routine to do AES192 CTR enc/decrypt "by8"
536 * XMM registers are clobbered.
537 * Saving/restoring must be done at a higher level
538 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
539 * unsigned int num_bytes)
540 */
6dcc5627 541SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
22cddcc7 542 /* call the aes main loop */
fd94fcf0 543 do_aes_ctrmain KEY_192 0
22cddcc7 544
6dcc5627 545SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
22cddcc7 546
547/*
548 * routine to do AES256 CTR enc/decrypt "by8"
549 * XMM registers are clobbered.
550 * Saving/restoring must be done at a higher level
551 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
552 * unsigned int num_bytes)
553 */
6dcc5627 554SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
22cddcc7 555 /* call the aes main loop */
fd94fcf0 556 do_aes_ctrmain KEY_256 0
22cddcc7 557
6dcc5627 558SYM_FUNC_END(aes_ctr_enc_256_avx_by8)
fd94fcf0
NH
559
560/*
561 * routine to do AES128 XCTR enc/decrypt "by8"
562 * XMM registers are clobbered.
563 * Saving/restoring must be done at a higher level
564 * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys,
565 * u8* out, unsigned int num_bytes, unsigned int byte_ctr)
566 */
567SYM_FUNC_START(aes_xctr_enc_128_avx_by8)
568 /* call the aes main loop */
569 do_aes_ctrmain KEY_128 1
570
571SYM_FUNC_END(aes_xctr_enc_128_avx_by8)
572
573/*
574 * routine to do AES192 XCTR enc/decrypt "by8"
575 * XMM registers are clobbered.
576 * Saving/restoring must be done at a higher level
577 * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys,
578 * u8* out, unsigned int num_bytes, unsigned int byte_ctr)
579 */
580SYM_FUNC_START(aes_xctr_enc_192_avx_by8)
581 /* call the aes main loop */
582 do_aes_ctrmain KEY_192 1
583
584SYM_FUNC_END(aes_xctr_enc_192_avx_by8)
585
586/*
587 * routine to do AES256 XCTR enc/decrypt "by8"
588 * XMM registers are clobbered.
589 * Saving/restoring must be done at a higher level
590 * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys,
591 * u8* out, unsigned int num_bytes, unsigned int byte_ctr)
592 */
593SYM_FUNC_START(aes_xctr_enc_256_avx_by8)
594 /* call the aes main loop */
595 do_aes_ctrmain KEY_256 1
596
597SYM_FUNC_END(aes_xctr_enc_256_avx_by8)