Merge tag 'erofs-for-6.8-rc3-fixes' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-block.git] / arch / arm64 / crypto / aes-modes.S
CommitLineData
d2912cb1 1/* SPDX-License-Identifier: GPL-2.0-only */
49788fe2
AB
2/*
3 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
4 *
4860620d 5 * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
49788fe2
AB
6 */
7
8/* included by aes-ce.S and aes-neon.S */
9
10 .text
11 .align 4
12
e2174139
AB
13#ifndef MAX_STRIDE
14#define MAX_STRIDE 4
15#endif
16
7367bfeb
AB
17#if MAX_STRIDE == 4
18#define ST4(x...) x
19#define ST5(x...)
20#else
21#define ST4(x...)
22#define ST5(x...) x
23#endif
24
0e89640b 25SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
6e7de6af 26 encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
49788fe2 27 ret
0e89640b 28SYM_FUNC_END(aes_encrypt_block4x)
49788fe2 29
0e89640b 30SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
6e7de6af 31 decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
49788fe2 32 ret
0e89640b 33SYM_FUNC_END(aes_decrypt_block4x)
49788fe2 34
e2174139 35#if MAX_STRIDE == 5
0e89640b 36SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
e2174139
AB
37 encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
38 ret
0e89640b 39SYM_FUNC_END(aes_encrypt_block5x)
e2174139 40
0e89640b 41SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
e2174139
AB
42 decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
43 ret
0e89640b 44SYM_FUNC_END(aes_decrypt_block5x)
e2174139
AB
45#endif
46
49788fe2
AB
47 /*
48 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
68338174 49 * int blocks)
49788fe2 50 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
68338174 51 * int blocks)
49788fe2
AB
52 */
53
b8e50548 54AES_FUNC_START(aes_ecb_encrypt)
7d709af1 55 frame_push 0
49788fe2 56
6e7de6af 57 enc_prepare w3, x2, x5
49788fe2
AB
58
59.LecbencloopNx:
7367bfeb 60 subs w4, w4, #MAX_STRIDE
49788fe2 61 bmi .Lecbenc1x
6e7de6af 62 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
7367bfeb
AB
63ST4( bl aes_encrypt_block4x )
64ST5( ld1 {v4.16b}, [x1], #16 )
65ST5( bl aes_encrypt_block5x )
6e7de6af 66 st1 {v0.16b-v3.16b}, [x0], #64
7367bfeb 67ST5( st1 {v4.16b}, [x0], #16 )
49788fe2
AB
68 b .LecbencloopNx
69.Lecbenc1x:
7367bfeb 70 adds w4, w4, #MAX_STRIDE
49788fe2 71 beq .Lecbencout
49788fe2 72.Lecbencloop:
6e7de6af
AB
73 ld1 {v0.16b}, [x1], #16 /* get next pt block */
74 encrypt_block v0, w3, x2, x5, w6
75 st1 {v0.16b}, [x0], #16
76 subs w4, w4, #1
49788fe2
AB
77 bne .Lecbencloop
78.Lecbencout:
7d709af1 79 frame_pop
49788fe2 80 ret
b8e50548 81AES_FUNC_END(aes_ecb_encrypt)
49788fe2
AB
82
83
b8e50548 84AES_FUNC_START(aes_ecb_decrypt)
7d709af1 85 frame_push 0
0c8f838a 86
6e7de6af 87 dec_prepare w3, x2, x5
49788fe2
AB
88
89.LecbdecloopNx:
7367bfeb 90 subs w4, w4, #MAX_STRIDE
49788fe2 91 bmi .Lecbdec1x
6e7de6af 92 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
7367bfeb
AB
93ST4( bl aes_decrypt_block4x )
94ST5( ld1 {v4.16b}, [x1], #16 )
95ST5( bl aes_decrypt_block5x )
6e7de6af 96 st1 {v0.16b-v3.16b}, [x0], #64
7367bfeb 97ST5( st1 {v4.16b}, [x0], #16 )
49788fe2
AB
98 b .LecbdecloopNx
99.Lecbdec1x:
7367bfeb 100 adds w4, w4, #MAX_STRIDE
49788fe2 101 beq .Lecbdecout
49788fe2 102.Lecbdecloop:
6e7de6af
AB
103 ld1 {v0.16b}, [x1], #16 /* get next ct block */
104 decrypt_block v0, w3, x2, x5, w6
105 st1 {v0.16b}, [x0], #16
106 subs w4, w4, #1
49788fe2
AB
107 bne .Lecbdecloop
108.Lecbdecout:
7d709af1 109 frame_pop
49788fe2 110 ret
b8e50548 111AES_FUNC_END(aes_ecb_decrypt)
49788fe2
AB
112
113
114 /*
115 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
68338174 116 * int blocks, u8 iv[])
49788fe2 117 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
68338174 118 * int blocks, u8 iv[])
735177ca
AB
119 * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
120 * int rounds, int blocks, u8 iv[],
121 * u32 const rk2[]);
122 * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
123 * int rounds, int blocks, u8 iv[],
124 * u32 const rk2[]);
49788fe2
AB
125 */
126
b8e50548 127AES_FUNC_START(aes_essiv_cbc_encrypt)
735177ca
AB
128 ld1 {v4.16b}, [x5] /* get iv */
129
130 mov w8, #14 /* AES-256: 14 rounds */
131 enc_prepare w8, x6, x7
132 encrypt_block v4, w8, x6, x7, w9
133 enc_switch_key w3, x2, x6
134 b .Lcbcencloop4x
135
b8e50548 136AES_FUNC_START(aes_cbc_encrypt)
6e7de6af
AB
137 ld1 {v4.16b}, [x5] /* get iv */
138 enc_prepare w3, x2, x6
49788fe2 139
a8f8a69e 140.Lcbcencloop4x:
6e7de6af 141 subs w4, w4, #4
a8f8a69e 142 bmi .Lcbcenc1x
6e7de6af 143 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
a8f8a69e 144 eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */
6e7de6af 145 encrypt_block v0, w3, x2, x6, w7
a8f8a69e 146 eor v1.16b, v1.16b, v0.16b
6e7de6af 147 encrypt_block v1, w3, x2, x6, w7
a8f8a69e 148 eor v2.16b, v2.16b, v1.16b
6e7de6af 149 encrypt_block v2, w3, x2, x6, w7
a8f8a69e 150 eor v3.16b, v3.16b, v2.16b
6e7de6af
AB
151 encrypt_block v3, w3, x2, x6, w7
152 st1 {v0.16b-v3.16b}, [x0], #64
a8f8a69e
AB
153 mov v4.16b, v3.16b
154 b .Lcbcencloop4x
155.Lcbcenc1x:
6e7de6af 156 adds w4, w4, #4
a8f8a69e
AB
157 beq .Lcbcencout
158.Lcbcencloop:
6e7de6af 159 ld1 {v0.16b}, [x1], #16 /* get next pt block */
a8f8a69e 160 eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */
6e7de6af
AB
161 encrypt_block v4, w3, x2, x6, w7
162 st1 {v4.16b}, [x0], #16
163 subs w4, w4, #1
49788fe2 164 bne .Lcbcencloop
a8f8a69e 165.Lcbcencout:
6e7de6af 166 st1 {v4.16b}, [x5] /* return iv */
49788fe2 167 ret
b8e50548
MB
168AES_FUNC_END(aes_cbc_encrypt)
169AES_FUNC_END(aes_essiv_cbc_encrypt)
735177ca 170
b8e50548 171AES_FUNC_START(aes_essiv_cbc_decrypt)
735177ca 172 ld1 {cbciv.16b}, [x5] /* get iv */
49788fe2 173
735177ca
AB
174 mov w8, #14 /* AES-256: 14 rounds */
175 enc_prepare w8, x6, x7
176 encrypt_block cbciv, w8, x6, x7, w9
177 b .Lessivcbcdecstart
49788fe2 178
b8e50548 179AES_FUNC_START(aes_cbc_decrypt)
7367bfeb 180 ld1 {cbciv.16b}, [x5] /* get iv */
735177ca 181.Lessivcbcdecstart:
7d709af1 182 frame_push 0
6e7de6af 183 dec_prepare w3, x2, x6
49788fe2
AB
184
185.LcbcdecloopNx:
7367bfeb 186 subs w4, w4, #MAX_STRIDE
49788fe2 187 bmi .Lcbcdec1x
6e7de6af 188 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
7367bfeb
AB
189#if MAX_STRIDE == 5
190 ld1 {v4.16b}, [x1], #16 /* get 1 ct block */
191 mov v5.16b, v0.16b
192 mov v6.16b, v1.16b
193 mov v7.16b, v2.16b
194 bl aes_decrypt_block5x
195 sub x1, x1, #32
196 eor v0.16b, v0.16b, cbciv.16b
197 eor v1.16b, v1.16b, v5.16b
198 ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */
199 ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
200 eor v2.16b, v2.16b, v6.16b
201 eor v3.16b, v3.16b, v7.16b
202 eor v4.16b, v4.16b, v5.16b
203#else
49788fe2
AB
204 mov v4.16b, v0.16b
205 mov v5.16b, v1.16b
206 mov v6.16b, v2.16b
55868b45 207 bl aes_decrypt_block4x
6e7de6af 208 sub x1, x1, #16
7367bfeb 209 eor v0.16b, v0.16b, cbciv.16b
49788fe2 210 eor v1.16b, v1.16b, v4.16b
7367bfeb 211 ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
49788fe2
AB
212 eor v2.16b, v2.16b, v5.16b
213 eor v3.16b, v3.16b, v6.16b
7367bfeb 214#endif
6e7de6af 215 st1 {v0.16b-v3.16b}, [x0], #64
7367bfeb 216ST5( st1 {v4.16b}, [x0], #16 )
49788fe2
AB
217 b .LcbcdecloopNx
218.Lcbcdec1x:
7367bfeb 219 adds w4, w4, #MAX_STRIDE
49788fe2 220 beq .Lcbcdecout
49788fe2 221.Lcbcdecloop:
6e7de6af 222 ld1 {v1.16b}, [x1], #16 /* get next ct block */
49788fe2 223 mov v0.16b, v1.16b /* ...and copy to v0 */
6e7de6af 224 decrypt_block v0, w3, x2, x6, w7
7367bfeb
AB
225 eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */
226 mov cbciv.16b, v1.16b /* ct is next iv */
6e7de6af
AB
227 st1 {v0.16b}, [x0], #16
228 subs w4, w4, #1
49788fe2
AB
229 bne .Lcbcdecloop
230.Lcbcdecout:
7367bfeb 231 st1 {cbciv.16b}, [x5] /* return iv */
7d709af1 232 frame_pop
49788fe2 233 ret
b8e50548
MB
234AES_FUNC_END(aes_cbc_decrypt)
235AES_FUNC_END(aes_essiv_cbc_decrypt)
49788fe2
AB
236
237
dd597fb3
AB
238 /*
239 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
240 * int rounds, int bytes, u8 const iv[])
241 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
242 * int rounds, int bytes, u8 const iv[])
243 */
244
b8e50548 245AES_FUNC_START(aes_cbc_cts_encrypt)
dd597fb3
AB
246 adr_l x8, .Lcts_permute_table
247 sub x4, x4, #16
248 add x9, x8, #32
249 add x8, x8, x4
250 sub x9, x9, x4
251 ld1 {v3.16b}, [x8]
252 ld1 {v4.16b}, [x9]
253
254 ld1 {v0.16b}, [x1], x4 /* overlapping loads */
255 ld1 {v1.16b}, [x1]
256
257 ld1 {v5.16b}, [x5] /* get iv */
258 enc_prepare w3, x2, x6
259
260 eor v0.16b, v0.16b, v5.16b /* xor with iv */
261 tbl v1.16b, {v1.16b}, v4.16b
262 encrypt_block v0, w3, x2, x6, w7
263
264 eor v1.16b, v1.16b, v0.16b
265 tbl v0.16b, {v0.16b}, v3.16b
266 encrypt_block v1, w3, x2, x6, w7
267
268 add x4, x0, x4
269 st1 {v0.16b}, [x4] /* overlapping stores */
270 st1 {v1.16b}, [x0]
271 ret
b8e50548 272AES_FUNC_END(aes_cbc_cts_encrypt)
dd597fb3 273
b8e50548 274AES_FUNC_START(aes_cbc_cts_decrypt)
dd597fb3
AB
275 adr_l x8, .Lcts_permute_table
276 sub x4, x4, #16
277 add x9, x8, #32
278 add x8, x8, x4
279 sub x9, x9, x4
280 ld1 {v3.16b}, [x8]
281 ld1 {v4.16b}, [x9]
282
283 ld1 {v0.16b}, [x1], x4 /* overlapping loads */
284 ld1 {v1.16b}, [x1]
285
286 ld1 {v5.16b}, [x5] /* get iv */
287 dec_prepare w3, x2, x6
288
dd597fb3 289 decrypt_block v0, w3, x2, x6, w7
0cfd507c
AB
290 tbl v2.16b, {v0.16b}, v3.16b
291 eor v2.16b, v2.16b, v1.16b
dd597fb3
AB
292
293 tbx v0.16b, {v1.16b}, v4.16b
dd597fb3
AB
294 decrypt_block v0, w3, x2, x6, w7
295 eor v0.16b, v0.16b, v5.16b /* xor with iv */
296
297 add x4, x0, x4
298 st1 {v2.16b}, [x4] /* overlapping stores */
299 st1 {v0.16b}, [x0]
300 ret
b8e50548 301AES_FUNC_END(aes_cbc_cts_decrypt)
dd597fb3
AB
302
303 .section ".rodata", "a"
304 .align 6
305.Lcts_permute_table:
306 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
307 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
308 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
309 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
310 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
311 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
312 .previous
313
49788fe2 314 /*
23a251cc 315 * This macro generates the code for CTR and XCTR mode.
49788fe2 316 */
23a251cc 317.macro ctr_encrypt xctr
c0eb7591
NH
318 // Arguments
319 OUT .req x0
320 IN .req x1
321 KEY .req x2
322 ROUNDS_W .req w3
323 BYTES_W .req w4
324 IV .req x5
325 BYTE_CTR_W .req w6 // XCTR only
326 // Intermediate values
327 CTR_W .req w11 // XCTR only
328 CTR .req x11 // XCTR only
329 IV_PART .req x12
330 BLOCKS .req x13
331 BLOCKS_W .req w13
332
7d709af1 333 frame_push 0
68338174 334
c0eb7591
NH
335 enc_prepare ROUNDS_W, KEY, IV_PART
336 ld1 {vctr.16b}, [IV]
11e3b725 337
c0eb7591
NH
338 /*
339 * Keep 64 bits of the IV in a register. For CTR mode this lets us
340 * easily increment the IV. For XCTR mode this lets us efficiently XOR
341 * the 64-bit counter with the IV.
342 */
23a251cc 343 .if \xctr
c0eb7591
NH
344 umov IV_PART, vctr.d[0]
345 lsr CTR_W, BYTE_CTR_W, #4
23a251cc 346 .else
c0eb7591
NH
347 umov IV_PART, vctr.d[1]
348 rev IV_PART, IV_PART
23a251cc 349 .endif
5318d3db 350
23a251cc 351.LctrloopNx\xctr:
c0eb7591
NH
352 add BLOCKS_W, BYTES_W, #15
353 sub BYTES_W, BYTES_W, #MAX_STRIDE << 4
354 lsr BLOCKS_W, BLOCKS_W, #4
5318d3db 355 mov w8, #MAX_STRIDE
c0eb7591
NH
356 cmp BLOCKS_W, w8
357 csel BLOCKS_W, BLOCKS_W, w8, lt
5318d3db 358
c0eb7591
NH
359 /*
360 * Set up the counter values in v0-v{MAX_STRIDE-1}.
361 *
362 * If we are encrypting less than MAX_STRIDE blocks, the tail block
363 * handling code expects the last keystream block to be in
364 * v{MAX_STRIDE-1}. For example: if encrypting two blocks with
365 * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks.
366 */
23a251cc 367 .if \xctr
c0eb7591 368 add CTR, CTR, BLOCKS
23a251cc 369 .else
c0eb7591 370 adds IV_PART, IV_PART, BLOCKS
23a251cc 371 .endif
7367bfeb 372 mov v0.16b, vctr.16b
7367bfeb 373 mov v1.16b, vctr.16b
7367bfeb 374 mov v2.16b, vctr.16b
7367bfeb 375 mov v3.16b, vctr.16b
7367bfeb 376ST5( mov v4.16b, vctr.16b )
23a251cc 377 .if \xctr
c0eb7591
NH
378 sub x6, CTR, #MAX_STRIDE - 1
379 sub x7, CTR, #MAX_STRIDE - 2
380 sub x8, CTR, #MAX_STRIDE - 3
381 sub x9, CTR, #MAX_STRIDE - 4
382ST5( sub x10, CTR, #MAX_STRIDE - 5 )
383 eor x6, x6, IV_PART
384 eor x7, x7, IV_PART
385 eor x8, x8, IV_PART
386 eor x9, x9, IV_PART
387ST5( eor x10, x10, IV_PART )
23a251cc
NH
388 mov v0.d[0], x6
389 mov v1.d[0], x7
390 mov v2.d[0], x8
391 mov v3.d[0], x9
392ST5( mov v4.d[0], x10 )
393 .else
394 bcs 0f
395 .subsection 1
c0eb7591
NH
396 /*
397 * This subsection handles carries.
398 *
399 * Conditional branching here is allowed with respect to time
400 * invariance since the branches are dependent on the IV instead
401 * of the plaintext or key. This code is rarely executed in
402 * practice anyway.
403 */
404
405 /* Apply carry to outgoing counter. */
23a251cc
NH
4060: umov x8, vctr.d[0]
407 rev x8, x8
408 add x8, x8, #1
409 rev x8, x8
410 ins vctr.d[0], x8
411
c0eb7591
NH
412 /*
413 * Apply carry to counter blocks if needed.
414 *
415 * Since the carry flag was set, we know 0 <= IV_PART <
416 * MAX_STRIDE. Using the value of IV_PART we can determine how
417 * many counter blocks need to be updated.
418 */
419 cbz IV_PART, 2f
23a251cc 420 adr x16, 1f
c0eb7591 421 sub x16, x16, IV_PART, lsl #3
23a251cc
NH
422 br x16
423 bti c
424 mov v0.d[0], vctr.d[0]
425 bti c
426 mov v1.d[0], vctr.d[0]
427 bti c
428 mov v2.d[0], vctr.d[0]
429 bti c
430 mov v3.d[0], vctr.d[0]
431ST5( bti c )
432ST5( mov v4.d[0], vctr.d[0] )
4331: b 2f
434 .previous
435
c0eb7591 4362: rev x7, IV_PART
23a251cc 437 ins vctr.d[1], x7
c0eb7591
NH
438 sub x7, IV_PART, #MAX_STRIDE - 1
439 sub x8, IV_PART, #MAX_STRIDE - 2
440 sub x9, IV_PART, #MAX_STRIDE - 3
23a251cc
NH
441 rev x7, x7
442 rev x8, x8
443 mov v1.d[1], x7
444 rev x9, x9
c0eb7591 445ST5( sub x10, IV_PART, #MAX_STRIDE - 4 )
23a251cc
NH
446 mov v2.d[1], x8
447ST5( rev x10, x10 )
448 mov v3.d[1], x9
449ST5( mov v4.d[1], x10 )
450 .endif
c0eb7591
NH
451
452 /*
453 * If there are at least MAX_STRIDE blocks left, XOR the data with
454 * keystream and store. Otherwise jump to tail handling.
455 */
456 tbnz BYTES_W, #31, .Lctrtail\xctr
457 ld1 {v5.16b-v7.16b}, [IN], #48
7367bfeb
AB
458ST4( bl aes_encrypt_block4x )
459ST5( bl aes_encrypt_block5x )
49788fe2 460 eor v0.16b, v5.16b, v0.16b
c0eb7591 461ST4( ld1 {v5.16b}, [IN], #16 )
49788fe2 462 eor v1.16b, v6.16b, v1.16b
c0eb7591 463ST5( ld1 {v5.16b-v6.16b}, [IN], #32 )
49788fe2
AB
464 eor v2.16b, v7.16b, v2.16b
465 eor v3.16b, v5.16b, v3.16b
7367bfeb 466ST5( eor v4.16b, v6.16b, v4.16b )
c0eb7591
NH
467 st1 {v0.16b-v3.16b}, [OUT], #64
468ST5( st1 {v4.16b}, [OUT], #16 )
469 cbz BYTES_W, .Lctrout\xctr
23a251cc 470 b .LctrloopNx\xctr
11e3b725 471
23a251cc
NH
472.Lctrout\xctr:
473 .if !\xctr
c0eb7591 474 st1 {vctr.16b}, [IV] /* return next CTR value */
23a251cc 475 .endif
7d709af1 476 frame_pop
11e3b725
AB
477 ret
478
23a251cc 479.Lctrtail\xctr:
c0eb7591
NH
480 /*
481 * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
482 *
483 * This code expects the last keystream block to be in v{MAX_STRIDE-1}.
484 * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and
485 * v4 should have the next two counter blocks.
486 *
487 * This allows us to store the ciphertext by writing to overlapping
488 * regions of memory. Any invalid ciphertext blocks get overwritten by
489 * correctly computed blocks. This approach greatly simplifies the
490 * logic for storing the ciphertext.
491 */
5318d3db 492 mov x16, #16
c0eb7591
NH
493 ands w7, BYTES_W, #0xf
494 csel x13, x7, x16, ne
5318d3db 495
c0eb7591 496ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4))
5318d3db 497ST5( csel x14, x16, xzr, gt )
c0eb7591 498 cmp BYTES_W, #48 - (MAX_STRIDE << 4)
5318d3db 499 csel x15, x16, xzr, gt
c0eb7591 500 cmp BYTES_W, #32 - (MAX_STRIDE << 4)
5318d3db 501 csel x16, x16, xzr, gt
c0eb7591 502 cmp BYTES_W, #16 - (MAX_STRIDE << 4)
5318d3db 503
c0eb7591
NH
504 adr_l x9, .Lcts_permute_table
505 add x9, x9, x13
23a251cc 506 ble .Lctrtail1x\xctr
5318d3db 507
c0eb7591
NH
508ST5( ld1 {v5.16b}, [IN], x14 )
509 ld1 {v6.16b}, [IN], x15
510 ld1 {v7.16b}, [IN], x16
5318d3db
AB
511
512ST4( bl aes_encrypt_block4x )
513ST5( bl aes_encrypt_block5x )
514
c0eb7591
NH
515 ld1 {v8.16b}, [IN], x13
516 ld1 {v9.16b}, [IN]
517 ld1 {v10.16b}, [x9]
5318d3db
AB
518
519ST4( eor v6.16b, v6.16b, v0.16b )
520ST4( eor v7.16b, v7.16b, v1.16b )
521ST4( tbl v3.16b, {v3.16b}, v10.16b )
522ST4( eor v8.16b, v8.16b, v2.16b )
523ST4( eor v9.16b, v9.16b, v3.16b )
524
525ST5( eor v5.16b, v5.16b, v0.16b )
526ST5( eor v6.16b, v6.16b, v1.16b )
527ST5( tbl v4.16b, {v4.16b}, v10.16b )
528ST5( eor v7.16b, v7.16b, v2.16b )
529ST5( eor v8.16b, v8.16b, v3.16b )
530ST5( eor v9.16b, v9.16b, v4.16b )
531
c0eb7591
NH
532ST5( st1 {v5.16b}, [OUT], x14 )
533 st1 {v6.16b}, [OUT], x15
534 st1 {v7.16b}, [OUT], x16
535 add x13, x13, OUT
5318d3db 536 st1 {v9.16b}, [x13] // overlapping stores
c0eb7591 537 st1 {v8.16b}, [OUT]
23a251cc 538 b .Lctrout\xctr
11e3b725 539
23a251cc 540.Lctrtail1x\xctr:
c0eb7591
NH
541 /*
542 * Handle <= 16 bytes of plaintext
543 *
544 * This code always reads and writes 16 bytes. To avoid out of bounds
545 * accesses, XCTR and CTR modes must use a temporary buffer when
546 * encrypting/decrypting less than 16 bytes.
547 *
548 * This code is unusual in that it loads the input and stores the output
549 * relative to the end of the buffers rather than relative to the start.
550 * This causes unusual behaviour when encrypting/decrypting less than 16
551 * bytes; the end of the data is expected to be at the end of the
552 * temporary buffer rather than the start of the data being at the start
553 * of the temporary buffer.
554 */
555 sub x8, x7, #16
556 csel x7, x7, x8, eq
557 add IN, IN, x7
558 add OUT, OUT, x7
559 ld1 {v5.16b}, [IN]
560 ld1 {v6.16b}, [OUT]
5318d3db 561ST5( mov v3.16b, v4.16b )
c0eb7591
NH
562 encrypt_block v3, ROUNDS_W, KEY, x8, w7
563 ld1 {v10.16b-v11.16b}, [x9]
8daa399e
AB
564 tbl v3.16b, {v3.16b}, v10.16b
565 sshr v11.16b, v11.16b, #7
5318d3db 566 eor v5.16b, v5.16b, v3.16b
8daa399e 567 bif v5.16b, v6.16b, v11.16b
c0eb7591 568 st1 {v5.16b}, [OUT]
23a251cc 569 b .Lctrout\xctr
c0eb7591
NH
570
571 // Arguments
572 .unreq OUT
573 .unreq IN
574 .unreq KEY
575 .unreq ROUNDS_W
576 .unreq BYTES_W
577 .unreq IV
578 .unreq BYTE_CTR_W // XCTR only
579 // Intermediate values
580 .unreq CTR_W // XCTR only
581 .unreq CTR // XCTR only
582 .unreq IV_PART
583 .unreq BLOCKS
584 .unreq BLOCKS_W
23a251cc
NH
585.endm
586
587 /*
588 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
589 * int bytes, u8 ctr[])
c0eb7591
NH
590 *
591 * The input and output buffers must always be at least 16 bytes even if
592 * encrypting/decrypting less than 16 bytes. Otherwise out of bounds
593 * accesses will occur. The data to be encrypted/decrypted is expected
594 * to be at the end of this 16-byte temporary buffer rather than the
595 * start.
23a251cc
NH
596 */
597
598AES_FUNC_START(aes_ctr_encrypt)
599 ctr_encrypt 0
b8e50548 600AES_FUNC_END(aes_ctr_encrypt)
49788fe2 601
23a251cc
NH
602 /*
603 * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
604 * int bytes, u8 const iv[], int byte_ctr)
c0eb7591
NH
605 *
606 * The input and output buffers must always be at least 16 bytes even if
607 * encrypting/decrypting less than 16 bytes. Otherwise out of bounds
608 * accesses will occur. The data to be encrypted/decrypted is expected
609 * to be at the end of this 16-byte temporary buffer rather than the
610 * start.
23a251cc
NH
611 */
612
613AES_FUNC_START(aes_xctr_encrypt)
614 ctr_encrypt 1
615AES_FUNC_END(aes_xctr_encrypt)
616
49788fe2
AB
617
618 /*
7cceca8b
AB
619 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
620 * int bytes, u8 const rk2[], u8 iv[], int first)
49788fe2 621 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
7cceca8b 622 * int bytes, u8 const rk2[], u8 iv[], int first)
49788fe2
AB
623 */
624
2e5d2f33 625 .macro next_tweak, out, in, tmp
49788fe2 626 sshr \tmp\().2d, \in\().2d, #63
2e5d2f33 627 and \tmp\().16b, \tmp\().16b, xtsmask.16b
49788fe2
AB
628 add \out\().2d, \in\().2d, \in\().2d
629 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
630 eor \out\().16b, \out\().16b, \tmp\().16b
631 .endm
632
2e5d2f33
AB
633 .macro xts_load_mask, tmp
634 movi xtsmask.2s, #0x1
635 movi \tmp\().2s, #0x87
636 uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s
637 .endm
49788fe2 638
b8e50548 639AES_FUNC_START(aes_xts_encrypt)
7d709af1 640 frame_push 0
55868b45 641
6e7de6af 642 ld1 {v4.16b}, [x6]
cc3cc489 643 xts_load_mask v8
68338174
AB
644 cbz w7, .Lxtsencnotfirst
645
646 enc_prepare w3, x5, x8
67cfa5d3 647 xts_cts_skip_tw w7, .LxtsencNx
68338174
AB
648 encrypt_block v4, w3, x5, x8, w7 /* first tweak */
649 enc_switch_key w3, x2, x8
49788fe2
AB
650 b .LxtsencNx
651
68338174 652.Lxtsencnotfirst:
6e7de6af 653 enc_prepare w3, x2, x8
49788fe2 654.LxtsencloopNx:
2e5d2f33 655 next_tweak v4, v4, v8
49788fe2 656.LxtsencNx:
7cceca8b 657 subs w4, w4, #64
49788fe2 658 bmi .Lxtsenc1x
6e7de6af 659 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
2e5d2f33 660 next_tweak v5, v4, v8
49788fe2 661 eor v0.16b, v0.16b, v4.16b
2e5d2f33 662 next_tweak v6, v5, v8
49788fe2
AB
663 eor v1.16b, v1.16b, v5.16b
664 eor v2.16b, v2.16b, v6.16b
2e5d2f33 665 next_tweak v7, v6, v8
49788fe2 666 eor v3.16b, v3.16b, v7.16b
55868b45 667 bl aes_encrypt_block4x
49788fe2
AB
668 eor v3.16b, v3.16b, v7.16b
669 eor v0.16b, v0.16b, v4.16b
670 eor v1.16b, v1.16b, v5.16b
671 eor v2.16b, v2.16b, v6.16b
6e7de6af 672 st1 {v0.16b-v3.16b}, [x0], #64
49788fe2 673 mov v4.16b, v7.16b
7cceca8b 674 cbz w4, .Lxtsencret
cc3cc489 675 xts_reload_mask v8
49788fe2 676 b .LxtsencloopNx
49788fe2 677.Lxtsenc1x:
7cceca8b 678 adds w4, w4, #64
49788fe2 679 beq .Lxtsencout
7cceca8b
AB
680 subs w4, w4, #16
681 bmi .LxtsencctsNx
49788fe2 682.Lxtsencloop:
7cceca8b
AB
683 ld1 {v0.16b}, [x1], #16
684.Lxtsencctsout:
685 eor v0.16b, v0.16b, v4.16b
6e7de6af 686 encrypt_block v0, w3, x2, x8, w7
49788fe2 687 eor v0.16b, v0.16b, v4.16b
7cceca8b
AB
688 cbz w4, .Lxtsencout
689 subs w4, w4, #16
2e5d2f33 690 next_tweak v4, v4, v8
7cceca8b
AB
691 bmi .Lxtsenccts
692 st1 {v0.16b}, [x0], #16
49788fe2
AB
693 b .Lxtsencloop
694.Lxtsencout:
7cceca8b
AB
695 st1 {v0.16b}, [x0]
696.Lxtsencret:
6e7de6af 697 st1 {v4.16b}, [x6]
7d709af1 698 frame_pop
49788fe2 699 ret
49788fe2 700
7cceca8b
AB
701.LxtsencctsNx:
702 mov v0.16b, v3.16b
703 sub x0, x0, #16
704.Lxtsenccts:
705 adr_l x8, .Lcts_permute_table
706
707 add x1, x1, w4, sxtw /* rewind input pointer */
708 add w4, w4, #16 /* # bytes in final block */
709 add x9, x8, #32
710 add x8, x8, x4
711 sub x9, x9, x4
712 add x4, x0, x4 /* output address of final block */
713
714 ld1 {v1.16b}, [x1] /* load final block */
715 ld1 {v2.16b}, [x8]
716 ld1 {v3.16b}, [x9]
717
718 tbl v2.16b, {v0.16b}, v2.16b
719 tbx v0.16b, {v1.16b}, v3.16b
720 st1 {v2.16b}, [x4] /* overlapping stores */
721 mov w4, wzr
722 b .Lxtsencctsout
b8e50548 723AES_FUNC_END(aes_xts_encrypt)
49788fe2 724
b8e50548 725AES_FUNC_START(aes_xts_decrypt)
7d709af1 726 frame_push 0
55868b45 727
7cceca8b
AB
728 /* subtract 16 bytes if we are doing CTS */
729 sub w8, w4, #0x10
730 tst w4, #0xf
731 csel w4, w4, w8, eq
732
6e7de6af 733 ld1 {v4.16b}, [x6]
cc3cc489 734 xts_load_mask v8
67cfa5d3 735 xts_cts_skip_tw w7, .Lxtsdecskiptw
68338174
AB
736 cbz w7, .Lxtsdecnotfirst
737
738 enc_prepare w3, x5, x8
739 encrypt_block v4, w3, x5, x8, w7 /* first tweak */
67cfa5d3 740.Lxtsdecskiptw:
68338174 741 dec_prepare w3, x2, x8
49788fe2
AB
742 b .LxtsdecNx
743
68338174 744.Lxtsdecnotfirst:
6e7de6af 745 dec_prepare w3, x2, x8
49788fe2 746.LxtsdecloopNx:
2e5d2f33 747 next_tweak v4, v4, v8
49788fe2 748.LxtsdecNx:
7cceca8b 749 subs w4, w4, #64
49788fe2 750 bmi .Lxtsdec1x
6e7de6af 751 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
2e5d2f33 752 next_tweak v5, v4, v8
49788fe2 753 eor v0.16b, v0.16b, v4.16b
2e5d2f33 754 next_tweak v6, v5, v8
49788fe2
AB
755 eor v1.16b, v1.16b, v5.16b
756 eor v2.16b, v2.16b, v6.16b
2e5d2f33 757 next_tweak v7, v6, v8
49788fe2 758 eor v3.16b, v3.16b, v7.16b
55868b45 759 bl aes_decrypt_block4x
49788fe2
AB
760 eor v3.16b, v3.16b, v7.16b
761 eor v0.16b, v0.16b, v4.16b
762 eor v1.16b, v1.16b, v5.16b
763 eor v2.16b, v2.16b, v6.16b
6e7de6af 764 st1 {v0.16b-v3.16b}, [x0], #64
49788fe2 765 mov v4.16b, v7.16b
6e7de6af 766 cbz w4, .Lxtsdecout
cc3cc489 767 xts_reload_mask v8
49788fe2 768 b .LxtsdecloopNx
49788fe2 769.Lxtsdec1x:
7cceca8b 770 adds w4, w4, #64
49788fe2 771 beq .Lxtsdecout
7cceca8b 772 subs w4, w4, #16
49788fe2 773.Lxtsdecloop:
7cceca8b
AB
774 ld1 {v0.16b}, [x1], #16
775 bmi .Lxtsdeccts
776.Lxtsdecctsout:
777 eor v0.16b, v0.16b, v4.16b
6e7de6af 778 decrypt_block v0, w3, x2, x8, w7
49788fe2 779 eor v0.16b, v0.16b, v4.16b
6e7de6af 780 st1 {v0.16b}, [x0], #16
7cceca8b
AB
781 cbz w4, .Lxtsdecout
782 subs w4, w4, #16
2e5d2f33 783 next_tweak v4, v4, v8
49788fe2
AB
784 b .Lxtsdecloop
785.Lxtsdecout:
6e7de6af 786 st1 {v4.16b}, [x6]
7d709af1 787 frame_pop
49788fe2 788 ret
7cceca8b
AB
789
790.Lxtsdeccts:
791 adr_l x8, .Lcts_permute_table
792
793 add x1, x1, w4, sxtw /* rewind input pointer */
794 add w4, w4, #16 /* # bytes in final block */
795 add x9, x8, #32
796 add x8, x8, x4
797 sub x9, x9, x4
798 add x4, x0, x4 /* output address of final block */
799
800 next_tweak v5, v4, v8
801
802 ld1 {v1.16b}, [x1] /* load final block */
803 ld1 {v2.16b}, [x8]
804 ld1 {v3.16b}, [x9]
805
806 eor v0.16b, v0.16b, v5.16b
807 decrypt_block v0, w3, x2, x8, w7
808 eor v0.16b, v0.16b, v5.16b
809
810 tbl v2.16b, {v0.16b}, v2.16b
811 tbx v0.16b, {v1.16b}, v3.16b
812
813 st1 {v2.16b}, [x4] /* overlapping stores */
814 mov w4, wzr
815 b .Lxtsdecctsout
b8e50548 816AES_FUNC_END(aes_xts_decrypt)
4860620d
AB
817
818 /*
819 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
820 * int blocks, u8 dg[], int enc_before, int enc_after)
821 */
b8e50548 822AES_FUNC_START(aes_mac_update)
f0070f4a 823 ld1 {v0.16b}, [x4] /* get dg */
4860620d 824 enc_prepare w2, x1, x7
870c163a 825 cbz w5, .Lmacloop4x
4860620d 826
870c163a
AB
827 encrypt_block v0, w2, x1, x7, w8
828
829.Lmacloop4x:
f0070f4a 830 subs w3, w3, #4
870c163a 831 bmi .Lmac1x
f0070f4a 832 ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */
870c163a 833 eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
f0070f4a 834 encrypt_block v0, w2, x1, x7, w8
870c163a 835 eor v0.16b, v0.16b, v2.16b
f0070f4a 836 encrypt_block v0, w2, x1, x7, w8
870c163a 837 eor v0.16b, v0.16b, v3.16b
f0070f4a 838 encrypt_block v0, w2, x1, x7, w8
870c163a 839 eor v0.16b, v0.16b, v4.16b
f0070f4a
AB
840 cmp w3, wzr
841 csinv x5, x6, xzr, eq
870c163a 842 cbz w5, .Lmacout
f0070f4a
AB
843 encrypt_block v0, w2, x1, x7, w8
844 st1 {v0.16b}, [x4] /* return dg */
13150149 845 cond_yield .Lmacout, x7, x8
870c163a
AB
846 b .Lmacloop4x
847.Lmac1x:
f0070f4a 848 add w3, w3, #4
4860620d 849.Lmacloop:
f0070f4a
AB
850 cbz w3, .Lmacout
851 ld1 {v1.16b}, [x0], #16 /* get next pt block */
4860620d
AB
852 eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */
853
f0070f4a
AB
854 subs w3, w3, #1
855 csinv x5, x6, xzr, eq
4860620d
AB
856 cbz w5, .Lmacout
857
0c8f838a 858.Lmacenc:
f0070f4a 859 encrypt_block v0, w2, x1, x7, w8
4860620d
AB
860 b .Lmacloop
861
862.Lmacout:
f0070f4a
AB
863 st1 {v0.16b}, [x4] /* return dg */
864 mov w0, w3
4860620d 865 ret
b8e50548 866AES_FUNC_END(aes_mac_update)