Commit | Line | Data |
---|---|---|
d2912cb1 | 1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
49788fe2 AB |
2 | /* |
3 | * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES | |
4 | * | |
4860620d | 5 | * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org> |
49788fe2 AB |
6 | */ |
7 | ||
8 | /* included by aes-ce.S and aes-neon.S */ | |
9 | ||
10 | .text | |
11 | .align 4 | |
12 | ||
e2174139 AB |
13 | #ifndef MAX_STRIDE |
14 | #define MAX_STRIDE 4 | |
15 | #endif | |
16 | ||
7367bfeb AB |
17 | #if MAX_STRIDE == 4 |
18 | #define ST4(x...) x | |
19 | #define ST5(x...) | |
20 | #else | |
21 | #define ST4(x...) | |
22 | #define ST5(x...) x | |
23 | #endif | |
24 | ||
0e89640b | 25 | SYM_FUNC_START_LOCAL(aes_encrypt_block4x) |
6e7de6af | 26 | encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 |
49788fe2 | 27 | ret |
0e89640b | 28 | SYM_FUNC_END(aes_encrypt_block4x) |
49788fe2 | 29 | |
0e89640b | 30 | SYM_FUNC_START_LOCAL(aes_decrypt_block4x) |
6e7de6af | 31 | decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7 |
49788fe2 | 32 | ret |
0e89640b | 33 | SYM_FUNC_END(aes_decrypt_block4x) |
49788fe2 | 34 | |
e2174139 | 35 | #if MAX_STRIDE == 5 |
0e89640b | 36 | SYM_FUNC_START_LOCAL(aes_encrypt_block5x) |
e2174139 AB |
37 | encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 |
38 | ret | |
0e89640b | 39 | SYM_FUNC_END(aes_encrypt_block5x) |
e2174139 | 40 | |
0e89640b | 41 | SYM_FUNC_START_LOCAL(aes_decrypt_block5x) |
e2174139 AB |
42 | decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7 |
43 | ret | |
0e89640b | 44 | SYM_FUNC_END(aes_decrypt_block5x) |
e2174139 AB |
45 | #endif |
46 | ||
49788fe2 AB |
47 | /* |
48 | * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | |
68338174 | 49 | * int blocks) |
49788fe2 | 50 | * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
68338174 | 51 | * int blocks) |
49788fe2 AB |
52 | */ |
53 | ||
b8e50548 | 54 | AES_FUNC_START(aes_ecb_encrypt) |
7d709af1 | 55 | frame_push 0 |
49788fe2 | 56 | |
6e7de6af | 57 | enc_prepare w3, x2, x5 |
49788fe2 AB |
58 | |
59 | .LecbencloopNx: | |
7367bfeb | 60 | subs w4, w4, #MAX_STRIDE |
49788fe2 | 61 | bmi .Lecbenc1x |
6e7de6af | 62 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ |
7367bfeb AB |
63 | ST4( bl aes_encrypt_block4x ) |
64 | ST5( ld1 {v4.16b}, [x1], #16 ) | |
65 | ST5( bl aes_encrypt_block5x ) | |
6e7de6af | 66 | st1 {v0.16b-v3.16b}, [x0], #64 |
7367bfeb | 67 | ST5( st1 {v4.16b}, [x0], #16 ) |
49788fe2 AB |
68 | b .LecbencloopNx |
69 | .Lecbenc1x: | |
7367bfeb | 70 | adds w4, w4, #MAX_STRIDE |
49788fe2 | 71 | beq .Lecbencout |
49788fe2 | 72 | .Lecbencloop: |
6e7de6af AB |
73 | ld1 {v0.16b}, [x1], #16 /* get next pt block */ |
74 | encrypt_block v0, w3, x2, x5, w6 | |
75 | st1 {v0.16b}, [x0], #16 | |
76 | subs w4, w4, #1 | |
49788fe2 AB |
77 | bne .Lecbencloop |
78 | .Lecbencout: | |
7d709af1 | 79 | frame_pop |
49788fe2 | 80 | ret |
b8e50548 | 81 | AES_FUNC_END(aes_ecb_encrypt) |
49788fe2 AB |
82 | |
83 | ||
b8e50548 | 84 | AES_FUNC_START(aes_ecb_decrypt) |
7d709af1 | 85 | frame_push 0 |
0c8f838a | 86 | |
6e7de6af | 87 | dec_prepare w3, x2, x5 |
49788fe2 AB |
88 | |
89 | .LecbdecloopNx: | |
7367bfeb | 90 | subs w4, w4, #MAX_STRIDE |
49788fe2 | 91 | bmi .Lecbdec1x |
6e7de6af | 92 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ |
7367bfeb AB |
93 | ST4( bl aes_decrypt_block4x ) |
94 | ST5( ld1 {v4.16b}, [x1], #16 ) | |
95 | ST5( bl aes_decrypt_block5x ) | |
6e7de6af | 96 | st1 {v0.16b-v3.16b}, [x0], #64 |
7367bfeb | 97 | ST5( st1 {v4.16b}, [x0], #16 ) |
49788fe2 AB |
98 | b .LecbdecloopNx |
99 | .Lecbdec1x: | |
7367bfeb | 100 | adds w4, w4, #MAX_STRIDE |
49788fe2 | 101 | beq .Lecbdecout |
49788fe2 | 102 | .Lecbdecloop: |
6e7de6af AB |
103 | ld1 {v0.16b}, [x1], #16 /* get next ct block */ |
104 | decrypt_block v0, w3, x2, x5, w6 | |
105 | st1 {v0.16b}, [x0], #16 | |
106 | subs w4, w4, #1 | |
49788fe2 AB |
107 | bne .Lecbdecloop |
108 | .Lecbdecout: | |
7d709af1 | 109 | frame_pop |
49788fe2 | 110 | ret |
b8e50548 | 111 | AES_FUNC_END(aes_ecb_decrypt) |
49788fe2 AB |
112 | |
113 | ||
114 | /* | |
115 | * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | |
68338174 | 116 | * int blocks, u8 iv[]) |
49788fe2 | 117 | * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
68338174 | 118 | * int blocks, u8 iv[]) |
735177ca AB |
119 | * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[], |
120 | * int rounds, int blocks, u8 iv[], | |
121 | * u32 const rk2[]); | |
122 | * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[], | |
123 | * int rounds, int blocks, u8 iv[], | |
124 | * u32 const rk2[]); | |
49788fe2 AB |
125 | */ |
126 | ||
b8e50548 | 127 | AES_FUNC_START(aes_essiv_cbc_encrypt) |
735177ca AB |
128 | ld1 {v4.16b}, [x5] /* get iv */ |
129 | ||
130 | mov w8, #14 /* AES-256: 14 rounds */ | |
131 | enc_prepare w8, x6, x7 | |
132 | encrypt_block v4, w8, x6, x7, w9 | |
133 | enc_switch_key w3, x2, x6 | |
134 | b .Lcbcencloop4x | |
135 | ||
b8e50548 | 136 | AES_FUNC_START(aes_cbc_encrypt) |
6e7de6af AB |
137 | ld1 {v4.16b}, [x5] /* get iv */ |
138 | enc_prepare w3, x2, x6 | |
49788fe2 | 139 | |
a8f8a69e | 140 | .Lcbcencloop4x: |
6e7de6af | 141 | subs w4, w4, #4 |
a8f8a69e | 142 | bmi .Lcbcenc1x |
6e7de6af | 143 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ |
a8f8a69e | 144 | eor v0.16b, v0.16b, v4.16b /* ..and xor with iv */ |
6e7de6af | 145 | encrypt_block v0, w3, x2, x6, w7 |
a8f8a69e | 146 | eor v1.16b, v1.16b, v0.16b |
6e7de6af | 147 | encrypt_block v1, w3, x2, x6, w7 |
a8f8a69e | 148 | eor v2.16b, v2.16b, v1.16b |
6e7de6af | 149 | encrypt_block v2, w3, x2, x6, w7 |
a8f8a69e | 150 | eor v3.16b, v3.16b, v2.16b |
6e7de6af AB |
151 | encrypt_block v3, w3, x2, x6, w7 |
152 | st1 {v0.16b-v3.16b}, [x0], #64 | |
a8f8a69e AB |
153 | mov v4.16b, v3.16b |
154 | b .Lcbcencloop4x | |
155 | .Lcbcenc1x: | |
6e7de6af | 156 | adds w4, w4, #4 |
a8f8a69e AB |
157 | beq .Lcbcencout |
158 | .Lcbcencloop: | |
6e7de6af | 159 | ld1 {v0.16b}, [x1], #16 /* get next pt block */ |
a8f8a69e | 160 | eor v4.16b, v4.16b, v0.16b /* ..and xor with iv */ |
6e7de6af AB |
161 | encrypt_block v4, w3, x2, x6, w7 |
162 | st1 {v4.16b}, [x0], #16 | |
163 | subs w4, w4, #1 | |
49788fe2 | 164 | bne .Lcbcencloop |
a8f8a69e | 165 | .Lcbcencout: |
6e7de6af | 166 | st1 {v4.16b}, [x5] /* return iv */ |
49788fe2 | 167 | ret |
b8e50548 MB |
168 | AES_FUNC_END(aes_cbc_encrypt) |
169 | AES_FUNC_END(aes_essiv_cbc_encrypt) | |
735177ca | 170 | |
b8e50548 | 171 | AES_FUNC_START(aes_essiv_cbc_decrypt) |
735177ca | 172 | ld1 {cbciv.16b}, [x5] /* get iv */ |
49788fe2 | 173 | |
735177ca AB |
174 | mov w8, #14 /* AES-256: 14 rounds */ |
175 | enc_prepare w8, x6, x7 | |
176 | encrypt_block cbciv, w8, x6, x7, w9 | |
177 | b .Lessivcbcdecstart | |
49788fe2 | 178 | |
b8e50548 | 179 | AES_FUNC_START(aes_cbc_decrypt) |
7367bfeb | 180 | ld1 {cbciv.16b}, [x5] /* get iv */ |
735177ca | 181 | .Lessivcbcdecstart: |
7d709af1 | 182 | frame_push 0 |
6e7de6af | 183 | dec_prepare w3, x2, x6 |
49788fe2 AB |
184 | |
185 | .LcbcdecloopNx: | |
7367bfeb | 186 | subs w4, w4, #MAX_STRIDE |
49788fe2 | 187 | bmi .Lcbcdec1x |
6e7de6af | 188 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ |
7367bfeb AB |
189 | #if MAX_STRIDE == 5 |
190 | ld1 {v4.16b}, [x1], #16 /* get 1 ct block */ | |
191 | mov v5.16b, v0.16b | |
192 | mov v6.16b, v1.16b | |
193 | mov v7.16b, v2.16b | |
194 | bl aes_decrypt_block5x | |
195 | sub x1, x1, #32 | |
196 | eor v0.16b, v0.16b, cbciv.16b | |
197 | eor v1.16b, v1.16b, v5.16b | |
198 | ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */ | |
199 | ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ | |
200 | eor v2.16b, v2.16b, v6.16b | |
201 | eor v3.16b, v3.16b, v7.16b | |
202 | eor v4.16b, v4.16b, v5.16b | |
203 | #else | |
49788fe2 AB |
204 | mov v4.16b, v0.16b |
205 | mov v5.16b, v1.16b | |
206 | mov v6.16b, v2.16b | |
55868b45 | 207 | bl aes_decrypt_block4x |
6e7de6af | 208 | sub x1, x1, #16 |
7367bfeb | 209 | eor v0.16b, v0.16b, cbciv.16b |
49788fe2 | 210 | eor v1.16b, v1.16b, v4.16b |
7367bfeb | 211 | ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */ |
49788fe2 AB |
212 | eor v2.16b, v2.16b, v5.16b |
213 | eor v3.16b, v3.16b, v6.16b | |
7367bfeb | 214 | #endif |
6e7de6af | 215 | st1 {v0.16b-v3.16b}, [x0], #64 |
7367bfeb | 216 | ST5( st1 {v4.16b}, [x0], #16 ) |
49788fe2 AB |
217 | b .LcbcdecloopNx |
218 | .Lcbcdec1x: | |
7367bfeb | 219 | adds w4, w4, #MAX_STRIDE |
49788fe2 | 220 | beq .Lcbcdecout |
49788fe2 | 221 | .Lcbcdecloop: |
6e7de6af | 222 | ld1 {v1.16b}, [x1], #16 /* get next ct block */ |
49788fe2 | 223 | mov v0.16b, v1.16b /* ...and copy to v0 */ |
6e7de6af | 224 | decrypt_block v0, w3, x2, x6, w7 |
7367bfeb AB |
225 | eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */ |
226 | mov cbciv.16b, v1.16b /* ct is next iv */ | |
6e7de6af AB |
227 | st1 {v0.16b}, [x0], #16 |
228 | subs w4, w4, #1 | |
49788fe2 AB |
229 | bne .Lcbcdecloop |
230 | .Lcbcdecout: | |
7367bfeb | 231 | st1 {cbciv.16b}, [x5] /* return iv */ |
7d709af1 | 232 | frame_pop |
49788fe2 | 233 | ret |
b8e50548 MB |
234 | AES_FUNC_END(aes_cbc_decrypt) |
235 | AES_FUNC_END(aes_essiv_cbc_decrypt) | |
49788fe2 AB |
236 | |
237 | ||
dd597fb3 AB |
238 | /* |
239 | * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[], | |
240 | * int rounds, int bytes, u8 const iv[]) | |
241 | * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], | |
242 | * int rounds, int bytes, u8 const iv[]) | |
243 | */ | |
244 | ||
b8e50548 | 245 | AES_FUNC_START(aes_cbc_cts_encrypt) |
dd597fb3 AB |
246 | adr_l x8, .Lcts_permute_table |
247 | sub x4, x4, #16 | |
248 | add x9, x8, #32 | |
249 | add x8, x8, x4 | |
250 | sub x9, x9, x4 | |
251 | ld1 {v3.16b}, [x8] | |
252 | ld1 {v4.16b}, [x9] | |
253 | ||
254 | ld1 {v0.16b}, [x1], x4 /* overlapping loads */ | |
255 | ld1 {v1.16b}, [x1] | |
256 | ||
257 | ld1 {v5.16b}, [x5] /* get iv */ | |
258 | enc_prepare w3, x2, x6 | |
259 | ||
260 | eor v0.16b, v0.16b, v5.16b /* xor with iv */ | |
261 | tbl v1.16b, {v1.16b}, v4.16b | |
262 | encrypt_block v0, w3, x2, x6, w7 | |
263 | ||
264 | eor v1.16b, v1.16b, v0.16b | |
265 | tbl v0.16b, {v0.16b}, v3.16b | |
266 | encrypt_block v1, w3, x2, x6, w7 | |
267 | ||
268 | add x4, x0, x4 | |
269 | st1 {v0.16b}, [x4] /* overlapping stores */ | |
270 | st1 {v1.16b}, [x0] | |
271 | ret | |
b8e50548 | 272 | AES_FUNC_END(aes_cbc_cts_encrypt) |
dd597fb3 | 273 | |
b8e50548 | 274 | AES_FUNC_START(aes_cbc_cts_decrypt) |
dd597fb3 AB |
275 | adr_l x8, .Lcts_permute_table |
276 | sub x4, x4, #16 | |
277 | add x9, x8, #32 | |
278 | add x8, x8, x4 | |
279 | sub x9, x9, x4 | |
280 | ld1 {v3.16b}, [x8] | |
281 | ld1 {v4.16b}, [x9] | |
282 | ||
283 | ld1 {v0.16b}, [x1], x4 /* overlapping loads */ | |
284 | ld1 {v1.16b}, [x1] | |
285 | ||
286 | ld1 {v5.16b}, [x5] /* get iv */ | |
287 | dec_prepare w3, x2, x6 | |
288 | ||
dd597fb3 | 289 | decrypt_block v0, w3, x2, x6, w7 |
0cfd507c AB |
290 | tbl v2.16b, {v0.16b}, v3.16b |
291 | eor v2.16b, v2.16b, v1.16b | |
dd597fb3 AB |
292 | |
293 | tbx v0.16b, {v1.16b}, v4.16b | |
dd597fb3 AB |
294 | decrypt_block v0, w3, x2, x6, w7 |
295 | eor v0.16b, v0.16b, v5.16b /* xor with iv */ | |
296 | ||
297 | add x4, x0, x4 | |
298 | st1 {v2.16b}, [x4] /* overlapping stores */ | |
299 | st1 {v0.16b}, [x0] | |
300 | ret | |
b8e50548 | 301 | AES_FUNC_END(aes_cbc_cts_decrypt) |
dd597fb3 AB |
302 | |
303 | .section ".rodata", "a" | |
304 | .align 6 | |
305 | .Lcts_permute_table: | |
306 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff | |
307 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff | |
308 | .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 | |
309 | .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf | |
310 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff | |
311 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff | |
312 | .previous | |
313 | ||
49788fe2 | 314 | /* |
23a251cc | 315 | * This macro generates the code for CTR and XCTR mode. |
49788fe2 | 316 | */ |
23a251cc | 317 | .macro ctr_encrypt xctr |
c0eb7591 NH |
318 | // Arguments |
319 | OUT .req x0 | |
320 | IN .req x1 | |
321 | KEY .req x2 | |
322 | ROUNDS_W .req w3 | |
323 | BYTES_W .req w4 | |
324 | IV .req x5 | |
325 | BYTE_CTR_W .req w6 // XCTR only | |
326 | // Intermediate values | |
327 | CTR_W .req w11 // XCTR only | |
328 | CTR .req x11 // XCTR only | |
329 | IV_PART .req x12 | |
330 | BLOCKS .req x13 | |
331 | BLOCKS_W .req w13 | |
332 | ||
7d709af1 | 333 | frame_push 0 |
68338174 | 334 | |
c0eb7591 NH |
335 | enc_prepare ROUNDS_W, KEY, IV_PART |
336 | ld1 {vctr.16b}, [IV] | |
11e3b725 | 337 | |
c0eb7591 NH |
338 | /* |
339 | * Keep 64 bits of the IV in a register. For CTR mode this lets us | |
340 | * easily increment the IV. For XCTR mode this lets us efficiently XOR | |
341 | * the 64-bit counter with the IV. | |
342 | */ | |
23a251cc | 343 | .if \xctr |
c0eb7591 NH |
344 | umov IV_PART, vctr.d[0] |
345 | lsr CTR_W, BYTE_CTR_W, #4 | |
23a251cc | 346 | .else |
c0eb7591 NH |
347 | umov IV_PART, vctr.d[1] |
348 | rev IV_PART, IV_PART | |
23a251cc | 349 | .endif |
5318d3db | 350 | |
23a251cc | 351 | .LctrloopNx\xctr: |
c0eb7591 NH |
352 | add BLOCKS_W, BYTES_W, #15 |
353 | sub BYTES_W, BYTES_W, #MAX_STRIDE << 4 | |
354 | lsr BLOCKS_W, BLOCKS_W, #4 | |
5318d3db | 355 | mov w8, #MAX_STRIDE |
c0eb7591 NH |
356 | cmp BLOCKS_W, w8 |
357 | csel BLOCKS_W, BLOCKS_W, w8, lt | |
5318d3db | 358 | |
c0eb7591 NH |
359 | /* |
360 | * Set up the counter values in v0-v{MAX_STRIDE-1}. | |
361 | * | |
362 | * If we are encrypting less than MAX_STRIDE blocks, the tail block | |
363 | * handling code expects the last keystream block to be in | |
364 | * v{MAX_STRIDE-1}. For example: if encrypting two blocks with | |
365 | * MAX_STRIDE=5, then v3 and v4 should have the next two counter blocks. | |
366 | */ | |
23a251cc | 367 | .if \xctr |
c0eb7591 | 368 | add CTR, CTR, BLOCKS |
23a251cc | 369 | .else |
c0eb7591 | 370 | adds IV_PART, IV_PART, BLOCKS |
23a251cc | 371 | .endif |
7367bfeb | 372 | mov v0.16b, vctr.16b |
7367bfeb | 373 | mov v1.16b, vctr.16b |
7367bfeb | 374 | mov v2.16b, vctr.16b |
7367bfeb | 375 | mov v3.16b, vctr.16b |
7367bfeb | 376 | ST5( mov v4.16b, vctr.16b ) |
23a251cc | 377 | .if \xctr |
c0eb7591 NH |
378 | sub x6, CTR, #MAX_STRIDE - 1 |
379 | sub x7, CTR, #MAX_STRIDE - 2 | |
380 | sub x8, CTR, #MAX_STRIDE - 3 | |
381 | sub x9, CTR, #MAX_STRIDE - 4 | |
382 | ST5( sub x10, CTR, #MAX_STRIDE - 5 ) | |
383 | eor x6, x6, IV_PART | |
384 | eor x7, x7, IV_PART | |
385 | eor x8, x8, IV_PART | |
386 | eor x9, x9, IV_PART | |
387 | ST5( eor x10, x10, IV_PART ) | |
23a251cc NH |
388 | mov v0.d[0], x6 |
389 | mov v1.d[0], x7 | |
390 | mov v2.d[0], x8 | |
391 | mov v3.d[0], x9 | |
392 | ST5( mov v4.d[0], x10 ) | |
393 | .else | |
394 | bcs 0f | |
395 | .subsection 1 | |
c0eb7591 NH |
396 | /* |
397 | * This subsection handles carries. | |
398 | * | |
399 | * Conditional branching here is allowed with respect to time | |
400 | * invariance since the branches are dependent on the IV instead | |
401 | * of the plaintext or key. This code is rarely executed in | |
402 | * practice anyway. | |
403 | */ | |
404 | ||
405 | /* Apply carry to outgoing counter. */ | |
23a251cc NH |
406 | 0: umov x8, vctr.d[0] |
407 | rev x8, x8 | |
408 | add x8, x8, #1 | |
409 | rev x8, x8 | |
410 | ins vctr.d[0], x8 | |
411 | ||
c0eb7591 NH |
412 | /* |
413 | * Apply carry to counter blocks if needed. | |
414 | * | |
415 | * Since the carry flag was set, we know 0 <= IV_PART < | |
416 | * MAX_STRIDE. Using the value of IV_PART we can determine how | |
417 | * many counter blocks need to be updated. | |
418 | */ | |
419 | cbz IV_PART, 2f | |
23a251cc | 420 | adr x16, 1f |
c0eb7591 | 421 | sub x16, x16, IV_PART, lsl #3 |
23a251cc NH |
422 | br x16 |
423 | bti c | |
424 | mov v0.d[0], vctr.d[0] | |
425 | bti c | |
426 | mov v1.d[0], vctr.d[0] | |
427 | bti c | |
428 | mov v2.d[0], vctr.d[0] | |
429 | bti c | |
430 | mov v3.d[0], vctr.d[0] | |
431 | ST5( bti c ) | |
432 | ST5( mov v4.d[0], vctr.d[0] ) | |
433 | 1: b 2f | |
434 | .previous | |
435 | ||
c0eb7591 | 436 | 2: rev x7, IV_PART |
23a251cc | 437 | ins vctr.d[1], x7 |
c0eb7591 NH |
438 | sub x7, IV_PART, #MAX_STRIDE - 1 |
439 | sub x8, IV_PART, #MAX_STRIDE - 2 | |
440 | sub x9, IV_PART, #MAX_STRIDE - 3 | |
23a251cc NH |
441 | rev x7, x7 |
442 | rev x8, x8 | |
443 | mov v1.d[1], x7 | |
444 | rev x9, x9 | |
c0eb7591 | 445 | ST5( sub x10, IV_PART, #MAX_STRIDE - 4 ) |
23a251cc NH |
446 | mov v2.d[1], x8 |
447 | ST5( rev x10, x10 ) | |
448 | mov v3.d[1], x9 | |
449 | ST5( mov v4.d[1], x10 ) | |
450 | .endif | |
c0eb7591 NH |
451 | |
452 | /* | |
453 | * If there are at least MAX_STRIDE blocks left, XOR the data with | |
454 | * keystream and store. Otherwise jump to tail handling. | |
455 | */ | |
456 | tbnz BYTES_W, #31, .Lctrtail\xctr | |
457 | ld1 {v5.16b-v7.16b}, [IN], #48 | |
7367bfeb AB |
458 | ST4( bl aes_encrypt_block4x ) |
459 | ST5( bl aes_encrypt_block5x ) | |
49788fe2 | 460 | eor v0.16b, v5.16b, v0.16b |
c0eb7591 | 461 | ST4( ld1 {v5.16b}, [IN], #16 ) |
49788fe2 | 462 | eor v1.16b, v6.16b, v1.16b |
c0eb7591 | 463 | ST5( ld1 {v5.16b-v6.16b}, [IN], #32 ) |
49788fe2 AB |
464 | eor v2.16b, v7.16b, v2.16b |
465 | eor v3.16b, v5.16b, v3.16b | |
7367bfeb | 466 | ST5( eor v4.16b, v6.16b, v4.16b ) |
c0eb7591 NH |
467 | st1 {v0.16b-v3.16b}, [OUT], #64 |
468 | ST5( st1 {v4.16b}, [OUT], #16 ) | |
469 | cbz BYTES_W, .Lctrout\xctr | |
23a251cc | 470 | b .LctrloopNx\xctr |
11e3b725 | 471 | |
23a251cc NH |
472 | .Lctrout\xctr: |
473 | .if !\xctr | |
c0eb7591 | 474 | st1 {vctr.16b}, [IV] /* return next CTR value */ |
23a251cc | 475 | .endif |
7d709af1 | 476 | frame_pop |
11e3b725 AB |
477 | ret |
478 | ||
23a251cc | 479 | .Lctrtail\xctr: |
c0eb7591 NH |
480 | /* |
481 | * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext | |
482 | * | |
483 | * This code expects the last keystream block to be in v{MAX_STRIDE-1}. | |
484 | * For example: if encrypting two blocks with MAX_STRIDE=5, then v3 and | |
485 | * v4 should have the next two counter blocks. | |
486 | * | |
487 | * This allows us to store the ciphertext by writing to overlapping | |
488 | * regions of memory. Any invalid ciphertext blocks get overwritten by | |
489 | * correctly computed blocks. This approach greatly simplifies the | |
490 | * logic for storing the ciphertext. | |
491 | */ | |
5318d3db | 492 | mov x16, #16 |
c0eb7591 NH |
493 | ands w7, BYTES_W, #0xf |
494 | csel x13, x7, x16, ne | |
5318d3db | 495 | |
c0eb7591 | 496 | ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4)) |
5318d3db | 497 | ST5( csel x14, x16, xzr, gt ) |
c0eb7591 | 498 | cmp BYTES_W, #48 - (MAX_STRIDE << 4) |
5318d3db | 499 | csel x15, x16, xzr, gt |
c0eb7591 | 500 | cmp BYTES_W, #32 - (MAX_STRIDE << 4) |
5318d3db | 501 | csel x16, x16, xzr, gt |
c0eb7591 | 502 | cmp BYTES_W, #16 - (MAX_STRIDE << 4) |
5318d3db | 503 | |
c0eb7591 NH |
504 | adr_l x9, .Lcts_permute_table |
505 | add x9, x9, x13 | |
23a251cc | 506 | ble .Lctrtail1x\xctr |
5318d3db | 507 | |
c0eb7591 NH |
508 | ST5( ld1 {v5.16b}, [IN], x14 ) |
509 | ld1 {v6.16b}, [IN], x15 | |
510 | ld1 {v7.16b}, [IN], x16 | |
5318d3db AB |
511 | |
512 | ST4( bl aes_encrypt_block4x ) | |
513 | ST5( bl aes_encrypt_block5x ) | |
514 | ||
c0eb7591 NH |
515 | ld1 {v8.16b}, [IN], x13 |
516 | ld1 {v9.16b}, [IN] | |
517 | ld1 {v10.16b}, [x9] | |
5318d3db AB |
518 | |
519 | ST4( eor v6.16b, v6.16b, v0.16b ) | |
520 | ST4( eor v7.16b, v7.16b, v1.16b ) | |
521 | ST4( tbl v3.16b, {v3.16b}, v10.16b ) | |
522 | ST4( eor v8.16b, v8.16b, v2.16b ) | |
523 | ST4( eor v9.16b, v9.16b, v3.16b ) | |
524 | ||
525 | ST5( eor v5.16b, v5.16b, v0.16b ) | |
526 | ST5( eor v6.16b, v6.16b, v1.16b ) | |
527 | ST5( tbl v4.16b, {v4.16b}, v10.16b ) | |
528 | ST5( eor v7.16b, v7.16b, v2.16b ) | |
529 | ST5( eor v8.16b, v8.16b, v3.16b ) | |
530 | ST5( eor v9.16b, v9.16b, v4.16b ) | |
531 | ||
c0eb7591 NH |
532 | ST5( st1 {v5.16b}, [OUT], x14 ) |
533 | st1 {v6.16b}, [OUT], x15 | |
534 | st1 {v7.16b}, [OUT], x16 | |
535 | add x13, x13, OUT | |
5318d3db | 536 | st1 {v9.16b}, [x13] // overlapping stores |
c0eb7591 | 537 | st1 {v8.16b}, [OUT] |
23a251cc | 538 | b .Lctrout\xctr |
11e3b725 | 539 | |
23a251cc | 540 | .Lctrtail1x\xctr: |
c0eb7591 NH |
541 | /* |
542 | * Handle <= 16 bytes of plaintext | |
543 | * | |
544 | * This code always reads and writes 16 bytes. To avoid out of bounds | |
545 | * accesses, XCTR and CTR modes must use a temporary buffer when | |
546 | * encrypting/decrypting less than 16 bytes. | |
547 | * | |
548 | * This code is unusual in that it loads the input and stores the output | |
549 | * relative to the end of the buffers rather than relative to the start. | |
550 | * This causes unusual behaviour when encrypting/decrypting less than 16 | |
551 | * bytes; the end of the data is expected to be at the end of the | |
552 | * temporary buffer rather than the start of the data being at the start | |
553 | * of the temporary buffer. | |
554 | */ | |
555 | sub x8, x7, #16 | |
556 | csel x7, x7, x8, eq | |
557 | add IN, IN, x7 | |
558 | add OUT, OUT, x7 | |
559 | ld1 {v5.16b}, [IN] | |
560 | ld1 {v6.16b}, [OUT] | |
5318d3db | 561 | ST5( mov v3.16b, v4.16b ) |
c0eb7591 NH |
562 | encrypt_block v3, ROUNDS_W, KEY, x8, w7 |
563 | ld1 {v10.16b-v11.16b}, [x9] | |
8daa399e AB |
564 | tbl v3.16b, {v3.16b}, v10.16b |
565 | sshr v11.16b, v11.16b, #7 | |
5318d3db | 566 | eor v5.16b, v5.16b, v3.16b |
8daa399e | 567 | bif v5.16b, v6.16b, v11.16b |
c0eb7591 | 568 | st1 {v5.16b}, [OUT] |
23a251cc | 569 | b .Lctrout\xctr |
c0eb7591 NH |
570 | |
571 | // Arguments | |
572 | .unreq OUT | |
573 | .unreq IN | |
574 | .unreq KEY | |
575 | .unreq ROUNDS_W | |
576 | .unreq BYTES_W | |
577 | .unreq IV | |
578 | .unreq BYTE_CTR_W // XCTR only | |
579 | // Intermediate values | |
580 | .unreq CTR_W // XCTR only | |
581 | .unreq CTR // XCTR only | |
582 | .unreq IV_PART | |
583 | .unreq BLOCKS | |
584 | .unreq BLOCKS_W | |
23a251cc NH |
585 | .endm |
586 | ||
587 | /* | |
588 | * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | |
589 | * int bytes, u8 ctr[]) | |
c0eb7591 NH |
590 | * |
591 | * The input and output buffers must always be at least 16 bytes even if | |
592 | * encrypting/decrypting less than 16 bytes. Otherwise out of bounds | |
593 | * accesses will occur. The data to be encrypted/decrypted is expected | |
594 | * to be at the end of this 16-byte temporary buffer rather than the | |
595 | * start. | |
23a251cc NH |
596 | */ |
597 | ||
598 | AES_FUNC_START(aes_ctr_encrypt) | |
599 | ctr_encrypt 0 | |
b8e50548 | 600 | AES_FUNC_END(aes_ctr_encrypt) |
49788fe2 | 601 | |
23a251cc NH |
602 | /* |
603 | * aes_xctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, | |
604 | * int bytes, u8 const iv[], int byte_ctr) | |
c0eb7591 NH |
605 | * |
606 | * The input and output buffers must always be at least 16 bytes even if | |
607 | * encrypting/decrypting less than 16 bytes. Otherwise out of bounds | |
608 | * accesses will occur. The data to be encrypted/decrypted is expected | |
609 | * to be at the end of this 16-byte temporary buffer rather than the | |
610 | * start. | |
23a251cc NH |
611 | */ |
612 | ||
613 | AES_FUNC_START(aes_xctr_encrypt) | |
614 | ctr_encrypt 1 | |
615 | AES_FUNC_END(aes_xctr_encrypt) | |
616 | ||
49788fe2 AB |
617 | |
618 | /* | |
7cceca8b AB |
619 | * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, |
620 | * int bytes, u8 const rk2[], u8 iv[], int first) | |
49788fe2 | 621 | * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, |
7cceca8b | 622 | * int bytes, u8 const rk2[], u8 iv[], int first) |
49788fe2 AB |
623 | */ |
624 | ||
2e5d2f33 | 625 | .macro next_tweak, out, in, tmp |
49788fe2 | 626 | sshr \tmp\().2d, \in\().2d, #63 |
2e5d2f33 | 627 | and \tmp\().16b, \tmp\().16b, xtsmask.16b |
49788fe2 AB |
628 | add \out\().2d, \in\().2d, \in\().2d |
629 | ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 | |
630 | eor \out\().16b, \out\().16b, \tmp\().16b | |
631 | .endm | |
632 | ||
2e5d2f33 AB |
633 | .macro xts_load_mask, tmp |
634 | movi xtsmask.2s, #0x1 | |
635 | movi \tmp\().2s, #0x87 | |
636 | uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s | |
637 | .endm | |
49788fe2 | 638 | |
b8e50548 | 639 | AES_FUNC_START(aes_xts_encrypt) |
7d709af1 | 640 | frame_push 0 |
55868b45 | 641 | |
6e7de6af | 642 | ld1 {v4.16b}, [x6] |
cc3cc489 | 643 | xts_load_mask v8 |
68338174 AB |
644 | cbz w7, .Lxtsencnotfirst |
645 | ||
646 | enc_prepare w3, x5, x8 | |
67cfa5d3 | 647 | xts_cts_skip_tw w7, .LxtsencNx |
68338174 AB |
648 | encrypt_block v4, w3, x5, x8, w7 /* first tweak */ |
649 | enc_switch_key w3, x2, x8 | |
49788fe2 AB |
650 | b .LxtsencNx |
651 | ||
68338174 | 652 | .Lxtsencnotfirst: |
6e7de6af | 653 | enc_prepare w3, x2, x8 |
49788fe2 | 654 | .LxtsencloopNx: |
2e5d2f33 | 655 | next_tweak v4, v4, v8 |
49788fe2 | 656 | .LxtsencNx: |
7cceca8b | 657 | subs w4, w4, #64 |
49788fe2 | 658 | bmi .Lxtsenc1x |
6e7de6af | 659 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ |
2e5d2f33 | 660 | next_tweak v5, v4, v8 |
49788fe2 | 661 | eor v0.16b, v0.16b, v4.16b |
2e5d2f33 | 662 | next_tweak v6, v5, v8 |
49788fe2 AB |
663 | eor v1.16b, v1.16b, v5.16b |
664 | eor v2.16b, v2.16b, v6.16b | |
2e5d2f33 | 665 | next_tweak v7, v6, v8 |
49788fe2 | 666 | eor v3.16b, v3.16b, v7.16b |
55868b45 | 667 | bl aes_encrypt_block4x |
49788fe2 AB |
668 | eor v3.16b, v3.16b, v7.16b |
669 | eor v0.16b, v0.16b, v4.16b | |
670 | eor v1.16b, v1.16b, v5.16b | |
671 | eor v2.16b, v2.16b, v6.16b | |
6e7de6af | 672 | st1 {v0.16b-v3.16b}, [x0], #64 |
49788fe2 | 673 | mov v4.16b, v7.16b |
7cceca8b | 674 | cbz w4, .Lxtsencret |
cc3cc489 | 675 | xts_reload_mask v8 |
49788fe2 | 676 | b .LxtsencloopNx |
49788fe2 | 677 | .Lxtsenc1x: |
7cceca8b | 678 | adds w4, w4, #64 |
49788fe2 | 679 | beq .Lxtsencout |
7cceca8b AB |
680 | subs w4, w4, #16 |
681 | bmi .LxtsencctsNx | |
49788fe2 | 682 | .Lxtsencloop: |
7cceca8b AB |
683 | ld1 {v0.16b}, [x1], #16 |
684 | .Lxtsencctsout: | |
685 | eor v0.16b, v0.16b, v4.16b | |
6e7de6af | 686 | encrypt_block v0, w3, x2, x8, w7 |
49788fe2 | 687 | eor v0.16b, v0.16b, v4.16b |
7cceca8b AB |
688 | cbz w4, .Lxtsencout |
689 | subs w4, w4, #16 | |
2e5d2f33 | 690 | next_tweak v4, v4, v8 |
7cceca8b AB |
691 | bmi .Lxtsenccts |
692 | st1 {v0.16b}, [x0], #16 | |
49788fe2 AB |
693 | b .Lxtsencloop |
694 | .Lxtsencout: | |
7cceca8b AB |
695 | st1 {v0.16b}, [x0] |
696 | .Lxtsencret: | |
6e7de6af | 697 | st1 {v4.16b}, [x6] |
7d709af1 | 698 | frame_pop |
49788fe2 | 699 | ret |
49788fe2 | 700 | |
7cceca8b AB |
701 | .LxtsencctsNx: |
702 | mov v0.16b, v3.16b | |
703 | sub x0, x0, #16 | |
704 | .Lxtsenccts: | |
705 | adr_l x8, .Lcts_permute_table | |
706 | ||
707 | add x1, x1, w4, sxtw /* rewind input pointer */ | |
708 | add w4, w4, #16 /* # bytes in final block */ | |
709 | add x9, x8, #32 | |
710 | add x8, x8, x4 | |
711 | sub x9, x9, x4 | |
712 | add x4, x0, x4 /* output address of final block */ | |
713 | ||
714 | ld1 {v1.16b}, [x1] /* load final block */ | |
715 | ld1 {v2.16b}, [x8] | |
716 | ld1 {v3.16b}, [x9] | |
717 | ||
718 | tbl v2.16b, {v0.16b}, v2.16b | |
719 | tbx v0.16b, {v1.16b}, v3.16b | |
720 | st1 {v2.16b}, [x4] /* overlapping stores */ | |
721 | mov w4, wzr | |
722 | b .Lxtsencctsout | |
b8e50548 | 723 | AES_FUNC_END(aes_xts_encrypt) |
49788fe2 | 724 | |
b8e50548 | 725 | AES_FUNC_START(aes_xts_decrypt) |
7d709af1 | 726 | frame_push 0 |
55868b45 | 727 | |
7cceca8b AB |
728 | /* subtract 16 bytes if we are doing CTS */ |
729 | sub w8, w4, #0x10 | |
730 | tst w4, #0xf | |
731 | csel w4, w4, w8, eq | |
732 | ||
6e7de6af | 733 | ld1 {v4.16b}, [x6] |
cc3cc489 | 734 | xts_load_mask v8 |
67cfa5d3 | 735 | xts_cts_skip_tw w7, .Lxtsdecskiptw |
68338174 AB |
736 | cbz w7, .Lxtsdecnotfirst |
737 | ||
738 | enc_prepare w3, x5, x8 | |
739 | encrypt_block v4, w3, x5, x8, w7 /* first tweak */ | |
67cfa5d3 | 740 | .Lxtsdecskiptw: |
68338174 | 741 | dec_prepare w3, x2, x8 |
49788fe2 AB |
742 | b .LxtsdecNx |
743 | ||
68338174 | 744 | .Lxtsdecnotfirst: |
6e7de6af | 745 | dec_prepare w3, x2, x8 |
49788fe2 | 746 | .LxtsdecloopNx: |
2e5d2f33 | 747 | next_tweak v4, v4, v8 |
49788fe2 | 748 | .LxtsdecNx: |
7cceca8b | 749 | subs w4, w4, #64 |
49788fe2 | 750 | bmi .Lxtsdec1x |
6e7de6af | 751 | ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ |
2e5d2f33 | 752 | next_tweak v5, v4, v8 |
49788fe2 | 753 | eor v0.16b, v0.16b, v4.16b |
2e5d2f33 | 754 | next_tweak v6, v5, v8 |
49788fe2 AB |
755 | eor v1.16b, v1.16b, v5.16b |
756 | eor v2.16b, v2.16b, v6.16b | |
2e5d2f33 | 757 | next_tweak v7, v6, v8 |
49788fe2 | 758 | eor v3.16b, v3.16b, v7.16b |
55868b45 | 759 | bl aes_decrypt_block4x |
49788fe2 AB |
760 | eor v3.16b, v3.16b, v7.16b |
761 | eor v0.16b, v0.16b, v4.16b | |
762 | eor v1.16b, v1.16b, v5.16b | |
763 | eor v2.16b, v2.16b, v6.16b | |
6e7de6af | 764 | st1 {v0.16b-v3.16b}, [x0], #64 |
49788fe2 | 765 | mov v4.16b, v7.16b |
6e7de6af | 766 | cbz w4, .Lxtsdecout |
cc3cc489 | 767 | xts_reload_mask v8 |
49788fe2 | 768 | b .LxtsdecloopNx |
49788fe2 | 769 | .Lxtsdec1x: |
7cceca8b | 770 | adds w4, w4, #64 |
49788fe2 | 771 | beq .Lxtsdecout |
7cceca8b | 772 | subs w4, w4, #16 |
49788fe2 | 773 | .Lxtsdecloop: |
7cceca8b AB |
774 | ld1 {v0.16b}, [x1], #16 |
775 | bmi .Lxtsdeccts | |
776 | .Lxtsdecctsout: | |
777 | eor v0.16b, v0.16b, v4.16b | |
6e7de6af | 778 | decrypt_block v0, w3, x2, x8, w7 |
49788fe2 | 779 | eor v0.16b, v0.16b, v4.16b |
6e7de6af | 780 | st1 {v0.16b}, [x0], #16 |
7cceca8b AB |
781 | cbz w4, .Lxtsdecout |
782 | subs w4, w4, #16 | |
2e5d2f33 | 783 | next_tweak v4, v4, v8 |
49788fe2 AB |
784 | b .Lxtsdecloop |
785 | .Lxtsdecout: | |
6e7de6af | 786 | st1 {v4.16b}, [x6] |
7d709af1 | 787 | frame_pop |
49788fe2 | 788 | ret |
7cceca8b AB |
789 | |
790 | .Lxtsdeccts: | |
791 | adr_l x8, .Lcts_permute_table | |
792 | ||
793 | add x1, x1, w4, sxtw /* rewind input pointer */ | |
794 | add w4, w4, #16 /* # bytes in final block */ | |
795 | add x9, x8, #32 | |
796 | add x8, x8, x4 | |
797 | sub x9, x9, x4 | |
798 | add x4, x0, x4 /* output address of final block */ | |
799 | ||
800 | next_tweak v5, v4, v8 | |
801 | ||
802 | ld1 {v1.16b}, [x1] /* load final block */ | |
803 | ld1 {v2.16b}, [x8] | |
804 | ld1 {v3.16b}, [x9] | |
805 | ||
806 | eor v0.16b, v0.16b, v5.16b | |
807 | decrypt_block v0, w3, x2, x8, w7 | |
808 | eor v0.16b, v0.16b, v5.16b | |
809 | ||
810 | tbl v2.16b, {v0.16b}, v2.16b | |
811 | tbx v0.16b, {v1.16b}, v3.16b | |
812 | ||
813 | st1 {v2.16b}, [x4] /* overlapping stores */ | |
814 | mov w4, wzr | |
815 | b .Lxtsdecctsout | |
b8e50548 | 816 | AES_FUNC_END(aes_xts_decrypt) |
4860620d AB |
817 | |
818 | /* | |
819 | * aes_mac_update(u8 const in[], u32 const rk[], int rounds, | |
820 | * int blocks, u8 dg[], int enc_before, int enc_after) | |
821 | */ | |
b8e50548 | 822 | AES_FUNC_START(aes_mac_update) |
f0070f4a | 823 | ld1 {v0.16b}, [x4] /* get dg */ |
4860620d | 824 | enc_prepare w2, x1, x7 |
870c163a | 825 | cbz w5, .Lmacloop4x |
4860620d | 826 | |
870c163a AB |
827 | encrypt_block v0, w2, x1, x7, w8 |
828 | ||
829 | .Lmacloop4x: | |
f0070f4a | 830 | subs w3, w3, #4 |
870c163a | 831 | bmi .Lmac1x |
f0070f4a | 832 | ld1 {v1.16b-v4.16b}, [x0], #64 /* get next pt block */ |
870c163a | 833 | eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ |
f0070f4a | 834 | encrypt_block v0, w2, x1, x7, w8 |
870c163a | 835 | eor v0.16b, v0.16b, v2.16b |
f0070f4a | 836 | encrypt_block v0, w2, x1, x7, w8 |
870c163a | 837 | eor v0.16b, v0.16b, v3.16b |
f0070f4a | 838 | encrypt_block v0, w2, x1, x7, w8 |
870c163a | 839 | eor v0.16b, v0.16b, v4.16b |
f0070f4a AB |
840 | cmp w3, wzr |
841 | csinv x5, x6, xzr, eq | |
870c163a | 842 | cbz w5, .Lmacout |
f0070f4a AB |
843 | encrypt_block v0, w2, x1, x7, w8 |
844 | st1 {v0.16b}, [x4] /* return dg */ | |
13150149 | 845 | cond_yield .Lmacout, x7, x8 |
870c163a AB |
846 | b .Lmacloop4x |
847 | .Lmac1x: | |
f0070f4a | 848 | add w3, w3, #4 |
4860620d | 849 | .Lmacloop: |
f0070f4a AB |
850 | cbz w3, .Lmacout |
851 | ld1 {v1.16b}, [x0], #16 /* get next pt block */ | |
4860620d AB |
852 | eor v0.16b, v0.16b, v1.16b /* ..and xor with dg */ |
853 | ||
f0070f4a AB |
854 | subs w3, w3, #1 |
855 | csinv x5, x6, xzr, eq | |
4860620d AB |
856 | cbz w5, .Lmacout |
857 | ||
0c8f838a | 858 | .Lmacenc: |
f0070f4a | 859 | encrypt_block v0, w2, x1, x7, w8 |
4860620d AB |
860 | b .Lmacloop |
861 | ||
862 | .Lmacout: | |
f0070f4a AB |
863 | st1 {v0.16b}, [x4] /* return dg */ |
864 | mov w0, w3 | |
4860620d | 865 | ret |
b8e50548 | 866 | AES_FUNC_END(aes_mac_update) |