Commit | Line | Data |
---|---|---|
d2912cb1 | 1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
86464859 AB |
2 | /* |
3 | * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions | |
4 | * | |
5 | * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> | |
86464859 AB |
6 | */ |
7 | ||
8 | #include <linux/linkage.h> | |
9 | #include <asm/assembler.h> | |
10 | ||
11 | .text | |
3aa6d4ab | 12 | .arch armv8-a |
86464859 AB |
13 | .fpu crypto-neon-fp-armv8 |
14 | .align 3 | |
15 | ||
16 | .macro enc_round, state, key | |
17 | aese.8 \state, \key | |
18 | aesmc.8 \state, \state | |
19 | .endm | |
20 | ||
21 | .macro dec_round, state, key | |
22 | aesd.8 \state, \key | |
23 | aesimc.8 \state, \state | |
24 | .endm | |
25 | ||
26 | .macro enc_dround, key1, key2 | |
27 | enc_round q0, \key1 | |
28 | enc_round q0, \key2 | |
29 | .endm | |
30 | ||
31 | .macro dec_dround, key1, key2 | |
32 | dec_round q0, \key1 | |
33 | dec_round q0, \key2 | |
34 | .endm | |
35 | ||
36 | .macro enc_fround, key1, key2, key3 | |
37 | enc_round q0, \key1 | |
38 | aese.8 q0, \key2 | |
39 | veor q0, q0, \key3 | |
40 | .endm | |
41 | ||
42 | .macro dec_fround, key1, key2, key3 | |
43 | dec_round q0, \key1 | |
44 | aesd.8 q0, \key2 | |
45 | veor q0, q0, \key3 | |
46 | .endm | |
47 | ||
1dede02b | 48 | .macro enc_dround_4x, key1, key2 |
86464859 AB |
49 | enc_round q0, \key1 |
50 | enc_round q1, \key1 | |
51 | enc_round q2, \key1 | |
1dede02b | 52 | enc_round q3, \key1 |
86464859 AB |
53 | enc_round q0, \key2 |
54 | enc_round q1, \key2 | |
55 | enc_round q2, \key2 | |
1dede02b | 56 | enc_round q3, \key2 |
86464859 AB |
57 | .endm |
58 | ||
1dede02b | 59 | .macro dec_dround_4x, key1, key2 |
86464859 AB |
60 | dec_round q0, \key1 |
61 | dec_round q1, \key1 | |
62 | dec_round q2, \key1 | |
1dede02b | 63 | dec_round q3, \key1 |
86464859 AB |
64 | dec_round q0, \key2 |
65 | dec_round q1, \key2 | |
66 | dec_round q2, \key2 | |
1dede02b | 67 | dec_round q3, \key2 |
86464859 AB |
68 | .endm |
69 | ||
1dede02b | 70 | .macro enc_fround_4x, key1, key2, key3 |
86464859 AB |
71 | enc_round q0, \key1 |
72 | enc_round q1, \key1 | |
73 | enc_round q2, \key1 | |
1dede02b | 74 | enc_round q3, \key1 |
86464859 AB |
75 | aese.8 q0, \key2 |
76 | aese.8 q1, \key2 | |
77 | aese.8 q2, \key2 | |
1dede02b | 78 | aese.8 q3, \key2 |
86464859 AB |
79 | veor q0, q0, \key3 |
80 | veor q1, q1, \key3 | |
81 | veor q2, q2, \key3 | |
1dede02b | 82 | veor q3, q3, \key3 |
86464859 AB |
83 | .endm |
84 | ||
1dede02b | 85 | .macro dec_fround_4x, key1, key2, key3 |
86464859 AB |
86 | dec_round q0, \key1 |
87 | dec_round q1, \key1 | |
88 | dec_round q2, \key1 | |
1dede02b | 89 | dec_round q3, \key1 |
86464859 AB |
90 | aesd.8 q0, \key2 |
91 | aesd.8 q1, \key2 | |
92 | aesd.8 q2, \key2 | |
1dede02b | 93 | aesd.8 q3, \key2 |
86464859 AB |
94 | veor q0, q0, \key3 |
95 | veor q1, q1, \key3 | |
96 | veor q2, q2, \key3 | |
1dede02b | 97 | veor q3, q3, \key3 |
86464859 AB |
98 | .endm |
99 | ||
100 | .macro do_block, dround, fround | |
101 | cmp r3, #12 @ which key size? | |
fafb1dca | 102 | vld1.32 {q10-q11}, [ip]! |
86464859 | 103 | \dround q8, q9 |
fafb1dca | 104 | vld1.32 {q12-q13}, [ip]! |
86464859 | 105 | \dround q10, q11 |
fafb1dca | 106 | vld1.32 {q10-q11}, [ip]! |
86464859 | 107 | \dround q12, q13 |
fafb1dca | 108 | vld1.32 {q12-q13}, [ip]! |
86464859 AB |
109 | \dround q10, q11 |
110 | blo 0f @ AES-128: 10 rounds | |
fafb1dca | 111 | vld1.32 {q10-q11}, [ip]! |
86464859 | 112 | \dround q12, q13 |
6499e8cf | 113 | beq 1f @ AES-192: 12 rounds |
fafb1dca | 114 | vld1.32 {q12-q13}, [ip] |
86464859 AB |
115 | \dround q10, q11 |
116 | 0: \fround q12, q13, q14 | |
117 | bx lr | |
118 | ||
6499e8cf | 119 | 1: \fround q10, q11, q14 |
86464859 AB |
120 | bx lr |
121 | .endm | |
122 | ||
123 | /* | |
124 | * Internal, non-AAPCS compliant functions that implement the core AES | |
125 | * transforms. These should preserve all registers except q0 - q2 and ip | |
126 | * Arguments: | |
127 | * q0 : first in/output block | |
1dede02b AB |
128 | * q1 : second in/output block (_4x version only) |
129 | * q2 : third in/output block (_4x version only) | |
130 | * q3 : fourth in/output block (_4x version only) | |
86464859 AB |
131 | * q8 : first round key |
132 | * q9 : secound round key | |
86464859 | 133 | * q14 : final round key |
6499e8cf | 134 | * r2 : address of round key array |
86464859 AB |
135 | * r3 : number of rounds |
136 | */ | |
137 | .align 6 | |
138 | aes_encrypt: | |
139 | add ip, r2, #32 @ 3rd round key | |
140 | .Laes_encrypt_tweak: | |
141 | do_block enc_dround, enc_fround | |
142 | ENDPROC(aes_encrypt) | |
143 | ||
144 | .align 6 | |
145 | aes_decrypt: | |
146 | add ip, r2, #32 @ 3rd round key | |
147 | do_block dec_dround, dec_fround | |
148 | ENDPROC(aes_decrypt) | |
149 | ||
150 | .align 6 | |
1dede02b | 151 | aes_encrypt_4x: |
86464859 | 152 | add ip, r2, #32 @ 3rd round key |
1dede02b AB |
153 | do_block enc_dround_4x, enc_fround_4x |
154 | ENDPROC(aes_encrypt_4x) | |
86464859 AB |
155 | |
156 | .align 6 | |
1dede02b | 157 | aes_decrypt_4x: |
86464859 | 158 | add ip, r2, #32 @ 3rd round key |
1dede02b AB |
159 | do_block dec_dround_4x, dec_fround_4x |
160 | ENDPROC(aes_decrypt_4x) | |
86464859 AB |
161 | |
162 | .macro prepare_key, rk, rounds | |
163 | add ip, \rk, \rounds, lsl #4 | |
fafb1dca AB |
164 | vld1.32 {q8-q9}, [\rk] @ load first 2 round keys |
165 | vld1.32 {q14}, [ip] @ load last round key | |
86464859 AB |
166 | .endm |
167 | ||
168 | /* | |
fcb0e30d | 169 | * aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, |
86464859 | 170 | * int blocks) |
fcb0e30d | 171 | * aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, |
86464859 AB |
172 | * int blocks) |
173 | */ | |
174 | ENTRY(ce_aes_ecb_encrypt) | |
175 | push {r4, lr} | |
176 | ldr r4, [sp, #8] | |
177 | prepare_key r2, r3 | |
1dede02b AB |
178 | .Lecbencloop4x: |
179 | subs r4, r4, #4 | |
86464859 | 180 | bmi .Lecbenc1x |
1465fb13 | 181 | vld1.8 {q0-q1}, [r1]! |
1dede02b AB |
182 | vld1.8 {q2-q3}, [r1]! |
183 | bl aes_encrypt_4x | |
1465fb13 | 184 | vst1.8 {q0-q1}, [r0]! |
1dede02b AB |
185 | vst1.8 {q2-q3}, [r0]! |
186 | b .Lecbencloop4x | |
86464859 | 187 | .Lecbenc1x: |
1dede02b | 188 | adds r4, r4, #4 |
86464859 AB |
189 | beq .Lecbencout |
190 | .Lecbencloop: | |
1465fb13 | 191 | vld1.8 {q0}, [r1]! |
86464859 | 192 | bl aes_encrypt |
1465fb13 | 193 | vst1.8 {q0}, [r0]! |
86464859 AB |
194 | subs r4, r4, #1 |
195 | bne .Lecbencloop | |
196 | .Lecbencout: | |
197 | pop {r4, pc} | |
198 | ENDPROC(ce_aes_ecb_encrypt) | |
199 | ||
200 | ENTRY(ce_aes_ecb_decrypt) | |
201 | push {r4, lr} | |
202 | ldr r4, [sp, #8] | |
203 | prepare_key r2, r3 | |
1dede02b AB |
204 | .Lecbdecloop4x: |
205 | subs r4, r4, #4 | |
86464859 | 206 | bmi .Lecbdec1x |
1465fb13 | 207 | vld1.8 {q0-q1}, [r1]! |
1dede02b AB |
208 | vld1.8 {q2-q3}, [r1]! |
209 | bl aes_decrypt_4x | |
1465fb13 | 210 | vst1.8 {q0-q1}, [r0]! |
1dede02b AB |
211 | vst1.8 {q2-q3}, [r0]! |
212 | b .Lecbdecloop4x | |
86464859 | 213 | .Lecbdec1x: |
1dede02b | 214 | adds r4, r4, #4 |
86464859 AB |
215 | beq .Lecbdecout |
216 | .Lecbdecloop: | |
1465fb13 | 217 | vld1.8 {q0}, [r1]! |
86464859 | 218 | bl aes_decrypt |
1465fb13 | 219 | vst1.8 {q0}, [r0]! |
86464859 AB |
220 | subs r4, r4, #1 |
221 | bne .Lecbdecloop | |
222 | .Lecbdecout: | |
223 | pop {r4, pc} | |
224 | ENDPROC(ce_aes_ecb_decrypt) | |
225 | ||
226 | /* | |
fcb0e30d | 227 | * aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, |
86464859 | 228 | * int blocks, u8 iv[]) |
fcb0e30d | 229 | * aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, |
86464859 AB |
230 | * int blocks, u8 iv[]) |
231 | */ | |
232 | ENTRY(ce_aes_cbc_encrypt) | |
233 | push {r4-r6, lr} | |
234 | ldrd r4, r5, [sp, #16] | |
235 | vld1.8 {q0}, [r5] | |
236 | prepare_key r2, r3 | |
237 | .Lcbcencloop: | |
1465fb13 | 238 | vld1.8 {q1}, [r1]! @ get next pt block |
86464859 AB |
239 | veor q0, q0, q1 @ ..and xor with iv |
240 | bl aes_encrypt | |
1465fb13 | 241 | vst1.8 {q0}, [r0]! |
86464859 AB |
242 | subs r4, r4, #1 |
243 | bne .Lcbcencloop | |
244 | vst1.8 {q0}, [r5] | |
245 | pop {r4-r6, pc} | |
246 | ENDPROC(ce_aes_cbc_encrypt) | |
247 | ||
248 | ENTRY(ce_aes_cbc_decrypt) | |
249 | push {r4-r6, lr} | |
250 | ldrd r4, r5, [sp, #16] | |
1dede02b | 251 | vld1.8 {q15}, [r5] @ keep iv in q15 |
86464859 | 252 | prepare_key r2, r3 |
1dede02b AB |
253 | .Lcbcdecloop4x: |
254 | subs r4, r4, #4 | |
86464859 | 255 | bmi .Lcbcdec1x |
1465fb13 | 256 | vld1.8 {q0-q1}, [r1]! |
1dede02b AB |
257 | vld1.8 {q2-q3}, [r1]! |
258 | vmov q4, q0 | |
259 | vmov q5, q1 | |
260 | vmov q6, q2 | |
261 | vmov q7, q3 | |
262 | bl aes_decrypt_4x | |
263 | veor q0, q0, q15 | |
264 | veor q1, q1, q4 | |
265 | veor q2, q2, q5 | |
266 | veor q3, q3, q6 | |
267 | vmov q15, q7 | |
1465fb13 | 268 | vst1.8 {q0-q1}, [r0]! |
1dede02b AB |
269 | vst1.8 {q2-q3}, [r0]! |
270 | b .Lcbcdecloop4x | |
86464859 | 271 | .Lcbcdec1x: |
1dede02b | 272 | adds r4, r4, #4 |
86464859 | 273 | beq .Lcbcdecout |
1dede02b | 274 | vmov q6, q14 @ preserve last round key |
86464859 | 275 | .Lcbcdecloop: |
1465fb13 | 276 | vld1.8 {q0}, [r1]! @ get next ct block |
86464859 | 277 | veor q14, q15, q6 @ combine prev ct with last key |
1dede02b | 278 | vmov q15, q0 |
86464859 | 279 | bl aes_decrypt |
1465fb13 | 280 | vst1.8 {q0}, [r0]! |
86464859 AB |
281 | subs r4, r4, #1 |
282 | bne .Lcbcdecloop | |
283 | .Lcbcdecout: | |
1dede02b | 284 | vst1.8 {q15}, [r5] @ keep iv in q15 |
86464859 AB |
285 | pop {r4-r6, pc} |
286 | ENDPROC(ce_aes_cbc_decrypt) | |
287 | ||
143d2647 AB |
288 | |
289 | /* | |
290 | * ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[], | |
291 | * int rounds, int bytes, u8 const iv[]) | |
292 | * ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[], | |
293 | * int rounds, int bytes, u8 const iv[]) | |
294 | */ | |
295 | ||
296 | ENTRY(ce_aes_cbc_cts_encrypt) | |
297 | push {r4-r6, lr} | |
298 | ldrd r4, r5, [sp, #16] | |
299 | ||
300 | movw ip, :lower16:.Lcts_permute_table | |
301 | movt ip, :upper16:.Lcts_permute_table | |
302 | sub r4, r4, #16 | |
303 | add lr, ip, #32 | |
304 | add ip, ip, r4 | |
305 | sub lr, lr, r4 | |
306 | vld1.8 {q5}, [ip] | |
307 | vld1.8 {q6}, [lr] | |
308 | ||
309 | add ip, r1, r4 | |
310 | vld1.8 {q0}, [r1] @ overlapping loads | |
311 | vld1.8 {q3}, [ip] | |
312 | ||
313 | vld1.8 {q1}, [r5] @ get iv | |
314 | prepare_key r2, r3 | |
315 | ||
316 | veor q0, q0, q1 @ xor with iv | |
317 | bl aes_encrypt | |
318 | ||
319 | vtbl.8 d4, {d0-d1}, d10 | |
320 | vtbl.8 d5, {d0-d1}, d11 | |
321 | vtbl.8 d2, {d6-d7}, d12 | |
322 | vtbl.8 d3, {d6-d7}, d13 | |
323 | ||
324 | veor q0, q0, q1 | |
325 | bl aes_encrypt | |
326 | ||
327 | add r4, r0, r4 | |
328 | vst1.8 {q2}, [r4] @ overlapping stores | |
329 | vst1.8 {q0}, [r0] | |
330 | ||
331 | pop {r4-r6, pc} | |
332 | ENDPROC(ce_aes_cbc_cts_encrypt) | |
333 | ||
334 | ENTRY(ce_aes_cbc_cts_decrypt) | |
335 | push {r4-r6, lr} | |
336 | ldrd r4, r5, [sp, #16] | |
337 | ||
338 | movw ip, :lower16:.Lcts_permute_table | |
339 | movt ip, :upper16:.Lcts_permute_table | |
340 | sub r4, r4, #16 | |
341 | add lr, ip, #32 | |
342 | add ip, ip, r4 | |
343 | sub lr, lr, r4 | |
344 | vld1.8 {q5}, [ip] | |
345 | vld1.8 {q6}, [lr] | |
346 | ||
347 | add ip, r1, r4 | |
348 | vld1.8 {q0}, [r1] @ overlapping loads | |
349 | vld1.8 {q1}, [ip] | |
350 | ||
351 | vld1.8 {q3}, [r5] @ get iv | |
352 | prepare_key r2, r3 | |
353 | ||
354 | bl aes_decrypt | |
355 | ||
356 | vtbl.8 d4, {d0-d1}, d10 | |
357 | vtbl.8 d5, {d0-d1}, d11 | |
358 | vtbx.8 d0, {d2-d3}, d12 | |
359 | vtbx.8 d1, {d2-d3}, d13 | |
360 | ||
361 | veor q1, q1, q2 | |
362 | bl aes_decrypt | |
363 | veor q0, q0, q3 @ xor with iv | |
364 | ||
365 | add r4, r0, r4 | |
366 | vst1.8 {q1}, [r4] @ overlapping stores | |
367 | vst1.8 {q0}, [r0] | |
368 | ||
369 | pop {r4-r6, pc} | |
370 | ENDPROC(ce_aes_cbc_cts_decrypt) | |
371 | ||
372 | ||
86464859 | 373 | /* |
fcb0e30d | 374 | * aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds, |
86464859 AB |
375 | * int blocks, u8 ctr[]) |
376 | */ | |
377 | ENTRY(ce_aes_ctr_encrypt) | |
378 | push {r4-r6, lr} | |
379 | ldrd r4, r5, [sp, #16] | |
1dede02b | 380 | vld1.8 {q7}, [r5] @ load ctr |
86464859 | 381 | prepare_key r2, r3 |
1dede02b | 382 | vmov r6, s31 @ keep swabbed ctr in r6 |
86464859 AB |
383 | rev r6, r6 |
384 | cmn r6, r4 @ 32 bit overflow? | |
385 | bcs .Lctrloop | |
1dede02b AB |
386 | .Lctrloop4x: |
387 | subs r4, r4, #4 | |
86464859 | 388 | bmi .Lctr1x |
f3456b9f AB |
389 | |
390 | /* | |
391 | * NOTE: the sequence below has been carefully tweaked to avoid | |
392 | * a silicon erratum that exists in Cortex-A57 (#1742098) and | |
393 | * Cortex-A72 (#1655431) cores, where AESE/AESMC instruction pairs | |
394 | * may produce an incorrect result if they take their input from a | |
395 | * register of which a single 32-bit lane has been updated the last | |
396 | * time it was modified. To work around this, the lanes of registers | |
397 | * q0-q3 below are not manipulated individually, and the different | |
398 | * counter values are prepared by successive manipulations of q7. | |
399 | */ | |
400 | add ip, r6, #1 | |
1dede02b | 401 | vmov q0, q7 |
f3456b9f AB |
402 | rev ip, ip |
403 | add lr, r6, #2 | |
404 | vmov s31, ip @ set lane 3 of q1 via q7 | |
405 | add ip, r6, #3 | |
406 | rev lr, lr | |
1dede02b | 407 | vmov q1, q7 |
f3456b9f AB |
408 | vmov s31, lr @ set lane 3 of q2 via q7 |
409 | rev ip, ip | |
1dede02b | 410 | vmov q2, q7 |
f3456b9f AB |
411 | vmov s31, ip @ set lane 3 of q3 via q7 |
412 | add r6, r6, #4 | |
1dede02b | 413 | vmov q3, q7 |
f3456b9f | 414 | |
1dede02b AB |
415 | vld1.8 {q4-q5}, [r1]! |
416 | vld1.8 {q6}, [r1]! | |
417 | vld1.8 {q15}, [r1]! | |
418 | bl aes_encrypt_4x | |
419 | veor q0, q0, q4 | |
420 | veor q1, q1, q5 | |
421 | veor q2, q2, q6 | |
422 | veor q3, q3, q15 | |
86464859 | 423 | rev ip, r6 |
1465fb13 | 424 | vst1.8 {q0-q1}, [r0]! |
1dede02b AB |
425 | vst1.8 {q2-q3}, [r0]! |
426 | vmov s31, ip | |
427 | b .Lctrloop4x | |
86464859 | 428 | .Lctr1x: |
1dede02b | 429 | adds r4, r4, #4 |
86464859 AB |
430 | beq .Lctrout |
431 | .Lctrloop: | |
1dede02b | 432 | vmov q0, q7 |
86464859 | 433 | bl aes_encrypt |
86464859 AB |
434 | |
435 | adds r6, r6, #1 @ increment BE ctr | |
436 | rev ip, r6 | |
1dede02b | 437 | vmov s31, ip |
86464859 | 438 | bcs .Lctrcarry |
511306b2 EB |
439 | |
440 | .Lctrcarrydone: | |
441 | subs r4, r4, #1 | |
442 | bmi .Lctrtailblock @ blocks < 0 means tail block | |
443 | vld1.8 {q3}, [r1]! | |
444 | veor q3, q0, q3 | |
445 | vst1.8 {q3}, [r0]! | |
86464859 | 446 | bne .Lctrloop |
511306b2 | 447 | |
86464859 | 448 | .Lctrout: |
1dede02b | 449 | vst1.8 {q7}, [r5] @ return next CTR value |
86464859 AB |
450 | pop {r4-r6, pc} |
451 | ||
1465fb13 | 452 | .Lctrtailblock: |
511306b2 EB |
453 | vst1.8 {q0}, [r0, :64] @ return the key stream |
454 | b .Lctrout | |
86464859 AB |
455 | |
456 | .Lctrcarry: | |
1dede02b | 457 | .irp sreg, s30, s29, s28 |
86464859 AB |
458 | vmov ip, \sreg @ load next word of ctr |
459 | rev ip, ip @ ... to handle the carry | |
460 | adds ip, ip, #1 | |
461 | rev ip, ip | |
462 | vmov \sreg, ip | |
511306b2 | 463 | bcc .Lctrcarrydone |
86464859 | 464 | .endr |
511306b2 | 465 | b .Lctrcarrydone |
86464859 AB |
466 | ENDPROC(ce_aes_ctr_encrypt) |
467 | ||
468 | /* | |
fcb0e30d | 469 | * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, |
c61b1607 | 470 | * int bytes, u8 iv[], u32 const rk2[], int first) |
fcb0e30d | 471 | * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds, |
c61b1607 | 472 | * int bytes, u8 iv[], u32 const rk2[], int first) |
86464859 AB |
473 | */ |
474 | ||
475 | .macro next_tweak, out, in, const, tmp | |
476 | vshr.s64 \tmp, \in, #63 | |
477 | vand \tmp, \tmp, \const | |
478 | vadd.u64 \out, \in, \in | |
479 | vext.8 \tmp, \tmp, \tmp, #8 | |
480 | veor \out, \out, \tmp | |
481 | .endm | |
482 | ||
86464859 | 483 | ce_aes_xts_init: |
e53b43d8 AB |
484 | vmov.i32 d30, #0x87 @ compose tweak mask vector |
485 | vmovl.u32 q15, d30 | |
486 | vshr.u64 d30, d31, #7 | |
86464859 AB |
487 | |
488 | ldrd r4, r5, [sp, #16] @ load args | |
489 | ldr r6, [sp, #28] | |
490 | vld1.8 {q0}, [r5] @ load iv | |
491 | teq r6, #1 @ start of a block? | |
492 | bxne lr | |
493 | ||
494 | @ Encrypt the IV in q0 with the second AES key. This should only | |
495 | @ be done at the start of a block. | |
496 | ldr r6, [sp, #24] @ load AES key 2 | |
497 | prepare_key r6, r3 | |
498 | add ip, r6, #32 @ 3rd round key of key 2 | |
499 | b .Laes_encrypt_tweak @ tail call | |
500 | ENDPROC(ce_aes_xts_init) | |
501 | ||
502 | ENTRY(ce_aes_xts_encrypt) | |
503 | push {r4-r6, lr} | |
504 | ||
505 | bl ce_aes_xts_init @ run shared prologue | |
506 | prepare_key r2, r3 | |
1dede02b | 507 | vmov q4, q0 |
86464859 AB |
508 | |
509 | teq r6, #0 @ start of a block? | |
1dede02b | 510 | bne .Lxtsenc4x |
86464859 | 511 | |
1dede02b AB |
512 | .Lxtsencloop4x: |
513 | next_tweak q4, q4, q15, q10 | |
514 | .Lxtsenc4x: | |
c61b1607 | 515 | subs r4, r4, #64 |
86464859 | 516 | bmi .Lxtsenc1x |
1dede02b AB |
517 | vld1.8 {q0-q1}, [r1]! @ get 4 pt blocks |
518 | vld1.8 {q2-q3}, [r1]! | |
519 | next_tweak q5, q4, q15, q10 | |
520 | veor q0, q0, q4 | |
521 | next_tweak q6, q5, q15, q10 | |
522 | veor q1, q1, q5 | |
523 | next_tweak q7, q6, q15, q10 | |
524 | veor q2, q2, q6 | |
525 | veor q3, q3, q7 | |
526 | bl aes_encrypt_4x | |
527 | veor q0, q0, q4 | |
528 | veor q1, q1, q5 | |
529 | veor q2, q2, q6 | |
530 | veor q3, q3, q7 | |
531 | vst1.8 {q0-q1}, [r0]! @ write 4 ct blocks | |
532 | vst1.8 {q2-q3}, [r0]! | |
533 | vmov q4, q7 | |
86464859 | 534 | teq r4, #0 |
c61b1607 | 535 | beq .Lxtsencret |
1dede02b | 536 | b .Lxtsencloop4x |
86464859 | 537 | .Lxtsenc1x: |
c61b1607 | 538 | adds r4, r4, #64 |
86464859 | 539 | beq .Lxtsencout |
c61b1607 AB |
540 | subs r4, r4, #16 |
541 | bmi .LxtsencctsNx | |
86464859 | 542 | .Lxtsencloop: |
1465fb13 | 543 | vld1.8 {q0}, [r1]! |
c61b1607 | 544 | .Lxtsencctsout: |
1dede02b | 545 | veor q0, q0, q4 |
86464859 | 546 | bl aes_encrypt |
1dede02b | 547 | veor q0, q0, q4 |
c61b1607 | 548 | teq r4, #0 |
86464859 | 549 | beq .Lxtsencout |
c61b1607 | 550 | subs r4, r4, #16 |
1dede02b | 551 | next_tweak q4, q4, q15, q6 |
c61b1607 AB |
552 | bmi .Lxtsenccts |
553 | vst1.8 {q0}, [r0]! | |
86464859 AB |
554 | b .Lxtsencloop |
555 | .Lxtsencout: | |
c61b1607 AB |
556 | vst1.8 {q0}, [r0] |
557 | .Lxtsencret: | |
1dede02b | 558 | vst1.8 {q4}, [r5] |
86464859 | 559 | pop {r4-r6, pc} |
c61b1607 AB |
560 | |
561 | .LxtsencctsNx: | |
562 | vmov q0, q3 | |
563 | sub r0, r0, #16 | |
564 | .Lxtsenccts: | |
565 | movw ip, :lower16:.Lcts_permute_table | |
566 | movt ip, :upper16:.Lcts_permute_table | |
567 | ||
568 | add r1, r1, r4 @ rewind input pointer | |
569 | add r4, r4, #16 @ # bytes in final block | |
570 | add lr, ip, #32 | |
571 | add ip, ip, r4 | |
572 | sub lr, lr, r4 | |
573 | add r4, r0, r4 @ output address of final block | |
574 | ||
575 | vld1.8 {q1}, [r1] @ load final partial block | |
576 | vld1.8 {q2}, [ip] | |
577 | vld1.8 {q3}, [lr] | |
578 | ||
579 | vtbl.8 d4, {d0-d1}, d4 | |
580 | vtbl.8 d5, {d0-d1}, d5 | |
581 | vtbx.8 d0, {d2-d3}, d6 | |
582 | vtbx.8 d1, {d2-d3}, d7 | |
583 | ||
584 | vst1.8 {q2}, [r4] @ overlapping stores | |
585 | mov r4, #0 | |
586 | b .Lxtsencctsout | |
86464859 AB |
587 | ENDPROC(ce_aes_xts_encrypt) |
588 | ||
589 | ||
590 | ENTRY(ce_aes_xts_decrypt) | |
591 | push {r4-r6, lr} | |
592 | ||
593 | bl ce_aes_xts_init @ run shared prologue | |
594 | prepare_key r2, r3 | |
1dede02b | 595 | vmov q4, q0 |
86464859 | 596 | |
c61b1607 AB |
597 | /* subtract 16 bytes if we are doing CTS */ |
598 | tst r4, #0xf | |
599 | subne r4, r4, #0x10 | |
600 | ||
86464859 | 601 | teq r6, #0 @ start of a block? |
1dede02b | 602 | bne .Lxtsdec4x |
86464859 | 603 | |
1dede02b AB |
604 | .Lxtsdecloop4x: |
605 | next_tweak q4, q4, q15, q10 | |
606 | .Lxtsdec4x: | |
c61b1607 | 607 | subs r4, r4, #64 |
86464859 | 608 | bmi .Lxtsdec1x |
1dede02b AB |
609 | vld1.8 {q0-q1}, [r1]! @ get 4 ct blocks |
610 | vld1.8 {q2-q3}, [r1]! | |
611 | next_tweak q5, q4, q15, q10 | |
612 | veor q0, q0, q4 | |
613 | next_tweak q6, q5, q15, q10 | |
614 | veor q1, q1, q5 | |
615 | next_tweak q7, q6, q15, q10 | |
616 | veor q2, q2, q6 | |
617 | veor q3, q3, q7 | |
618 | bl aes_decrypt_4x | |
619 | veor q0, q0, q4 | |
620 | veor q1, q1, q5 | |
621 | veor q2, q2, q6 | |
622 | veor q3, q3, q7 | |
623 | vst1.8 {q0-q1}, [r0]! @ write 4 pt blocks | |
624 | vst1.8 {q2-q3}, [r0]! | |
625 | vmov q4, q7 | |
86464859 AB |
626 | teq r4, #0 |
627 | beq .Lxtsdecout | |
1dede02b | 628 | b .Lxtsdecloop4x |
86464859 | 629 | .Lxtsdec1x: |
c61b1607 | 630 | adds r4, r4, #64 |
86464859 | 631 | beq .Lxtsdecout |
c61b1607 | 632 | subs r4, r4, #16 |
86464859 | 633 | .Lxtsdecloop: |
1465fb13 | 634 | vld1.8 {q0}, [r1]! |
c61b1607 AB |
635 | bmi .Lxtsdeccts |
636 | .Lxtsdecctsout: | |
1dede02b | 637 | veor q0, q0, q4 |
86464859 | 638 | bl aes_decrypt |
1dede02b | 639 | veor q0, q0, q4 |
1465fb13 | 640 | vst1.8 {q0}, [r0]! |
c61b1607 | 641 | teq r4, #0 |
86464859 | 642 | beq .Lxtsdecout |
c61b1607 | 643 | subs r4, r4, #16 |
1dede02b | 644 | next_tweak q4, q4, q15, q6 |
86464859 AB |
645 | b .Lxtsdecloop |
646 | .Lxtsdecout: | |
1dede02b | 647 | vst1.8 {q4}, [r5] |
86464859 | 648 | pop {r4-r6, pc} |
c61b1607 AB |
649 | |
650 | .Lxtsdeccts: | |
651 | movw ip, :lower16:.Lcts_permute_table | |
652 | movt ip, :upper16:.Lcts_permute_table | |
653 | ||
654 | add r1, r1, r4 @ rewind input pointer | |
655 | add r4, r4, #16 @ # bytes in final block | |
656 | add lr, ip, #32 | |
657 | add ip, ip, r4 | |
658 | sub lr, lr, r4 | |
659 | add r4, r0, r4 @ output address of final block | |
660 | ||
661 | next_tweak q5, q4, q15, q6 | |
662 | ||
663 | vld1.8 {q1}, [r1] @ load final partial block | |
664 | vld1.8 {q2}, [ip] | |
665 | vld1.8 {q3}, [lr] | |
666 | ||
667 | veor q0, q0, q5 | |
668 | bl aes_decrypt | |
669 | veor q0, q0, q5 | |
670 | ||
671 | vtbl.8 d4, {d0-d1}, d4 | |
672 | vtbl.8 d5, {d0-d1}, d5 | |
673 | vtbx.8 d0, {d2-d3}, d6 | |
674 | vtbx.8 d1, {d2-d3}, d7 | |
675 | ||
676 | vst1.8 {q2}, [r4] @ overlapping stores | |
677 | mov r4, #0 | |
678 | b .Lxtsdecctsout | |
86464859 AB |
679 | ENDPROC(ce_aes_xts_decrypt) |
680 | ||
681 | /* | |
682 | * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the | |
683 | * AES sbox substitution on each byte in | |
684 | * 'input' | |
685 | */ | |
686 | ENTRY(ce_aes_sub) | |
687 | vdup.32 q1, r0 | |
688 | veor q0, q0, q0 | |
689 | aese.8 q0, q1 | |
690 | vmov r0, s0 | |
691 | bx lr | |
692 | ENDPROC(ce_aes_sub) | |
693 | ||
694 | /* | |
695 | * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns | |
696 | * operation on round key *src | |
697 | */ | |
698 | ENTRY(ce_aes_invert) | |
fafb1dca | 699 | vld1.32 {q0}, [r1] |
86464859 | 700 | aesimc.8 q0, q0 |
fafb1dca | 701 | vst1.32 {q0}, [r0] |
86464859 AB |
702 | bx lr |
703 | ENDPROC(ce_aes_invert) | |
c61b1607 AB |
704 | |
705 | .section ".rodata", "a" | |
706 | .align 6 | |
707 | .Lcts_permute_table: | |
708 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff | |
709 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff | |
710 | .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 | |
711 | .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf | |
712 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff | |
713 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |