Commit | Line | Data |
---|---|---|
d2912cb1 | 1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
f1e866b1 | 2 | /* |
3759ee05 | 3 | * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions. |
f1e866b1 | 4 | * |
b575b5a1 AB |
5 | * Copyright (C) 2015 - 2017 Linaro Ltd. |
6 | * Copyright (C) 2023 Google LLC. <ardb@google.com> | |
f1e866b1 AB |
7 | */ |
8 | ||
9 | #include <linux/linkage.h> | |
10 | #include <asm/assembler.h> | |
11 | ||
7548bf8c SA |
12 | .arch armv8-a |
13 | .fpu crypto-neon-fp-armv8 | |
14 | ||
f1e866b1 | 15 | SHASH .req q0 |
3759ee05 AB |
16 | T1 .req q1 |
17 | XL .req q2 | |
18 | XM .req q3 | |
19 | XH .req q4 | |
20 | IN1 .req q4 | |
f1e866b1 AB |
21 | |
22 | SHASH_L .req d0 | |
23 | SHASH_H .req d1 | |
3759ee05 AB |
24 | T1_L .req d2 |
25 | T1_H .req d3 | |
26 | XL_L .req d4 | |
27 | XL_H .req d5 | |
28 | XM_L .req d6 | |
29 | XM_H .req d7 | |
30 | XH_L .req d8 | |
31 | ||
32 | t0l .req d10 | |
33 | t0h .req d11 | |
34 | t1l .req d12 | |
35 | t1h .req d13 | |
36 | t2l .req d14 | |
37 | t2h .req d15 | |
38 | t3l .req d16 | |
39 | t3h .req d17 | |
40 | t4l .req d18 | |
41 | t4h .req d19 | |
42 | ||
43 | t0q .req q5 | |
44 | t1q .req q6 | |
45 | t2q .req q7 | |
46 | t3q .req q8 | |
47 | t4q .req q9 | |
b575b5a1 | 48 | XH2 .req q9 |
3759ee05 AB |
49 | |
50 | s1l .req d20 | |
51 | s1h .req d21 | |
52 | s2l .req d22 | |
53 | s2h .req d23 | |
54 | s3l .req d24 | |
55 | s3h .req d25 | |
56 | s4l .req d26 | |
57 | s4h .req d27 | |
58 | ||
59 | MASK .req d28 | |
60 | SHASH2_p8 .req d28 | |
61 | ||
62 | k16 .req d29 | |
63 | k32 .req d30 | |
64 | k48 .req d31 | |
65 | SHASH2_p64 .req d31 | |
f1e866b1 | 66 | |
00227e3a AB |
67 | HH .req q10 |
68 | HH3 .req q11 | |
69 | HH4 .req q12 | |
70 | HH34 .req q13 | |
71 | ||
72 | HH_L .req d20 | |
73 | HH_H .req d21 | |
74 | HH3_L .req d22 | |
75 | HH3_H .req d23 | |
76 | HH4_L .req d24 | |
77 | HH4_H .req d25 | |
78 | HH34_L .req d26 | |
79 | HH34_H .req d27 | |
80 | SHASH2_H .req d29 | |
81 | ||
82 | XL2 .req q5 | |
83 | XM2 .req q6 | |
b575b5a1 | 84 | T2 .req q7 |
00227e3a AB |
85 | T3 .req q8 |
86 | ||
87 | XL2_L .req d10 | |
88 | XL2_H .req d11 | |
89 | XM2_L .req d12 | |
90 | XM2_H .req d13 | |
91 | T3_L .req d16 | |
92 | T3_H .req d17 | |
93 | ||
f1e866b1 | 94 | .text |
f1e866b1 | 95 | |
3759ee05 AB |
96 | .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4 |
97 | vmull.p64 \rd, \rn, \rm | |
98 | .endm | |
99 | ||
f1e866b1 | 100 | /* |
3759ee05 AB |
101 | * This implementation of 64x64 -> 128 bit polynomial multiplication |
102 | * using vmull.p8 instructions (8x8 -> 16) is taken from the paper | |
103 | * "Fast Software Polynomial Multiplication on ARM Processors Using | |
104 | * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and | |
105 | * Ricardo Dahab (https://hal.inria.fr/hal-01506572) | |
106 | * | |
107 | * It has been slightly tweaked for in-order performance, and to allow | |
108 | * 'rq' to overlap with 'ad' or 'bd'. | |
f1e866b1 | 109 | */ |
3759ee05 AB |
110 | .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l |
111 | vext.8 t0l, \ad, \ad, #1 @ A1 | |
112 | .ifc \b1, t4l | |
113 | vext.8 t4l, \bd, \bd, #1 @ B1 | |
114 | .endif | |
115 | vmull.p8 t0q, t0l, \bd @ F = A1*B | |
116 | vext.8 t1l, \ad, \ad, #2 @ A2 | |
117 | vmull.p8 t4q, \ad, \b1 @ E = A*B1 | |
118 | .ifc \b2, t3l | |
119 | vext.8 t3l, \bd, \bd, #2 @ B2 | |
120 | .endif | |
121 | vmull.p8 t1q, t1l, \bd @ H = A2*B | |
122 | vext.8 t2l, \ad, \ad, #3 @ A3 | |
123 | vmull.p8 t3q, \ad, \b2 @ G = A*B2 | |
124 | veor t0q, t0q, t4q @ L = E + F | |
125 | .ifc \b3, t4l | |
126 | vext.8 t4l, \bd, \bd, #3 @ B3 | |
127 | .endif | |
128 | vmull.p8 t2q, t2l, \bd @ J = A3*B | |
129 | veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 | |
130 | veor t1q, t1q, t3q @ M = G + H | |
131 | .ifc \b4, t3l | |
132 | vext.8 t3l, \bd, \bd, #4 @ B4 | |
133 | .endif | |
134 | vmull.p8 t4q, \ad, \b3 @ I = A*B3 | |
135 | veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 | |
136 | vmull.p8 t3q, \ad, \b4 @ K = A*B4 | |
137 | vand t0h, t0h, k48 | |
138 | vand t1h, t1h, k32 | |
139 | veor t2q, t2q, t4q @ N = I + J | |
140 | veor t0l, t0l, t0h | |
141 | veor t1l, t1l, t1h | |
142 | veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 | |
143 | vand t2h, t2h, k16 | |
144 | veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 | |
145 | vmov.i64 t3h, #0 | |
146 | vext.8 t0q, t0q, t0q, #15 | |
147 | veor t2l, t2l, t2h | |
148 | vext.8 t1q, t1q, t1q, #14 | |
149 | vmull.p8 \rq, \ad, \bd @ D = A*B | |
150 | vext.8 t2q, t2q, t2q, #13 | |
151 | vext.8 t3q, t3q, t3q, #12 | |
152 | veor t0q, t0q, t1q | |
153 | veor t2q, t2q, t3q | |
154 | veor \rq, \rq, t0q | |
155 | veor \rq, \rq, t2q | |
156 | .endm | |
157 | ||
158 | // | |
159 | // PMULL (64x64->128) based reduction for CPUs that can do | |
160 | // it in a single instruction. | |
161 | // | |
162 | .macro __pmull_reduce_p64 | |
163 | vmull.p64 T1, XL_L, MASK | |
164 | ||
165 | veor XH_L, XH_L, XM_H | |
166 | vext.8 T1, T1, T1, #8 | |
167 | veor XL_H, XL_H, XM_L | |
168 | veor T1, T1, XL | |
169 | ||
170 | vmull.p64 XL, T1_H, MASK | |
171 | .endm | |
172 | ||
173 | // | |
174 | // Alternative reduction for CPUs that lack support for the | |
175 | // 64x64->128 PMULL instruction | |
176 | // | |
177 | .macro __pmull_reduce_p8 | |
178 | veor XL_H, XL_H, XM_L | |
179 | veor XH_L, XH_L, XM_H | |
180 | ||
181 | vshl.i64 T1, XL, #57 | |
182 | vshl.i64 T2, XL, #62 | |
183 | veor T1, T1, T2 | |
184 | vshl.i64 T2, XL, #63 | |
185 | veor T1, T1, T2 | |
186 | veor XL_H, XL_H, T1_L | |
187 | veor XH_L, XH_L, T1_H | |
188 | ||
189 | vshr.u64 T1, XL, #1 | |
190 | veor XH, XH, XL | |
191 | veor XL, XL, T1 | |
192 | vshr.u64 T1, T1, #6 | |
193 | vshr.u64 XL, XL, #1 | |
194 | .endm | |
195 | ||
b575b5a1 | 196 | .macro ghash_update, pn, enc, aggregate=1, head=1 |
f1e866b1 | 197 | vld1.64 {XL}, [r1] |
f1e866b1 | 198 | |
b575b5a1 | 199 | .if \head |
f1e866b1 AB |
200 | /* do the head block first, if supplied */ |
201 | ldr ip, [sp] | |
202 | teq ip, #0 | |
203 | beq 0f | |
204 | vld1.64 {T1}, [ip] | |
205 | teq r0, #0 | |
00227e3a | 206 | b 3f |
b575b5a1 | 207 | .endif |
00227e3a AB |
208 | |
209 | 0: .ifc \pn, p64 | |
b575b5a1 | 210 | .if \aggregate |
00227e3a AB |
211 | tst r0, #3 // skip until #blocks is a |
212 | bne 2f // round multiple of 4 | |
213 | ||
214 | vld1.8 {XL2-XM2}, [r2]! | |
b575b5a1 AB |
215 | 1: vld1.8 {T2-T3}, [r2]! |
216 | ||
217 | .ifnb \enc | |
218 | \enc\()_4x XL2, XM2, T2, T3 | |
219 | ||
220 | add ip, r3, #16 | |
221 | vld1.64 {HH}, [ip, :128]! | |
222 | vld1.64 {HH3-HH4}, [ip, :128] | |
223 | ||
224 | veor SHASH2_p64, SHASH_L, SHASH_H | |
225 | veor SHASH2_H, HH_L, HH_H | |
226 | veor HH34_L, HH3_L, HH3_H | |
227 | veor HH34_H, HH4_L, HH4_H | |
228 | ||
229 | vmov.i8 MASK, #0xe1 | |
230 | vshl.u64 MASK, MASK, #57 | |
231 | .endif | |
232 | ||
00227e3a AB |
233 | vrev64.8 XL2, XL2 |
234 | vrev64.8 XM2, XM2 | |
235 | ||
236 | subs r0, r0, #4 | |
237 | ||
238 | vext.8 T1, XL2, XL2, #8 | |
239 | veor XL2_H, XL2_H, XL_L | |
240 | veor XL, XL, T1 | |
241 | ||
b575b5a1 AB |
242 | vrev64.8 T1, T3 |
243 | vrev64.8 T3, T2 | |
00227e3a AB |
244 | |
245 | vmull.p64 XH, HH4_H, XL_H // a1 * b1 | |
246 | veor XL2_H, XL2_H, XL_H | |
247 | vmull.p64 XL, HH4_L, XL_L // a0 * b0 | |
248 | vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0) | |
249 | ||
250 | vmull.p64 XH2, HH3_H, XM2_L // a1 * b1 | |
251 | veor XM2_L, XM2_L, XM2_H | |
252 | vmull.p64 XL2, HH3_L, XM2_H // a0 * b0 | |
253 | vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0) | |
254 | ||
255 | veor XH, XH, XH2 | |
256 | veor XL, XL, XL2 | |
257 | veor XM, XM, XM2 | |
258 | ||
259 | vmull.p64 XH2, HH_H, T3_L // a1 * b1 | |
260 | veor T3_L, T3_L, T3_H | |
261 | vmull.p64 XL2, HH_L, T3_H // a0 * b0 | |
262 | vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0) | |
263 | ||
264 | veor XH, XH, XH2 | |
265 | veor XL, XL, XL2 | |
266 | veor XM, XM, XM2 | |
267 | ||
268 | vmull.p64 XH2, SHASH_H, T1_L // a1 * b1 | |
269 | veor T1_L, T1_L, T1_H | |
270 | vmull.p64 XL2, SHASH_L, T1_H // a0 * b0 | |
271 | vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0) | |
272 | ||
273 | veor XH, XH, XH2 | |
274 | veor XL, XL, XL2 | |
275 | veor XM, XM, XM2 | |
f1e866b1 | 276 | |
00227e3a AB |
277 | beq 4f |
278 | ||
279 | vld1.8 {XL2-XM2}, [r2]! | |
280 | ||
281 | veor T1, XL, XH | |
282 | veor XM, XM, T1 | |
283 | ||
284 | __pmull_reduce_p64 | |
285 | ||
286 | veor T1, T1, XH | |
287 | veor XL, XL, T1 | |
288 | ||
289 | b 1b | |
290 | .endif | |
b575b5a1 AB |
291 | .endif |
292 | ||
293 | 2: vld1.8 {T1}, [r2]! | |
294 | ||
295 | .ifnb \enc | |
296 | \enc\()_1x T1 | |
297 | veor SHASH2_p64, SHASH_L, SHASH_H | |
298 | vmov.i8 MASK, #0xe1 | |
299 | vshl.u64 MASK, MASK, #57 | |
300 | .endif | |
00227e3a | 301 | |
f1e866b1 AB |
302 | subs r0, r0, #1 |
303 | ||
00227e3a | 304 | 3: /* multiply XL by SHASH in GF(2^128) */ |
f1e866b1 | 305 | vrev64.8 T1, T1 |
b575b5a1 | 306 | |
f1e866b1 | 307 | vext.8 IN1, T1, T1, #8 |
3759ee05 | 308 | veor T1_L, T1_L, XL_H |
f1e866b1 AB |
309 | veor XL, XL, IN1 |
310 | ||
3759ee05 | 311 | __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 |
f1e866b1 | 312 | veor T1, T1, XL |
3759ee05 AB |
313 | __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 |
314 | __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) | |
f1e866b1 | 315 | |
00227e3a | 316 | 4: veor T1, XL, XH |
f1e866b1 | 317 | veor XM, XM, T1 |
f1e866b1 | 318 | |
3759ee05 | 319 | __pmull_reduce_\pn |
f1e866b1 | 320 | |
3759ee05 AB |
321 | veor T1, T1, XH |
322 | veor XL, XL, T1 | |
f1e866b1 AB |
323 | |
324 | bne 0b | |
3759ee05 AB |
325 | .endm |
326 | ||
327 | /* | |
328 | * void pmull_ghash_update(int blocks, u64 dg[], const char *src, | |
329 | * struct ghash_key const *k, const char *head) | |
330 | */ | |
331 | ENTRY(pmull_ghash_update_p64) | |
00227e3a AB |
332 | vld1.64 {SHASH}, [r3]! |
333 | vld1.64 {HH}, [r3]! | |
334 | vld1.64 {HH3-HH4}, [r3] | |
335 | ||
3759ee05 | 336 | veor SHASH2_p64, SHASH_L, SHASH_H |
00227e3a AB |
337 | veor SHASH2_H, HH_L, HH_H |
338 | veor HH34_L, HH3_L, HH3_H | |
339 | veor HH34_H, HH4_L, HH4_H | |
3759ee05 AB |
340 | |
341 | vmov.i8 MASK, #0xe1 | |
342 | vshl.u64 MASK, MASK, #57 | |
343 | ||
344 | ghash_update p64 | |
b575b5a1 AB |
345 | vst1.64 {XL}, [r1] |
346 | ||
347 | bx lr | |
3759ee05 AB |
348 | ENDPROC(pmull_ghash_update_p64) |
349 | ||
350 | ENTRY(pmull_ghash_update_p8) | |
351 | vld1.64 {SHASH}, [r3] | |
352 | veor SHASH2_p8, SHASH_L, SHASH_H | |
353 | ||
354 | vext.8 s1l, SHASH_L, SHASH_L, #1 | |
355 | vext.8 s2l, SHASH_L, SHASH_L, #2 | |
356 | vext.8 s3l, SHASH_L, SHASH_L, #3 | |
357 | vext.8 s4l, SHASH_L, SHASH_L, #4 | |
358 | vext.8 s1h, SHASH_H, SHASH_H, #1 | |
359 | vext.8 s2h, SHASH_H, SHASH_H, #2 | |
360 | vext.8 s3h, SHASH_H, SHASH_H, #3 | |
361 | vext.8 s4h, SHASH_H, SHASH_H, #4 | |
362 | ||
363 | vmov.i64 k16, #0xffff | |
364 | vmov.i64 k32, #0xffffffff | |
365 | vmov.i64 k48, #0xffffffffffff | |
366 | ||
367 | ghash_update p8 | |
b575b5a1 AB |
368 | vst1.64 {XL}, [r1] |
369 | ||
370 | bx lr | |
3759ee05 | 371 | ENDPROC(pmull_ghash_update_p8) |
b575b5a1 AB |
372 | |
373 | e0 .req q9 | |
374 | e1 .req q10 | |
375 | e2 .req q11 | |
376 | e3 .req q12 | |
377 | e0l .req d18 | |
378 | e0h .req d19 | |
379 | e2l .req d22 | |
380 | e2h .req d23 | |
381 | e3l .req d24 | |
382 | e3h .req d25 | |
383 | ctr .req q13 | |
384 | ctr0 .req d26 | |
385 | ctr1 .req d27 | |
386 | ||
387 | ek0 .req q14 | |
388 | ek1 .req q15 | |
389 | ||
390 | .macro round, rk:req, regs:vararg | |
391 | .irp r, \regs | |
392 | aese.8 \r, \rk | |
393 | aesmc.8 \r, \r | |
394 | .endr | |
395 | .endm | |
396 | ||
397 | .macro aes_encrypt, rkp, rounds, regs:vararg | |
398 | vld1.8 {ek0-ek1}, [\rkp, :128]! | |
399 | cmp \rounds, #12 | |
400 | blt .L\@ // AES-128 | |
401 | ||
402 | round ek0, \regs | |
403 | vld1.8 {ek0}, [\rkp, :128]! | |
404 | round ek1, \regs | |
405 | vld1.8 {ek1}, [\rkp, :128]! | |
406 | ||
407 | beq .L\@ // AES-192 | |
408 | ||
409 | round ek0, \regs | |
410 | vld1.8 {ek0}, [\rkp, :128]! | |
411 | round ek1, \regs | |
412 | vld1.8 {ek1}, [\rkp, :128]! | |
413 | ||
414 | .L\@: .rept 4 | |
415 | round ek0, \regs | |
416 | vld1.8 {ek0}, [\rkp, :128]! | |
417 | round ek1, \regs | |
418 | vld1.8 {ek1}, [\rkp, :128]! | |
419 | .endr | |
420 | ||
421 | round ek0, \regs | |
422 | vld1.8 {ek0}, [\rkp, :128] | |
423 | ||
424 | .irp r, \regs | |
425 | aese.8 \r, ek1 | |
426 | .endr | |
427 | .irp r, \regs | |
428 | veor \r, \r, ek0 | |
429 | .endr | |
430 | .endm | |
431 | ||
432 | pmull_aes_encrypt: | |
433 | add ip, r5, #4 | |
434 | vld1.8 {ctr0}, [r5] // load 12 byte IV | |
435 | vld1.8 {ctr1}, [ip] | |
436 | rev r8, r7 | |
437 | vext.8 ctr1, ctr1, ctr1, #4 | |
438 | add r7, r7, #1 | |
439 | vmov.32 ctr1[1], r8 | |
440 | vmov e0, ctr | |
441 | ||
442 | add ip, r3, #64 | |
443 | aes_encrypt ip, r6, e0 | |
444 | bx lr | |
445 | ENDPROC(pmull_aes_encrypt) | |
446 | ||
447 | pmull_aes_encrypt_4x: | |
448 | add ip, r5, #4 | |
449 | vld1.8 {ctr0}, [r5] | |
450 | vld1.8 {ctr1}, [ip] | |
451 | rev r8, r7 | |
452 | vext.8 ctr1, ctr1, ctr1, #4 | |
453 | add r7, r7, #1 | |
454 | vmov.32 ctr1[1], r8 | |
455 | rev ip, r7 | |
456 | vmov e0, ctr | |
457 | add r7, r7, #1 | |
458 | vmov.32 ctr1[1], ip | |
459 | rev r8, r7 | |
460 | vmov e1, ctr | |
461 | add r7, r7, #1 | |
462 | vmov.32 ctr1[1], r8 | |
463 | rev ip, r7 | |
464 | vmov e2, ctr | |
465 | add r7, r7, #1 | |
466 | vmov.32 ctr1[1], ip | |
467 | vmov e3, ctr | |
468 | ||
469 | add ip, r3, #64 | |
470 | aes_encrypt ip, r6, e0, e1, e2, e3 | |
471 | bx lr | |
472 | ENDPROC(pmull_aes_encrypt_4x) | |
473 | ||
474 | pmull_aes_encrypt_final: | |
475 | add ip, r5, #4 | |
476 | vld1.8 {ctr0}, [r5] | |
477 | vld1.8 {ctr1}, [ip] | |
478 | rev r8, r7 | |
479 | vext.8 ctr1, ctr1, ctr1, #4 | |
480 | mov r7, #1 << 24 // BE #1 for the tag | |
481 | vmov.32 ctr1[1], r8 | |
482 | vmov e0, ctr | |
483 | vmov.32 ctr1[1], r7 | |
484 | vmov e1, ctr | |
485 | ||
486 | add ip, r3, #64 | |
487 | aes_encrypt ip, r6, e0, e1 | |
488 | bx lr | |
489 | ENDPROC(pmull_aes_encrypt_final) | |
490 | ||
491 | .macro enc_1x, in0 | |
492 | bl pmull_aes_encrypt | |
493 | veor \in0, \in0, e0 | |
494 | vst1.8 {\in0}, [r4]! | |
495 | .endm | |
496 | ||
497 | .macro dec_1x, in0 | |
498 | bl pmull_aes_encrypt | |
499 | veor e0, e0, \in0 | |
500 | vst1.8 {e0}, [r4]! | |
501 | .endm | |
502 | ||
503 | .macro enc_4x, in0, in1, in2, in3 | |
504 | bl pmull_aes_encrypt_4x | |
505 | ||
506 | veor \in0, \in0, e0 | |
507 | veor \in1, \in1, e1 | |
508 | veor \in2, \in2, e2 | |
509 | veor \in3, \in3, e3 | |
510 | ||
511 | vst1.8 {\in0-\in1}, [r4]! | |
512 | vst1.8 {\in2-\in3}, [r4]! | |
513 | .endm | |
514 | ||
515 | .macro dec_4x, in0, in1, in2, in3 | |
516 | bl pmull_aes_encrypt_4x | |
517 | ||
518 | veor e0, e0, \in0 | |
519 | veor e1, e1, \in1 | |
520 | veor e2, e2, \in2 | |
521 | veor e3, e3, \in3 | |
522 | ||
523 | vst1.8 {e0-e1}, [r4]! | |
524 | vst1.8 {e2-e3}, [r4]! | |
525 | .endm | |
526 | ||
527 | /* | |
528 | * void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src, | |
529 | * struct gcm_key const *k, char *dst, | |
530 | * char *iv, int rounds, u32 counter) | |
531 | */ | |
532 | ENTRY(pmull_gcm_encrypt) | |
533 | push {r4-r8, lr} | |
534 | ldrd r4, r5, [sp, #24] | |
535 | ldrd r6, r7, [sp, #32] | |
536 | ||
537 | vld1.64 {SHASH}, [r3] | |
538 | ||
539 | ghash_update p64, enc, head=0 | |
540 | vst1.64 {XL}, [r1] | |
541 | ||
542 | pop {r4-r8, pc} | |
543 | ENDPROC(pmull_gcm_encrypt) | |
544 | ||
545 | /* | |
546 | * void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src, | |
547 | * struct gcm_key const *k, char *dst, | |
548 | * char *iv, int rounds, u32 counter) | |
549 | */ | |
550 | ENTRY(pmull_gcm_decrypt) | |
551 | push {r4-r8, lr} | |
552 | ldrd r4, r5, [sp, #24] | |
553 | ldrd r6, r7, [sp, #32] | |
554 | ||
555 | vld1.64 {SHASH}, [r3] | |
556 | ||
557 | ghash_update p64, dec, head=0 | |
558 | vst1.64 {XL}, [r1] | |
559 | ||
560 | pop {r4-r8, pc} | |
561 | ENDPROC(pmull_gcm_decrypt) | |
562 | ||
563 | /* | |
564 | * void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag, | |
565 | * struct gcm_key const *k, char *head, | |
566 | * char *iv, int rounds, u32 counter) | |
567 | */ | |
568 | ENTRY(pmull_gcm_enc_final) | |
569 | push {r4-r8, lr} | |
570 | ldrd r4, r5, [sp, #24] | |
571 | ldrd r6, r7, [sp, #32] | |
572 | ||
573 | bl pmull_aes_encrypt_final | |
574 | ||
575 | cmp r0, #0 | |
576 | beq .Lenc_final | |
577 | ||
578 | mov_l ip, .Lpermute | |
579 | sub r4, r4, #16 | |
580 | add r8, ip, r0 | |
581 | add ip, ip, #32 | |
582 | add r4, r4, r0 | |
583 | sub ip, ip, r0 | |
584 | ||
585 | vld1.8 {e3}, [r8] // permute vector for key stream | |
586 | vld1.8 {e2}, [ip] // permute vector for ghash input | |
587 | ||
588 | vtbl.8 e3l, {e0}, e3l | |
589 | vtbl.8 e3h, {e0}, e3h | |
590 | ||
591 | vld1.8 {e0}, [r4] // encrypt tail block | |
592 | veor e0, e0, e3 | |
593 | vst1.8 {e0}, [r4] | |
594 | ||
595 | vtbl.8 T1_L, {e0}, e2l | |
596 | vtbl.8 T1_H, {e0}, e2h | |
597 | ||
598 | vld1.64 {XL}, [r1] | |
599 | .Lenc_final: | |
600 | vld1.64 {SHASH}, [r3, :128] | |
601 | vmov.i8 MASK, #0xe1 | |
602 | veor SHASH2_p64, SHASH_L, SHASH_H | |
603 | vshl.u64 MASK, MASK, #57 | |
604 | mov r0, #1 | |
605 | bne 3f // process head block first | |
606 | ghash_update p64, aggregate=0, head=0 | |
607 | ||
608 | vrev64.8 XL, XL | |
609 | vext.8 XL, XL, XL, #8 | |
610 | veor XL, XL, e1 | |
611 | ||
612 | sub r2, r2, #16 // rewind src pointer | |
613 | vst1.8 {XL}, [r2] // store tag | |
614 | ||
615 | pop {r4-r8, pc} | |
616 | ENDPROC(pmull_gcm_enc_final) | |
617 | ||
618 | /* | |
619 | * int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag, | |
620 | * struct gcm_key const *k, char *head, | |
621 | * char *iv, int rounds, u32 counter, | |
622 | * const char *otag, int authsize) | |
623 | */ | |
624 | ENTRY(pmull_gcm_dec_final) | |
625 | push {r4-r8, lr} | |
626 | ldrd r4, r5, [sp, #24] | |
627 | ldrd r6, r7, [sp, #32] | |
628 | ||
629 | bl pmull_aes_encrypt_final | |
630 | ||
631 | cmp r0, #0 | |
632 | beq .Ldec_final | |
633 | ||
634 | mov_l ip, .Lpermute | |
635 | sub r4, r4, #16 | |
636 | add r8, ip, r0 | |
637 | add ip, ip, #32 | |
638 | add r4, r4, r0 | |
639 | sub ip, ip, r0 | |
640 | ||
641 | vld1.8 {e3}, [r8] // permute vector for key stream | |
642 | vld1.8 {e2}, [ip] // permute vector for ghash input | |
643 | ||
644 | vtbl.8 e3l, {e0}, e3l | |
645 | vtbl.8 e3h, {e0}, e3h | |
646 | ||
647 | vld1.8 {e0}, [r4] | |
648 | ||
649 | vtbl.8 T1_L, {e0}, e2l | |
650 | vtbl.8 T1_H, {e0}, e2h | |
651 | ||
652 | veor e0, e0, e3 | |
653 | vst1.8 {e0}, [r4] | |
654 | ||
655 | vld1.64 {XL}, [r1] | |
656 | .Ldec_final: | |
657 | vld1.64 {SHASH}, [r3] | |
658 | vmov.i8 MASK, #0xe1 | |
659 | veor SHASH2_p64, SHASH_L, SHASH_H | |
660 | vshl.u64 MASK, MASK, #57 | |
661 | mov r0, #1 | |
662 | bne 3f // process head block first | |
663 | ghash_update p64, aggregate=0, head=0 | |
664 | ||
665 | vrev64.8 XL, XL | |
666 | vext.8 XL, XL, XL, #8 | |
667 | veor XL, XL, e1 | |
668 | ||
669 | mov_l ip, .Lpermute | |
670 | ldrd r2, r3, [sp, #40] // otag and authsize | |
671 | vld1.8 {T1}, [r2] | |
672 | add ip, ip, r3 | |
673 | vceq.i8 T1, T1, XL // compare tags | |
674 | vmvn T1, T1 // 0 for eq, -1 for ne | |
675 | ||
676 | vld1.8 {e0}, [ip] | |
677 | vtbl.8 XL_L, {T1}, e0l // keep authsize bytes only | |
678 | vtbl.8 XL_H, {T1}, e0h | |
679 | ||
680 | vpmin.s8 XL_L, XL_L, XL_H // take the minimum s8 across the vector | |
681 | vpmin.s8 XL_L, XL_L, XL_L | |
682 | vmov.32 r0, XL_L[0] // fail if != 0x0 | |
683 | ||
684 | pop {r4-r8, pc} | |
685 | ENDPROC(pmull_gcm_dec_final) | |
686 | ||
687 | .section ".rodata", "a", %progbits | |
688 | .align 5 | |
689 | .Lpermute: | |
690 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff | |
691 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff | |
692 | .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 | |
693 | .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f | |
694 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff | |
695 | .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff |