Commit | Line | Data |
---|---|---|
f1e866b1 | 1 | /* |
3759ee05 | 2 | * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions. |
f1e866b1 | 3 | * |
3759ee05 | 4 | * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> |
f1e866b1 AB |
5 | * |
6 | * This program is free software; you can redistribute it and/or modify it | |
7 | * under the terms of the GNU General Public License version 2 as published | |
8 | * by the Free Software Foundation. | |
9 | */ | |
10 | ||
11 | #include <linux/linkage.h> | |
12 | #include <asm/assembler.h> | |
13 | ||
14 | SHASH .req q0 | |
3759ee05 AB |
15 | T1 .req q1 |
16 | XL .req q2 | |
17 | XM .req q3 | |
18 | XH .req q4 | |
19 | IN1 .req q4 | |
f1e866b1 AB |
20 | |
21 | SHASH_L .req d0 | |
22 | SHASH_H .req d1 | |
3759ee05 AB |
23 | T1_L .req d2 |
24 | T1_H .req d3 | |
25 | XL_L .req d4 | |
26 | XL_H .req d5 | |
27 | XM_L .req d6 | |
28 | XM_H .req d7 | |
29 | XH_L .req d8 | |
30 | ||
31 | t0l .req d10 | |
32 | t0h .req d11 | |
33 | t1l .req d12 | |
34 | t1h .req d13 | |
35 | t2l .req d14 | |
36 | t2h .req d15 | |
37 | t3l .req d16 | |
38 | t3h .req d17 | |
39 | t4l .req d18 | |
40 | t4h .req d19 | |
41 | ||
42 | t0q .req q5 | |
43 | t1q .req q6 | |
44 | t2q .req q7 | |
45 | t3q .req q8 | |
46 | t4q .req q9 | |
47 | T2 .req q9 | |
48 | ||
49 | s1l .req d20 | |
50 | s1h .req d21 | |
51 | s2l .req d22 | |
52 | s2h .req d23 | |
53 | s3l .req d24 | |
54 | s3h .req d25 | |
55 | s4l .req d26 | |
56 | s4h .req d27 | |
57 | ||
58 | MASK .req d28 | |
59 | SHASH2_p8 .req d28 | |
60 | ||
61 | k16 .req d29 | |
62 | k32 .req d30 | |
63 | k48 .req d31 | |
64 | SHASH2_p64 .req d31 | |
f1e866b1 | 65 | |
00227e3a AB |
66 | HH .req q10 |
67 | HH3 .req q11 | |
68 | HH4 .req q12 | |
69 | HH34 .req q13 | |
70 | ||
71 | HH_L .req d20 | |
72 | HH_H .req d21 | |
73 | HH3_L .req d22 | |
74 | HH3_H .req d23 | |
75 | HH4_L .req d24 | |
76 | HH4_H .req d25 | |
77 | HH34_L .req d26 | |
78 | HH34_H .req d27 | |
79 | SHASH2_H .req d29 | |
80 | ||
81 | XL2 .req q5 | |
82 | XM2 .req q6 | |
83 | XH2 .req q7 | |
84 | T3 .req q8 | |
85 | ||
86 | XL2_L .req d10 | |
87 | XL2_H .req d11 | |
88 | XM2_L .req d12 | |
89 | XM2_H .req d13 | |
90 | T3_L .req d16 | |
91 | T3_H .req d17 | |
92 | ||
f1e866b1 AB |
93 | .text |
94 | .fpu crypto-neon-fp-armv8 | |
95 | ||
3759ee05 AB |
96 | .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4 |
97 | vmull.p64 \rd, \rn, \rm | |
98 | .endm | |
99 | ||
f1e866b1 | 100 | /* |
3759ee05 AB |
101 | * This implementation of 64x64 -> 128 bit polynomial multiplication |
102 | * using vmull.p8 instructions (8x8 -> 16) is taken from the paper | |
103 | * "Fast Software Polynomial Multiplication on ARM Processors Using | |
104 | * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and | |
105 | * Ricardo Dahab (https://hal.inria.fr/hal-01506572) | |
106 | * | |
107 | * It has been slightly tweaked for in-order performance, and to allow | |
108 | * 'rq' to overlap with 'ad' or 'bd'. | |
f1e866b1 | 109 | */ |
3759ee05 AB |
110 | .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l |
111 | vext.8 t0l, \ad, \ad, #1 @ A1 | |
112 | .ifc \b1, t4l | |
113 | vext.8 t4l, \bd, \bd, #1 @ B1 | |
114 | .endif | |
115 | vmull.p8 t0q, t0l, \bd @ F = A1*B | |
116 | vext.8 t1l, \ad, \ad, #2 @ A2 | |
117 | vmull.p8 t4q, \ad, \b1 @ E = A*B1 | |
118 | .ifc \b2, t3l | |
119 | vext.8 t3l, \bd, \bd, #2 @ B2 | |
120 | .endif | |
121 | vmull.p8 t1q, t1l, \bd @ H = A2*B | |
122 | vext.8 t2l, \ad, \ad, #3 @ A3 | |
123 | vmull.p8 t3q, \ad, \b2 @ G = A*B2 | |
124 | veor t0q, t0q, t4q @ L = E + F | |
125 | .ifc \b3, t4l | |
126 | vext.8 t4l, \bd, \bd, #3 @ B3 | |
127 | .endif | |
128 | vmull.p8 t2q, t2l, \bd @ J = A3*B | |
129 | veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 | |
130 | veor t1q, t1q, t3q @ M = G + H | |
131 | .ifc \b4, t3l | |
132 | vext.8 t3l, \bd, \bd, #4 @ B4 | |
133 | .endif | |
134 | vmull.p8 t4q, \ad, \b3 @ I = A*B3 | |
135 | veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 | |
136 | vmull.p8 t3q, \ad, \b4 @ K = A*B4 | |
137 | vand t0h, t0h, k48 | |
138 | vand t1h, t1h, k32 | |
139 | veor t2q, t2q, t4q @ N = I + J | |
140 | veor t0l, t0l, t0h | |
141 | veor t1l, t1l, t1h | |
142 | veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 | |
143 | vand t2h, t2h, k16 | |
144 | veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 | |
145 | vmov.i64 t3h, #0 | |
146 | vext.8 t0q, t0q, t0q, #15 | |
147 | veor t2l, t2l, t2h | |
148 | vext.8 t1q, t1q, t1q, #14 | |
149 | vmull.p8 \rq, \ad, \bd @ D = A*B | |
150 | vext.8 t2q, t2q, t2q, #13 | |
151 | vext.8 t3q, t3q, t3q, #12 | |
152 | veor t0q, t0q, t1q | |
153 | veor t2q, t2q, t3q | |
154 | veor \rq, \rq, t0q | |
155 | veor \rq, \rq, t2q | |
156 | .endm | |
157 | ||
158 | // | |
159 | // PMULL (64x64->128) based reduction for CPUs that can do | |
160 | // it in a single instruction. | |
161 | // | |
162 | .macro __pmull_reduce_p64 | |
163 | vmull.p64 T1, XL_L, MASK | |
164 | ||
165 | veor XH_L, XH_L, XM_H | |
166 | vext.8 T1, T1, T1, #8 | |
167 | veor XL_H, XL_H, XM_L | |
168 | veor T1, T1, XL | |
169 | ||
170 | vmull.p64 XL, T1_H, MASK | |
171 | .endm | |
172 | ||
173 | // | |
174 | // Alternative reduction for CPUs that lack support for the | |
175 | // 64x64->128 PMULL instruction | |
176 | // | |
177 | .macro __pmull_reduce_p8 | |
178 | veor XL_H, XL_H, XM_L | |
179 | veor XH_L, XH_L, XM_H | |
180 | ||
181 | vshl.i64 T1, XL, #57 | |
182 | vshl.i64 T2, XL, #62 | |
183 | veor T1, T1, T2 | |
184 | vshl.i64 T2, XL, #63 | |
185 | veor T1, T1, T2 | |
186 | veor XL_H, XL_H, T1_L | |
187 | veor XH_L, XH_L, T1_H | |
188 | ||
189 | vshr.u64 T1, XL, #1 | |
190 | veor XH, XH, XL | |
191 | veor XL, XL, T1 | |
192 | vshr.u64 T1, T1, #6 | |
193 | vshr.u64 XL, XL, #1 | |
194 | .endm | |
195 | ||
196 | .macro ghash_update, pn | |
f1e866b1 | 197 | vld1.64 {XL}, [r1] |
f1e866b1 AB |
198 | |
199 | /* do the head block first, if supplied */ | |
200 | ldr ip, [sp] | |
201 | teq ip, #0 | |
202 | beq 0f | |
203 | vld1.64 {T1}, [ip] | |
204 | teq r0, #0 | |
00227e3a AB |
205 | b 3f |
206 | ||
207 | 0: .ifc \pn, p64 | |
208 | tst r0, #3 // skip until #blocks is a | |
209 | bne 2f // round multiple of 4 | |
210 | ||
211 | vld1.8 {XL2-XM2}, [r2]! | |
212 | 1: vld1.8 {T3-T2}, [r2]! | |
213 | vrev64.8 XL2, XL2 | |
214 | vrev64.8 XM2, XM2 | |
215 | ||
216 | subs r0, r0, #4 | |
217 | ||
218 | vext.8 T1, XL2, XL2, #8 | |
219 | veor XL2_H, XL2_H, XL_L | |
220 | veor XL, XL, T1 | |
221 | ||
222 | vrev64.8 T3, T3 | |
223 | vrev64.8 T1, T2 | |
224 | ||
225 | vmull.p64 XH, HH4_H, XL_H // a1 * b1 | |
226 | veor XL2_H, XL2_H, XL_H | |
227 | vmull.p64 XL, HH4_L, XL_L // a0 * b0 | |
228 | vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0) | |
229 | ||
230 | vmull.p64 XH2, HH3_H, XM2_L // a1 * b1 | |
231 | veor XM2_L, XM2_L, XM2_H | |
232 | vmull.p64 XL2, HH3_L, XM2_H // a0 * b0 | |
233 | vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0) | |
234 | ||
235 | veor XH, XH, XH2 | |
236 | veor XL, XL, XL2 | |
237 | veor XM, XM, XM2 | |
238 | ||
239 | vmull.p64 XH2, HH_H, T3_L // a1 * b1 | |
240 | veor T3_L, T3_L, T3_H | |
241 | vmull.p64 XL2, HH_L, T3_H // a0 * b0 | |
242 | vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0) | |
243 | ||
244 | veor XH, XH, XH2 | |
245 | veor XL, XL, XL2 | |
246 | veor XM, XM, XM2 | |
247 | ||
248 | vmull.p64 XH2, SHASH_H, T1_L // a1 * b1 | |
249 | veor T1_L, T1_L, T1_H | |
250 | vmull.p64 XL2, SHASH_L, T1_H // a0 * b0 | |
251 | vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0) | |
252 | ||
253 | veor XH, XH, XH2 | |
254 | veor XL, XL, XL2 | |
255 | veor XM, XM, XM2 | |
f1e866b1 | 256 | |
00227e3a AB |
257 | beq 4f |
258 | ||
259 | vld1.8 {XL2-XM2}, [r2]! | |
260 | ||
261 | veor T1, XL, XH | |
262 | veor XM, XM, T1 | |
263 | ||
264 | __pmull_reduce_p64 | |
265 | ||
266 | veor T1, T1, XH | |
267 | veor XL, XL, T1 | |
268 | ||
269 | b 1b | |
270 | .endif | |
271 | ||
272 | 2: vld1.64 {T1}, [r2]! | |
f1e866b1 AB |
273 | subs r0, r0, #1 |
274 | ||
00227e3a | 275 | 3: /* multiply XL by SHASH in GF(2^128) */ |
f1e866b1 AB |
276 | #ifndef CONFIG_CPU_BIG_ENDIAN |
277 | vrev64.8 T1, T1 | |
278 | #endif | |
f1e866b1 | 279 | vext.8 IN1, T1, T1, #8 |
3759ee05 | 280 | veor T1_L, T1_L, XL_H |
f1e866b1 AB |
281 | veor XL, XL, IN1 |
282 | ||
3759ee05 | 283 | __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 |
f1e866b1 | 284 | veor T1, T1, XL |
3759ee05 AB |
285 | __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 |
286 | __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) | |
f1e866b1 | 287 | |
00227e3a | 288 | 4: veor T1, XL, XH |
f1e866b1 | 289 | veor XM, XM, T1 |
f1e866b1 | 290 | |
3759ee05 | 291 | __pmull_reduce_\pn |
f1e866b1 | 292 | |
3759ee05 AB |
293 | veor T1, T1, XH |
294 | veor XL, XL, T1 | |
f1e866b1 AB |
295 | |
296 | bne 0b | |
297 | ||
298 | vst1.64 {XL}, [r1] | |
299 | bx lr | |
3759ee05 AB |
300 | .endm |
301 | ||
302 | /* | |
303 | * void pmull_ghash_update(int blocks, u64 dg[], const char *src, | |
304 | * struct ghash_key const *k, const char *head) | |
305 | */ | |
306 | ENTRY(pmull_ghash_update_p64) | |
00227e3a AB |
307 | vld1.64 {SHASH}, [r3]! |
308 | vld1.64 {HH}, [r3]! | |
309 | vld1.64 {HH3-HH4}, [r3] | |
310 | ||
3759ee05 | 311 | veor SHASH2_p64, SHASH_L, SHASH_H |
00227e3a AB |
312 | veor SHASH2_H, HH_L, HH_H |
313 | veor HH34_L, HH3_L, HH3_H | |
314 | veor HH34_H, HH4_L, HH4_H | |
3759ee05 AB |
315 | |
316 | vmov.i8 MASK, #0xe1 | |
317 | vshl.u64 MASK, MASK, #57 | |
318 | ||
319 | ghash_update p64 | |
320 | ENDPROC(pmull_ghash_update_p64) | |
321 | ||
322 | ENTRY(pmull_ghash_update_p8) | |
323 | vld1.64 {SHASH}, [r3] | |
324 | veor SHASH2_p8, SHASH_L, SHASH_H | |
325 | ||
326 | vext.8 s1l, SHASH_L, SHASH_L, #1 | |
327 | vext.8 s2l, SHASH_L, SHASH_L, #2 | |
328 | vext.8 s3l, SHASH_L, SHASH_L, #3 | |
329 | vext.8 s4l, SHASH_L, SHASH_L, #4 | |
330 | vext.8 s1h, SHASH_H, SHASH_H, #1 | |
331 | vext.8 s2h, SHASH_H, SHASH_H, #2 | |
332 | vext.8 s3h, SHASH_H, SHASH_H, #3 | |
333 | vext.8 s4h, SHASH_H, SHASH_H, #4 | |
334 | ||
335 | vmov.i64 k16, #0xffff | |
336 | vmov.i64 k32, #0xffffffff | |
337 | vmov.i64 k48, #0xffffffffffff | |
338 | ||
339 | ghash_update p8 | |
340 | ENDPROC(pmull_ghash_update_p8) |