treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 499
[linux-block.git] / arch / arm / crypto / ghash-ce-core.S
CommitLineData
f1e866b1 1/*
3759ee05 2 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
f1e866b1 3 *
3759ee05 4 * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
f1e866b1
AB
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14 SHASH .req q0
3759ee05
AB
15 T1 .req q1
16 XL .req q2
17 XM .req q3
18 XH .req q4
19 IN1 .req q4
f1e866b1
AB
20
21 SHASH_L .req d0
22 SHASH_H .req d1
3759ee05
AB
23 T1_L .req d2
24 T1_H .req d3
25 XL_L .req d4
26 XL_H .req d5
27 XM_L .req d6
28 XM_H .req d7
29 XH_L .req d8
30
31 t0l .req d10
32 t0h .req d11
33 t1l .req d12
34 t1h .req d13
35 t2l .req d14
36 t2h .req d15
37 t3l .req d16
38 t3h .req d17
39 t4l .req d18
40 t4h .req d19
41
42 t0q .req q5
43 t1q .req q6
44 t2q .req q7
45 t3q .req q8
46 t4q .req q9
47 T2 .req q9
48
49 s1l .req d20
50 s1h .req d21
51 s2l .req d22
52 s2h .req d23
53 s3l .req d24
54 s3h .req d25
55 s4l .req d26
56 s4h .req d27
57
58 MASK .req d28
59 SHASH2_p8 .req d28
60
61 k16 .req d29
62 k32 .req d30
63 k48 .req d31
64 SHASH2_p64 .req d31
f1e866b1 65
00227e3a
AB
66 HH .req q10
67 HH3 .req q11
68 HH4 .req q12
69 HH34 .req q13
70
71 HH_L .req d20
72 HH_H .req d21
73 HH3_L .req d22
74 HH3_H .req d23
75 HH4_L .req d24
76 HH4_H .req d25
77 HH34_L .req d26
78 HH34_H .req d27
79 SHASH2_H .req d29
80
81 XL2 .req q5
82 XM2 .req q6
83 XH2 .req q7
84 T3 .req q8
85
86 XL2_L .req d10
87 XL2_H .req d11
88 XM2_L .req d12
89 XM2_H .req d13
90 T3_L .req d16
91 T3_H .req d17
92
f1e866b1
AB
93 .text
94 .fpu crypto-neon-fp-armv8
95
3759ee05
AB
96 .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4
97 vmull.p64 \rd, \rn, \rm
98 .endm
99
f1e866b1 100 /*
3759ee05
AB
101 * This implementation of 64x64 -> 128 bit polynomial multiplication
102 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
103 * "Fast Software Polynomial Multiplication on ARM Processors Using
104 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
105 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
106 *
107 * It has been slightly tweaked for in-order performance, and to allow
108 * 'rq' to overlap with 'ad' or 'bd'.
f1e866b1 109 */
3759ee05
AB
110 .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
111 vext.8 t0l, \ad, \ad, #1 @ A1
112 .ifc \b1, t4l
113 vext.8 t4l, \bd, \bd, #1 @ B1
114 .endif
115 vmull.p8 t0q, t0l, \bd @ F = A1*B
116 vext.8 t1l, \ad, \ad, #2 @ A2
117 vmull.p8 t4q, \ad, \b1 @ E = A*B1
118 .ifc \b2, t3l
119 vext.8 t3l, \bd, \bd, #2 @ B2
120 .endif
121 vmull.p8 t1q, t1l, \bd @ H = A2*B
122 vext.8 t2l, \ad, \ad, #3 @ A3
123 vmull.p8 t3q, \ad, \b2 @ G = A*B2
124 veor t0q, t0q, t4q @ L = E + F
125 .ifc \b3, t4l
126 vext.8 t4l, \bd, \bd, #3 @ B3
127 .endif
128 vmull.p8 t2q, t2l, \bd @ J = A3*B
129 veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8
130 veor t1q, t1q, t3q @ M = G + H
131 .ifc \b4, t3l
132 vext.8 t3l, \bd, \bd, #4 @ B4
133 .endif
134 vmull.p8 t4q, \ad, \b3 @ I = A*B3
135 veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16
136 vmull.p8 t3q, \ad, \b4 @ K = A*B4
137 vand t0h, t0h, k48
138 vand t1h, t1h, k32
139 veor t2q, t2q, t4q @ N = I + J
140 veor t0l, t0l, t0h
141 veor t1l, t1l, t1h
142 veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24
143 vand t2h, t2h, k16
144 veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32
145 vmov.i64 t3h, #0
146 vext.8 t0q, t0q, t0q, #15
147 veor t2l, t2l, t2h
148 vext.8 t1q, t1q, t1q, #14
149 vmull.p8 \rq, \ad, \bd @ D = A*B
150 vext.8 t2q, t2q, t2q, #13
151 vext.8 t3q, t3q, t3q, #12
152 veor t0q, t0q, t1q
153 veor t2q, t2q, t3q
154 veor \rq, \rq, t0q
155 veor \rq, \rq, t2q
156 .endm
157
158 //
159 // PMULL (64x64->128) based reduction for CPUs that can do
160 // it in a single instruction.
161 //
162 .macro __pmull_reduce_p64
163 vmull.p64 T1, XL_L, MASK
164
165 veor XH_L, XH_L, XM_H
166 vext.8 T1, T1, T1, #8
167 veor XL_H, XL_H, XM_L
168 veor T1, T1, XL
169
170 vmull.p64 XL, T1_H, MASK
171 .endm
172
173 //
174 // Alternative reduction for CPUs that lack support for the
175 // 64x64->128 PMULL instruction
176 //
177 .macro __pmull_reduce_p8
178 veor XL_H, XL_H, XM_L
179 veor XH_L, XH_L, XM_H
180
181 vshl.i64 T1, XL, #57
182 vshl.i64 T2, XL, #62
183 veor T1, T1, T2
184 vshl.i64 T2, XL, #63
185 veor T1, T1, T2
186 veor XL_H, XL_H, T1_L
187 veor XH_L, XH_L, T1_H
188
189 vshr.u64 T1, XL, #1
190 veor XH, XH, XL
191 veor XL, XL, T1
192 vshr.u64 T1, T1, #6
193 vshr.u64 XL, XL, #1
194 .endm
195
196 .macro ghash_update, pn
f1e866b1 197 vld1.64 {XL}, [r1]
f1e866b1
AB
198
199 /* do the head block first, if supplied */
200 ldr ip, [sp]
201 teq ip, #0
202 beq 0f
203 vld1.64 {T1}, [ip]
204 teq r0, #0
00227e3a
AB
205 b 3f
206
2070: .ifc \pn, p64
208 tst r0, #3 // skip until #blocks is a
209 bne 2f // round multiple of 4
210
211 vld1.8 {XL2-XM2}, [r2]!
2121: vld1.8 {T3-T2}, [r2]!
213 vrev64.8 XL2, XL2
214 vrev64.8 XM2, XM2
215
216 subs r0, r0, #4
217
218 vext.8 T1, XL2, XL2, #8
219 veor XL2_H, XL2_H, XL_L
220 veor XL, XL, T1
221
222 vrev64.8 T3, T3
223 vrev64.8 T1, T2
224
225 vmull.p64 XH, HH4_H, XL_H // a1 * b1
226 veor XL2_H, XL2_H, XL_H
227 vmull.p64 XL, HH4_L, XL_L // a0 * b0
228 vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0)
229
230 vmull.p64 XH2, HH3_H, XM2_L // a1 * b1
231 veor XM2_L, XM2_L, XM2_H
232 vmull.p64 XL2, HH3_L, XM2_H // a0 * b0
233 vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0)
234
235 veor XH, XH, XH2
236 veor XL, XL, XL2
237 veor XM, XM, XM2
238
239 vmull.p64 XH2, HH_H, T3_L // a1 * b1
240 veor T3_L, T3_L, T3_H
241 vmull.p64 XL2, HH_L, T3_H // a0 * b0
242 vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0)
243
244 veor XH, XH, XH2
245 veor XL, XL, XL2
246 veor XM, XM, XM2
247
248 vmull.p64 XH2, SHASH_H, T1_L // a1 * b1
249 veor T1_L, T1_L, T1_H
250 vmull.p64 XL2, SHASH_L, T1_H // a0 * b0
251 vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0)
252
253 veor XH, XH, XH2
254 veor XL, XL, XL2
255 veor XM, XM, XM2
f1e866b1 256
00227e3a
AB
257 beq 4f
258
259 vld1.8 {XL2-XM2}, [r2]!
260
261 veor T1, XL, XH
262 veor XM, XM, T1
263
264 __pmull_reduce_p64
265
266 veor T1, T1, XH
267 veor XL, XL, T1
268
269 b 1b
270 .endif
271
2722: vld1.64 {T1}, [r2]!
f1e866b1
AB
273 subs r0, r0, #1
274
00227e3a 2753: /* multiply XL by SHASH in GF(2^128) */
f1e866b1
AB
276#ifndef CONFIG_CPU_BIG_ENDIAN
277 vrev64.8 T1, T1
278#endif
f1e866b1 279 vext.8 IN1, T1, T1, #8
3759ee05 280 veor T1_L, T1_L, XL_H
f1e866b1
AB
281 veor XL, XL, IN1
282
3759ee05 283 __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1
f1e866b1 284 veor T1, T1, XL
3759ee05
AB
285 __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
286 __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
f1e866b1 287
00227e3a 2884: veor T1, XL, XH
f1e866b1 289 veor XM, XM, T1
f1e866b1 290
3759ee05 291 __pmull_reduce_\pn
f1e866b1 292
3759ee05
AB
293 veor T1, T1, XH
294 veor XL, XL, T1
f1e866b1
AB
295
296 bne 0b
297
298 vst1.64 {XL}, [r1]
299 bx lr
3759ee05
AB
300 .endm
301
302 /*
303 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
304 * struct ghash_key const *k, const char *head)
305 */
306ENTRY(pmull_ghash_update_p64)
00227e3a
AB
307 vld1.64 {SHASH}, [r3]!
308 vld1.64 {HH}, [r3]!
309 vld1.64 {HH3-HH4}, [r3]
310
3759ee05 311 veor SHASH2_p64, SHASH_L, SHASH_H
00227e3a
AB
312 veor SHASH2_H, HH_L, HH_H
313 veor HH34_L, HH3_L, HH3_H
314 veor HH34_H, HH4_L, HH4_H
3759ee05
AB
315
316 vmov.i8 MASK, #0xe1
317 vshl.u64 MASK, MASK, #57
318
319 ghash_update p64
320ENDPROC(pmull_ghash_update_p64)
321
322ENTRY(pmull_ghash_update_p8)
323 vld1.64 {SHASH}, [r3]
324 veor SHASH2_p8, SHASH_L, SHASH_H
325
326 vext.8 s1l, SHASH_L, SHASH_L, #1
327 vext.8 s2l, SHASH_L, SHASH_L, #2
328 vext.8 s3l, SHASH_L, SHASH_L, #3
329 vext.8 s4l, SHASH_L, SHASH_L, #4
330 vext.8 s1h, SHASH_H, SHASH_H, #1
331 vext.8 s2h, SHASH_H, SHASH_H, #2
332 vext.8 s3h, SHASH_H, SHASH_H, #3
333 vext.8 s4h, SHASH_H, SHASH_H, #4
334
335 vmov.i64 k16, #0xffff
336 vmov.i64 k32, #0xffffffff
337 vmov.i64 k48, #0xffffffffffff
338
339 ghash_update p8
340ENDPROC(pmull_ghash_update_p8)