Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-block.git] / arch / arm / crypto / ghash-ce-core.S
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
4  *
5  * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
6  */
7
8 #include <linux/linkage.h>
9 #include <asm/assembler.h>
10
11         SHASH           .req    q0
12         T1              .req    q1
13         XL              .req    q2
14         XM              .req    q3
15         XH              .req    q4
16         IN1             .req    q4
17
18         SHASH_L         .req    d0
19         SHASH_H         .req    d1
20         T1_L            .req    d2
21         T1_H            .req    d3
22         XL_L            .req    d4
23         XL_H            .req    d5
24         XM_L            .req    d6
25         XM_H            .req    d7
26         XH_L            .req    d8
27
28         t0l             .req    d10
29         t0h             .req    d11
30         t1l             .req    d12
31         t1h             .req    d13
32         t2l             .req    d14
33         t2h             .req    d15
34         t3l             .req    d16
35         t3h             .req    d17
36         t4l             .req    d18
37         t4h             .req    d19
38
39         t0q             .req    q5
40         t1q             .req    q6
41         t2q             .req    q7
42         t3q             .req    q8
43         t4q             .req    q9
44         T2              .req    q9
45
46         s1l             .req    d20
47         s1h             .req    d21
48         s2l             .req    d22
49         s2h             .req    d23
50         s3l             .req    d24
51         s3h             .req    d25
52         s4l             .req    d26
53         s4h             .req    d27
54
55         MASK            .req    d28
56         SHASH2_p8       .req    d28
57
58         k16             .req    d29
59         k32             .req    d30
60         k48             .req    d31
61         SHASH2_p64      .req    d31
62
63         HH              .req    q10
64         HH3             .req    q11
65         HH4             .req    q12
66         HH34            .req    q13
67
68         HH_L            .req    d20
69         HH_H            .req    d21
70         HH3_L           .req    d22
71         HH3_H           .req    d23
72         HH4_L           .req    d24
73         HH4_H           .req    d25
74         HH34_L          .req    d26
75         HH34_H          .req    d27
76         SHASH2_H        .req    d29
77
78         XL2             .req    q5
79         XM2             .req    q6
80         XH2             .req    q7
81         T3              .req    q8
82
83         XL2_L           .req    d10
84         XL2_H           .req    d11
85         XM2_L           .req    d12
86         XM2_H           .req    d13
87         T3_L            .req    d16
88         T3_H            .req    d17
89
90         .text
91         .arch           armv8-a
92         .fpu            crypto-neon-fp-armv8
93
94         .macro          __pmull_p64, rd, rn, rm, b1, b2, b3, b4
95         vmull.p64       \rd, \rn, \rm
96         .endm
97
98         /*
99          * This implementation of 64x64 -> 128 bit polynomial multiplication
100          * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
101          * "Fast Software Polynomial Multiplication on ARM Processors Using
102          * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
103          * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
104          *
105          * It has been slightly tweaked for in-order performance, and to allow
106          * 'rq' to overlap with 'ad' or 'bd'.
107          */
108         .macro          __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
109         vext.8          t0l, \ad, \ad, #1       @ A1
110         .ifc            \b1, t4l
111         vext.8          t4l, \bd, \bd, #1       @ B1
112         .endif
113         vmull.p8        t0q, t0l, \bd           @ F = A1*B
114         vext.8          t1l, \ad, \ad, #2       @ A2
115         vmull.p8        t4q, \ad, \b1           @ E = A*B1
116         .ifc            \b2, t3l
117         vext.8          t3l, \bd, \bd, #2       @ B2
118         .endif
119         vmull.p8        t1q, t1l, \bd           @ H = A2*B
120         vext.8          t2l, \ad, \ad, #3       @ A3
121         vmull.p8        t3q, \ad, \b2           @ G = A*B2
122         veor            t0q, t0q, t4q           @ L = E + F
123         .ifc            \b3, t4l
124         vext.8          t4l, \bd, \bd, #3       @ B3
125         .endif
126         vmull.p8        t2q, t2l, \bd           @ J = A3*B
127         veor            t0l, t0l, t0h           @ t0 = (L) (P0 + P1) << 8
128         veor            t1q, t1q, t3q           @ M = G + H
129         .ifc            \b4, t3l
130         vext.8          t3l, \bd, \bd, #4       @ B4
131         .endif
132         vmull.p8        t4q, \ad, \b3           @ I = A*B3
133         veor            t1l, t1l, t1h           @ t1 = (M) (P2 + P3) << 16
134         vmull.p8        t3q, \ad, \b4           @ K = A*B4
135         vand            t0h, t0h, k48
136         vand            t1h, t1h, k32
137         veor            t2q, t2q, t4q           @ N = I + J
138         veor            t0l, t0l, t0h
139         veor            t1l, t1l, t1h
140         veor            t2l, t2l, t2h           @ t2 = (N) (P4 + P5) << 24
141         vand            t2h, t2h, k16
142         veor            t3l, t3l, t3h           @ t3 = (K) (P6 + P7) << 32
143         vmov.i64        t3h, #0
144         vext.8          t0q, t0q, t0q, #15
145         veor            t2l, t2l, t2h
146         vext.8          t1q, t1q, t1q, #14
147         vmull.p8        \rq, \ad, \bd           @ D = A*B
148         vext.8          t2q, t2q, t2q, #13
149         vext.8          t3q, t3q, t3q, #12
150         veor            t0q, t0q, t1q
151         veor            t2q, t2q, t3q
152         veor            \rq, \rq, t0q
153         veor            \rq, \rq, t2q
154         .endm
155
156         //
157         // PMULL (64x64->128) based reduction for CPUs that can do
158         // it in a single instruction.
159         //
160         .macro          __pmull_reduce_p64
161         vmull.p64       T1, XL_L, MASK
162
163         veor            XH_L, XH_L, XM_H
164         vext.8          T1, T1, T1, #8
165         veor            XL_H, XL_H, XM_L
166         veor            T1, T1, XL
167
168         vmull.p64       XL, T1_H, MASK
169         .endm
170
171         //
172         // Alternative reduction for CPUs that lack support for the
173         // 64x64->128 PMULL instruction
174         //
175         .macro          __pmull_reduce_p8
176         veor            XL_H, XL_H, XM_L
177         veor            XH_L, XH_L, XM_H
178
179         vshl.i64        T1, XL, #57
180         vshl.i64        T2, XL, #62
181         veor            T1, T1, T2
182         vshl.i64        T2, XL, #63
183         veor            T1, T1, T2
184         veor            XL_H, XL_H, T1_L
185         veor            XH_L, XH_L, T1_H
186
187         vshr.u64        T1, XL, #1
188         veor            XH, XH, XL
189         veor            XL, XL, T1
190         vshr.u64        T1, T1, #6
191         vshr.u64        XL, XL, #1
192         .endm
193
194         .macro          ghash_update, pn
195         vld1.64         {XL}, [r1]
196
197         /* do the head block first, if supplied */
198         ldr             ip, [sp]
199         teq             ip, #0
200         beq             0f
201         vld1.64         {T1}, [ip]
202         teq             r0, #0
203         b               3f
204
205 0:      .ifc            \pn, p64
206         tst             r0, #3                  // skip until #blocks is a
207         bne             2f                      // round multiple of 4
208
209         vld1.8          {XL2-XM2}, [r2]!
210 1:      vld1.8          {T3-T2}, [r2]!
211         vrev64.8        XL2, XL2
212         vrev64.8        XM2, XM2
213
214         subs            r0, r0, #4
215
216         vext.8          T1, XL2, XL2, #8
217         veor            XL2_H, XL2_H, XL_L
218         veor            XL, XL, T1
219
220         vrev64.8        T3, T3
221         vrev64.8        T1, T2
222
223         vmull.p64       XH, HH4_H, XL_H                 // a1 * b1
224         veor            XL2_H, XL2_H, XL_H
225         vmull.p64       XL, HH4_L, XL_L                 // a0 * b0
226         vmull.p64       XM, HH34_H, XL2_H               // (a1 + a0)(b1 + b0)
227
228         vmull.p64       XH2, HH3_H, XM2_L               // a1 * b1
229         veor            XM2_L, XM2_L, XM2_H
230         vmull.p64       XL2, HH3_L, XM2_H               // a0 * b0
231         vmull.p64       XM2, HH34_L, XM2_L              // (a1 + a0)(b1 + b0)
232
233         veor            XH, XH, XH2
234         veor            XL, XL, XL2
235         veor            XM, XM, XM2
236
237         vmull.p64       XH2, HH_H, T3_L                 // a1 * b1
238         veor            T3_L, T3_L, T3_H
239         vmull.p64       XL2, HH_L, T3_H                 // a0 * b0
240         vmull.p64       XM2, SHASH2_H, T3_L             // (a1 + a0)(b1 + b0)
241
242         veor            XH, XH, XH2
243         veor            XL, XL, XL2
244         veor            XM, XM, XM2
245
246         vmull.p64       XH2, SHASH_H, T1_L              // a1 * b1
247         veor            T1_L, T1_L, T1_H
248         vmull.p64       XL2, SHASH_L, T1_H              // a0 * b0
249         vmull.p64       XM2, SHASH2_p64, T1_L           // (a1 + a0)(b1 + b0)
250
251         veor            XH, XH, XH2
252         veor            XL, XL, XL2
253         veor            XM, XM, XM2
254
255         beq             4f
256
257         vld1.8          {XL2-XM2}, [r2]!
258
259         veor            T1, XL, XH
260         veor            XM, XM, T1
261
262         __pmull_reduce_p64
263
264         veor            T1, T1, XH
265         veor            XL, XL, T1
266
267         b               1b
268         .endif
269
270 2:      vld1.64         {T1}, [r2]!
271         subs            r0, r0, #1
272
273 3:      /* multiply XL by SHASH in GF(2^128) */
274 #ifndef CONFIG_CPU_BIG_ENDIAN
275         vrev64.8        T1, T1
276 #endif
277         vext.8          IN1, T1, T1, #8
278         veor            T1_L, T1_L, XL_H
279         veor            XL, XL, IN1
280
281         __pmull_\pn     XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h   @ a1 * b1
282         veor            T1, T1, XL
283         __pmull_\pn     XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l   @ a0 * b0
284         __pmull_\pn     XM, T1_L, SHASH2_\pn                    @ (a1+a0)(b1+b0)
285
286 4:      veor            T1, XL, XH
287         veor            XM, XM, T1
288
289         __pmull_reduce_\pn
290
291         veor            T1, T1, XH
292         veor            XL, XL, T1
293
294         bne             0b
295
296         vst1.64         {XL}, [r1]
297         bx              lr
298         .endm
299
300         /*
301          * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
302          *                         struct ghash_key const *k, const char *head)
303          */
304 ENTRY(pmull_ghash_update_p64)
305         vld1.64         {SHASH}, [r3]!
306         vld1.64         {HH}, [r3]!
307         vld1.64         {HH3-HH4}, [r3]
308
309         veor            SHASH2_p64, SHASH_L, SHASH_H
310         veor            SHASH2_H, HH_L, HH_H
311         veor            HH34_L, HH3_L, HH3_H
312         veor            HH34_H, HH4_L, HH4_H
313
314         vmov.i8         MASK, #0xe1
315         vshl.u64        MASK, MASK, #57
316
317         ghash_update    p64
318 ENDPROC(pmull_ghash_update_p64)
319
320 ENTRY(pmull_ghash_update_p8)
321         vld1.64         {SHASH}, [r3]
322         veor            SHASH2_p8, SHASH_L, SHASH_H
323
324         vext.8          s1l, SHASH_L, SHASH_L, #1
325         vext.8          s2l, SHASH_L, SHASH_L, #2
326         vext.8          s3l, SHASH_L, SHASH_L, #3
327         vext.8          s4l, SHASH_L, SHASH_L, #4
328         vext.8          s1h, SHASH_H, SHASH_H, #1
329         vext.8          s2h, SHASH_H, SHASH_H, #2
330         vext.8          s3h, SHASH_H, SHASH_H, #3
331         vext.8          s4h, SHASH_H, SHASH_H, #4
332
333         vmov.i64        k16, #0xffff
334         vmov.i64        k32, #0xffffffff
335         vmov.i64        k48, #0xffffffffffff
336
337         ghash_update    p8
338 ENDPROC(pmull_ghash_update_p8)