Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux-2.6-block.git] / arch / arm / crypto / poly1305-core.S_shipped
1 #ifndef __KERNEL__
2 # include "arm_arch.h"
3 #else
4 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
5 # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
6 # define poly1305_init   poly1305_init_arm
7 # define poly1305_blocks poly1305_blocks_arm
8 # define poly1305_emit   poly1305_emit_arm
9 .globl  poly1305_blocks_neon
10 #endif
11
12 #if defined(__thumb2__)
13 .syntax unified
14 .thumb
15 #else
16 .code   32
17 #endif
18
19 .text
20
21 .globl  poly1305_emit
22 .globl  poly1305_blocks
23 .globl  poly1305_init
24 .type   poly1305_init,%function
25 .align  5
26 poly1305_init:
27 .Lpoly1305_init:
28         stmdb   sp!,{r4-r11}
29
30         eor     r3,r3,r3
31         cmp     r1,#0
32         str     r3,[r0,#0]              @ zero hash value
33         str     r3,[r0,#4]
34         str     r3,[r0,#8]
35         str     r3,[r0,#12]
36         str     r3,[r0,#16]
37         str     r3,[r0,#36]             @ clear is_base2_26
38         add     r0,r0,#20
39
40 #ifdef  __thumb2__
41         it      eq
42 #endif
43         moveq   r0,#0
44         beq     .Lno_key
45
46 #if     __ARM_MAX_ARCH__>=7
47         mov     r3,#-1
48         str     r3,[r0,#28]             @ impossible key power value
49 # ifndef __KERNEL__
50         adr     r11,.Lpoly1305_init
51         ldr     r12,.LOPENSSL_armcap
52 # endif
53 #endif
54         ldrb    r4,[r1,#0]
55         mov     r10,#0x0fffffff
56         ldrb    r5,[r1,#1]
57         and     r3,r10,#-4              @ 0x0ffffffc
58         ldrb    r6,[r1,#2]
59         ldrb    r7,[r1,#3]
60         orr     r4,r4,r5,lsl#8
61         ldrb    r5,[r1,#4]
62         orr     r4,r4,r6,lsl#16
63         ldrb    r6,[r1,#5]
64         orr     r4,r4,r7,lsl#24
65         ldrb    r7,[r1,#6]
66         and     r4,r4,r10
67
68 #if     __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
69 # if !defined(_WIN32)
70         ldr     r12,[r11,r12]           @ OPENSSL_armcap_P
71 # endif
72 # if defined(__APPLE__) || defined(_WIN32)
73         ldr     r12,[r12]
74 # endif
75 #endif
76         ldrb    r8,[r1,#7]
77         orr     r5,r5,r6,lsl#8
78         ldrb    r6,[r1,#8]
79         orr     r5,r5,r7,lsl#16
80         ldrb    r7,[r1,#9]
81         orr     r5,r5,r8,lsl#24
82         ldrb    r8,[r1,#10]
83         and     r5,r5,r3
84
85 #if     __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
86         tst     r12,#ARMV7_NEON         @ check for NEON
87 # ifdef __thumb2__
88         adr     r9,.Lpoly1305_blocks_neon
89         adr     r11,.Lpoly1305_blocks
90         it      ne
91         movne   r11,r9
92         adr     r12,.Lpoly1305_emit
93         orr     r11,r11,#1              @ thumb-ify addresses
94         orr     r12,r12,#1
95 # else
96         add     r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
97         ite     eq
98         addeq   r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
99         addne   r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
100 # endif
101 #endif
102         ldrb    r9,[r1,#11]
103         orr     r6,r6,r7,lsl#8
104         ldrb    r7,[r1,#12]
105         orr     r6,r6,r8,lsl#16
106         ldrb    r8,[r1,#13]
107         orr     r6,r6,r9,lsl#24
108         ldrb    r9,[r1,#14]
109         and     r6,r6,r3
110
111         ldrb    r10,[r1,#15]
112         orr     r7,r7,r8,lsl#8
113         str     r4,[r0,#0]
114         orr     r7,r7,r9,lsl#16
115         str     r5,[r0,#4]
116         orr     r7,r7,r10,lsl#24
117         str     r6,[r0,#8]
118         and     r7,r7,r3
119         str     r7,[r0,#12]
120 #if     __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
121         stmia   r2,{r11,r12}            @ fill functions table
122         mov     r0,#1
123 #else
124         mov     r0,#0
125 #endif
126 .Lno_key:
127         ldmia   sp!,{r4-r11}
128 #if     __ARM_ARCH__>=5
129         bx      lr                              @ bx    lr
130 #else
131         tst     lr,#1
132         moveq   pc,lr                   @ be binary compatible with V4, yet
133         .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
134 #endif
135 .size   poly1305_init,.-poly1305_init
136 .type   poly1305_blocks,%function
137 .align  5
138 poly1305_blocks:
139 .Lpoly1305_blocks:
140         stmdb   sp!,{r3-r11,lr}
141
142         ands    r2,r2,#-16
143         beq     .Lno_data
144
145         add     r2,r2,r1                @ end pointer
146         sub     sp,sp,#32
147
148 #if __ARM_ARCH__<7
149         ldmia   r0,{r4-r12}             @ load context
150         add     r0,r0,#20
151         str     r2,[sp,#16]             @ offload stuff
152         str     r0,[sp,#12]
153 #else
154         ldr     lr,[r0,#36]             @ is_base2_26
155         ldmia   r0!,{r4-r8}             @ load hash value
156         str     r2,[sp,#16]             @ offload stuff
157         str     r0,[sp,#12]
158
159         adds    r9,r4,r5,lsl#26 @ base 2^26 -> base 2^32
160         mov     r10,r5,lsr#6
161         adcs    r10,r10,r6,lsl#20
162         mov     r11,r6,lsr#12
163         adcs    r11,r11,r7,lsl#14
164         mov     r12,r7,lsr#18
165         adcs    r12,r12,r8,lsl#8
166         mov     r2,#0
167         teq     lr,#0
168         str     r2,[r0,#16]             @ clear is_base2_26
169         adc     r2,r2,r8,lsr#24
170
171         itttt   ne
172         movne   r4,r9                   @ choose between radixes
173         movne   r5,r10
174         movne   r6,r11
175         movne   r7,r12
176         ldmia   r0,{r9-r12}             @ load key
177         it      ne
178         movne   r8,r2
179 #endif
180
181         mov     lr,r1
182         cmp     r3,#0
183         str     r10,[sp,#20]
184         str     r11,[sp,#24]
185         str     r12,[sp,#28]
186         b       .Loop
187
188 .align  4
189 .Loop:
190 #if __ARM_ARCH__<7
191         ldrb    r0,[lr],#16             @ load input
192 # ifdef __thumb2__
193         it      hi
194 # endif
195         addhi   r8,r8,#1                @ 1<<128
196         ldrb    r1,[lr,#-15]
197         ldrb    r2,[lr,#-14]
198         ldrb    r3,[lr,#-13]
199         orr     r1,r0,r1,lsl#8
200         ldrb    r0,[lr,#-12]
201         orr     r2,r1,r2,lsl#16
202         ldrb    r1,[lr,#-11]
203         orr     r3,r2,r3,lsl#24
204         ldrb    r2,[lr,#-10]
205         adds    r4,r4,r3                @ accumulate input
206
207         ldrb    r3,[lr,#-9]
208         orr     r1,r0,r1,lsl#8
209         ldrb    r0,[lr,#-8]
210         orr     r2,r1,r2,lsl#16
211         ldrb    r1,[lr,#-7]
212         orr     r3,r2,r3,lsl#24
213         ldrb    r2,[lr,#-6]
214         adcs    r5,r5,r3
215
216         ldrb    r3,[lr,#-5]
217         orr     r1,r0,r1,lsl#8
218         ldrb    r0,[lr,#-4]
219         orr     r2,r1,r2,lsl#16
220         ldrb    r1,[lr,#-3]
221         orr     r3,r2,r3,lsl#24
222         ldrb    r2,[lr,#-2]
223         adcs    r6,r6,r3
224
225         ldrb    r3,[lr,#-1]
226         orr     r1,r0,r1,lsl#8
227         str     lr,[sp,#8]              @ offload input pointer
228         orr     r2,r1,r2,lsl#16
229         add     r10,r10,r10,lsr#2
230         orr     r3,r2,r3,lsl#24
231 #else
232         ldr     r0,[lr],#16             @ load input
233         it      hi
234         addhi   r8,r8,#1                @ padbit
235         ldr     r1,[lr,#-12]
236         ldr     r2,[lr,#-8]
237         ldr     r3,[lr,#-4]
238 # ifdef __ARMEB__
239         rev     r0,r0
240         rev     r1,r1
241         rev     r2,r2
242         rev     r3,r3
243 # endif
244         adds    r4,r4,r0                @ accumulate input
245         str     lr,[sp,#8]              @ offload input pointer
246         adcs    r5,r5,r1
247         add     r10,r10,r10,lsr#2
248         adcs    r6,r6,r2
249 #endif
250         add     r11,r11,r11,lsr#2
251         adcs    r7,r7,r3
252         add     r12,r12,r12,lsr#2
253
254         umull   r2,r3,r5,r9
255          adc    r8,r8,#0
256         umull   r0,r1,r4,r9
257         umlal   r2,r3,r8,r10
258         umlal   r0,r1,r7,r10
259         ldr     r10,[sp,#20]            @ reload r10
260         umlal   r2,r3,r6,r12
261         umlal   r0,r1,r5,r12
262         umlal   r2,r3,r7,r11
263         umlal   r0,r1,r6,r11
264         umlal   r2,r3,r4,r10
265         str     r0,[sp,#0]              @ future r4
266          mul    r0,r11,r8
267         ldr     r11,[sp,#24]            @ reload r11
268         adds    r2,r2,r1                @ d1+=d0>>32
269          eor    r1,r1,r1
270         adc     lr,r3,#0                @ future r6
271         str     r2,[sp,#4]              @ future r5
272
273         mul     r2,r12,r8
274         eor     r3,r3,r3
275         umlal   r0,r1,r7,r12
276         ldr     r12,[sp,#28]            @ reload r12
277         umlal   r2,r3,r7,r9
278         umlal   r0,r1,r6,r9
279         umlal   r2,r3,r6,r10
280         umlal   r0,r1,r5,r10
281         umlal   r2,r3,r5,r11
282         umlal   r0,r1,r4,r11
283         umlal   r2,r3,r4,r12
284         ldr     r4,[sp,#0]
285         mul     r8,r9,r8
286         ldr     r5,[sp,#4]
287
288         adds    r6,lr,r0                @ d2+=d1>>32
289         ldr     lr,[sp,#8]              @ reload input pointer
290         adc     r1,r1,#0
291         adds    r7,r2,r1                @ d3+=d2>>32
292         ldr     r0,[sp,#16]             @ reload end pointer
293         adc     r3,r3,#0
294         add     r8,r8,r3                @ h4+=d3>>32
295
296         and     r1,r8,#-4
297         and     r8,r8,#3
298         add     r1,r1,r1,lsr#2          @ *=5
299         adds    r4,r4,r1
300         adcs    r5,r5,#0
301         adcs    r6,r6,#0
302         adcs    r7,r7,#0
303         adc     r8,r8,#0
304
305         cmp     r0,lr                   @ done yet?
306         bhi     .Loop
307
308         ldr     r0,[sp,#12]
309         add     sp,sp,#32
310         stmdb   r0,{r4-r8}              @ store the result
311
312 .Lno_data:
313 #if     __ARM_ARCH__>=5
314         ldmia   sp!,{r3-r11,pc}
315 #else
316         ldmia   sp!,{r3-r11,lr}
317         tst     lr,#1
318         moveq   pc,lr                   @ be binary compatible with V4, yet
319         .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
320 #endif
321 .size   poly1305_blocks,.-poly1305_blocks
322 .type   poly1305_emit,%function
323 .align  5
324 poly1305_emit:
325 .Lpoly1305_emit:
326         stmdb   sp!,{r4-r11}
327
328         ldmia   r0,{r3-r7}
329
330 #if __ARM_ARCH__>=7
331         ldr     ip,[r0,#36]             @ is_base2_26
332
333         adds    r8,r3,r4,lsl#26 @ base 2^26 -> base 2^32
334         mov     r9,r4,lsr#6
335         adcs    r9,r9,r5,lsl#20
336         mov     r10,r5,lsr#12
337         adcs    r10,r10,r6,lsl#14
338         mov     r11,r6,lsr#18
339         adcs    r11,r11,r7,lsl#8
340         mov     r0,#0
341         adc     r0,r0,r7,lsr#24
342
343         tst     ip,ip
344         itttt   ne
345         movne   r3,r8
346         movne   r4,r9
347         movne   r5,r10
348         movne   r6,r11
349         it      ne
350         movne   r7,r0
351 #endif
352
353         adds    r8,r3,#5                @ compare to modulus
354         adcs    r9,r4,#0
355         adcs    r10,r5,#0
356         adcs    r11,r6,#0
357         adc     r0,r7,#0
358         tst     r0,#4                   @ did it carry/borrow?
359
360 #ifdef  __thumb2__
361         it      ne
362 #endif
363         movne   r3,r8
364         ldr     r8,[r2,#0]
365 #ifdef  __thumb2__
366         it      ne
367 #endif
368         movne   r4,r9
369         ldr     r9,[r2,#4]
370 #ifdef  __thumb2__
371         it      ne
372 #endif
373         movne   r5,r10
374         ldr     r10,[r2,#8]
375 #ifdef  __thumb2__
376         it      ne
377 #endif
378         movne   r6,r11
379         ldr     r11,[r2,#12]
380
381         adds    r3,r3,r8
382         adcs    r4,r4,r9
383         adcs    r5,r5,r10
384         adc     r6,r6,r11
385
386 #if __ARM_ARCH__>=7
387 # ifdef __ARMEB__
388         rev     r3,r3
389         rev     r4,r4
390         rev     r5,r5
391         rev     r6,r6
392 # endif
393         str     r3,[r1,#0]
394         str     r4,[r1,#4]
395         str     r5,[r1,#8]
396         str     r6,[r1,#12]
397 #else
398         strb    r3,[r1,#0]
399         mov     r3,r3,lsr#8
400         strb    r4,[r1,#4]
401         mov     r4,r4,lsr#8
402         strb    r5,[r1,#8]
403         mov     r5,r5,lsr#8
404         strb    r6,[r1,#12]
405         mov     r6,r6,lsr#8
406
407         strb    r3,[r1,#1]
408         mov     r3,r3,lsr#8
409         strb    r4,[r1,#5]
410         mov     r4,r4,lsr#8
411         strb    r5,[r1,#9]
412         mov     r5,r5,lsr#8
413         strb    r6,[r1,#13]
414         mov     r6,r6,lsr#8
415
416         strb    r3,[r1,#2]
417         mov     r3,r3,lsr#8
418         strb    r4,[r1,#6]
419         mov     r4,r4,lsr#8
420         strb    r5,[r1,#10]
421         mov     r5,r5,lsr#8
422         strb    r6,[r1,#14]
423         mov     r6,r6,lsr#8
424
425         strb    r3,[r1,#3]
426         strb    r4,[r1,#7]
427         strb    r5,[r1,#11]
428         strb    r6,[r1,#15]
429 #endif
430         ldmia   sp!,{r4-r11}
431 #if     __ARM_ARCH__>=5
432         bx      lr                              @ bx    lr
433 #else
434         tst     lr,#1
435         moveq   pc,lr                   @ be binary compatible with V4, yet
436         .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
437 #endif
438 .size   poly1305_emit,.-poly1305_emit
439 #if     __ARM_MAX_ARCH__>=7
440 .fpu    neon
441
442 .type   poly1305_init_neon,%function
443 .align  5
444 poly1305_init_neon:
445 .Lpoly1305_init_neon:
446         ldr     r3,[r0,#48]             @ first table element
447         cmp     r3,#-1                  @ is value impossible?
448         bne     .Lno_init_neon
449
450         ldr     r4,[r0,#20]             @ load key base 2^32
451         ldr     r5,[r0,#24]
452         ldr     r6,[r0,#28]
453         ldr     r7,[r0,#32]
454
455         and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
456         mov     r3,r4,lsr#26
457         mov     r4,r5,lsr#20
458         orr     r3,r3,r5,lsl#6
459         mov     r5,r6,lsr#14
460         orr     r4,r4,r6,lsl#12
461         mov     r6,r7,lsr#8
462         orr     r5,r5,r7,lsl#18
463         and     r3,r3,#0x03ffffff
464         and     r4,r4,#0x03ffffff
465         and     r5,r5,#0x03ffffff
466
467         vdup.32 d0,r2                   @ r^1 in both lanes
468         add     r2,r3,r3,lsl#2          @ *5
469         vdup.32 d1,r3
470         add     r3,r4,r4,lsl#2
471         vdup.32 d2,r2
472         vdup.32 d3,r4
473         add     r4,r5,r5,lsl#2
474         vdup.32 d4,r3
475         vdup.32 d5,r5
476         add     r5,r6,r6,lsl#2
477         vdup.32 d6,r4
478         vdup.32 d7,r6
479         vdup.32 d8,r5
480
481         mov     r5,#2           @ counter
482
483 .Lsquare_neon:
484         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
485         @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
486         @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
487         @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
488         @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
489         @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
490
491         vmull.u32       q5,d0,d0[1]
492         vmull.u32       q6,d1,d0[1]
493         vmull.u32       q7,d3,d0[1]
494         vmull.u32       q8,d5,d0[1]
495         vmull.u32       q9,d7,d0[1]
496
497         vmlal.u32       q5,d7,d2[1]
498         vmlal.u32       q6,d0,d1[1]
499         vmlal.u32       q7,d1,d1[1]
500         vmlal.u32       q8,d3,d1[1]
501         vmlal.u32       q9,d5,d1[1]
502
503         vmlal.u32       q5,d5,d4[1]
504         vmlal.u32       q6,d7,d4[1]
505         vmlal.u32       q8,d1,d3[1]
506         vmlal.u32       q7,d0,d3[1]
507         vmlal.u32       q9,d3,d3[1]
508
509         vmlal.u32       q5,d3,d6[1]
510         vmlal.u32       q8,d0,d5[1]
511         vmlal.u32       q6,d5,d6[1]
512         vmlal.u32       q7,d7,d6[1]
513         vmlal.u32       q9,d1,d5[1]
514
515         vmlal.u32       q8,d7,d8[1]
516         vmlal.u32       q5,d1,d8[1]
517         vmlal.u32       q6,d3,d8[1]
518         vmlal.u32       q7,d5,d8[1]
519         vmlal.u32       q9,d0,d7[1]
520
521         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
522         @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
523         @ and P. Schwabe
524         @
525         @ H0>>+H1>>+H2>>+H3>>+H4
526         @ H3>>+H4>>*5+H0>>+H1
527         @
528         @ Trivia.
529         @
530         @ Result of multiplication of n-bit number by m-bit number is
531         @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
532         @ m-bit number multiplied by 2^n is still n+m bits wide.
533         @
534         @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
535         @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
536         @ one is n+1 bits wide.
537         @
538         @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
539         @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
540         @ can be 27. However! In cases when their width exceeds 26 bits
541         @ they are limited by 2^26+2^6. This in turn means that *sum*
542         @ of the products with these values can still be viewed as sum
543         @ of 52-bit numbers as long as the amount of addends is not a
544         @ power of 2. For example,
545         @
546         @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
547         @
548         @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
549         @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
550         @ 8 * (2^52) or 2^55. However, the value is then multiplied by
551         @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
552         @ which is less than 32 * (2^52) or 2^57. And when processing
553         @ data we are looking at triple as many addends...
554         @
555         @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
556         @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
557         @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
558         @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
559         @ instruction accepts 2x32-bit input and writes 2x64-bit result.
560         @ This means that result of reduction have to be compressed upon
561         @ loop wrap-around. This can be done in the process of reduction
562         @ to minimize amount of instructions [as well as amount of
563         @ 128-bit instructions, which benefits low-end processors], but
564         @ one has to watch for H2 (which is narrower than H0) and 5*H4
565         @ not being wider than 58 bits, so that result of right shift
566         @ by 26 bits fits in 32 bits. This is also useful on x86,
567         @ because it allows to use paddd in place for paddq, which
568         @ benefits Atom, where paddq is ridiculously slow.
569
570         vshr.u64        q15,q8,#26
571         vmovn.i64       d16,q8
572          vshr.u64       q4,q5,#26
573          vmovn.i64      d10,q5
574         vadd.i64        q9,q9,q15               @ h3 -> h4
575         vbic.i32        d16,#0xfc000000 @ &=0x03ffffff
576          vadd.i64       q6,q6,q4                @ h0 -> h1
577          vbic.i32       d10,#0xfc000000
578
579         vshrn.u64       d30,q9,#26
580         vmovn.i64       d18,q9
581          vshr.u64       q4,q6,#26
582          vmovn.i64      d12,q6
583          vadd.i64       q7,q7,q4                @ h1 -> h2
584         vbic.i32        d18,#0xfc000000
585          vbic.i32       d12,#0xfc000000
586
587         vadd.i32        d10,d10,d30
588         vshl.u32        d30,d30,#2
589          vshrn.u64      d8,q7,#26
590          vmovn.i64      d14,q7
591         vadd.i32        d10,d10,d30     @ h4 -> h0
592          vadd.i32       d16,d16,d8      @ h2 -> h3
593          vbic.i32       d14,#0xfc000000
594
595         vshr.u32        d30,d10,#26
596         vbic.i32        d10,#0xfc000000
597          vshr.u32       d8,d16,#26
598          vbic.i32       d16,#0xfc000000
599         vadd.i32        d12,d12,d30     @ h0 -> h1
600          vadd.i32       d18,d18,d8      @ h3 -> h4
601
602         subs            r5,r5,#1
603         beq             .Lsquare_break_neon
604
605         add             r6,r0,#(48+0*9*4)
606         add             r7,r0,#(48+1*9*4)
607
608         vtrn.32         d0,d10          @ r^2:r^1
609         vtrn.32         d3,d14
610         vtrn.32         d5,d16
611         vtrn.32         d1,d12
612         vtrn.32         d7,d18
613
614         vshl.u32        d4,d3,#2                @ *5
615         vshl.u32        d6,d5,#2
616         vshl.u32        d2,d1,#2
617         vshl.u32        d8,d7,#2
618         vadd.i32        d4,d4,d3
619         vadd.i32        d2,d2,d1
620         vadd.i32        d6,d6,d5
621         vadd.i32        d8,d8,d7
622
623         vst4.32         {d0[0],d1[0],d2[0],d3[0]},[r6]!
624         vst4.32         {d0[1],d1[1],d2[1],d3[1]},[r7]!
625         vst4.32         {d4[0],d5[0],d6[0],d7[0]},[r6]!
626         vst4.32         {d4[1],d5[1],d6[1],d7[1]},[r7]!
627         vst1.32         {d8[0]},[r6,:32]
628         vst1.32         {d8[1]},[r7,:32]
629
630         b               .Lsquare_neon
631
632 .align  4
633 .Lsquare_break_neon:
634         add             r6,r0,#(48+2*4*9)
635         add             r7,r0,#(48+3*4*9)
636
637         vmov            d0,d10          @ r^4:r^3
638         vshl.u32        d2,d12,#2               @ *5
639         vmov            d1,d12
640         vshl.u32        d4,d14,#2
641         vmov            d3,d14
642         vshl.u32        d6,d16,#2
643         vmov            d5,d16
644         vshl.u32        d8,d18,#2
645         vmov            d7,d18
646         vadd.i32        d2,d2,d12
647         vadd.i32        d4,d4,d14
648         vadd.i32        d6,d6,d16
649         vadd.i32        d8,d8,d18
650
651         vst4.32         {d0[0],d1[0],d2[0],d3[0]},[r6]!
652         vst4.32         {d0[1],d1[1],d2[1],d3[1]},[r7]!
653         vst4.32         {d4[0],d5[0],d6[0],d7[0]},[r6]!
654         vst4.32         {d4[1],d5[1],d6[1],d7[1]},[r7]!
655         vst1.32         {d8[0]},[r6]
656         vst1.32         {d8[1]},[r7]
657
658 .Lno_init_neon:
659         bx      lr                              @ bx    lr
660 .size   poly1305_init_neon,.-poly1305_init_neon
661
662 .type   poly1305_blocks_neon,%function
663 .align  5
664 poly1305_blocks_neon:
665 .Lpoly1305_blocks_neon:
666         ldr     ip,[r0,#36]             @ is_base2_26
667
668         cmp     r2,#64
669         blo     .Lpoly1305_blocks
670
671         stmdb   sp!,{r4-r7}
672         vstmdb  sp!,{d8-d15}            @ ABI specification says so
673
674         tst     ip,ip                   @ is_base2_26?
675         bne     .Lbase2_26_neon
676
677         stmdb   sp!,{r1-r3,lr}
678         bl      .Lpoly1305_init_neon
679
680         ldr     r4,[r0,#0]              @ load hash value base 2^32
681         ldr     r5,[r0,#4]
682         ldr     r6,[r0,#8]
683         ldr     r7,[r0,#12]
684         ldr     ip,[r0,#16]
685
686         and     r2,r4,#0x03ffffff       @ base 2^32 -> base 2^26
687         mov     r3,r4,lsr#26
688          veor   d10,d10,d10
689         mov     r4,r5,lsr#20
690         orr     r3,r3,r5,lsl#6
691          veor   d12,d12,d12
692         mov     r5,r6,lsr#14
693         orr     r4,r4,r6,lsl#12
694          veor   d14,d14,d14
695         mov     r6,r7,lsr#8
696         orr     r5,r5,r7,lsl#18
697          veor   d16,d16,d16
698         and     r3,r3,#0x03ffffff
699         orr     r6,r6,ip,lsl#24
700          veor   d18,d18,d18
701         and     r4,r4,#0x03ffffff
702         mov     r1,#1
703         and     r5,r5,#0x03ffffff
704         str     r1,[r0,#36]             @ set is_base2_26
705
706         vmov.32 d10[0],r2
707         vmov.32 d12[0],r3
708         vmov.32 d14[0],r4
709         vmov.32 d16[0],r5
710         vmov.32 d18[0],r6
711         adr     r5,.Lzeros
712
713         ldmia   sp!,{r1-r3,lr}
714         b       .Lhash_loaded
715
716 .align  4
717 .Lbase2_26_neon:
718         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
719         @ load hash value
720
721         veor            d10,d10,d10
722         veor            d12,d12,d12
723         veor            d14,d14,d14
724         veor            d16,d16,d16
725         veor            d18,d18,d18
726         vld4.32         {d10[0],d12[0],d14[0],d16[0]},[r0]!
727         adr             r5,.Lzeros
728         vld1.32         {d18[0]},[r0]
729         sub             r0,r0,#16               @ rewind
730
731 .Lhash_loaded:
732         add             r4,r1,#32
733         mov             r3,r3,lsl#24
734         tst             r2,#31
735         beq             .Leven
736
737         vld4.32         {d20[0],d22[0],d24[0],d26[0]},[r1]!
738         vmov.32         d28[0],r3
739         sub             r2,r2,#16
740         add             r4,r1,#32
741
742 # ifdef __ARMEB__
743         vrev32.8        q10,q10
744         vrev32.8        q13,q13
745         vrev32.8        q11,q11
746         vrev32.8        q12,q12
747 # endif
748         vsri.u32        d28,d26,#8      @ base 2^32 -> base 2^26
749         vshl.u32        d26,d26,#18
750
751         vsri.u32        d26,d24,#14
752         vshl.u32        d24,d24,#12
753         vadd.i32        d29,d28,d18     @ add hash value and move to #hi
754
755         vbic.i32        d26,#0xfc000000
756         vsri.u32        d24,d22,#20
757         vshl.u32        d22,d22,#6
758
759         vbic.i32        d24,#0xfc000000
760         vsri.u32        d22,d20,#26
761         vadd.i32        d27,d26,d16
762
763         vbic.i32        d20,#0xfc000000
764         vbic.i32        d22,#0xfc000000
765         vadd.i32        d25,d24,d14
766
767         vadd.i32        d21,d20,d10
768         vadd.i32        d23,d22,d12
769
770         mov             r7,r5
771         add             r6,r0,#48
772
773         cmp             r2,r2
774         b               .Long_tail
775
776 .align  4
777 .Leven:
778         subs            r2,r2,#64
779         it              lo
780         movlo           r4,r5
781
782         vmov.i32        q14,#1<<24              @ padbit, yes, always
783         vld4.32         {d20,d22,d24,d26},[r1]  @ inp[0:1]
784         add             r1,r1,#64
785         vld4.32         {d21,d23,d25,d27},[r4]  @ inp[2:3] (or 0)
786         add             r4,r4,#64
787         itt             hi
788         addhi           r7,r0,#(48+1*9*4)
789         addhi           r6,r0,#(48+3*9*4)
790
791 # ifdef __ARMEB__
792         vrev32.8        q10,q10
793         vrev32.8        q13,q13
794         vrev32.8        q11,q11
795         vrev32.8        q12,q12
796 # endif
797         vsri.u32        q14,q13,#8              @ base 2^32 -> base 2^26
798         vshl.u32        q13,q13,#18
799
800         vsri.u32        q13,q12,#14
801         vshl.u32        q12,q12,#12
802
803         vbic.i32        q13,#0xfc000000
804         vsri.u32        q12,q11,#20
805         vshl.u32        q11,q11,#6
806
807         vbic.i32        q12,#0xfc000000
808         vsri.u32        q11,q10,#26
809
810         vbic.i32        q10,#0xfc000000
811         vbic.i32        q11,#0xfc000000
812
813         bls             .Lskip_loop
814
815         vld4.32         {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
816         vld4.32         {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
817         vld4.32         {d4[1],d5[1],d6[1],d7[1]},[r7]!
818         vld4.32         {d4[0],d5[0],d6[0],d7[0]},[r6]!
819         b               .Loop_neon
820
821 .align  5
822 .Loop_neon:
823         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
824         @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
825         @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
826         @   ___________________/
827         @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
828         @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
829         @   ___________________/ ____________________/
830         @
831         @ Note that we start with inp[2:3]*r^2. This is because it
832         @ doesn't depend on reduction in previous iteration.
833         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
834         @ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
835         @ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
836         @ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
837         @ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
838         @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
839
840         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
841         @ inp[2:3]*r^2
842
843         vadd.i32        d24,d24,d14     @ accumulate inp[0:1]
844         vmull.u32       q7,d25,d0[1]
845         vadd.i32        d20,d20,d10
846         vmull.u32       q5,d21,d0[1]
847         vadd.i32        d26,d26,d16
848         vmull.u32       q8,d27,d0[1]
849         vmlal.u32       q7,d23,d1[1]
850         vadd.i32        d22,d22,d12
851         vmull.u32       q6,d23,d0[1]
852
853         vadd.i32        d28,d28,d18
854         vmull.u32       q9,d29,d0[1]
855         subs            r2,r2,#64
856         vmlal.u32       q5,d29,d2[1]
857         it              lo
858         movlo           r4,r5
859         vmlal.u32       q8,d25,d1[1]
860         vld1.32         d8[1],[r7,:32]
861         vmlal.u32       q6,d21,d1[1]
862         vmlal.u32       q9,d27,d1[1]
863
864         vmlal.u32       q5,d27,d4[1]
865         vmlal.u32       q8,d23,d3[1]
866         vmlal.u32       q9,d25,d3[1]
867         vmlal.u32       q6,d29,d4[1]
868         vmlal.u32       q7,d21,d3[1]
869
870         vmlal.u32       q8,d21,d5[1]
871         vmlal.u32       q5,d25,d6[1]
872         vmlal.u32       q9,d23,d5[1]
873         vmlal.u32       q6,d27,d6[1]
874         vmlal.u32       q7,d29,d6[1]
875
876         vmlal.u32       q8,d29,d8[1]
877         vmlal.u32       q5,d23,d8[1]
878         vmlal.u32       q9,d21,d7[1]
879         vmlal.u32       q6,d25,d8[1]
880         vmlal.u32       q7,d27,d8[1]
881
882         vld4.32         {d21,d23,d25,d27},[r4]  @ inp[2:3] (or 0)
883         add             r4,r4,#64
884
885         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
886         @ (hash+inp[0:1])*r^4 and accumulate
887
888         vmlal.u32       q8,d26,d0[0]
889         vmlal.u32       q5,d20,d0[0]
890         vmlal.u32       q9,d28,d0[0]
891         vmlal.u32       q6,d22,d0[0]
892         vmlal.u32       q7,d24,d0[0]
893         vld1.32         d8[0],[r6,:32]
894
895         vmlal.u32       q8,d24,d1[0]
896         vmlal.u32       q5,d28,d2[0]
897         vmlal.u32       q9,d26,d1[0]
898         vmlal.u32       q6,d20,d1[0]
899         vmlal.u32       q7,d22,d1[0]
900
901         vmlal.u32       q8,d22,d3[0]
902         vmlal.u32       q5,d26,d4[0]
903         vmlal.u32       q9,d24,d3[0]
904         vmlal.u32       q6,d28,d4[0]
905         vmlal.u32       q7,d20,d3[0]
906
907         vmlal.u32       q8,d20,d5[0]
908         vmlal.u32       q5,d24,d6[0]
909         vmlal.u32       q9,d22,d5[0]
910         vmlal.u32       q6,d26,d6[0]
911         vmlal.u32       q8,d28,d8[0]
912
913         vmlal.u32       q7,d28,d6[0]
914         vmlal.u32       q5,d22,d8[0]
915         vmlal.u32       q9,d20,d7[0]
916         vmov.i32        q14,#1<<24              @ padbit, yes, always
917         vmlal.u32       q6,d24,d8[0]
918         vmlal.u32       q7,d26,d8[0]
919
920         vld4.32         {d20,d22,d24,d26},[r1]  @ inp[0:1]
921         add             r1,r1,#64
922 # ifdef __ARMEB__
923         vrev32.8        q10,q10
924         vrev32.8        q11,q11
925         vrev32.8        q12,q12
926         vrev32.8        q13,q13
927 # endif
928
929         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
930         @ lazy reduction interleaved with base 2^32 -> base 2^26 of
931         @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
932
933         vshr.u64        q15,q8,#26
934         vmovn.i64       d16,q8
935          vshr.u64       q4,q5,#26
936          vmovn.i64      d10,q5
937         vadd.i64        q9,q9,q15               @ h3 -> h4
938         vbic.i32        d16,#0xfc000000
939           vsri.u32      q14,q13,#8              @ base 2^32 -> base 2^26
940          vadd.i64       q6,q6,q4                @ h0 -> h1
941           vshl.u32      q13,q13,#18
942          vbic.i32       d10,#0xfc000000
943
944         vshrn.u64       d30,q9,#26
945         vmovn.i64       d18,q9
946          vshr.u64       q4,q6,#26
947          vmovn.i64      d12,q6
948          vadd.i64       q7,q7,q4                @ h1 -> h2
949           vsri.u32      q13,q12,#14
950         vbic.i32        d18,#0xfc000000
951           vshl.u32      q12,q12,#12
952          vbic.i32       d12,#0xfc000000
953
954         vadd.i32        d10,d10,d30
955         vshl.u32        d30,d30,#2
956           vbic.i32      q13,#0xfc000000
957          vshrn.u64      d8,q7,#26
958          vmovn.i64      d14,q7
959         vaddl.u32       q5,d10,d30      @ h4 -> h0 [widen for a sec]
960           vsri.u32      q12,q11,#20
961          vadd.i32       d16,d16,d8      @ h2 -> h3
962           vshl.u32      q11,q11,#6
963          vbic.i32       d14,#0xfc000000
964           vbic.i32      q12,#0xfc000000
965
966         vshrn.u64       d30,q5,#26              @ re-narrow
967         vmovn.i64       d10,q5
968           vsri.u32      q11,q10,#26
969           vbic.i32      q10,#0xfc000000
970          vshr.u32       d8,d16,#26
971          vbic.i32       d16,#0xfc000000
972         vbic.i32        d10,#0xfc000000
973         vadd.i32        d12,d12,d30     @ h0 -> h1
974          vadd.i32       d18,d18,d8      @ h3 -> h4
975           vbic.i32      q11,#0xfc000000
976
977         bhi             .Loop_neon
978
979 .Lskip_loop:
980         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
981         @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
982
983         add             r7,r0,#(48+0*9*4)
984         add             r6,r0,#(48+1*9*4)
985         adds            r2,r2,#32
986         it              ne
987         movne           r2,#0
988         bne             .Long_tail
989
990         vadd.i32        d25,d24,d14     @ add hash value and move to #hi
991         vadd.i32        d21,d20,d10
992         vadd.i32        d27,d26,d16
993         vadd.i32        d23,d22,d12
994         vadd.i32        d29,d28,d18
995
996 .Long_tail:
997         vld4.32         {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
998         vld4.32         {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
999
1000         vadd.i32        d24,d24,d14     @ can be redundant
1001         vmull.u32       q7,d25,d0
1002         vadd.i32        d20,d20,d10
1003         vmull.u32       q5,d21,d0
1004         vadd.i32        d26,d26,d16
1005         vmull.u32       q8,d27,d0
1006         vadd.i32        d22,d22,d12
1007         vmull.u32       q6,d23,d0
1008         vadd.i32        d28,d28,d18
1009         vmull.u32       q9,d29,d0
1010
1011         vmlal.u32       q5,d29,d2
1012         vld4.32         {d4[1],d5[1],d6[1],d7[1]},[r7]!
1013         vmlal.u32       q8,d25,d1
1014         vld4.32         {d4[0],d5[0],d6[0],d7[0]},[r6]!
1015         vmlal.u32       q6,d21,d1
1016         vmlal.u32       q9,d27,d1
1017         vmlal.u32       q7,d23,d1
1018
1019         vmlal.u32       q8,d23,d3
1020         vld1.32         d8[1],[r7,:32]
1021         vmlal.u32       q5,d27,d4
1022         vld1.32         d8[0],[r6,:32]
1023         vmlal.u32       q9,d25,d3
1024         vmlal.u32       q6,d29,d4
1025         vmlal.u32       q7,d21,d3
1026
1027         vmlal.u32       q8,d21,d5
1028          it             ne
1029          addne          r7,r0,#(48+2*9*4)
1030         vmlal.u32       q5,d25,d6
1031          it             ne
1032          addne          r6,r0,#(48+3*9*4)
1033         vmlal.u32       q9,d23,d5
1034         vmlal.u32       q6,d27,d6
1035         vmlal.u32       q7,d29,d6
1036
1037         vmlal.u32       q8,d29,d8
1038          vorn           q0,q0,q0        @ all-ones, can be redundant
1039         vmlal.u32       q5,d23,d8
1040          vshr.u64       q0,q0,#38
1041         vmlal.u32       q9,d21,d7
1042         vmlal.u32       q6,d25,d8
1043         vmlal.u32       q7,d27,d8
1044
1045         beq             .Lshort_tail
1046
1047         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1048         @ (hash+inp[0:1])*r^4:r^3 and accumulate
1049
1050         vld4.32         {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
1051         vld4.32         {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
1052
1053         vmlal.u32       q7,d24,d0
1054         vmlal.u32       q5,d20,d0
1055         vmlal.u32       q8,d26,d0
1056         vmlal.u32       q6,d22,d0
1057         vmlal.u32       q9,d28,d0
1058
1059         vmlal.u32       q5,d28,d2
1060         vld4.32         {d4[1],d5[1],d6[1],d7[1]},[r7]!
1061         vmlal.u32       q8,d24,d1
1062         vld4.32         {d4[0],d5[0],d6[0],d7[0]},[r6]!
1063         vmlal.u32       q6,d20,d1
1064         vmlal.u32       q9,d26,d1
1065         vmlal.u32       q7,d22,d1
1066
1067         vmlal.u32       q8,d22,d3
1068         vld1.32         d8[1],[r7,:32]
1069         vmlal.u32       q5,d26,d4
1070         vld1.32         d8[0],[r6,:32]
1071         vmlal.u32       q9,d24,d3
1072         vmlal.u32       q6,d28,d4
1073         vmlal.u32       q7,d20,d3
1074
1075         vmlal.u32       q8,d20,d5
1076         vmlal.u32       q5,d24,d6
1077         vmlal.u32       q9,d22,d5
1078         vmlal.u32       q6,d26,d6
1079         vmlal.u32       q7,d28,d6
1080
1081         vmlal.u32       q8,d28,d8
1082          vorn           q0,q0,q0        @ all-ones
1083         vmlal.u32       q5,d22,d8
1084          vshr.u64       q0,q0,#38
1085         vmlal.u32       q9,d20,d7
1086         vmlal.u32       q6,d24,d8
1087         vmlal.u32       q7,d26,d8
1088
1089 .Lshort_tail:
1090         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1091         @ horizontal addition
1092
1093         vadd.i64        d16,d16,d17
1094         vadd.i64        d10,d10,d11
1095         vadd.i64        d18,d18,d19
1096         vadd.i64        d12,d12,d13
1097         vadd.i64        d14,d14,d15
1098
1099         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1100         @ lazy reduction, but without narrowing
1101
1102         vshr.u64        q15,q8,#26
1103         vand.i64        q8,q8,q0
1104          vshr.u64       q4,q5,#26
1105          vand.i64       q5,q5,q0
1106         vadd.i64        q9,q9,q15               @ h3 -> h4
1107          vadd.i64       q6,q6,q4                @ h0 -> h1
1108
1109         vshr.u64        q15,q9,#26
1110         vand.i64        q9,q9,q0
1111          vshr.u64       q4,q6,#26
1112          vand.i64       q6,q6,q0
1113          vadd.i64       q7,q7,q4                @ h1 -> h2
1114
1115         vadd.i64        q5,q5,q15
1116         vshl.u64        q15,q15,#2
1117          vshr.u64       q4,q7,#26
1118          vand.i64       q7,q7,q0
1119         vadd.i64        q5,q5,q15               @ h4 -> h0
1120          vadd.i64       q8,q8,q4                @ h2 -> h3
1121
1122         vshr.u64        q15,q5,#26
1123         vand.i64        q5,q5,q0
1124          vshr.u64       q4,q8,#26
1125          vand.i64       q8,q8,q0
1126         vadd.i64        q6,q6,q15               @ h0 -> h1
1127          vadd.i64       q9,q9,q4                @ h3 -> h4
1128
1129         cmp             r2,#0
1130         bne             .Leven
1131
1132         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1133         @ store hash value
1134
1135         vst4.32         {d10[0],d12[0],d14[0],d16[0]},[r0]!
1136         vst1.32         {d18[0]},[r0]
1137
1138         vldmia  sp!,{d8-d15}                    @ epilogue
1139         ldmia   sp!,{r4-r7}
1140         bx      lr                                      @ bx    lr
1141 .size   poly1305_blocks_neon,.-poly1305_blocks_neon
1142
1143 .align  5
1144 .Lzeros:
1145 .long   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1146 #ifndef __KERNEL__
1147 .LOPENSSL_armcap:
1148 # ifdef _WIN32
1149 .word   OPENSSL_armcap_P
1150 # else
1151 .word   OPENSSL_armcap_P-.Lpoly1305_init
1152 # endif
1153 .comm   OPENSSL_armcap_P,4,4
1154 .hidden OPENSSL_armcap_P
1155 #endif
1156 #endif
1157 .asciz  "Poly1305 for ARMv4/NEON, CRYPTOGAMS by @dot-asm"
1158 .align  2