Commit | Line | Data |
---|---|---|
a6b803b3 AB |
1 | #!/usr/bin/env perl |
2 | # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause | |
3 | # | |
4 | # ==================================================================== | |
5 | # Written by Andy Polyakov, @dot-asm, initially for the OpenSSL | |
6 | # project. | |
7 | # ==================================================================== | |
8 | # | |
9 | # IALU(*)/gcc-4.4 NEON | |
10 | # | |
11 | # ARM11xx(ARMv6) 7.78/+100% - | |
12 | # Cortex-A5 6.35/+130% 3.00 | |
13 | # Cortex-A8 6.25/+115% 2.36 | |
14 | # Cortex-A9 5.10/+95% 2.55 | |
15 | # Cortex-A15 3.85/+85% 1.25(**) | |
16 | # Snapdragon S4 5.70/+100% 1.48(**) | |
17 | # | |
18 | # (*) this is for -march=armv6, i.e. with bunch of ldrb loading data; | |
19 | # (**) these are trade-off results, they can be improved by ~8% but at | |
20 | # the cost of 15/12% regression on Cortex-A5/A7, it's even possible | |
21 | # to improve Cortex-A9 result, but then A5/A7 loose more than 20%; | |
22 | ||
23 | $flavour = shift; | |
24 | if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } | |
25 | else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } | |
26 | ||
27 | if ($flavour && $flavour ne "void") { | |
28 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |
29 | ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or | |
30 | ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or | |
31 | die "can't locate arm-xlate.pl"; | |
32 | ||
33 | open STDOUT,"| \"$^X\" $xlate $flavour $output"; | |
34 | } else { | |
35 | open STDOUT,">$output"; | |
36 | } | |
37 | ||
38 | ($ctx,$inp,$len,$padbit)=map("r$_",(0..3)); | |
39 | ||
40 | $code.=<<___; | |
41 | #ifndef __KERNEL__ | |
42 | # include "arm_arch.h" | |
43 | #else | |
44 | # define __ARM_ARCH__ __LINUX_ARM_ARCH__ | |
45 | # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ | |
46 | # define poly1305_init poly1305_init_arm | |
47 | # define poly1305_blocks poly1305_blocks_arm | |
48 | # define poly1305_emit poly1305_emit_arm | |
49 | .globl poly1305_blocks_neon | |
50 | #endif | |
51 | ||
52 | #if defined(__thumb2__) | |
53 | .syntax unified | |
54 | .thumb | |
55 | #else | |
56 | .code 32 | |
57 | #endif | |
58 | ||
59 | .text | |
60 | ||
61 | .globl poly1305_emit | |
62 | .globl poly1305_blocks | |
63 | .globl poly1305_init | |
64 | .type poly1305_init,%function | |
65 | .align 5 | |
66 | poly1305_init: | |
67 | .Lpoly1305_init: | |
68 | stmdb sp!,{r4-r11} | |
69 | ||
70 | eor r3,r3,r3 | |
71 | cmp $inp,#0 | |
72 | str r3,[$ctx,#0] @ zero hash value | |
73 | str r3,[$ctx,#4] | |
74 | str r3,[$ctx,#8] | |
75 | str r3,[$ctx,#12] | |
76 | str r3,[$ctx,#16] | |
77 | str r3,[$ctx,#36] @ clear is_base2_26 | |
78 | add $ctx,$ctx,#20 | |
79 | ||
80 | #ifdef __thumb2__ | |
81 | it eq | |
82 | #endif | |
83 | moveq r0,#0 | |
84 | beq .Lno_key | |
85 | ||
86 | #if __ARM_MAX_ARCH__>=7 | |
87 | mov r3,#-1 | |
88 | str r3,[$ctx,#28] @ impossible key power value | |
89 | # ifndef __KERNEL__ | |
90 | adr r11,.Lpoly1305_init | |
91 | ldr r12,.LOPENSSL_armcap | |
92 | # endif | |
93 | #endif | |
94 | ldrb r4,[$inp,#0] | |
95 | mov r10,#0x0fffffff | |
96 | ldrb r5,[$inp,#1] | |
97 | and r3,r10,#-4 @ 0x0ffffffc | |
98 | ldrb r6,[$inp,#2] | |
99 | ldrb r7,[$inp,#3] | |
100 | orr r4,r4,r5,lsl#8 | |
101 | ldrb r5,[$inp,#4] | |
102 | orr r4,r4,r6,lsl#16 | |
103 | ldrb r6,[$inp,#5] | |
104 | orr r4,r4,r7,lsl#24 | |
105 | ldrb r7,[$inp,#6] | |
106 | and r4,r4,r10 | |
107 | ||
108 | #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) | |
109 | # if !defined(_WIN32) | |
110 | ldr r12,[r11,r12] @ OPENSSL_armcap_P | |
111 | # endif | |
112 | # if defined(__APPLE__) || defined(_WIN32) | |
113 | ldr r12,[r12] | |
114 | # endif | |
115 | #endif | |
116 | ldrb r8,[$inp,#7] | |
117 | orr r5,r5,r6,lsl#8 | |
118 | ldrb r6,[$inp,#8] | |
119 | orr r5,r5,r7,lsl#16 | |
120 | ldrb r7,[$inp,#9] | |
121 | orr r5,r5,r8,lsl#24 | |
122 | ldrb r8,[$inp,#10] | |
123 | and r5,r5,r3 | |
124 | ||
125 | #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) | |
126 | tst r12,#ARMV7_NEON @ check for NEON | |
127 | # ifdef __thumb2__ | |
128 | adr r9,.Lpoly1305_blocks_neon | |
129 | adr r11,.Lpoly1305_blocks | |
130 | it ne | |
131 | movne r11,r9 | |
132 | adr r12,.Lpoly1305_emit | |
133 | orr r11,r11,#1 @ thumb-ify addresses | |
134 | orr r12,r12,#1 | |
135 | # else | |
136 | add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) | |
137 | ite eq | |
138 | addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) | |
139 | addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) | |
140 | # endif | |
141 | #endif | |
142 | ldrb r9,[$inp,#11] | |
143 | orr r6,r6,r7,lsl#8 | |
144 | ldrb r7,[$inp,#12] | |
145 | orr r6,r6,r8,lsl#16 | |
146 | ldrb r8,[$inp,#13] | |
147 | orr r6,r6,r9,lsl#24 | |
148 | ldrb r9,[$inp,#14] | |
149 | and r6,r6,r3 | |
150 | ||
151 | ldrb r10,[$inp,#15] | |
152 | orr r7,r7,r8,lsl#8 | |
153 | str r4,[$ctx,#0] | |
154 | orr r7,r7,r9,lsl#16 | |
155 | str r5,[$ctx,#4] | |
156 | orr r7,r7,r10,lsl#24 | |
157 | str r6,[$ctx,#8] | |
158 | and r7,r7,r3 | |
159 | str r7,[$ctx,#12] | |
160 | #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) | |
161 | stmia r2,{r11,r12} @ fill functions table | |
162 | mov r0,#1 | |
163 | #else | |
164 | mov r0,#0 | |
165 | #endif | |
166 | .Lno_key: | |
167 | ldmia sp!,{r4-r11} | |
168 | #if __ARM_ARCH__>=5 | |
169 | ret @ bx lr | |
170 | #else | |
171 | tst lr,#1 | |
172 | moveq pc,lr @ be binary compatible with V4, yet | |
173 | bx lr @ interoperable with Thumb ISA:-) | |
174 | #endif | |
175 | .size poly1305_init,.-poly1305_init | |
176 | ___ | |
177 | { | |
178 | my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12)); | |
179 | my ($s1,$s2,$s3)=($r1,$r2,$r3); | |
180 | ||
181 | $code.=<<___; | |
182 | .type poly1305_blocks,%function | |
183 | .align 5 | |
184 | poly1305_blocks: | |
185 | .Lpoly1305_blocks: | |
186 | stmdb sp!,{r3-r11,lr} | |
187 | ||
188 | ands $len,$len,#-16 | |
189 | beq .Lno_data | |
190 | ||
191 | add $len,$len,$inp @ end pointer | |
192 | sub sp,sp,#32 | |
193 | ||
194 | #if __ARM_ARCH__<7 | |
195 | ldmia $ctx,{$h0-$r3} @ load context | |
196 | add $ctx,$ctx,#20 | |
197 | str $len,[sp,#16] @ offload stuff | |
198 | str $ctx,[sp,#12] | |
199 | #else | |
200 | ldr lr,[$ctx,#36] @ is_base2_26 | |
201 | ldmia $ctx!,{$h0-$h4} @ load hash value | |
202 | str $len,[sp,#16] @ offload stuff | |
203 | str $ctx,[sp,#12] | |
204 | ||
205 | adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 | |
206 | mov $r1,$h1,lsr#6 | |
207 | adcs $r1,$r1,$h2,lsl#20 | |
208 | mov $r2,$h2,lsr#12 | |
209 | adcs $r2,$r2,$h3,lsl#14 | |
210 | mov $r3,$h3,lsr#18 | |
211 | adcs $r3,$r3,$h4,lsl#8 | |
212 | mov $len,#0 | |
213 | teq lr,#0 | |
214 | str $len,[$ctx,#16] @ clear is_base2_26 | |
215 | adc $len,$len,$h4,lsr#24 | |
216 | ||
217 | itttt ne | |
218 | movne $h0,$r0 @ choose between radixes | |
219 | movne $h1,$r1 | |
220 | movne $h2,$r2 | |
221 | movne $h3,$r3 | |
222 | ldmia $ctx,{$r0-$r3} @ load key | |
223 | it ne | |
224 | movne $h4,$len | |
225 | #endif | |
226 | ||
227 | mov lr,$inp | |
228 | cmp $padbit,#0 | |
229 | str $r1,[sp,#20] | |
230 | str $r2,[sp,#24] | |
231 | str $r3,[sp,#28] | |
232 | b .Loop | |
233 | ||
234 | .align 4 | |
235 | .Loop: | |
236 | #if __ARM_ARCH__<7 | |
237 | ldrb r0,[lr],#16 @ load input | |
238 | # ifdef __thumb2__ | |
239 | it hi | |
240 | # endif | |
241 | addhi $h4,$h4,#1 @ 1<<128 | |
242 | ldrb r1,[lr,#-15] | |
243 | ldrb r2,[lr,#-14] | |
244 | ldrb r3,[lr,#-13] | |
245 | orr r1,r0,r1,lsl#8 | |
246 | ldrb r0,[lr,#-12] | |
247 | orr r2,r1,r2,lsl#16 | |
248 | ldrb r1,[lr,#-11] | |
249 | orr r3,r2,r3,lsl#24 | |
250 | ldrb r2,[lr,#-10] | |
251 | adds $h0,$h0,r3 @ accumulate input | |
252 | ||
253 | ldrb r3,[lr,#-9] | |
254 | orr r1,r0,r1,lsl#8 | |
255 | ldrb r0,[lr,#-8] | |
256 | orr r2,r1,r2,lsl#16 | |
257 | ldrb r1,[lr,#-7] | |
258 | orr r3,r2,r3,lsl#24 | |
259 | ldrb r2,[lr,#-6] | |
260 | adcs $h1,$h1,r3 | |
261 | ||
262 | ldrb r3,[lr,#-5] | |
263 | orr r1,r0,r1,lsl#8 | |
264 | ldrb r0,[lr,#-4] | |
265 | orr r2,r1,r2,lsl#16 | |
266 | ldrb r1,[lr,#-3] | |
267 | orr r3,r2,r3,lsl#24 | |
268 | ldrb r2,[lr,#-2] | |
269 | adcs $h2,$h2,r3 | |
270 | ||
271 | ldrb r3,[lr,#-1] | |
272 | orr r1,r0,r1,lsl#8 | |
273 | str lr,[sp,#8] @ offload input pointer | |
274 | orr r2,r1,r2,lsl#16 | |
275 | add $s1,$r1,$r1,lsr#2 | |
276 | orr r3,r2,r3,lsl#24 | |
277 | #else | |
278 | ldr r0,[lr],#16 @ load input | |
279 | it hi | |
280 | addhi $h4,$h4,#1 @ padbit | |
281 | ldr r1,[lr,#-12] | |
282 | ldr r2,[lr,#-8] | |
283 | ldr r3,[lr,#-4] | |
284 | # ifdef __ARMEB__ | |
285 | rev r0,r0 | |
286 | rev r1,r1 | |
287 | rev r2,r2 | |
288 | rev r3,r3 | |
289 | # endif | |
290 | adds $h0,$h0,r0 @ accumulate input | |
291 | str lr,[sp,#8] @ offload input pointer | |
292 | adcs $h1,$h1,r1 | |
293 | add $s1,$r1,$r1,lsr#2 | |
294 | adcs $h2,$h2,r2 | |
295 | #endif | |
296 | add $s2,$r2,$r2,lsr#2 | |
297 | adcs $h3,$h3,r3 | |
298 | add $s3,$r3,$r3,lsr#2 | |
299 | ||
300 | umull r2,r3,$h1,$r0 | |
301 | adc $h4,$h4,#0 | |
302 | umull r0,r1,$h0,$r0 | |
303 | umlal r2,r3,$h4,$s1 | |
304 | umlal r0,r1,$h3,$s1 | |
305 | ldr $r1,[sp,#20] @ reload $r1 | |
306 | umlal r2,r3,$h2,$s3 | |
307 | umlal r0,r1,$h1,$s3 | |
308 | umlal r2,r3,$h3,$s2 | |
309 | umlal r0,r1,$h2,$s2 | |
310 | umlal r2,r3,$h0,$r1 | |
311 | str r0,[sp,#0] @ future $h0 | |
312 | mul r0,$s2,$h4 | |
313 | ldr $r2,[sp,#24] @ reload $r2 | |
314 | adds r2,r2,r1 @ d1+=d0>>32 | |
315 | eor r1,r1,r1 | |
316 | adc lr,r3,#0 @ future $h2 | |
317 | str r2,[sp,#4] @ future $h1 | |
318 | ||
319 | mul r2,$s3,$h4 | |
320 | eor r3,r3,r3 | |
321 | umlal r0,r1,$h3,$s3 | |
322 | ldr $r3,[sp,#28] @ reload $r3 | |
323 | umlal r2,r3,$h3,$r0 | |
324 | umlal r0,r1,$h2,$r0 | |
325 | umlal r2,r3,$h2,$r1 | |
326 | umlal r0,r1,$h1,$r1 | |
327 | umlal r2,r3,$h1,$r2 | |
328 | umlal r0,r1,$h0,$r2 | |
329 | umlal r2,r3,$h0,$r3 | |
330 | ldr $h0,[sp,#0] | |
331 | mul $h4,$r0,$h4 | |
332 | ldr $h1,[sp,#4] | |
333 | ||
334 | adds $h2,lr,r0 @ d2+=d1>>32 | |
335 | ldr lr,[sp,#8] @ reload input pointer | |
336 | adc r1,r1,#0 | |
337 | adds $h3,r2,r1 @ d3+=d2>>32 | |
338 | ldr r0,[sp,#16] @ reload end pointer | |
339 | adc r3,r3,#0 | |
340 | add $h4,$h4,r3 @ h4+=d3>>32 | |
341 | ||
342 | and r1,$h4,#-4 | |
343 | and $h4,$h4,#3 | |
344 | add r1,r1,r1,lsr#2 @ *=5 | |
345 | adds $h0,$h0,r1 | |
346 | adcs $h1,$h1,#0 | |
347 | adcs $h2,$h2,#0 | |
348 | adcs $h3,$h3,#0 | |
349 | adc $h4,$h4,#0 | |
350 | ||
351 | cmp r0,lr @ done yet? | |
352 | bhi .Loop | |
353 | ||
354 | ldr $ctx,[sp,#12] | |
355 | add sp,sp,#32 | |
356 | stmdb $ctx,{$h0-$h4} @ store the result | |
357 | ||
358 | .Lno_data: | |
359 | #if __ARM_ARCH__>=5 | |
360 | ldmia sp!,{r3-r11,pc} | |
361 | #else | |
362 | ldmia sp!,{r3-r11,lr} | |
363 | tst lr,#1 | |
364 | moveq pc,lr @ be binary compatible with V4, yet | |
365 | bx lr @ interoperable with Thumb ISA:-) | |
366 | #endif | |
367 | .size poly1305_blocks,.-poly1305_blocks | |
368 | ___ | |
369 | } | |
370 | { | |
371 | my ($ctx,$mac,$nonce)=map("r$_",(0..2)); | |
372 | my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11)); | |
373 | my $g4=$ctx; | |
374 | ||
375 | $code.=<<___; | |
376 | .type poly1305_emit,%function | |
377 | .align 5 | |
378 | poly1305_emit: | |
379 | .Lpoly1305_emit: | |
380 | stmdb sp!,{r4-r11} | |
381 | ||
382 | ldmia $ctx,{$h0-$h4} | |
383 | ||
384 | #if __ARM_ARCH__>=7 | |
385 | ldr ip,[$ctx,#36] @ is_base2_26 | |
386 | ||
387 | adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 | |
388 | mov $g1,$h1,lsr#6 | |
389 | adcs $g1,$g1,$h2,lsl#20 | |
390 | mov $g2,$h2,lsr#12 | |
391 | adcs $g2,$g2,$h3,lsl#14 | |
392 | mov $g3,$h3,lsr#18 | |
393 | adcs $g3,$g3,$h4,lsl#8 | |
394 | mov $g4,#0 | |
395 | adc $g4,$g4,$h4,lsr#24 | |
396 | ||
397 | tst ip,ip | |
398 | itttt ne | |
399 | movne $h0,$g0 | |
400 | movne $h1,$g1 | |
401 | movne $h2,$g2 | |
402 | movne $h3,$g3 | |
403 | it ne | |
404 | movne $h4,$g4 | |
405 | #endif | |
406 | ||
407 | adds $g0,$h0,#5 @ compare to modulus | |
408 | adcs $g1,$h1,#0 | |
409 | adcs $g2,$h2,#0 | |
410 | adcs $g3,$h3,#0 | |
411 | adc $g4,$h4,#0 | |
412 | tst $g4,#4 @ did it carry/borrow? | |
413 | ||
414 | #ifdef __thumb2__ | |
415 | it ne | |
416 | #endif | |
417 | movne $h0,$g0 | |
418 | ldr $g0,[$nonce,#0] | |
419 | #ifdef __thumb2__ | |
420 | it ne | |
421 | #endif | |
422 | movne $h1,$g1 | |
423 | ldr $g1,[$nonce,#4] | |
424 | #ifdef __thumb2__ | |
425 | it ne | |
426 | #endif | |
427 | movne $h2,$g2 | |
428 | ldr $g2,[$nonce,#8] | |
429 | #ifdef __thumb2__ | |
430 | it ne | |
431 | #endif | |
432 | movne $h3,$g3 | |
433 | ldr $g3,[$nonce,#12] | |
434 | ||
435 | adds $h0,$h0,$g0 | |
436 | adcs $h1,$h1,$g1 | |
437 | adcs $h2,$h2,$g2 | |
438 | adc $h3,$h3,$g3 | |
439 | ||
440 | #if __ARM_ARCH__>=7 | |
441 | # ifdef __ARMEB__ | |
442 | rev $h0,$h0 | |
443 | rev $h1,$h1 | |
444 | rev $h2,$h2 | |
445 | rev $h3,$h3 | |
446 | # endif | |
447 | str $h0,[$mac,#0] | |
448 | str $h1,[$mac,#4] | |
449 | str $h2,[$mac,#8] | |
450 | str $h3,[$mac,#12] | |
451 | #else | |
452 | strb $h0,[$mac,#0] | |
453 | mov $h0,$h0,lsr#8 | |
454 | strb $h1,[$mac,#4] | |
455 | mov $h1,$h1,lsr#8 | |
456 | strb $h2,[$mac,#8] | |
457 | mov $h2,$h2,lsr#8 | |
458 | strb $h3,[$mac,#12] | |
459 | mov $h3,$h3,lsr#8 | |
460 | ||
461 | strb $h0,[$mac,#1] | |
462 | mov $h0,$h0,lsr#8 | |
463 | strb $h1,[$mac,#5] | |
464 | mov $h1,$h1,lsr#8 | |
465 | strb $h2,[$mac,#9] | |
466 | mov $h2,$h2,lsr#8 | |
467 | strb $h3,[$mac,#13] | |
468 | mov $h3,$h3,lsr#8 | |
469 | ||
470 | strb $h0,[$mac,#2] | |
471 | mov $h0,$h0,lsr#8 | |
472 | strb $h1,[$mac,#6] | |
473 | mov $h1,$h1,lsr#8 | |
474 | strb $h2,[$mac,#10] | |
475 | mov $h2,$h2,lsr#8 | |
476 | strb $h3,[$mac,#14] | |
477 | mov $h3,$h3,lsr#8 | |
478 | ||
479 | strb $h0,[$mac,#3] | |
480 | strb $h1,[$mac,#7] | |
481 | strb $h2,[$mac,#11] | |
482 | strb $h3,[$mac,#15] | |
483 | #endif | |
484 | ldmia sp!,{r4-r11} | |
485 | #if __ARM_ARCH__>=5 | |
486 | ret @ bx lr | |
487 | #else | |
488 | tst lr,#1 | |
489 | moveq pc,lr @ be binary compatible with V4, yet | |
490 | bx lr @ interoperable with Thumb ISA:-) | |
491 | #endif | |
492 | .size poly1305_emit,.-poly1305_emit | |
493 | ___ | |
494 | { | |
495 | my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9)); | |
496 | my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14)); | |
497 | my ($T0,$T1,$MASK) = map("q$_",(15,4,0)); | |
498 | ||
499 | my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7)); | |
500 | ||
501 | $code.=<<___; | |
502 | #if __ARM_MAX_ARCH__>=7 | |
503 | .fpu neon | |
504 | ||
505 | .type poly1305_init_neon,%function | |
506 | .align 5 | |
507 | poly1305_init_neon: | |
508 | .Lpoly1305_init_neon: | |
509 | ldr r3,[$ctx,#48] @ first table element | |
510 | cmp r3,#-1 @ is value impossible? | |
511 | bne .Lno_init_neon | |
512 | ||
513 | ldr r4,[$ctx,#20] @ load key base 2^32 | |
514 | ldr r5,[$ctx,#24] | |
515 | ldr r6,[$ctx,#28] | |
516 | ldr r7,[$ctx,#32] | |
517 | ||
518 | and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 | |
519 | mov r3,r4,lsr#26 | |
520 | mov r4,r5,lsr#20 | |
521 | orr r3,r3,r5,lsl#6 | |
522 | mov r5,r6,lsr#14 | |
523 | orr r4,r4,r6,lsl#12 | |
524 | mov r6,r7,lsr#8 | |
525 | orr r5,r5,r7,lsl#18 | |
526 | and r3,r3,#0x03ffffff | |
527 | and r4,r4,#0x03ffffff | |
528 | and r5,r5,#0x03ffffff | |
529 | ||
530 | vdup.32 $R0,r2 @ r^1 in both lanes | |
531 | add r2,r3,r3,lsl#2 @ *5 | |
532 | vdup.32 $R1,r3 | |
533 | add r3,r4,r4,lsl#2 | |
534 | vdup.32 $S1,r2 | |
535 | vdup.32 $R2,r4 | |
536 | add r4,r5,r5,lsl#2 | |
537 | vdup.32 $S2,r3 | |
538 | vdup.32 $R3,r5 | |
539 | add r5,r6,r6,lsl#2 | |
540 | vdup.32 $S3,r4 | |
541 | vdup.32 $R4,r6 | |
542 | vdup.32 $S4,r5 | |
543 | ||
544 | mov $zeros,#2 @ counter | |
545 | ||
546 | .Lsquare_neon: | |
547 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
548 | @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 | |
549 | @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 | |
550 | @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 | |
551 | @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 | |
552 | @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 | |
553 | ||
554 | vmull.u32 $D0,$R0,${R0}[1] | |
555 | vmull.u32 $D1,$R1,${R0}[1] | |
556 | vmull.u32 $D2,$R2,${R0}[1] | |
557 | vmull.u32 $D3,$R3,${R0}[1] | |
558 | vmull.u32 $D4,$R4,${R0}[1] | |
559 | ||
560 | vmlal.u32 $D0,$R4,${S1}[1] | |
561 | vmlal.u32 $D1,$R0,${R1}[1] | |
562 | vmlal.u32 $D2,$R1,${R1}[1] | |
563 | vmlal.u32 $D3,$R2,${R1}[1] | |
564 | vmlal.u32 $D4,$R3,${R1}[1] | |
565 | ||
566 | vmlal.u32 $D0,$R3,${S2}[1] | |
567 | vmlal.u32 $D1,$R4,${S2}[1] | |
568 | vmlal.u32 $D3,$R1,${R2}[1] | |
569 | vmlal.u32 $D2,$R0,${R2}[1] | |
570 | vmlal.u32 $D4,$R2,${R2}[1] | |
571 | ||
572 | vmlal.u32 $D0,$R2,${S3}[1] | |
573 | vmlal.u32 $D3,$R0,${R3}[1] | |
574 | vmlal.u32 $D1,$R3,${S3}[1] | |
575 | vmlal.u32 $D2,$R4,${S3}[1] | |
576 | vmlal.u32 $D4,$R1,${R3}[1] | |
577 | ||
578 | vmlal.u32 $D3,$R4,${S4}[1] | |
579 | vmlal.u32 $D0,$R1,${S4}[1] | |
580 | vmlal.u32 $D1,$R2,${S4}[1] | |
581 | vmlal.u32 $D2,$R3,${S4}[1] | |
582 | vmlal.u32 $D4,$R0,${R4}[1] | |
583 | ||
584 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
585 | @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein | |
586 | @ and P. Schwabe | |
587 | @ | |
588 | @ H0>>+H1>>+H2>>+H3>>+H4 | |
589 | @ H3>>+H4>>*5+H0>>+H1 | |
590 | @ | |
591 | @ Trivia. | |
592 | @ | |
593 | @ Result of multiplication of n-bit number by m-bit number is | |
594 | @ n+m bits wide. However! Even though 2^n is a n+1-bit number, | |
595 | @ m-bit number multiplied by 2^n is still n+m bits wide. | |
596 | @ | |
597 | @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, | |
598 | @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit | |
599 | @ one is n+1 bits wide. | |
600 | @ | |
601 | @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that | |
602 | @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 | |
603 | @ can be 27. However! In cases when their width exceeds 26 bits | |
604 | @ they are limited by 2^26+2^6. This in turn means that *sum* | |
605 | @ of the products with these values can still be viewed as sum | |
606 | @ of 52-bit numbers as long as the amount of addends is not a | |
607 | @ power of 2. For example, | |
608 | @ | |
609 | @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, | |
610 | @ | |
611 | @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or | |
612 | @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than | |
613 | @ 8 * (2^52) or 2^55. However, the value is then multiplied by | |
614 | @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), | |
615 | @ which is less than 32 * (2^52) or 2^57. And when processing | |
616 | @ data we are looking at triple as many addends... | |
617 | @ | |
618 | @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and | |
619 | @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the | |
620 | @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while | |
621 | @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 | |
622 | @ instruction accepts 2x32-bit input and writes 2x64-bit result. | |
623 | @ This means that result of reduction have to be compressed upon | |
624 | @ loop wrap-around. This can be done in the process of reduction | |
625 | @ to minimize amount of instructions [as well as amount of | |
626 | @ 128-bit instructions, which benefits low-end processors], but | |
627 | @ one has to watch for H2 (which is narrower than H0) and 5*H4 | |
628 | @ not being wider than 58 bits, so that result of right shift | |
629 | @ by 26 bits fits in 32 bits. This is also useful on x86, | |
630 | @ because it allows to use paddd in place for paddq, which | |
631 | @ benefits Atom, where paddq is ridiculously slow. | |
632 | ||
633 | vshr.u64 $T0,$D3,#26 | |
634 | vmovn.i64 $D3#lo,$D3 | |
635 | vshr.u64 $T1,$D0,#26 | |
636 | vmovn.i64 $D0#lo,$D0 | |
637 | vadd.i64 $D4,$D4,$T0 @ h3 -> h4 | |
638 | vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff | |
639 | vadd.i64 $D1,$D1,$T1 @ h0 -> h1 | |
640 | vbic.i32 $D0#lo,#0xfc000000 | |
641 | ||
642 | vshrn.u64 $T0#lo,$D4,#26 | |
643 | vmovn.i64 $D4#lo,$D4 | |
644 | vshr.u64 $T1,$D1,#26 | |
645 | vmovn.i64 $D1#lo,$D1 | |
646 | vadd.i64 $D2,$D2,$T1 @ h1 -> h2 | |
647 | vbic.i32 $D4#lo,#0xfc000000 | |
648 | vbic.i32 $D1#lo,#0xfc000000 | |
649 | ||
650 | vadd.i32 $D0#lo,$D0#lo,$T0#lo | |
651 | vshl.u32 $T0#lo,$T0#lo,#2 | |
652 | vshrn.u64 $T1#lo,$D2,#26 | |
653 | vmovn.i64 $D2#lo,$D2 | |
654 | vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0 | |
655 | vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 | |
656 | vbic.i32 $D2#lo,#0xfc000000 | |
657 | ||
658 | vshr.u32 $T0#lo,$D0#lo,#26 | |
659 | vbic.i32 $D0#lo,#0xfc000000 | |
660 | vshr.u32 $T1#lo,$D3#lo,#26 | |
661 | vbic.i32 $D3#lo,#0xfc000000 | |
662 | vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 | |
663 | vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 | |
664 | ||
665 | subs $zeros,$zeros,#1 | |
666 | beq .Lsquare_break_neon | |
667 | ||
668 | add $tbl0,$ctx,#(48+0*9*4) | |
669 | add $tbl1,$ctx,#(48+1*9*4) | |
670 | ||
671 | vtrn.32 $R0,$D0#lo @ r^2:r^1 | |
672 | vtrn.32 $R2,$D2#lo | |
673 | vtrn.32 $R3,$D3#lo | |
674 | vtrn.32 $R1,$D1#lo | |
675 | vtrn.32 $R4,$D4#lo | |
676 | ||
677 | vshl.u32 $S2,$R2,#2 @ *5 | |
678 | vshl.u32 $S3,$R3,#2 | |
679 | vshl.u32 $S1,$R1,#2 | |
680 | vshl.u32 $S4,$R4,#2 | |
681 | vadd.i32 $S2,$S2,$R2 | |
682 | vadd.i32 $S1,$S1,$R1 | |
683 | vadd.i32 $S3,$S3,$R3 | |
684 | vadd.i32 $S4,$S4,$R4 | |
685 | ||
686 | vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! | |
687 | vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! | |
688 | vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! | |
689 | vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! | |
690 | vst1.32 {${S4}[0]},[$tbl0,:32] | |
691 | vst1.32 {${S4}[1]},[$tbl1,:32] | |
692 | ||
693 | b .Lsquare_neon | |
694 | ||
695 | .align 4 | |
696 | .Lsquare_break_neon: | |
697 | add $tbl0,$ctx,#(48+2*4*9) | |
698 | add $tbl1,$ctx,#(48+3*4*9) | |
699 | ||
700 | vmov $R0,$D0#lo @ r^4:r^3 | |
701 | vshl.u32 $S1,$D1#lo,#2 @ *5 | |
702 | vmov $R1,$D1#lo | |
703 | vshl.u32 $S2,$D2#lo,#2 | |
704 | vmov $R2,$D2#lo | |
705 | vshl.u32 $S3,$D3#lo,#2 | |
706 | vmov $R3,$D3#lo | |
707 | vshl.u32 $S4,$D4#lo,#2 | |
708 | vmov $R4,$D4#lo | |
709 | vadd.i32 $S1,$S1,$D1#lo | |
710 | vadd.i32 $S2,$S2,$D2#lo | |
711 | vadd.i32 $S3,$S3,$D3#lo | |
712 | vadd.i32 $S4,$S4,$D4#lo | |
713 | ||
714 | vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! | |
715 | vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! | |
716 | vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! | |
717 | vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! | |
718 | vst1.32 {${S4}[0]},[$tbl0] | |
719 | vst1.32 {${S4}[1]},[$tbl1] | |
720 | ||
721 | .Lno_init_neon: | |
722 | ret @ bx lr | |
723 | .size poly1305_init_neon,.-poly1305_init_neon | |
724 | ||
725 | .type poly1305_blocks_neon,%function | |
726 | .align 5 | |
727 | poly1305_blocks_neon: | |
728 | .Lpoly1305_blocks_neon: | |
729 | ldr ip,[$ctx,#36] @ is_base2_26 | |
730 | ||
731 | cmp $len,#64 | |
732 | blo .Lpoly1305_blocks | |
733 | ||
734 | stmdb sp!,{r4-r7} | |
735 | vstmdb sp!,{d8-d15} @ ABI specification says so | |
736 | ||
737 | tst ip,ip @ is_base2_26? | |
738 | bne .Lbase2_26_neon | |
739 | ||
740 | stmdb sp!,{r1-r3,lr} | |
741 | bl .Lpoly1305_init_neon | |
742 | ||
743 | ldr r4,[$ctx,#0] @ load hash value base 2^32 | |
744 | ldr r5,[$ctx,#4] | |
745 | ldr r6,[$ctx,#8] | |
746 | ldr r7,[$ctx,#12] | |
747 | ldr ip,[$ctx,#16] | |
748 | ||
749 | and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 | |
750 | mov r3,r4,lsr#26 | |
751 | veor $D0#lo,$D0#lo,$D0#lo | |
752 | mov r4,r5,lsr#20 | |
753 | orr r3,r3,r5,lsl#6 | |
754 | veor $D1#lo,$D1#lo,$D1#lo | |
755 | mov r5,r6,lsr#14 | |
756 | orr r4,r4,r6,lsl#12 | |
757 | veor $D2#lo,$D2#lo,$D2#lo | |
758 | mov r6,r7,lsr#8 | |
759 | orr r5,r5,r7,lsl#18 | |
760 | veor $D3#lo,$D3#lo,$D3#lo | |
761 | and r3,r3,#0x03ffffff | |
762 | orr r6,r6,ip,lsl#24 | |
763 | veor $D4#lo,$D4#lo,$D4#lo | |
764 | and r4,r4,#0x03ffffff | |
765 | mov r1,#1 | |
766 | and r5,r5,#0x03ffffff | |
767 | str r1,[$ctx,#36] @ set is_base2_26 | |
768 | ||
769 | vmov.32 $D0#lo[0],r2 | |
770 | vmov.32 $D1#lo[0],r3 | |
771 | vmov.32 $D2#lo[0],r4 | |
772 | vmov.32 $D3#lo[0],r5 | |
773 | vmov.32 $D4#lo[0],r6 | |
774 | adr $zeros,.Lzeros | |
775 | ||
776 | ldmia sp!,{r1-r3,lr} | |
777 | b .Lhash_loaded | |
778 | ||
779 | .align 4 | |
780 | .Lbase2_26_neon: | |
781 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
782 | @ load hash value | |
783 | ||
784 | veor $D0#lo,$D0#lo,$D0#lo | |
785 | veor $D1#lo,$D1#lo,$D1#lo | |
786 | veor $D2#lo,$D2#lo,$D2#lo | |
787 | veor $D3#lo,$D3#lo,$D3#lo | |
788 | veor $D4#lo,$D4#lo,$D4#lo | |
789 | vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! | |
790 | adr $zeros,.Lzeros | |
791 | vld1.32 {$D4#lo[0]},[$ctx] | |
792 | sub $ctx,$ctx,#16 @ rewind | |
793 | ||
794 | .Lhash_loaded: | |
795 | add $in2,$inp,#32 | |
796 | mov $padbit,$padbit,lsl#24 | |
797 | tst $len,#31 | |
798 | beq .Leven | |
799 | ||
800 | vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]! | |
801 | vmov.32 $H4#lo[0],$padbit | |
802 | sub $len,$len,#16 | |
803 | add $in2,$inp,#32 | |
804 | ||
805 | # ifdef __ARMEB__ | |
806 | vrev32.8 $H0,$H0 | |
807 | vrev32.8 $H3,$H3 | |
808 | vrev32.8 $H1,$H1 | |
809 | vrev32.8 $H2,$H2 | |
810 | # endif | |
811 | vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26 | |
812 | vshl.u32 $H3#lo,$H3#lo,#18 | |
813 | ||
814 | vsri.u32 $H3#lo,$H2#lo,#14 | |
815 | vshl.u32 $H2#lo,$H2#lo,#12 | |
816 | vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi | |
817 | ||
818 | vbic.i32 $H3#lo,#0xfc000000 | |
819 | vsri.u32 $H2#lo,$H1#lo,#20 | |
820 | vshl.u32 $H1#lo,$H1#lo,#6 | |
821 | ||
822 | vbic.i32 $H2#lo,#0xfc000000 | |
823 | vsri.u32 $H1#lo,$H0#lo,#26 | |
824 | vadd.i32 $H3#hi,$H3#lo,$D3#lo | |
825 | ||
826 | vbic.i32 $H0#lo,#0xfc000000 | |
827 | vbic.i32 $H1#lo,#0xfc000000 | |
828 | vadd.i32 $H2#hi,$H2#lo,$D2#lo | |
829 | ||
830 | vadd.i32 $H0#hi,$H0#lo,$D0#lo | |
831 | vadd.i32 $H1#hi,$H1#lo,$D1#lo | |
832 | ||
833 | mov $tbl1,$zeros | |
834 | add $tbl0,$ctx,#48 | |
835 | ||
836 | cmp $len,$len | |
837 | b .Long_tail | |
838 | ||
839 | .align 4 | |
840 | .Leven: | |
841 | subs $len,$len,#64 | |
842 | it lo | |
843 | movlo $in2,$zeros | |
844 | ||
845 | vmov.i32 $H4,#1<<24 @ padbit, yes, always | |
846 | vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] | |
847 | add $inp,$inp,#64 | |
848 | vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) | |
849 | add $in2,$in2,#64 | |
850 | itt hi | |
851 | addhi $tbl1,$ctx,#(48+1*9*4) | |
852 | addhi $tbl0,$ctx,#(48+3*9*4) | |
853 | ||
854 | # ifdef __ARMEB__ | |
855 | vrev32.8 $H0,$H0 | |
856 | vrev32.8 $H3,$H3 | |
857 | vrev32.8 $H1,$H1 | |
858 | vrev32.8 $H2,$H2 | |
859 | # endif | |
860 | vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 | |
861 | vshl.u32 $H3,$H3,#18 | |
862 | ||
863 | vsri.u32 $H3,$H2,#14 | |
864 | vshl.u32 $H2,$H2,#12 | |
865 | ||
866 | vbic.i32 $H3,#0xfc000000 | |
867 | vsri.u32 $H2,$H1,#20 | |
868 | vshl.u32 $H1,$H1,#6 | |
869 | ||
870 | vbic.i32 $H2,#0xfc000000 | |
871 | vsri.u32 $H1,$H0,#26 | |
872 | ||
873 | vbic.i32 $H0,#0xfc000000 | |
874 | vbic.i32 $H1,#0xfc000000 | |
875 | ||
876 | bls .Lskip_loop | |
877 | ||
878 | vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2 | |
879 | vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 | |
880 | vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! | |
881 | vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! | |
882 | b .Loop_neon | |
883 | ||
884 | .align 5 | |
885 | .Loop_neon: | |
886 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
887 | @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 | |
888 | @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r | |
889 | @ \___________________/ | |
890 | @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 | |
891 | @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r | |
892 | @ \___________________/ \____________________/ | |
893 | @ | |
894 | @ Note that we start with inp[2:3]*r^2. This is because it | |
895 | @ doesn't depend on reduction in previous iteration. | |
896 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
897 | @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 | |
898 | @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 | |
899 | @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 | |
900 | @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 | |
901 | @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 | |
902 | ||
903 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
904 | @ inp[2:3]*r^2 | |
905 | ||
906 | vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1] | |
907 | vmull.u32 $D2,$H2#hi,${R0}[1] | |
908 | vadd.i32 $H0#lo,$H0#lo,$D0#lo | |
909 | vmull.u32 $D0,$H0#hi,${R0}[1] | |
910 | vadd.i32 $H3#lo,$H3#lo,$D3#lo | |
911 | vmull.u32 $D3,$H3#hi,${R0}[1] | |
912 | vmlal.u32 $D2,$H1#hi,${R1}[1] | |
913 | vadd.i32 $H1#lo,$H1#lo,$D1#lo | |
914 | vmull.u32 $D1,$H1#hi,${R0}[1] | |
915 | ||
916 | vadd.i32 $H4#lo,$H4#lo,$D4#lo | |
917 | vmull.u32 $D4,$H4#hi,${R0}[1] | |
918 | subs $len,$len,#64 | |
919 | vmlal.u32 $D0,$H4#hi,${S1}[1] | |
920 | it lo | |
921 | movlo $in2,$zeros | |
922 | vmlal.u32 $D3,$H2#hi,${R1}[1] | |
923 | vld1.32 ${S4}[1],[$tbl1,:32] | |
924 | vmlal.u32 $D1,$H0#hi,${R1}[1] | |
925 | vmlal.u32 $D4,$H3#hi,${R1}[1] | |
926 | ||
927 | vmlal.u32 $D0,$H3#hi,${S2}[1] | |
928 | vmlal.u32 $D3,$H1#hi,${R2}[1] | |
929 | vmlal.u32 $D4,$H2#hi,${R2}[1] | |
930 | vmlal.u32 $D1,$H4#hi,${S2}[1] | |
931 | vmlal.u32 $D2,$H0#hi,${R2}[1] | |
932 | ||
933 | vmlal.u32 $D3,$H0#hi,${R3}[1] | |
934 | vmlal.u32 $D0,$H2#hi,${S3}[1] | |
935 | vmlal.u32 $D4,$H1#hi,${R3}[1] | |
936 | vmlal.u32 $D1,$H3#hi,${S3}[1] | |
937 | vmlal.u32 $D2,$H4#hi,${S3}[1] | |
938 | ||
939 | vmlal.u32 $D3,$H4#hi,${S4}[1] | |
940 | vmlal.u32 $D0,$H1#hi,${S4}[1] | |
941 | vmlal.u32 $D4,$H0#hi,${R4}[1] | |
942 | vmlal.u32 $D1,$H2#hi,${S4}[1] | |
943 | vmlal.u32 $D2,$H3#hi,${S4}[1] | |
944 | ||
945 | vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) | |
946 | add $in2,$in2,#64 | |
947 | ||
948 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
949 | @ (hash+inp[0:1])*r^4 and accumulate | |
950 | ||
951 | vmlal.u32 $D3,$H3#lo,${R0}[0] | |
952 | vmlal.u32 $D0,$H0#lo,${R0}[0] | |
953 | vmlal.u32 $D4,$H4#lo,${R0}[0] | |
954 | vmlal.u32 $D1,$H1#lo,${R0}[0] | |
955 | vmlal.u32 $D2,$H2#lo,${R0}[0] | |
956 | vld1.32 ${S4}[0],[$tbl0,:32] | |
957 | ||
958 | vmlal.u32 $D3,$H2#lo,${R1}[0] | |
959 | vmlal.u32 $D0,$H4#lo,${S1}[0] | |
960 | vmlal.u32 $D4,$H3#lo,${R1}[0] | |
961 | vmlal.u32 $D1,$H0#lo,${R1}[0] | |
962 | vmlal.u32 $D2,$H1#lo,${R1}[0] | |
963 | ||
964 | vmlal.u32 $D3,$H1#lo,${R2}[0] | |
965 | vmlal.u32 $D0,$H3#lo,${S2}[0] | |
966 | vmlal.u32 $D4,$H2#lo,${R2}[0] | |
967 | vmlal.u32 $D1,$H4#lo,${S2}[0] | |
968 | vmlal.u32 $D2,$H0#lo,${R2}[0] | |
969 | ||
970 | vmlal.u32 $D3,$H0#lo,${R3}[0] | |
971 | vmlal.u32 $D0,$H2#lo,${S3}[0] | |
972 | vmlal.u32 $D4,$H1#lo,${R3}[0] | |
973 | vmlal.u32 $D1,$H3#lo,${S3}[0] | |
974 | vmlal.u32 $D3,$H4#lo,${S4}[0] | |
975 | ||
976 | vmlal.u32 $D2,$H4#lo,${S3}[0] | |
977 | vmlal.u32 $D0,$H1#lo,${S4}[0] | |
978 | vmlal.u32 $D4,$H0#lo,${R4}[0] | |
979 | vmov.i32 $H4,#1<<24 @ padbit, yes, always | |
980 | vmlal.u32 $D1,$H2#lo,${S4}[0] | |
981 | vmlal.u32 $D2,$H3#lo,${S4}[0] | |
982 | ||
983 | vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] | |
984 | add $inp,$inp,#64 | |
985 | # ifdef __ARMEB__ | |
986 | vrev32.8 $H0,$H0 | |
987 | vrev32.8 $H1,$H1 | |
988 | vrev32.8 $H2,$H2 | |
989 | vrev32.8 $H3,$H3 | |
990 | # endif | |
991 | ||
992 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
993 | @ lazy reduction interleaved with base 2^32 -> base 2^26 of | |
994 | @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4. | |
995 | ||
996 | vshr.u64 $T0,$D3,#26 | |
997 | vmovn.i64 $D3#lo,$D3 | |
998 | vshr.u64 $T1,$D0,#26 | |
999 | vmovn.i64 $D0#lo,$D0 | |
1000 | vadd.i64 $D4,$D4,$T0 @ h3 -> h4 | |
1001 | vbic.i32 $D3#lo,#0xfc000000 | |
1002 | vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 | |
1003 | vadd.i64 $D1,$D1,$T1 @ h0 -> h1 | |
1004 | vshl.u32 $H3,$H3,#18 | |
1005 | vbic.i32 $D0#lo,#0xfc000000 | |
1006 | ||
1007 | vshrn.u64 $T0#lo,$D4,#26 | |
1008 | vmovn.i64 $D4#lo,$D4 | |
1009 | vshr.u64 $T1,$D1,#26 | |
1010 | vmovn.i64 $D1#lo,$D1 | |
1011 | vadd.i64 $D2,$D2,$T1 @ h1 -> h2 | |
1012 | vsri.u32 $H3,$H2,#14 | |
1013 | vbic.i32 $D4#lo,#0xfc000000 | |
1014 | vshl.u32 $H2,$H2,#12 | |
1015 | vbic.i32 $D1#lo,#0xfc000000 | |
1016 | ||
1017 | vadd.i32 $D0#lo,$D0#lo,$T0#lo | |
1018 | vshl.u32 $T0#lo,$T0#lo,#2 | |
1019 | vbic.i32 $H3,#0xfc000000 | |
1020 | vshrn.u64 $T1#lo,$D2,#26 | |
1021 | vmovn.i64 $D2#lo,$D2 | |
1022 | vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec] | |
1023 | vsri.u32 $H2,$H1,#20 | |
1024 | vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 | |
1025 | vshl.u32 $H1,$H1,#6 | |
1026 | vbic.i32 $D2#lo,#0xfc000000 | |
1027 | vbic.i32 $H2,#0xfc000000 | |
1028 | ||
1029 | vshrn.u64 $T0#lo,$D0,#26 @ re-narrow | |
1030 | vmovn.i64 $D0#lo,$D0 | |
1031 | vsri.u32 $H1,$H0,#26 | |
1032 | vbic.i32 $H0,#0xfc000000 | |
1033 | vshr.u32 $T1#lo,$D3#lo,#26 | |
1034 | vbic.i32 $D3#lo,#0xfc000000 | |
1035 | vbic.i32 $D0#lo,#0xfc000000 | |
1036 | vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 | |
1037 | vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 | |
1038 | vbic.i32 $H1,#0xfc000000 | |
1039 | ||
1040 | bhi .Loop_neon | |
1041 | ||
1042 | .Lskip_loop: | |
1043 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
1044 | @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 | |
1045 | ||
1046 | add $tbl1,$ctx,#(48+0*9*4) | |
1047 | add $tbl0,$ctx,#(48+1*9*4) | |
1048 | adds $len,$len,#32 | |
1049 | it ne | |
1050 | movne $len,#0 | |
1051 | bne .Long_tail | |
1052 | ||
1053 | vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi | |
1054 | vadd.i32 $H0#hi,$H0#lo,$D0#lo | |
1055 | vadd.i32 $H3#hi,$H3#lo,$D3#lo | |
1056 | vadd.i32 $H1#hi,$H1#lo,$D1#lo | |
1057 | vadd.i32 $H4#hi,$H4#lo,$D4#lo | |
1058 | ||
1059 | .Long_tail: | |
1060 | vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1 | |
1061 | vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2 | |
1062 | ||
1063 | vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant | |
1064 | vmull.u32 $D2,$H2#hi,$R0 | |
1065 | vadd.i32 $H0#lo,$H0#lo,$D0#lo | |
1066 | vmull.u32 $D0,$H0#hi,$R0 | |
1067 | vadd.i32 $H3#lo,$H3#lo,$D3#lo | |
1068 | vmull.u32 $D3,$H3#hi,$R0 | |
1069 | vadd.i32 $H1#lo,$H1#lo,$D1#lo | |
1070 | vmull.u32 $D1,$H1#hi,$R0 | |
1071 | vadd.i32 $H4#lo,$H4#lo,$D4#lo | |
1072 | vmull.u32 $D4,$H4#hi,$R0 | |
1073 | ||
1074 | vmlal.u32 $D0,$H4#hi,$S1 | |
1075 | vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! | |
1076 | vmlal.u32 $D3,$H2#hi,$R1 | |
1077 | vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! | |
1078 | vmlal.u32 $D1,$H0#hi,$R1 | |
1079 | vmlal.u32 $D4,$H3#hi,$R1 | |
1080 | vmlal.u32 $D2,$H1#hi,$R1 | |
1081 | ||
1082 | vmlal.u32 $D3,$H1#hi,$R2 | |
1083 | vld1.32 ${S4}[1],[$tbl1,:32] | |
1084 | vmlal.u32 $D0,$H3#hi,$S2 | |
1085 | vld1.32 ${S4}[0],[$tbl0,:32] | |
1086 | vmlal.u32 $D4,$H2#hi,$R2 | |
1087 | vmlal.u32 $D1,$H4#hi,$S2 | |
1088 | vmlal.u32 $D2,$H0#hi,$R2 | |
1089 | ||
1090 | vmlal.u32 $D3,$H0#hi,$R3 | |
1091 | it ne | |
1092 | addne $tbl1,$ctx,#(48+2*9*4) | |
1093 | vmlal.u32 $D0,$H2#hi,$S3 | |
1094 | it ne | |
1095 | addne $tbl0,$ctx,#(48+3*9*4) | |
1096 | vmlal.u32 $D4,$H1#hi,$R3 | |
1097 | vmlal.u32 $D1,$H3#hi,$S3 | |
1098 | vmlal.u32 $D2,$H4#hi,$S3 | |
1099 | ||
1100 | vmlal.u32 $D3,$H4#hi,$S4 | |
1101 | vorn $MASK,$MASK,$MASK @ all-ones, can be redundant | |
1102 | vmlal.u32 $D0,$H1#hi,$S4 | |
1103 | vshr.u64 $MASK,$MASK,#38 | |
1104 | vmlal.u32 $D4,$H0#hi,$R4 | |
1105 | vmlal.u32 $D1,$H2#hi,$S4 | |
1106 | vmlal.u32 $D2,$H3#hi,$S4 | |
1107 | ||
1108 | beq .Lshort_tail | |
1109 | ||
1110 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
1111 | @ (hash+inp[0:1])*r^4:r^3 and accumulate | |
1112 | ||
1113 | vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3 | |
1114 | vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 | |
1115 | ||
1116 | vmlal.u32 $D2,$H2#lo,$R0 | |
1117 | vmlal.u32 $D0,$H0#lo,$R0 | |
1118 | vmlal.u32 $D3,$H3#lo,$R0 | |
1119 | vmlal.u32 $D1,$H1#lo,$R0 | |
1120 | vmlal.u32 $D4,$H4#lo,$R0 | |
1121 | ||
1122 | vmlal.u32 $D0,$H4#lo,$S1 | |
1123 | vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! | |
1124 | vmlal.u32 $D3,$H2#lo,$R1 | |
1125 | vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! | |
1126 | vmlal.u32 $D1,$H0#lo,$R1 | |
1127 | vmlal.u32 $D4,$H3#lo,$R1 | |
1128 | vmlal.u32 $D2,$H1#lo,$R1 | |
1129 | ||
1130 | vmlal.u32 $D3,$H1#lo,$R2 | |
1131 | vld1.32 ${S4}[1],[$tbl1,:32] | |
1132 | vmlal.u32 $D0,$H3#lo,$S2 | |
1133 | vld1.32 ${S4}[0],[$tbl0,:32] | |
1134 | vmlal.u32 $D4,$H2#lo,$R2 | |
1135 | vmlal.u32 $D1,$H4#lo,$S2 | |
1136 | vmlal.u32 $D2,$H0#lo,$R2 | |
1137 | ||
1138 | vmlal.u32 $D3,$H0#lo,$R3 | |
1139 | vmlal.u32 $D0,$H2#lo,$S3 | |
1140 | vmlal.u32 $D4,$H1#lo,$R3 | |
1141 | vmlal.u32 $D1,$H3#lo,$S3 | |
1142 | vmlal.u32 $D2,$H4#lo,$S3 | |
1143 | ||
1144 | vmlal.u32 $D3,$H4#lo,$S4 | |
1145 | vorn $MASK,$MASK,$MASK @ all-ones | |
1146 | vmlal.u32 $D0,$H1#lo,$S4 | |
1147 | vshr.u64 $MASK,$MASK,#38 | |
1148 | vmlal.u32 $D4,$H0#lo,$R4 | |
1149 | vmlal.u32 $D1,$H2#lo,$S4 | |
1150 | vmlal.u32 $D2,$H3#lo,$S4 | |
1151 | ||
1152 | .Lshort_tail: | |
1153 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
1154 | @ horizontal addition | |
1155 | ||
1156 | vadd.i64 $D3#lo,$D3#lo,$D3#hi | |
1157 | vadd.i64 $D0#lo,$D0#lo,$D0#hi | |
1158 | vadd.i64 $D4#lo,$D4#lo,$D4#hi | |
1159 | vadd.i64 $D1#lo,$D1#lo,$D1#hi | |
1160 | vadd.i64 $D2#lo,$D2#lo,$D2#hi | |
1161 | ||
1162 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
1163 | @ lazy reduction, but without narrowing | |
1164 | ||
1165 | vshr.u64 $T0,$D3,#26 | |
1166 | vand.i64 $D3,$D3,$MASK | |
1167 | vshr.u64 $T1,$D0,#26 | |
1168 | vand.i64 $D0,$D0,$MASK | |
1169 | vadd.i64 $D4,$D4,$T0 @ h3 -> h4 | |
1170 | vadd.i64 $D1,$D1,$T1 @ h0 -> h1 | |
1171 | ||
1172 | vshr.u64 $T0,$D4,#26 | |
1173 | vand.i64 $D4,$D4,$MASK | |
1174 | vshr.u64 $T1,$D1,#26 | |
1175 | vand.i64 $D1,$D1,$MASK | |
1176 | vadd.i64 $D2,$D2,$T1 @ h1 -> h2 | |
1177 | ||
1178 | vadd.i64 $D0,$D0,$T0 | |
1179 | vshl.u64 $T0,$T0,#2 | |
1180 | vshr.u64 $T1,$D2,#26 | |
1181 | vand.i64 $D2,$D2,$MASK | |
1182 | vadd.i64 $D0,$D0,$T0 @ h4 -> h0 | |
1183 | vadd.i64 $D3,$D3,$T1 @ h2 -> h3 | |
1184 | ||
1185 | vshr.u64 $T0,$D0,#26 | |
1186 | vand.i64 $D0,$D0,$MASK | |
1187 | vshr.u64 $T1,$D3,#26 | |
1188 | vand.i64 $D3,$D3,$MASK | |
1189 | vadd.i64 $D1,$D1,$T0 @ h0 -> h1 | |
1190 | vadd.i64 $D4,$D4,$T1 @ h3 -> h4 | |
1191 | ||
1192 | cmp $len,#0 | |
1193 | bne .Leven | |
1194 | ||
1195 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
1196 | @ store hash value | |
1197 | ||
1198 | vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! | |
1199 | vst1.32 {$D4#lo[0]},[$ctx] | |
1200 | ||
1201 | vldmia sp!,{d8-d15} @ epilogue | |
1202 | ldmia sp!,{r4-r7} | |
1203 | ret @ bx lr | |
1204 | .size poly1305_blocks_neon,.-poly1305_blocks_neon | |
1205 | ||
1206 | .align 5 | |
1207 | .Lzeros: | |
1208 | .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 | |
1209 | #ifndef __KERNEL__ | |
1210 | .LOPENSSL_armcap: | |
1211 | # ifdef _WIN32 | |
1212 | .word OPENSSL_armcap_P | |
1213 | # else | |
1214 | .word OPENSSL_armcap_P-.Lpoly1305_init | |
1215 | # endif | |
1216 | .comm OPENSSL_armcap_P,4,4 | |
1217 | .hidden OPENSSL_armcap_P | |
1218 | #endif | |
1219 | #endif | |
1220 | ___ | |
1221 | } } | |
1222 | $code.=<<___; | |
1223 | .asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm" | |
1224 | .align 2 | |
1225 | ___ | |
1226 | ||
1227 | foreach (split("\n",$code)) { | |
1228 | s/\`([^\`]*)\`/eval $1/geo; | |
1229 | ||
1230 | s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or | |
1231 | s/\bret\b/bx lr/go or | |
1232 | s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 | |
1233 | ||
1234 | print $_,"\n"; | |
1235 | } | |
1236 | close STDOUT; # enforce flush |