Commit | Line | Data |
---|---|---|
6ef5737f AB |
1 | // |
2 | // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions | |
3 | // | |
4 | // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> | |
5 | // | |
6 | // This program is free software; you can redistribute it and/or modify | |
7 | // it under the terms of the GNU General Public License version 2 as | |
8 | // published by the Free Software Foundation. | |
9 | // | |
10 | ||
11 | // | |
12 | // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions | |
13 | // | |
14 | // Copyright (c) 2013, Intel Corporation | |
15 | // | |
16 | // Authors: | |
17 | // Erdinc Ozturk <erdinc.ozturk@intel.com> | |
18 | // Vinodh Gopal <vinodh.gopal@intel.com> | |
19 | // James Guilford <james.guilford@intel.com> | |
20 | // Tim Chen <tim.c.chen@linux.intel.com> | |
21 | // | |
22 | // This software is available to you under a choice of one of two | |
23 | // licenses. You may choose to be licensed under the terms of the GNU | |
24 | // General Public License (GPL) Version 2, available from the file | |
25 | // COPYING in the main directory of this source tree, or the | |
26 | // OpenIB.org BSD license below: | |
27 | // | |
28 | // Redistribution and use in source and binary forms, with or without | |
29 | // modification, are permitted provided that the following conditions are | |
30 | // met: | |
31 | // | |
32 | // * Redistributions of source code must retain the above copyright | |
33 | // notice, this list of conditions and the following disclaimer. | |
34 | // | |
35 | // * Redistributions in binary form must reproduce the above copyright | |
36 | // notice, this list of conditions and the following disclaimer in the | |
37 | // documentation and/or other materials provided with the | |
38 | // distribution. | |
39 | // | |
40 | // * Neither the name of the Intel Corporation nor the names of its | |
41 | // contributors may be used to endorse or promote products derived from | |
42 | // this software without specific prior written permission. | |
43 | // | |
44 | // | |
45 | // THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY | |
46 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
47 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
48 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR | |
49 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
50 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
51 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
52 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
53 | // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
54 | // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
55 | // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
56 | // | |
57 | // Function API: | |
58 | // UINT16 crc_t10dif_pcl( | |
59 | // UINT16 init_crc, //initial CRC value, 16 bits | |
60 | // const unsigned char *buf, //buffer pointer to calculate CRC on | |
61 | // UINT64 len //buffer length in bytes (64-bit data) | |
62 | // ); | |
63 | // | |
64 | // Reference paper titled "Fast CRC Computation for Generic | |
65 | // Polynomials Using PCLMULQDQ Instruction" | |
66 | // URL: http://www.intel.com/content/dam/www/public/us/en/documents | |
67 | // /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf | |
68 | // | |
69 | // | |
70 | ||
71 | #include <linux/linkage.h> | |
72 | #include <asm/assembler.h> | |
73 | ||
74 | .text | |
75 | .cpu generic+crypto | |
76 | ||
5b3da651 AB |
77 | arg1_low32 .req w19 |
78 | arg2 .req x20 | |
79 | arg3 .req x21 | |
6ef5737f AB |
80 | |
81 | vzr .req v13 | |
82 | ||
2fffee53 AB |
83 | ad .req v14 |
84 | bd .req v10 | |
85 | ||
86 | k00_16 .req v15 | |
87 | k32_48 .req v16 | |
88 | ||
89 | t3 .req v17 | |
90 | t4 .req v18 | |
91 | t5 .req v19 | |
92 | t6 .req v20 | |
93 | t7 .req v21 | |
94 | t8 .req v22 | |
95 | t9 .req v23 | |
96 | ||
97 | perm1 .req v24 | |
98 | perm2 .req v25 | |
99 | perm3 .req v26 | |
100 | perm4 .req v27 | |
101 | ||
102 | bd1 .req v28 | |
103 | bd2 .req v29 | |
104 | bd3 .req v30 | |
105 | bd4 .req v31 | |
106 | ||
107 | .macro __pmull_init_p64 | |
108 | .endm | |
109 | ||
110 | .macro __pmull_pre_p64, bd | |
111 | .endm | |
112 | ||
113 | .macro __pmull_init_p8 | |
114 | // k00_16 := 0x0000000000000000_000000000000ffff | |
115 | // k32_48 := 0x00000000ffffffff_0000ffffffffffff | |
116 | movi k32_48.2d, #0xffffffff | |
117 | mov k32_48.h[2], k32_48.h[0] | |
118 | ushr k00_16.2d, k32_48.2d, #32 | |
119 | ||
120 | // prepare the permutation vectors | |
121 | mov_q x5, 0x080f0e0d0c0b0a09 | |
122 | movi perm4.8b, #8 | |
123 | dup perm1.2d, x5 | |
124 | eor perm1.16b, perm1.16b, perm4.16b | |
125 | ushr perm2.2d, perm1.2d, #8 | |
126 | ushr perm3.2d, perm1.2d, #16 | |
127 | ushr perm4.2d, perm1.2d, #24 | |
128 | sli perm2.2d, perm1.2d, #56 | |
129 | sli perm3.2d, perm1.2d, #48 | |
130 | sli perm4.2d, perm1.2d, #40 | |
131 | .endm | |
132 | ||
133 | .macro __pmull_pre_p8, bd | |
134 | tbl bd1.16b, {\bd\().16b}, perm1.16b | |
135 | tbl bd2.16b, {\bd\().16b}, perm2.16b | |
136 | tbl bd3.16b, {\bd\().16b}, perm3.16b | |
137 | tbl bd4.16b, {\bd\().16b}, perm4.16b | |
138 | .endm | |
139 | ||
140 | __pmull_p8_core: | |
141 | .L__pmull_p8_core: | |
142 | ext t4.8b, ad.8b, ad.8b, #1 // A1 | |
143 | ext t5.8b, ad.8b, ad.8b, #2 // A2 | |
144 | ext t6.8b, ad.8b, ad.8b, #3 // A3 | |
145 | ||
146 | pmull t4.8h, t4.8b, bd.8b // F = A1*B | |
147 | pmull t8.8h, ad.8b, bd1.8b // E = A*B1 | |
148 | pmull t5.8h, t5.8b, bd.8b // H = A2*B | |
149 | pmull t7.8h, ad.8b, bd2.8b // G = A*B2 | |
150 | pmull t6.8h, t6.8b, bd.8b // J = A3*B | |
151 | pmull t9.8h, ad.8b, bd3.8b // I = A*B3 | |
152 | pmull t3.8h, ad.8b, bd4.8b // K = A*B4 | |
153 | b 0f | |
154 | ||
155 | .L__pmull_p8_core2: | |
156 | tbl t4.16b, {ad.16b}, perm1.16b // A1 | |
157 | tbl t5.16b, {ad.16b}, perm2.16b // A2 | |
158 | tbl t6.16b, {ad.16b}, perm3.16b // A3 | |
159 | ||
160 | pmull2 t4.8h, t4.16b, bd.16b // F = A1*B | |
161 | pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1 | |
162 | pmull2 t5.8h, t5.16b, bd.16b // H = A2*B | |
163 | pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2 | |
164 | pmull2 t6.8h, t6.16b, bd.16b // J = A3*B | |
165 | pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3 | |
166 | pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4 | |
167 | ||
168 | 0: eor t4.16b, t4.16b, t8.16b // L = E + F | |
169 | eor t5.16b, t5.16b, t7.16b // M = G + H | |
170 | eor t6.16b, t6.16b, t9.16b // N = I + J | |
171 | ||
172 | uzp1 t8.2d, t4.2d, t5.2d | |
173 | uzp2 t4.2d, t4.2d, t5.2d | |
174 | uzp1 t7.2d, t6.2d, t3.2d | |
175 | uzp2 t6.2d, t6.2d, t3.2d | |
176 | ||
177 | // t4 = (L) (P0 + P1) << 8 | |
178 | // t5 = (M) (P2 + P3) << 16 | |
179 | eor t8.16b, t8.16b, t4.16b | |
180 | and t4.16b, t4.16b, k32_48.16b | |
181 | ||
182 | // t6 = (N) (P4 + P5) << 24 | |
183 | // t7 = (K) (P6 + P7) << 32 | |
184 | eor t7.16b, t7.16b, t6.16b | |
185 | and t6.16b, t6.16b, k00_16.16b | |
186 | ||
187 | eor t8.16b, t8.16b, t4.16b | |
188 | eor t7.16b, t7.16b, t6.16b | |
189 | ||
190 | zip2 t5.2d, t8.2d, t4.2d | |
191 | zip1 t4.2d, t8.2d, t4.2d | |
192 | zip2 t3.2d, t7.2d, t6.2d | |
193 | zip1 t6.2d, t7.2d, t6.2d | |
194 | ||
195 | ext t4.16b, t4.16b, t4.16b, #15 | |
196 | ext t5.16b, t5.16b, t5.16b, #14 | |
197 | ext t6.16b, t6.16b, t6.16b, #13 | |
198 | ext t3.16b, t3.16b, t3.16b, #12 | |
199 | ||
200 | eor t4.16b, t4.16b, t5.16b | |
201 | eor t6.16b, t6.16b, t3.16b | |
202 | ret | |
203 | ENDPROC(__pmull_p8_core) | |
204 | ||
205 | .macro __pmull_p8, rq, ad, bd, i | |
206 | .ifnc \bd, v10 | |
207 | .err | |
208 | .endif | |
209 | mov ad.16b, \ad\().16b | |
210 | .ifb \i | |
211 | pmull \rq\().8h, \ad\().8b, bd.8b // D = A*B | |
212 | .else | |
213 | pmull2 \rq\().8h, \ad\().16b, bd.16b // D = A*B | |
214 | .endif | |
215 | ||
216 | bl .L__pmull_p8_core\i | |
217 | ||
218 | eor \rq\().16b, \rq\().16b, t4.16b | |
219 | eor \rq\().16b, \rq\().16b, t6.16b | |
220 | .endm | |
221 | ||
6c1b0da1 AB |
222 | .macro fold64, p, reg1, reg2 |
223 | ldp q11, q12, [arg2], #0x20 | |
224 | ||
225 | __pmull_\p v8, \reg1, v10, 2 | |
226 | __pmull_\p \reg1, \reg1, v10 | |
227 | ||
228 | CPU_LE( rev64 v11.16b, v11.16b ) | |
229 | CPU_LE( rev64 v12.16b, v12.16b ) | |
230 | ||
231 | __pmull_\p v9, \reg2, v10, 2 | |
232 | __pmull_\p \reg2, \reg2, v10 | |
233 | ||
234 | CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 ) | |
235 | CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) | |
236 | ||
237 | eor \reg1\().16b, \reg1\().16b, v8.16b | |
238 | eor \reg2\().16b, \reg2\().16b, v9.16b | |
239 | eor \reg1\().16b, \reg1\().16b, v11.16b | |
240 | eor \reg2\().16b, \reg2\().16b, v12.16b | |
241 | .endm | |
242 | ||
243 | .macro fold16, p, reg, rk | |
244 | __pmull_\p v8, \reg, v10 | |
245 | __pmull_\p \reg, \reg, v10, 2 | |
246 | .ifnb \rk | |
247 | ldr_l q10, \rk, x8 | |
2fffee53 | 248 | __pmull_pre_\p v10 |
6c1b0da1 AB |
249 | .endif |
250 | eor v7.16b, v7.16b, v8.16b | |
251 | eor v7.16b, v7.16b, \reg\().16b | |
252 | .endm | |
253 | ||
254 | .macro __pmull_p64, rd, rn, rm, n | |
255 | .ifb \n | |
256 | pmull \rd\().1q, \rn\().1d, \rm\().1d | |
257 | .else | |
258 | pmull2 \rd\().1q, \rn\().2d, \rm\().2d | |
259 | .endif | |
260 | .endm | |
261 | ||
262 | .macro crc_t10dif_pmull, p | |
5b3da651 AB |
263 | frame_push 3, 128 |
264 | ||
265 | mov arg1_low32, w0 | |
266 | mov arg2, x1 | |
267 | mov arg3, x2 | |
268 | ||
6ef5737f AB |
269 | movi vzr.16b, #0 // init zero register |
270 | ||
2fffee53 AB |
271 | __pmull_init_\p |
272 | ||
6ef5737f AB |
273 | // adjust the 16-bit initial_crc value, scale it to 32 bits |
274 | lsl arg1_low32, arg1_low32, #16 | |
275 | ||
276 | // check if smaller than 256 | |
277 | cmp arg3, #256 | |
278 | ||
279 | // for sizes less than 128, we can't fold 64B at a time... | |
6c1b0da1 | 280 | b.lt .L_less_than_128_\@ |
6ef5737f AB |
281 | |
282 | // load the initial crc value | |
283 | // crc value does not need to be byte-reflected, but it needs | |
284 | // to be moved to the high part of the register. | |
285 | // because data will be byte-reflected and will align with | |
286 | // initial crc at correct place. | |
287 | movi v10.16b, #0 | |
288 | mov v10.s[3], arg1_low32 // initial crc | |
289 | ||
290 | // receive the initial 64B data, xor the initial crc value | |
291 | ldp q0, q1, [arg2] | |
292 | ldp q2, q3, [arg2, #0x20] | |
293 | ldp q4, q5, [arg2, #0x40] | |
294 | ldp q6, q7, [arg2, #0x60] | |
295 | add arg2, arg2, #0x80 | |
296 | ||
297 | CPU_LE( rev64 v0.16b, v0.16b ) | |
298 | CPU_LE( rev64 v1.16b, v1.16b ) | |
299 | CPU_LE( rev64 v2.16b, v2.16b ) | |
300 | CPU_LE( rev64 v3.16b, v3.16b ) | |
301 | CPU_LE( rev64 v4.16b, v4.16b ) | |
302 | CPU_LE( rev64 v5.16b, v5.16b ) | |
303 | CPU_LE( rev64 v6.16b, v6.16b ) | |
304 | CPU_LE( rev64 v7.16b, v7.16b ) | |
305 | ||
306 | CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) | |
307 | CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) | |
308 | CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 ) | |
309 | CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 ) | |
310 | CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 ) | |
311 | CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 ) | |
312 | CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 ) | |
313 | CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) | |
314 | ||
315 | // XOR the initial_crc value | |
316 | eor v0.16b, v0.16b, v10.16b | |
317 | ||
325f562d | 318 | ldr_l q10, rk3, x8 // xmm10 has rk3 and rk4 |
6ef5737f AB |
319 | // type of pmull instruction |
320 | // will determine which constant to use | |
2fffee53 | 321 | __pmull_pre_\p v10 |
6ef5737f AB |
322 | |
323 | // | |
324 | // we subtract 256 instead of 128 to save one instruction from the loop | |
325 | // | |
326 | sub arg3, arg3, #256 | |
327 | ||
328 | // at this section of the code, there is 64*x+y (0<=y<64) bytes of | |
329 | // buffer. The _fold_64_B_loop will fold 64B at a time | |
330 | // until we have 64+y Bytes of buffer | |
331 | ||
6ef5737f AB |
332 | // fold 64B at a time. This section of the code folds 4 vector |
333 | // registers in parallel | |
6c1b0da1 | 334 | .L_fold_64_B_loop_\@: |
6ef5737f | 335 | |
6c1b0da1 AB |
336 | fold64 \p, v0, v1 |
337 | fold64 \p, v2, v3 | |
338 | fold64 \p, v4, v5 | |
339 | fold64 \p, v6, v7 | |
6ef5737f AB |
340 | |
341 | subs arg3, arg3, #128 | |
342 | ||
343 | // check if there is another 64B in the buffer to be able to fold | |
6c1b0da1 | 344 | b.lt .L_fold_64_B_end_\@ |
5b3da651 AB |
345 | |
346 | if_will_cond_yield_neon | |
347 | stp q0, q1, [sp, #.Lframe_local_offset] | |
348 | stp q2, q3, [sp, #.Lframe_local_offset + 32] | |
349 | stp q4, q5, [sp, #.Lframe_local_offset + 64] | |
350 | stp q6, q7, [sp, #.Lframe_local_offset + 96] | |
351 | do_cond_yield_neon | |
352 | ldp q0, q1, [sp, #.Lframe_local_offset] | |
353 | ldp q2, q3, [sp, #.Lframe_local_offset + 32] | |
354 | ldp q4, q5, [sp, #.Lframe_local_offset + 64] | |
355 | ldp q6, q7, [sp, #.Lframe_local_offset + 96] | |
356 | ldr_l q10, rk3, x8 | |
357 | movi vzr.16b, #0 // init zero register | |
2fffee53 AB |
358 | __pmull_init_\p |
359 | __pmull_pre_\p v10 | |
5b3da651 AB |
360 | endif_yield_neon |
361 | ||
6c1b0da1 | 362 | b .L_fold_64_B_loop_\@ |
6ef5737f | 363 | |
6c1b0da1 | 364 | .L_fold_64_B_end_\@: |
6ef5737f AB |
365 | // at this point, the buffer pointer is pointing at the last y Bytes |
366 | // of the buffer the 64B of folded data is in 4 of the vector | |
367 | // registers: v0, v1, v2, v3 | |
368 | ||
369 | // fold the 8 vector registers to 1 vector register with different | |
370 | // constants | |
371 | ||
325f562d | 372 | ldr_l q10, rk9, x8 |
2fffee53 | 373 | __pmull_pre_\p v10 |
6ef5737f | 374 | |
6c1b0da1 AB |
375 | fold16 \p, v0, rk11 |
376 | fold16 \p, v1, rk13 | |
377 | fold16 \p, v2, rk15 | |
378 | fold16 \p, v3, rk17 | |
379 | fold16 \p, v4, rk19 | |
380 | fold16 \p, v5, rk1 | |
381 | fold16 \p, v6 | |
6ef5737f AB |
382 | |
383 | // instead of 64, we add 48 to the loop counter to save 1 instruction | |
384 | // from the loop instead of a cmp instruction, we use the negative | |
385 | // flag with the jl instruction | |
386 | adds arg3, arg3, #(128-16) | |
6c1b0da1 | 387 | b.lt .L_final_reduction_for_128_\@ |
6ef5737f AB |
388 | |
389 | // now we have 16+y bytes left to reduce. 16 Bytes is in register v7 | |
390 | // and the rest is in memory. We can fold 16 bytes at a time if y>=16 | |
391 | // continue folding 16B at a time | |
392 | ||
6c1b0da1 AB |
393 | .L_16B_reduction_loop_\@: |
394 | __pmull_\p v8, v7, v10 | |
395 | __pmull_\p v7, v7, v10, 2 | |
6ef5737f AB |
396 | eor v7.16b, v7.16b, v8.16b |
397 | ||
398 | ldr q0, [arg2], #16 | |
399 | CPU_LE( rev64 v0.16b, v0.16b ) | |
400 | CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) | |
401 | eor v7.16b, v7.16b, v0.16b | |
402 | subs arg3, arg3, #16 | |
403 | ||
404 | // instead of a cmp instruction, we utilize the flags with the | |
405 | // jge instruction equivalent of: cmp arg3, 16-16 | |
406 | // check if there is any more 16B in the buffer to be able to fold | |
6c1b0da1 | 407 | b.ge .L_16B_reduction_loop_\@ |
6ef5737f AB |
408 | |
409 | // now we have 16+z bytes left to reduce, where 0<= z < 16. | |
410 | // first, we reduce the data in the xmm7 register | |
411 | ||
6c1b0da1 | 412 | .L_final_reduction_for_128_\@: |
6ef5737f AB |
413 | // check if any more data to fold. If not, compute the CRC of |
414 | // the final 128 bits | |
415 | adds arg3, arg3, #16 | |
6c1b0da1 | 416 | b.eq .L_128_done_\@ |
6ef5737f AB |
417 | |
418 | // here we are getting data that is less than 16 bytes. | |
419 | // since we know that there was data before the pointer, we can | |
420 | // offset the input pointer before the actual point, to receive | |
421 | // exactly 16 bytes. after that the registers need to be adjusted. | |
6c1b0da1 | 422 | .L_get_last_two_regs_\@: |
6ef5737f AB |
423 | add arg2, arg2, arg3 |
424 | ldr q1, [arg2, #-16] | |
425 | CPU_LE( rev64 v1.16b, v1.16b ) | |
426 | CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) | |
427 | ||
428 | // get rid of the extra data that was loaded before | |
429 | // load the shift constant | |
325f562d | 430 | adr_l x4, tbl_shf_table + 16 |
6ef5737f AB |
431 | sub x4, x4, arg3 |
432 | ld1 {v0.16b}, [x4] | |
433 | ||
434 | // shift v2 to the left by arg3 bytes | |
435 | tbl v2.16b, {v7.16b}, v0.16b | |
436 | ||
437 | // shift v7 to the right by 16-arg3 bytes | |
438 | movi v9.16b, #0x80 | |
439 | eor v0.16b, v0.16b, v9.16b | |
440 | tbl v7.16b, {v7.16b}, v0.16b | |
441 | ||
442 | // blend | |
443 | sshr v0.16b, v0.16b, #7 // convert to 8-bit mask | |
444 | bsl v0.16b, v2.16b, v1.16b | |
445 | ||
446 | // fold 16 Bytes | |
6c1b0da1 AB |
447 | __pmull_\p v8, v7, v10 |
448 | __pmull_\p v7, v7, v10, 2 | |
6ef5737f AB |
449 | eor v7.16b, v7.16b, v8.16b |
450 | eor v7.16b, v7.16b, v0.16b | |
451 | ||
6c1b0da1 | 452 | .L_128_done_\@: |
6ef5737f | 453 | // compute crc of a 128-bit value |
325f562d | 454 | ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10 |
2fffee53 | 455 | __pmull_pre_\p v10 |
6ef5737f AB |
456 | |
457 | // 64b fold | |
458 | ext v0.16b, vzr.16b, v7.16b, #8 | |
459 | mov v7.d[0], v7.d[1] | |
6c1b0da1 | 460 | __pmull_\p v7, v7, v10 |
6ef5737f AB |
461 | eor v7.16b, v7.16b, v0.16b |
462 | ||
463 | // 32b fold | |
464 | ext v0.16b, v7.16b, vzr.16b, #4 | |
465 | mov v7.s[3], vzr.s[0] | |
6c1b0da1 | 466 | __pmull_\p v0, v0, v10, 2 |
6ef5737f AB |
467 | eor v7.16b, v7.16b, v0.16b |
468 | ||
469 | // barrett reduction | |
325f562d | 470 | ldr_l q10, rk7, x8 |
2fffee53 | 471 | __pmull_pre_\p v10 |
6ef5737f AB |
472 | mov v0.d[0], v7.d[1] |
473 | ||
6c1b0da1 | 474 | __pmull_\p v0, v0, v10 |
6ef5737f | 475 | ext v0.16b, vzr.16b, v0.16b, #12 |
6c1b0da1 | 476 | __pmull_\p v0, v0, v10, 2 |
6ef5737f AB |
477 | ext v0.16b, vzr.16b, v0.16b, #12 |
478 | eor v7.16b, v7.16b, v0.16b | |
479 | mov w0, v7.s[1] | |
480 | ||
6c1b0da1 | 481 | .L_cleanup_\@: |
6ef5737f AB |
482 | // scale the result back to 16 bits |
483 | lsr x0, x0, #16 | |
5b3da651 | 484 | frame_pop |
6ef5737f AB |
485 | ret |
486 | ||
6c1b0da1 AB |
487 | .L_less_than_128_\@: |
488 | cbz arg3, .L_cleanup_\@ | |
6ef5737f AB |
489 | |
490 | movi v0.16b, #0 | |
491 | mov v0.s[3], arg1_low32 // get the initial crc value | |
492 | ||
493 | ldr q7, [arg2], #0x10 | |
494 | CPU_LE( rev64 v7.16b, v7.16b ) | |
495 | CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) | |
496 | eor v7.16b, v7.16b, v0.16b // xor the initial crc value | |
497 | ||
498 | cmp arg3, #16 | |
6c1b0da1 AB |
499 | b.eq .L_128_done_\@ // exactly 16 left |
500 | b.lt .L_less_than_16_left_\@ | |
6ef5737f | 501 | |
325f562d | 502 | ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10 |
2fffee53 | 503 | __pmull_pre_\p v10 |
6ef5737f AB |
504 | |
505 | // update the counter. subtract 32 instead of 16 to save one | |
506 | // instruction from the loop | |
507 | subs arg3, arg3, #32 | |
6c1b0da1 | 508 | b.ge .L_16B_reduction_loop_\@ |
6ef5737f AB |
509 | |
510 | add arg3, arg3, #16 | |
6c1b0da1 | 511 | b .L_get_last_two_regs_\@ |
6ef5737f | 512 | |
6c1b0da1 | 513 | .L_less_than_16_left_\@: |
6ef5737f | 514 | // shl r9, 4 |
325f562d | 515 | adr_l x0, tbl_shf_table + 16 |
6ef5737f AB |
516 | sub x0, x0, arg3 |
517 | ld1 {v0.16b}, [x0] | |
518 | movi v9.16b, #0x80 | |
519 | eor v0.16b, v0.16b, v9.16b | |
520 | tbl v7.16b, {v7.16b}, v0.16b | |
6c1b0da1 AB |
521 | b .L_128_done_\@ |
522 | .endm | |
523 | ||
2fffee53 AB |
524 | ENTRY(crc_t10dif_pmull_p8) |
525 | crc_t10dif_pmull p8 | |
526 | ENDPROC(crc_t10dif_pmull_p8) | |
527 | ||
528 | .align 5 | |
6c1b0da1 AB |
529 | ENTRY(crc_t10dif_pmull_p64) |
530 | crc_t10dif_pmull p64 | |
531 | ENDPROC(crc_t10dif_pmull_p64) | |
6ef5737f AB |
532 | |
533 | // precomputed constants | |
534 | // these constants are precomputed from the poly: | |
535 | // 0x8bb70000 (0x8bb7 scaled to 32 bits) | |
325f562d | 536 | .section ".rodata", "a" |
6ef5737f AB |
537 | .align 4 |
538 | // Q = 0x18BB70000 | |
539 | // rk1 = 2^(32*3) mod Q << 32 | |
540 | // rk2 = 2^(32*5) mod Q << 32 | |
541 | // rk3 = 2^(32*15) mod Q << 32 | |
542 | // rk4 = 2^(32*17) mod Q << 32 | |
543 | // rk5 = 2^(32*3) mod Q << 32 | |
544 | // rk6 = 2^(32*2) mod Q << 32 | |
545 | // rk7 = floor(2^64/Q) | |
546 | // rk8 = Q | |
547 | ||
548 | rk1: .octa 0x06df0000000000002d56000000000000 | |
549 | rk3: .octa 0x7cf50000000000009d9d000000000000 | |
550 | rk5: .octa 0x13680000000000002d56000000000000 | |
551 | rk7: .octa 0x000000018bb7000000000001f65a57f8 | |
552 | rk9: .octa 0xbfd6000000000000ceae000000000000 | |
553 | rk11: .octa 0x713c0000000000001e16000000000000 | |
554 | rk13: .octa 0x80a6000000000000f7f9000000000000 | |
555 | rk15: .octa 0xe658000000000000044c000000000000 | |
556 | rk17: .octa 0xa497000000000000ad18000000000000 | |
557 | rk19: .octa 0xe7b50000000000006ee3000000000000 | |
558 | ||
559 | tbl_shf_table: | |
560 | // use these values for shift constants for the tbl/tbx instruction | |
561 | // different alignments result in values as shown: | |
562 | // DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 | |
563 | // DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 | |
564 | // DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 | |
565 | // DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 | |
566 | // DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 | |
567 | // DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 | |
568 | // DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 | |
569 | // DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 | |
570 | // DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 | |
571 | // DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 | |
572 | // DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 | |
573 | // DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 | |
574 | // DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 | |
575 | // DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 | |
576 | // DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 | |
577 | ||
578 | .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 | |
579 | .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f | |
580 | .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 | |
581 | .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 |