Merge tag 'v4.20-rockchip-dts32fixes-1' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-block.git] / arch / arm64 / crypto / crct10dif-ce-core.S
CommitLineData
6ef5737f
AB
1//
2// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
3//
4// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
5//
6// This program is free software; you can redistribute it and/or modify
7// it under the terms of the GNU General Public License version 2 as
8// published by the Free Software Foundation.
9//
10
11//
12// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
13//
14// Copyright (c) 2013, Intel Corporation
15//
16// Authors:
17// Erdinc Ozturk <erdinc.ozturk@intel.com>
18// Vinodh Gopal <vinodh.gopal@intel.com>
19// James Guilford <james.guilford@intel.com>
20// Tim Chen <tim.c.chen@linux.intel.com>
21//
22// This software is available to you under a choice of one of two
23// licenses. You may choose to be licensed under the terms of the GNU
24// General Public License (GPL) Version 2, available from the file
25// COPYING in the main directory of this source tree, or the
26// OpenIB.org BSD license below:
27//
28// Redistribution and use in source and binary forms, with or without
29// modification, are permitted provided that the following conditions are
30// met:
31//
32// * Redistributions of source code must retain the above copyright
33// notice, this list of conditions and the following disclaimer.
34//
35// * Redistributions in binary form must reproduce the above copyright
36// notice, this list of conditions and the following disclaimer in the
37// documentation and/or other materials provided with the
38// distribution.
39//
40// * Neither the name of the Intel Corporation nor the names of its
41// contributors may be used to endorse or promote products derived from
42// this software without specific prior written permission.
43//
44//
45// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
46// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
48// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
49// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
50// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
51// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
52// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
53// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
54// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
55// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
56//
57// Function API:
58// UINT16 crc_t10dif_pcl(
59// UINT16 init_crc, //initial CRC value, 16 bits
60// const unsigned char *buf, //buffer pointer to calculate CRC on
61// UINT64 len //buffer length in bytes (64-bit data)
62// );
63//
64// Reference paper titled "Fast CRC Computation for Generic
65// Polynomials Using PCLMULQDQ Instruction"
66// URL: http://www.intel.com/content/dam/www/public/us/en/documents
67// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
68//
69//
70
71#include <linux/linkage.h>
72#include <asm/assembler.h>
73
74 .text
75 .cpu generic+crypto
76
5b3da651
AB
77 arg1_low32 .req w19
78 arg2 .req x20
79 arg3 .req x21
6ef5737f
AB
80
81 vzr .req v13
82
2fffee53
AB
83 ad .req v14
84 bd .req v10
85
86 k00_16 .req v15
87 k32_48 .req v16
88
89 t3 .req v17
90 t4 .req v18
91 t5 .req v19
92 t6 .req v20
93 t7 .req v21
94 t8 .req v22
95 t9 .req v23
96
97 perm1 .req v24
98 perm2 .req v25
99 perm3 .req v26
100 perm4 .req v27
101
102 bd1 .req v28
103 bd2 .req v29
104 bd3 .req v30
105 bd4 .req v31
106
107 .macro __pmull_init_p64
108 .endm
109
110 .macro __pmull_pre_p64, bd
111 .endm
112
113 .macro __pmull_init_p8
114 // k00_16 := 0x0000000000000000_000000000000ffff
115 // k32_48 := 0x00000000ffffffff_0000ffffffffffff
116 movi k32_48.2d, #0xffffffff
117 mov k32_48.h[2], k32_48.h[0]
118 ushr k00_16.2d, k32_48.2d, #32
119
120 // prepare the permutation vectors
121 mov_q x5, 0x080f0e0d0c0b0a09
122 movi perm4.8b, #8
123 dup perm1.2d, x5
124 eor perm1.16b, perm1.16b, perm4.16b
125 ushr perm2.2d, perm1.2d, #8
126 ushr perm3.2d, perm1.2d, #16
127 ushr perm4.2d, perm1.2d, #24
128 sli perm2.2d, perm1.2d, #56
129 sli perm3.2d, perm1.2d, #48
130 sli perm4.2d, perm1.2d, #40
131 .endm
132
133 .macro __pmull_pre_p8, bd
134 tbl bd1.16b, {\bd\().16b}, perm1.16b
135 tbl bd2.16b, {\bd\().16b}, perm2.16b
136 tbl bd3.16b, {\bd\().16b}, perm3.16b
137 tbl bd4.16b, {\bd\().16b}, perm4.16b
138 .endm
139
140__pmull_p8_core:
141.L__pmull_p8_core:
142 ext t4.8b, ad.8b, ad.8b, #1 // A1
143 ext t5.8b, ad.8b, ad.8b, #2 // A2
144 ext t6.8b, ad.8b, ad.8b, #3 // A3
145
146 pmull t4.8h, t4.8b, bd.8b // F = A1*B
147 pmull t8.8h, ad.8b, bd1.8b // E = A*B1
148 pmull t5.8h, t5.8b, bd.8b // H = A2*B
149 pmull t7.8h, ad.8b, bd2.8b // G = A*B2
150 pmull t6.8h, t6.8b, bd.8b // J = A3*B
151 pmull t9.8h, ad.8b, bd3.8b // I = A*B3
152 pmull t3.8h, ad.8b, bd4.8b // K = A*B4
153 b 0f
154
155.L__pmull_p8_core2:
156 tbl t4.16b, {ad.16b}, perm1.16b // A1
157 tbl t5.16b, {ad.16b}, perm2.16b // A2
158 tbl t6.16b, {ad.16b}, perm3.16b // A3
159
160 pmull2 t4.8h, t4.16b, bd.16b // F = A1*B
161 pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1
162 pmull2 t5.8h, t5.16b, bd.16b // H = A2*B
163 pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2
164 pmull2 t6.8h, t6.16b, bd.16b // J = A3*B
165 pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3
166 pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4
167
1680: eor t4.16b, t4.16b, t8.16b // L = E + F
169 eor t5.16b, t5.16b, t7.16b // M = G + H
170 eor t6.16b, t6.16b, t9.16b // N = I + J
171
172 uzp1 t8.2d, t4.2d, t5.2d
173 uzp2 t4.2d, t4.2d, t5.2d
174 uzp1 t7.2d, t6.2d, t3.2d
175 uzp2 t6.2d, t6.2d, t3.2d
176
177 // t4 = (L) (P0 + P1) << 8
178 // t5 = (M) (P2 + P3) << 16
179 eor t8.16b, t8.16b, t4.16b
180 and t4.16b, t4.16b, k32_48.16b
181
182 // t6 = (N) (P4 + P5) << 24
183 // t7 = (K) (P6 + P7) << 32
184 eor t7.16b, t7.16b, t6.16b
185 and t6.16b, t6.16b, k00_16.16b
186
187 eor t8.16b, t8.16b, t4.16b
188 eor t7.16b, t7.16b, t6.16b
189
190 zip2 t5.2d, t8.2d, t4.2d
191 zip1 t4.2d, t8.2d, t4.2d
192 zip2 t3.2d, t7.2d, t6.2d
193 zip1 t6.2d, t7.2d, t6.2d
194
195 ext t4.16b, t4.16b, t4.16b, #15
196 ext t5.16b, t5.16b, t5.16b, #14
197 ext t6.16b, t6.16b, t6.16b, #13
198 ext t3.16b, t3.16b, t3.16b, #12
199
200 eor t4.16b, t4.16b, t5.16b
201 eor t6.16b, t6.16b, t3.16b
202 ret
203ENDPROC(__pmull_p8_core)
204
205 .macro __pmull_p8, rq, ad, bd, i
206 .ifnc \bd, v10
207 .err
208 .endif
209 mov ad.16b, \ad\().16b
210 .ifb \i
211 pmull \rq\().8h, \ad\().8b, bd.8b // D = A*B
212 .else
213 pmull2 \rq\().8h, \ad\().16b, bd.16b // D = A*B
214 .endif
215
216 bl .L__pmull_p8_core\i
217
218 eor \rq\().16b, \rq\().16b, t4.16b
219 eor \rq\().16b, \rq\().16b, t6.16b
220 .endm
221
6c1b0da1
AB
222 .macro fold64, p, reg1, reg2
223 ldp q11, q12, [arg2], #0x20
224
225 __pmull_\p v8, \reg1, v10, 2
226 __pmull_\p \reg1, \reg1, v10
227
228CPU_LE( rev64 v11.16b, v11.16b )
229CPU_LE( rev64 v12.16b, v12.16b )
230
231 __pmull_\p v9, \reg2, v10, 2
232 __pmull_\p \reg2, \reg2, v10
233
234CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
235CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
236
237 eor \reg1\().16b, \reg1\().16b, v8.16b
238 eor \reg2\().16b, \reg2\().16b, v9.16b
239 eor \reg1\().16b, \reg1\().16b, v11.16b
240 eor \reg2\().16b, \reg2\().16b, v12.16b
241 .endm
242
243 .macro fold16, p, reg, rk
244 __pmull_\p v8, \reg, v10
245 __pmull_\p \reg, \reg, v10, 2
246 .ifnb \rk
247 ldr_l q10, \rk, x8
2fffee53 248 __pmull_pre_\p v10
6c1b0da1
AB
249 .endif
250 eor v7.16b, v7.16b, v8.16b
251 eor v7.16b, v7.16b, \reg\().16b
252 .endm
253
254 .macro __pmull_p64, rd, rn, rm, n
255 .ifb \n
256 pmull \rd\().1q, \rn\().1d, \rm\().1d
257 .else
258 pmull2 \rd\().1q, \rn\().2d, \rm\().2d
259 .endif
260 .endm
261
262 .macro crc_t10dif_pmull, p
5b3da651
AB
263 frame_push 3, 128
264
265 mov arg1_low32, w0
266 mov arg2, x1
267 mov arg3, x2
268
6ef5737f
AB
269 movi vzr.16b, #0 // init zero register
270
2fffee53
AB
271 __pmull_init_\p
272
6ef5737f
AB
273 // adjust the 16-bit initial_crc value, scale it to 32 bits
274 lsl arg1_low32, arg1_low32, #16
275
276 // check if smaller than 256
277 cmp arg3, #256
278
279 // for sizes less than 128, we can't fold 64B at a time...
6c1b0da1 280 b.lt .L_less_than_128_\@
6ef5737f
AB
281
282 // load the initial crc value
283 // crc value does not need to be byte-reflected, but it needs
284 // to be moved to the high part of the register.
285 // because data will be byte-reflected and will align with
286 // initial crc at correct place.
287 movi v10.16b, #0
288 mov v10.s[3], arg1_low32 // initial crc
289
290 // receive the initial 64B data, xor the initial crc value
291 ldp q0, q1, [arg2]
292 ldp q2, q3, [arg2, #0x20]
293 ldp q4, q5, [arg2, #0x40]
294 ldp q6, q7, [arg2, #0x60]
295 add arg2, arg2, #0x80
296
297CPU_LE( rev64 v0.16b, v0.16b )
298CPU_LE( rev64 v1.16b, v1.16b )
299CPU_LE( rev64 v2.16b, v2.16b )
300CPU_LE( rev64 v3.16b, v3.16b )
301CPU_LE( rev64 v4.16b, v4.16b )
302CPU_LE( rev64 v5.16b, v5.16b )
303CPU_LE( rev64 v6.16b, v6.16b )
304CPU_LE( rev64 v7.16b, v7.16b )
305
306CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
307CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
308CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
309CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
310CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
311CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
312CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
313CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
314
315 // XOR the initial_crc value
316 eor v0.16b, v0.16b, v10.16b
317
325f562d 318 ldr_l q10, rk3, x8 // xmm10 has rk3 and rk4
6ef5737f
AB
319 // type of pmull instruction
320 // will determine which constant to use
2fffee53 321 __pmull_pre_\p v10
6ef5737f
AB
322
323 //
324 // we subtract 256 instead of 128 to save one instruction from the loop
325 //
326 sub arg3, arg3, #256
327
328 // at this section of the code, there is 64*x+y (0<=y<64) bytes of
329 // buffer. The _fold_64_B_loop will fold 64B at a time
330 // until we have 64+y Bytes of buffer
331
6ef5737f
AB
332 // fold 64B at a time. This section of the code folds 4 vector
333 // registers in parallel
6c1b0da1 334.L_fold_64_B_loop_\@:
6ef5737f 335
6c1b0da1
AB
336 fold64 \p, v0, v1
337 fold64 \p, v2, v3
338 fold64 \p, v4, v5
339 fold64 \p, v6, v7
6ef5737f
AB
340
341 subs arg3, arg3, #128
342
343 // check if there is another 64B in the buffer to be able to fold
6c1b0da1 344 b.lt .L_fold_64_B_end_\@
5b3da651
AB
345
346 if_will_cond_yield_neon
347 stp q0, q1, [sp, #.Lframe_local_offset]
348 stp q2, q3, [sp, #.Lframe_local_offset + 32]
349 stp q4, q5, [sp, #.Lframe_local_offset + 64]
350 stp q6, q7, [sp, #.Lframe_local_offset + 96]
351 do_cond_yield_neon
352 ldp q0, q1, [sp, #.Lframe_local_offset]
353 ldp q2, q3, [sp, #.Lframe_local_offset + 32]
354 ldp q4, q5, [sp, #.Lframe_local_offset + 64]
355 ldp q6, q7, [sp, #.Lframe_local_offset + 96]
356 ldr_l q10, rk3, x8
357 movi vzr.16b, #0 // init zero register
2fffee53
AB
358 __pmull_init_\p
359 __pmull_pre_\p v10
5b3da651
AB
360 endif_yield_neon
361
6c1b0da1 362 b .L_fold_64_B_loop_\@
6ef5737f 363
6c1b0da1 364.L_fold_64_B_end_\@:
6ef5737f
AB
365 // at this point, the buffer pointer is pointing at the last y Bytes
366 // of the buffer the 64B of folded data is in 4 of the vector
367 // registers: v0, v1, v2, v3
368
369 // fold the 8 vector registers to 1 vector register with different
370 // constants
371
325f562d 372 ldr_l q10, rk9, x8
2fffee53 373 __pmull_pre_\p v10
6ef5737f 374
6c1b0da1
AB
375 fold16 \p, v0, rk11
376 fold16 \p, v1, rk13
377 fold16 \p, v2, rk15
378 fold16 \p, v3, rk17
379 fold16 \p, v4, rk19
380 fold16 \p, v5, rk1
381 fold16 \p, v6
6ef5737f
AB
382
383 // instead of 64, we add 48 to the loop counter to save 1 instruction
384 // from the loop instead of a cmp instruction, we use the negative
385 // flag with the jl instruction
386 adds arg3, arg3, #(128-16)
6c1b0da1 387 b.lt .L_final_reduction_for_128_\@
6ef5737f
AB
388
389 // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
390 // and the rest is in memory. We can fold 16 bytes at a time if y>=16
391 // continue folding 16B at a time
392
6c1b0da1
AB
393.L_16B_reduction_loop_\@:
394 __pmull_\p v8, v7, v10
395 __pmull_\p v7, v7, v10, 2
6ef5737f
AB
396 eor v7.16b, v7.16b, v8.16b
397
398 ldr q0, [arg2], #16
399CPU_LE( rev64 v0.16b, v0.16b )
400CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
401 eor v7.16b, v7.16b, v0.16b
402 subs arg3, arg3, #16
403
404 // instead of a cmp instruction, we utilize the flags with the
405 // jge instruction equivalent of: cmp arg3, 16-16
406 // check if there is any more 16B in the buffer to be able to fold
6c1b0da1 407 b.ge .L_16B_reduction_loop_\@
6ef5737f
AB
408
409 // now we have 16+z bytes left to reduce, where 0<= z < 16.
410 // first, we reduce the data in the xmm7 register
411
6c1b0da1 412.L_final_reduction_for_128_\@:
6ef5737f
AB
413 // check if any more data to fold. If not, compute the CRC of
414 // the final 128 bits
415 adds arg3, arg3, #16
6c1b0da1 416 b.eq .L_128_done_\@
6ef5737f
AB
417
418 // here we are getting data that is less than 16 bytes.
419 // since we know that there was data before the pointer, we can
420 // offset the input pointer before the actual point, to receive
421 // exactly 16 bytes. after that the registers need to be adjusted.
6c1b0da1 422.L_get_last_two_regs_\@:
6ef5737f
AB
423 add arg2, arg2, arg3
424 ldr q1, [arg2, #-16]
425CPU_LE( rev64 v1.16b, v1.16b )
426CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
427
428 // get rid of the extra data that was loaded before
429 // load the shift constant
325f562d 430 adr_l x4, tbl_shf_table + 16
6ef5737f
AB
431 sub x4, x4, arg3
432 ld1 {v0.16b}, [x4]
433
434 // shift v2 to the left by arg3 bytes
435 tbl v2.16b, {v7.16b}, v0.16b
436
437 // shift v7 to the right by 16-arg3 bytes
438 movi v9.16b, #0x80
439 eor v0.16b, v0.16b, v9.16b
440 tbl v7.16b, {v7.16b}, v0.16b
441
442 // blend
443 sshr v0.16b, v0.16b, #7 // convert to 8-bit mask
444 bsl v0.16b, v2.16b, v1.16b
445
446 // fold 16 Bytes
6c1b0da1
AB
447 __pmull_\p v8, v7, v10
448 __pmull_\p v7, v7, v10, 2
6ef5737f
AB
449 eor v7.16b, v7.16b, v8.16b
450 eor v7.16b, v7.16b, v0.16b
451
6c1b0da1 452.L_128_done_\@:
6ef5737f 453 // compute crc of a 128-bit value
325f562d 454 ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10
2fffee53 455 __pmull_pre_\p v10
6ef5737f
AB
456
457 // 64b fold
458 ext v0.16b, vzr.16b, v7.16b, #8
459 mov v7.d[0], v7.d[1]
6c1b0da1 460 __pmull_\p v7, v7, v10
6ef5737f
AB
461 eor v7.16b, v7.16b, v0.16b
462
463 // 32b fold
464 ext v0.16b, v7.16b, vzr.16b, #4
465 mov v7.s[3], vzr.s[0]
6c1b0da1 466 __pmull_\p v0, v0, v10, 2
6ef5737f
AB
467 eor v7.16b, v7.16b, v0.16b
468
469 // barrett reduction
325f562d 470 ldr_l q10, rk7, x8
2fffee53 471 __pmull_pre_\p v10
6ef5737f
AB
472 mov v0.d[0], v7.d[1]
473
6c1b0da1 474 __pmull_\p v0, v0, v10
6ef5737f 475 ext v0.16b, vzr.16b, v0.16b, #12
6c1b0da1 476 __pmull_\p v0, v0, v10, 2
6ef5737f
AB
477 ext v0.16b, vzr.16b, v0.16b, #12
478 eor v7.16b, v7.16b, v0.16b
479 mov w0, v7.s[1]
480
6c1b0da1 481.L_cleanup_\@:
6ef5737f
AB
482 // scale the result back to 16 bits
483 lsr x0, x0, #16
5b3da651 484 frame_pop
6ef5737f
AB
485 ret
486
6c1b0da1
AB
487.L_less_than_128_\@:
488 cbz arg3, .L_cleanup_\@
6ef5737f
AB
489
490 movi v0.16b, #0
491 mov v0.s[3], arg1_low32 // get the initial crc value
492
493 ldr q7, [arg2], #0x10
494CPU_LE( rev64 v7.16b, v7.16b )
495CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
496 eor v7.16b, v7.16b, v0.16b // xor the initial crc value
497
498 cmp arg3, #16
6c1b0da1
AB
499 b.eq .L_128_done_\@ // exactly 16 left
500 b.lt .L_less_than_16_left_\@
6ef5737f 501
325f562d 502 ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10
2fffee53 503 __pmull_pre_\p v10
6ef5737f
AB
504
505 // update the counter. subtract 32 instead of 16 to save one
506 // instruction from the loop
507 subs arg3, arg3, #32
6c1b0da1 508 b.ge .L_16B_reduction_loop_\@
6ef5737f
AB
509
510 add arg3, arg3, #16
6c1b0da1 511 b .L_get_last_two_regs_\@
6ef5737f 512
6c1b0da1 513.L_less_than_16_left_\@:
6ef5737f 514 // shl r9, 4
325f562d 515 adr_l x0, tbl_shf_table + 16
6ef5737f
AB
516 sub x0, x0, arg3
517 ld1 {v0.16b}, [x0]
518 movi v9.16b, #0x80
519 eor v0.16b, v0.16b, v9.16b
520 tbl v7.16b, {v7.16b}, v0.16b
6c1b0da1
AB
521 b .L_128_done_\@
522 .endm
523
2fffee53
AB
524ENTRY(crc_t10dif_pmull_p8)
525 crc_t10dif_pmull p8
526ENDPROC(crc_t10dif_pmull_p8)
527
528 .align 5
6c1b0da1
AB
529ENTRY(crc_t10dif_pmull_p64)
530 crc_t10dif_pmull p64
531ENDPROC(crc_t10dif_pmull_p64)
6ef5737f
AB
532
533// precomputed constants
534// these constants are precomputed from the poly:
535// 0x8bb70000 (0x8bb7 scaled to 32 bits)
325f562d 536 .section ".rodata", "a"
6ef5737f
AB
537 .align 4
538// Q = 0x18BB70000
539// rk1 = 2^(32*3) mod Q << 32
540// rk2 = 2^(32*5) mod Q << 32
541// rk3 = 2^(32*15) mod Q << 32
542// rk4 = 2^(32*17) mod Q << 32
543// rk5 = 2^(32*3) mod Q << 32
544// rk6 = 2^(32*2) mod Q << 32
545// rk7 = floor(2^64/Q)
546// rk8 = Q
547
548rk1: .octa 0x06df0000000000002d56000000000000
549rk3: .octa 0x7cf50000000000009d9d000000000000
550rk5: .octa 0x13680000000000002d56000000000000
551rk7: .octa 0x000000018bb7000000000001f65a57f8
552rk9: .octa 0xbfd6000000000000ceae000000000000
553rk11: .octa 0x713c0000000000001e16000000000000
554rk13: .octa 0x80a6000000000000f7f9000000000000
555rk15: .octa 0xe658000000000000044c000000000000
556rk17: .octa 0xa497000000000000ad18000000000000
557rk19: .octa 0xe7b50000000000006ee3000000000000
558
559tbl_shf_table:
560// use these values for shift constants for the tbl/tbx instruction
561// different alignments result in values as shown:
562// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
563// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
564// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
565// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
566// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
567// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
568// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
569// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
570// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
571// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
572// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
573// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
574// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
575// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
576// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
577
578 .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
579 .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
580 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
581 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0