Commit | Line | Data |
---|---|---|
1a59d1b8 | 1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
4d6d6a2c JG |
2 | /* |
3 | * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64) | |
4 | * | |
5 | * Copyright (C) 2012 Johannes Goetzfried | |
6 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> | |
7 | * | |
ddaea786 | 8 | * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> |
4d6d6a2c JG |
9 | */ |
10 | ||
e17e209e | 11 | #include <linux/linkage.h> |
8691ccd7 | 12 | #include <asm/frame.h> |
e17e209e | 13 | |
4d6d6a2c | 14 | .file "cast5-avx-x86_64-asm_64.S" |
4d6d6a2c | 15 | |
044ab525 JK |
16 | .extern cast_s1 |
17 | .extern cast_s2 | |
18 | .extern cast_s3 | |
19 | .extern cast_s4 | |
4d6d6a2c JG |
20 | |
21 | /* structure of crypto context */ | |
22 | #define km 0 | |
23 | #define kr (16*4) | |
24 | #define rr ((16*4)+16) | |
25 | ||
26 | /* s-boxes */ | |
044ab525 JK |
27 | #define s1 cast_s1 |
28 | #define s2 cast_s2 | |
29 | #define s3 cast_s3 | |
30 | #define s4 cast_s4 | |
4d6d6a2c JG |
31 | |
32 | /********************************************************************** | |
33 | 16-way AVX cast5 | |
34 | **********************************************************************/ | |
4b156066 | 35 | #define CTX %r15 |
4d6d6a2c JG |
36 | |
37 | #define RL1 %xmm0 | |
38 | #define RR1 %xmm1 | |
39 | #define RL2 %xmm2 | |
40 | #define RR2 %xmm3 | |
41 | #define RL3 %xmm4 | |
42 | #define RR3 %xmm5 | |
43 | #define RL4 %xmm6 | |
44 | #define RR4 %xmm7 | |
45 | ||
46 | #define RX %xmm8 | |
47 | ||
48 | #define RKM %xmm9 | |
ddaea786 JK |
49 | #define RKR %xmm10 |
50 | #define RKRF %xmm11 | |
51 | #define RKRR %xmm12 | |
52 | ||
53 | #define R32 %xmm13 | |
54 | #define R1ST %xmm14 | |
4d6d6a2c | 55 | |
ddaea786 | 56 | #define RTMP %xmm15 |
4d6d6a2c | 57 | |
4b156066 JP |
58 | #define RID1 %rdi |
59 | #define RID1d %edi | |
ddaea786 JK |
60 | #define RID2 %rsi |
61 | #define RID2d %esi | |
4d6d6a2c JG |
62 | |
63 | #define RGI1 %rdx | |
64 | #define RGI1bl %dl | |
65 | #define RGI1bh %dh | |
66 | #define RGI2 %rcx | |
67 | #define RGI2bl %cl | |
68 | #define RGI2bh %ch | |
69 | ||
ddaea786 JK |
70 | #define RGI3 %rax |
71 | #define RGI3bl %al | |
72 | #define RGI3bh %ah | |
73 | #define RGI4 %rbx | |
74 | #define RGI4bl %bl | |
75 | #define RGI4bh %bh | |
76 | ||
4d6d6a2c JG |
77 | #define RFS1 %r8 |
78 | #define RFS1d %r8d | |
79 | #define RFS2 %r9 | |
80 | #define RFS2d %r9d | |
81 | #define RFS3 %r10 | |
82 | #define RFS3d %r10d | |
83 | ||
84 | ||
ddaea786 JK |
85 | #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ |
86 | movzbl src ## bh, RID1d; \ | |
87 | movzbl src ## bl, RID2d; \ | |
88 | shrq $16, src; \ | |
4d6d6a2c JG |
89 | movl s1(, RID1, 4), dst ## d; \ |
90 | op1 s2(, RID2, 4), dst ## d; \ | |
ddaea786 JK |
91 | movzbl src ## bh, RID1d; \ |
92 | movzbl src ## bl, RID2d; \ | |
93 | interleave_op(il_reg); \ | |
4d6d6a2c JG |
94 | op2 s3(, RID1, 4), dst ## d; \ |
95 | op3 s4(, RID2, 4), dst ## d; | |
96 | ||
ddaea786 JK |
97 | #define dummy(d) /* do nothing */ |
98 | ||
99 | #define shr_next(reg) \ | |
100 | shrq $16, reg; | |
101 | ||
102 | #define F_head(a, x, gi1, gi2, op0) \ | |
4d6d6a2c | 103 | op0 a, RKM, x; \ |
ddaea786 JK |
104 | vpslld RKRF, x, RTMP; \ |
105 | vpsrld RKRR, x, x; \ | |
4d6d6a2c JG |
106 | vpor RTMP, x, x; \ |
107 | \ | |
ddaea786 JK |
108 | vmovq x, gi1; \ |
109 | vpextrq $1, x, gi2; | |
110 | ||
111 | #define F_tail(a, x, gi1, gi2, op1, op2, op3) \ | |
112 | lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \ | |
113 | lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \ | |
4d6d6a2c | 114 | \ |
ddaea786 JK |
115 | lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \ |
116 | shlq $32, RFS2; \ | |
117 | orq RFS1, RFS2; \ | |
118 | lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \ | |
119 | shlq $32, RFS1; \ | |
120 | orq RFS1, RFS3; \ | |
4d6d6a2c | 121 | \ |
ddaea786 | 122 | vmovq RFS2, x; \ |
4d6d6a2c JG |
123 | vpinsrq $1, RFS3, x, x; |
124 | ||
ddaea786 JK |
125 | #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \ |
126 | F_head(b1, RX, RGI1, RGI2, op0); \ | |
127 | F_head(b2, RX, RGI3, RGI4, op0); \ | |
128 | \ | |
129 | F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \ | |
130 | F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \ | |
131 | \ | |
132 | vpxor a1, RX, a1; \ | |
133 | vpxor a2, RTMP, a2; | |
134 | ||
135 | #define F1_2(a1, b1, a2, b2) \ | |
136 | F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl) | |
137 | #define F2_2(a1, b1, a2, b2) \ | |
138 | F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl) | |
139 | #define F3_2(a1, b1, a2, b2) \ | |
140 | F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl) | |
4d6d6a2c | 141 | |
ddaea786 JK |
142 | #define subround(a1, b1, a2, b2, f) \ |
143 | F ## f ## _2(a1, b1, a2, b2); | |
4d6d6a2c JG |
144 | |
145 | #define round(l, r, n, f) \ | |
146 | vbroadcastss (km+(4*n))(CTX), RKM; \ | |
ddaea786 | 147 | vpand R1ST, RKR, RKRF; \ |
4d6d6a2c | 148 | vpsubq RKRF, R32, RKRR; \ |
ddaea786 JK |
149 | vpsrldq $1, RKR, RKR; \ |
150 | subround(l ## 1, r ## 1, l ## 2, r ## 2, f); \ | |
151 | subround(l ## 3, r ## 3, l ## 4, r ## 4, f); | |
152 | ||
153 | #define enc_preload_rkr() \ | |
154 | vbroadcastss .L16_mask, RKR; \ | |
155 | /* add 16-bit rotation to key rotations (mod 32) */ \ | |
156 | vpxor kr(CTX), RKR, RKR; | |
4d6d6a2c | 157 | |
ddaea786 JK |
158 | #define dec_preload_rkr() \ |
159 | vbroadcastss .L16_mask, RKR; \ | |
160 | /* add 16-bit rotation to key rotations (mod 32) */ \ | |
161 | vpxor kr(CTX), RKR, RKR; \ | |
162 | vpshufb .Lbswap128_mask, RKR, RKR; | |
4d6d6a2c JG |
163 | |
164 | #define transpose_2x4(x0, x1, t0, t1) \ | |
165 | vpunpckldq x1, x0, t0; \ | |
166 | vpunpckhdq x1, x0, t1; \ | |
167 | \ | |
168 | vpunpcklqdq t1, t0, x0; \ | |
169 | vpunpckhqdq t1, t0, x1; | |
170 | ||
c12ab20b | 171 | #define inpack_blocks(x0, x1, t0, t1, rmask) \ |
ddaea786 JK |
172 | vpshufb rmask, x0, x0; \ |
173 | vpshufb rmask, x1, x1; \ | |
4d6d6a2c JG |
174 | \ |
175 | transpose_2x4(x0, x1, t0, t1) | |
176 | ||
c12ab20b | 177 | #define outunpack_blocks(x0, x1, t0, t1, rmask) \ |
4d6d6a2c JG |
178 | transpose_2x4(x0, x1, t0, t1) \ |
179 | \ | |
ddaea786 | 180 | vpshufb rmask, x0, x0; \ |
c12ab20b | 181 | vpshufb rmask, x1, x1; |
4d6d6a2c | 182 | |
e183914a | 183 | .section .rodata.cst16.bswap_mask, "aM", @progbits, 16 |
4d6d6a2c JG |
184 | .align 16 |
185 | .Lbswap_mask: | |
186 | .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 | |
e183914a DV |
187 | .section .rodata.cst16.bswap128_mask, "aM", @progbits, 16 |
188 | .align 16 | |
ddaea786 JK |
189 | .Lbswap128_mask: |
190 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | |
e183914a DV |
191 | .section .rodata.cst16.bswap_iv_mask, "aM", @progbits, 16 |
192 | .align 16 | |
c12ab20b JK |
193 | .Lbswap_iv_mask: |
194 | .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0 | |
e183914a DV |
195 | |
196 | .section .rodata.cst4.16_mask, "aM", @progbits, 4 | |
197 | .align 4 | |
ddaea786 JK |
198 | .L16_mask: |
199 | .byte 16, 16, 16, 16 | |
e183914a DV |
200 | .section .rodata.cst4.32_mask, "aM", @progbits, 4 |
201 | .align 4 | |
4d6d6a2c | 202 | .L32_mask: |
ddaea786 | 203 | .byte 32, 0, 0, 0 |
e183914a DV |
204 | .section .rodata.cst4.first_mask, "aM", @progbits, 4 |
205 | .align 4 | |
ddaea786 JK |
206 | .Lfirst_mask: |
207 | .byte 0x1f, 0, 0, 0 | |
208 | ||
209 | .text | |
4d6d6a2c JG |
210 | |
211 | .align 16 | |
c12ab20b | 212 | __cast5_enc_blk16: |
4d6d6a2c | 213 | /* input: |
4b156066 | 214 | * %rdi: ctx |
c12ab20b JK |
215 | * RL1: blocks 1 and 2 |
216 | * RR1: blocks 3 and 4 | |
217 | * RL2: blocks 5 and 6 | |
218 | * RR2: blocks 7 and 8 | |
219 | * RL3: blocks 9 and 10 | |
220 | * RR3: blocks 11 and 12 | |
221 | * RL4: blocks 13 and 14 | |
222 | * RR4: blocks 15 and 16 | |
223 | * output: | |
224 | * RL1: encrypted blocks 1 and 2 | |
225 | * RR1: encrypted blocks 3 and 4 | |
226 | * RL2: encrypted blocks 5 and 6 | |
227 | * RR2: encrypted blocks 7 and 8 | |
228 | * RL3: encrypted blocks 9 and 10 | |
229 | * RR3: encrypted blocks 11 and 12 | |
230 | * RL4: encrypted blocks 13 and 14 | |
231 | * RR4: encrypted blocks 15 and 16 | |
4d6d6a2c JG |
232 | */ |
233 | ||
4b156066 | 234 | pushq %r15; |
4d6d6a2c | 235 | pushq %rbx; |
4d6d6a2c | 236 | |
4b156066 JP |
237 | movq %rdi, CTX; |
238 | ||
ddaea786 JK |
239 | vmovdqa .Lbswap_mask, RKM; |
240 | vmovd .Lfirst_mask, R1ST; | |
241 | vmovd .L32_mask, R32; | |
242 | enc_preload_rkr(); | |
4d6d6a2c | 243 | |
c12ab20b JK |
244 | inpack_blocks(RL1, RR1, RTMP, RX, RKM); |
245 | inpack_blocks(RL2, RR2, RTMP, RX, RKM); | |
246 | inpack_blocks(RL3, RR3, RTMP, RX, RKM); | |
247 | inpack_blocks(RL4, RR4, RTMP, RX, RKM); | |
4d6d6a2c JG |
248 | |
249 | round(RL, RR, 0, 1); | |
250 | round(RR, RL, 1, 2); | |
251 | round(RL, RR, 2, 3); | |
252 | round(RR, RL, 3, 1); | |
253 | round(RL, RR, 4, 2); | |
254 | round(RR, RL, 5, 3); | |
255 | round(RL, RR, 6, 1); | |
256 | round(RR, RL, 7, 2); | |
257 | round(RL, RR, 8, 3); | |
258 | round(RR, RL, 9, 1); | |
259 | round(RL, RR, 10, 2); | |
260 | round(RR, RL, 11, 3); | |
261 | ||
ddaea786 JK |
262 | movzbl rr(CTX), %eax; |
263 | testl %eax, %eax; | |
e17e209e | 264 | jnz .L__skip_enc; |
4d6d6a2c JG |
265 | |
266 | round(RL, RR, 12, 1); | |
267 | round(RR, RL, 13, 2); | |
268 | round(RL, RR, 14, 3); | |
269 | round(RR, RL, 15, 1); | |
270 | ||
e17e209e | 271 | .L__skip_enc: |
4d6d6a2c | 272 | popq %rbx; |
4b156066 | 273 | popq %r15; |
ddaea786 JK |
274 | |
275 | vmovdqa .Lbswap_mask, RKM; | |
4d6d6a2c | 276 | |
c12ab20b JK |
277 | outunpack_blocks(RR1, RL1, RTMP, RX, RKM); |
278 | outunpack_blocks(RR2, RL2, RTMP, RX, RKM); | |
279 | outunpack_blocks(RR3, RL3, RTMP, RX, RKM); | |
280 | outunpack_blocks(RR4, RL4, RTMP, RX, RKM); | |
4d6d6a2c JG |
281 | |
282 | ret; | |
e17e209e | 283 | ENDPROC(__cast5_enc_blk16) |
4d6d6a2c JG |
284 | |
285 | .align 16 | |
c12ab20b | 286 | __cast5_dec_blk16: |
4d6d6a2c | 287 | /* input: |
4b156066 | 288 | * %rdi: ctx |
c12ab20b JK |
289 | * RL1: encrypted blocks 1 and 2 |
290 | * RR1: encrypted blocks 3 and 4 | |
291 | * RL2: encrypted blocks 5 and 6 | |
292 | * RR2: encrypted blocks 7 and 8 | |
293 | * RL3: encrypted blocks 9 and 10 | |
294 | * RR3: encrypted blocks 11 and 12 | |
295 | * RL4: encrypted blocks 13 and 14 | |
296 | * RR4: encrypted blocks 15 and 16 | |
297 | * output: | |
298 | * RL1: decrypted blocks 1 and 2 | |
299 | * RR1: decrypted blocks 3 and 4 | |
300 | * RL2: decrypted blocks 5 and 6 | |
301 | * RR2: decrypted blocks 7 and 8 | |
302 | * RL3: decrypted blocks 9 and 10 | |
303 | * RR3: decrypted blocks 11 and 12 | |
304 | * RL4: decrypted blocks 13 and 14 | |
305 | * RR4: decrypted blocks 15 and 16 | |
4d6d6a2c JG |
306 | */ |
307 | ||
4b156066 | 308 | pushq %r15; |
4d6d6a2c JG |
309 | pushq %rbx; |
310 | ||
4b156066 JP |
311 | movq %rdi, CTX; |
312 | ||
ddaea786 JK |
313 | vmovdqa .Lbswap_mask, RKM; |
314 | vmovd .Lfirst_mask, R1ST; | |
315 | vmovd .L32_mask, R32; | |
316 | dec_preload_rkr(); | |
4d6d6a2c | 317 | |
c12ab20b JK |
318 | inpack_blocks(RL1, RR1, RTMP, RX, RKM); |
319 | inpack_blocks(RL2, RR2, RTMP, RX, RKM); | |
320 | inpack_blocks(RL3, RR3, RTMP, RX, RKM); | |
321 | inpack_blocks(RL4, RR4, RTMP, RX, RKM); | |
4d6d6a2c | 322 | |
ddaea786 JK |
323 | movzbl rr(CTX), %eax; |
324 | testl %eax, %eax; | |
e17e209e | 325 | jnz .L__skip_dec; |
4d6d6a2c JG |
326 | |
327 | round(RL, RR, 15, 1); | |
328 | round(RR, RL, 14, 3); | |
329 | round(RL, RR, 13, 2); | |
330 | round(RR, RL, 12, 1); | |
331 | ||
e17e209e | 332 | .L__dec_tail: |
4d6d6a2c JG |
333 | round(RL, RR, 11, 3); |
334 | round(RR, RL, 10, 2); | |
335 | round(RL, RR, 9, 1); | |
336 | round(RR, RL, 8, 3); | |
337 | round(RL, RR, 7, 2); | |
338 | round(RR, RL, 6, 1); | |
339 | round(RL, RR, 5, 3); | |
340 | round(RR, RL, 4, 2); | |
341 | round(RL, RR, 3, 1); | |
342 | round(RR, RL, 2, 3); | |
343 | round(RL, RR, 1, 2); | |
344 | round(RR, RL, 0, 1); | |
345 | ||
ddaea786 | 346 | vmovdqa .Lbswap_mask, RKM; |
4d6d6a2c | 347 | popq %rbx; |
4b156066 | 348 | popq %r15; |
4d6d6a2c | 349 | |
c12ab20b JK |
350 | outunpack_blocks(RR1, RL1, RTMP, RX, RKM); |
351 | outunpack_blocks(RR2, RL2, RTMP, RX, RKM); | |
352 | outunpack_blocks(RR3, RL3, RTMP, RX, RKM); | |
353 | outunpack_blocks(RR4, RL4, RTMP, RX, RKM); | |
4d6d6a2c JG |
354 | |
355 | ret; | |
ddaea786 | 356 | |
e17e209e | 357 | .L__skip_dec: |
ddaea786 | 358 | vpsrldq $4, RKR, RKR; |
e17e209e JK |
359 | jmp .L__dec_tail; |
360 | ENDPROC(__cast5_dec_blk16) | |
c12ab20b | 361 | |
e17e209e | 362 | ENTRY(cast5_ecb_enc_16way) |
c12ab20b | 363 | /* input: |
4b156066 | 364 | * %rdi: ctx |
c12ab20b JK |
365 | * %rsi: dst |
366 | * %rdx: src | |
367 | */ | |
8691ccd7 | 368 | FRAME_BEGIN |
4b156066 | 369 | pushq %r15; |
c12ab20b | 370 | |
4b156066 | 371 | movq %rdi, CTX; |
c12ab20b JK |
372 | movq %rsi, %r11; |
373 | ||
374 | vmovdqu (0*4*4)(%rdx), RL1; | |
375 | vmovdqu (1*4*4)(%rdx), RR1; | |
376 | vmovdqu (2*4*4)(%rdx), RL2; | |
377 | vmovdqu (3*4*4)(%rdx), RR2; | |
378 | vmovdqu (4*4*4)(%rdx), RL3; | |
379 | vmovdqu (5*4*4)(%rdx), RR3; | |
380 | vmovdqu (6*4*4)(%rdx), RL4; | |
381 | vmovdqu (7*4*4)(%rdx), RR4; | |
382 | ||
383 | call __cast5_enc_blk16; | |
384 | ||
385 | vmovdqu RR1, (0*4*4)(%r11); | |
386 | vmovdqu RL1, (1*4*4)(%r11); | |
387 | vmovdqu RR2, (2*4*4)(%r11); | |
388 | vmovdqu RL2, (3*4*4)(%r11); | |
389 | vmovdqu RR3, (4*4*4)(%r11); | |
390 | vmovdqu RL3, (5*4*4)(%r11); | |
391 | vmovdqu RR4, (6*4*4)(%r11); | |
392 | vmovdqu RL4, (7*4*4)(%r11); | |
393 | ||
4b156066 | 394 | popq %r15; |
8691ccd7 | 395 | FRAME_END |
c12ab20b | 396 | ret; |
e17e209e | 397 | ENDPROC(cast5_ecb_enc_16way) |
c12ab20b | 398 | |
e17e209e | 399 | ENTRY(cast5_ecb_dec_16way) |
c12ab20b | 400 | /* input: |
4b156066 | 401 | * %rdi: ctx |
c12ab20b JK |
402 | * %rsi: dst |
403 | * %rdx: src | |
404 | */ | |
405 | ||
8691ccd7 | 406 | FRAME_BEGIN |
4b156066 JP |
407 | pushq %r15; |
408 | ||
409 | movq %rdi, CTX; | |
c12ab20b JK |
410 | movq %rsi, %r11; |
411 | ||
412 | vmovdqu (0*4*4)(%rdx), RL1; | |
413 | vmovdqu (1*4*4)(%rdx), RR1; | |
414 | vmovdqu (2*4*4)(%rdx), RL2; | |
415 | vmovdqu (3*4*4)(%rdx), RR2; | |
416 | vmovdqu (4*4*4)(%rdx), RL3; | |
417 | vmovdqu (5*4*4)(%rdx), RR3; | |
418 | vmovdqu (6*4*4)(%rdx), RL4; | |
419 | vmovdqu (7*4*4)(%rdx), RR4; | |
420 | ||
421 | call __cast5_dec_blk16; | |
422 | ||
423 | vmovdqu RR1, (0*4*4)(%r11); | |
424 | vmovdqu RL1, (1*4*4)(%r11); | |
425 | vmovdqu RR2, (2*4*4)(%r11); | |
426 | vmovdqu RL2, (3*4*4)(%r11); | |
427 | vmovdqu RR3, (4*4*4)(%r11); | |
428 | vmovdqu RL3, (5*4*4)(%r11); | |
429 | vmovdqu RR4, (6*4*4)(%r11); | |
430 | vmovdqu RL4, (7*4*4)(%r11); | |
431 | ||
4b156066 | 432 | popq %r15; |
8691ccd7 | 433 | FRAME_END |
c12ab20b | 434 | ret; |
e17e209e | 435 | ENDPROC(cast5_ecb_dec_16way) |
c12ab20b | 436 | |
e17e209e | 437 | ENTRY(cast5_cbc_dec_16way) |
c12ab20b | 438 | /* input: |
4b156066 | 439 | * %rdi: ctx |
c12ab20b JK |
440 | * %rsi: dst |
441 | * %rdx: src | |
442 | */ | |
8691ccd7 | 443 | FRAME_BEGIN |
c12ab20b | 444 | pushq %r12; |
4b156066 | 445 | pushq %r15; |
c12ab20b | 446 | |
4b156066 | 447 | movq %rdi, CTX; |
c12ab20b JK |
448 | movq %rsi, %r11; |
449 | movq %rdx, %r12; | |
450 | ||
451 | vmovdqu (0*16)(%rdx), RL1; | |
452 | vmovdqu (1*16)(%rdx), RR1; | |
453 | vmovdqu (2*16)(%rdx), RL2; | |
454 | vmovdqu (3*16)(%rdx), RR2; | |
455 | vmovdqu (4*16)(%rdx), RL3; | |
456 | vmovdqu (5*16)(%rdx), RR3; | |
457 | vmovdqu (6*16)(%rdx), RL4; | |
458 | vmovdqu (7*16)(%rdx), RR4; | |
459 | ||
460 | call __cast5_dec_blk16; | |
461 | ||
462 | /* xor with src */ | |
463 | vmovq (%r12), RX; | |
464 | vpshufd $0x4f, RX, RX; | |
465 | vpxor RX, RR1, RR1; | |
466 | vpxor 0*16+8(%r12), RL1, RL1; | |
467 | vpxor 1*16+8(%r12), RR2, RR2; | |
468 | vpxor 2*16+8(%r12), RL2, RL2; | |
469 | vpxor 3*16+8(%r12), RR3, RR3; | |
470 | vpxor 4*16+8(%r12), RL3, RL3; | |
471 | vpxor 5*16+8(%r12), RR4, RR4; | |
472 | vpxor 6*16+8(%r12), RL4, RL4; | |
473 | ||
474 | vmovdqu RR1, (0*16)(%r11); | |
475 | vmovdqu RL1, (1*16)(%r11); | |
476 | vmovdqu RR2, (2*16)(%r11); | |
477 | vmovdqu RL2, (3*16)(%r11); | |
478 | vmovdqu RR3, (4*16)(%r11); | |
479 | vmovdqu RL3, (5*16)(%r11); | |
480 | vmovdqu RR4, (6*16)(%r11); | |
481 | vmovdqu RL4, (7*16)(%r11); | |
482 | ||
4b156066 | 483 | popq %r15; |
c12ab20b | 484 | popq %r12; |
8691ccd7 | 485 | FRAME_END |
c12ab20b | 486 | ret; |
e17e209e | 487 | ENDPROC(cast5_cbc_dec_16way) |
c12ab20b | 488 | |
e17e209e | 489 | ENTRY(cast5_ctr_16way) |
c12ab20b | 490 | /* input: |
4b156066 | 491 | * %rdi: ctx |
c12ab20b JK |
492 | * %rsi: dst |
493 | * %rdx: src | |
494 | * %rcx: iv (big endian, 64bit) | |
495 | */ | |
8691ccd7 | 496 | FRAME_BEGIN |
c12ab20b | 497 | pushq %r12; |
4b156066 | 498 | pushq %r15; |
c12ab20b | 499 | |
4b156066 | 500 | movq %rdi, CTX; |
c12ab20b JK |
501 | movq %rsi, %r11; |
502 | movq %rdx, %r12; | |
503 | ||
504 | vpcmpeqd RTMP, RTMP, RTMP; | |
505 | vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */ | |
506 | ||
507 | vpcmpeqd RKR, RKR, RKR; | |
508 | vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */ | |
509 | vmovdqa .Lbswap_iv_mask, R1ST; | |
510 | vmovdqa .Lbswap128_mask, RKM; | |
511 | ||
512 | /* load IV and byteswap */ | |
513 | vmovq (%rcx), RX; | |
514 | vpshufb R1ST, RX, RX; | |
515 | ||
516 | /* construct IVs */ | |
517 | vpsubq RTMP, RX, RX; /* le: IV1, IV0 */ | |
518 | vpshufb RKM, RX, RL1; /* be: IV0, IV1 */ | |
519 | vpsubq RKR, RX, RX; | |
520 | vpshufb RKM, RX, RR1; /* be: IV2, IV3 */ | |
521 | vpsubq RKR, RX, RX; | |
522 | vpshufb RKM, RX, RL2; /* be: IV4, IV5 */ | |
523 | vpsubq RKR, RX, RX; | |
524 | vpshufb RKM, RX, RR2; /* be: IV6, IV7 */ | |
525 | vpsubq RKR, RX, RX; | |
526 | vpshufb RKM, RX, RL3; /* be: IV8, IV9 */ | |
527 | vpsubq RKR, RX, RX; | |
528 | vpshufb RKM, RX, RR3; /* be: IV10, IV11 */ | |
529 | vpsubq RKR, RX, RX; | |
530 | vpshufb RKM, RX, RL4; /* be: IV12, IV13 */ | |
531 | vpsubq RKR, RX, RX; | |
532 | vpshufb RKM, RX, RR4; /* be: IV14, IV15 */ | |
533 | ||
534 | /* store last IV */ | |
535 | vpsubq RTMP, RX, RX; /* le: IV16, IV14 */ | |
536 | vpshufb R1ST, RX, RX; /* be: IV16, IV16 */ | |
537 | vmovq RX, (%rcx); | |
538 | ||
539 | call __cast5_enc_blk16; | |
540 | ||
541 | /* dst = src ^ iv */ | |
542 | vpxor (0*16)(%r12), RR1, RR1; | |
543 | vpxor (1*16)(%r12), RL1, RL1; | |
544 | vpxor (2*16)(%r12), RR2, RR2; | |
545 | vpxor (3*16)(%r12), RL2, RL2; | |
546 | vpxor (4*16)(%r12), RR3, RR3; | |
547 | vpxor (5*16)(%r12), RL3, RL3; | |
548 | vpxor (6*16)(%r12), RR4, RR4; | |
549 | vpxor (7*16)(%r12), RL4, RL4; | |
550 | vmovdqu RR1, (0*16)(%r11); | |
551 | vmovdqu RL1, (1*16)(%r11); | |
552 | vmovdqu RR2, (2*16)(%r11); | |
553 | vmovdqu RL2, (3*16)(%r11); | |
554 | vmovdqu RR3, (4*16)(%r11); | |
555 | vmovdqu RL3, (5*16)(%r11); | |
556 | vmovdqu RR4, (6*16)(%r11); | |
557 | vmovdqu RL4, (7*16)(%r11); | |
558 | ||
4b156066 | 559 | popq %r15; |
c12ab20b | 560 | popq %r12; |
8691ccd7 | 561 | FRAME_END |
c12ab20b | 562 | ret; |
e17e209e | 563 | ENDPROC(cast5_ctr_16way) |