Commit | Line | Data |
---|---|---|
1a59d1b8 | 1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
eaf44088 JF |
2 | /*************************************************************************** |
3 | * Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> * | |
4 | * * | |
eaf44088 JF |
5 | ***************************************************************************/ |
6 | ||
7 | .file "twofish-x86_64-asm.S" | |
8 | .text | |
9 | ||
d3f5188d | 10 | #include <linux/linkage.h> |
eaf44088 JF |
11 | #include <asm/asm-offsets.h> |
12 | ||
13 | #define a_offset 0 | |
14 | #define b_offset 4 | |
15 | #define c_offset 8 | |
16 | #define d_offset 12 | |
17 | ||
18 | /* Structure of the crypto context struct*/ | |
19 | ||
20 | #define s0 0 /* S0 Array 256 Words each */ | |
21 | #define s1 1024 /* S1 Array */ | |
22 | #define s2 2048 /* S2 Array */ | |
23 | #define s3 3072 /* S3 Array */ | |
24 | #define w 4096 /* 8 whitening keys (word) */ | |
25 | #define k 4128 /* key 1-32 ( word ) */ | |
26 | ||
27 | /* define a few register aliases to allow macro substitution */ | |
28 | ||
29 | #define R0 %rax | |
30 | #define R0D %eax | |
31 | #define R0B %al | |
32 | #define R0H %ah | |
33 | ||
34 | #define R1 %rbx | |
35 | #define R1D %ebx | |
36 | #define R1B %bl | |
37 | #define R1H %bh | |
38 | ||
39 | #define R2 %rcx | |
40 | #define R2D %ecx | |
41 | #define R2B %cl | |
42 | #define R2H %ch | |
43 | ||
44 | #define R3 %rdx | |
45 | #define R3D %edx | |
46 | #define R3B %dl | |
47 | #define R3H %dh | |
48 | ||
49 | ||
50 | /* performs input whitening */ | |
51 | #define input_whitening(src,context,offset)\ | |
52 | xor w+offset(context), src; | |
53 | ||
54 | /* performs input whitening */ | |
55 | #define output_whitening(src,context,offset)\ | |
56 | xor w+16+offset(context), src; | |
57 | ||
58 | ||
59 | /* | |
60 | * a input register containing a (rotated 16) | |
61 | * b input register containing b | |
62 | * c input register containing c | |
63 | * d input register containing d (already rol $1) | |
64 | * operations on a and b are interleaved to increase performance | |
65 | */ | |
66 | #define encrypt_round(a,b,c,d,round)\ | |
67 | movzx b ## B, %edi;\ | |
68 | mov s1(%r11,%rdi,4),%r8d;\ | |
69 | movzx a ## B, %edi;\ | |
70 | mov s2(%r11,%rdi,4),%r9d;\ | |
71 | movzx b ## H, %edi;\ | |
72 | ror $16, b ## D;\ | |
73 | xor s2(%r11,%rdi,4),%r8d;\ | |
74 | movzx a ## H, %edi;\ | |
75 | ror $16, a ## D;\ | |
76 | xor s3(%r11,%rdi,4),%r9d;\ | |
77 | movzx b ## B, %edi;\ | |
78 | xor s3(%r11,%rdi,4),%r8d;\ | |
79 | movzx a ## B, %edi;\ | |
80 | xor (%r11,%rdi,4), %r9d;\ | |
81 | movzx b ## H, %edi;\ | |
82 | ror $15, b ## D;\ | |
83 | xor (%r11,%rdi,4), %r8d;\ | |
84 | movzx a ## H, %edi;\ | |
85 | xor s1(%r11,%rdi,4),%r9d;\ | |
86 | add %r8d, %r9d;\ | |
87 | add %r9d, %r8d;\ | |
88 | add k+round(%r11), %r9d;\ | |
89 | xor %r9d, c ## D;\ | |
90 | rol $15, c ## D;\ | |
91 | add k+4+round(%r11),%r8d;\ | |
92 | xor %r8d, d ## D; | |
93 | ||
94 | /* | |
95 | * a input register containing a(rotated 16) | |
96 | * b input register containing b | |
97 | * c input register containing c | |
98 | * d input register containing d (already rol $1) | |
99 | * operations on a and b are interleaved to increase performance | |
100 | * during the round a and b are prepared for the output whitening | |
101 | */ | |
102 | #define encrypt_last_round(a,b,c,d,round)\ | |
103 | mov b ## D, %r10d;\ | |
104 | shl $32, %r10;\ | |
105 | movzx b ## B, %edi;\ | |
106 | mov s1(%r11,%rdi,4),%r8d;\ | |
107 | movzx a ## B, %edi;\ | |
108 | mov s2(%r11,%rdi,4),%r9d;\ | |
109 | movzx b ## H, %edi;\ | |
110 | ror $16, b ## D;\ | |
111 | xor s2(%r11,%rdi,4),%r8d;\ | |
112 | movzx a ## H, %edi;\ | |
113 | ror $16, a ## D;\ | |
114 | xor s3(%r11,%rdi,4),%r9d;\ | |
115 | movzx b ## B, %edi;\ | |
116 | xor s3(%r11,%rdi,4),%r8d;\ | |
117 | movzx a ## B, %edi;\ | |
118 | xor (%r11,%rdi,4), %r9d;\ | |
119 | xor a, %r10;\ | |
120 | movzx b ## H, %edi;\ | |
121 | xor (%r11,%rdi,4), %r8d;\ | |
122 | movzx a ## H, %edi;\ | |
123 | xor s1(%r11,%rdi,4),%r9d;\ | |
124 | add %r8d, %r9d;\ | |
125 | add %r9d, %r8d;\ | |
126 | add k+round(%r11), %r9d;\ | |
127 | xor %r9d, c ## D;\ | |
128 | ror $1, c ## D;\ | |
129 | add k+4+round(%r11),%r8d;\ | |
130 | xor %r8d, d ## D | |
131 | ||
132 | /* | |
133 | * a input register containing a | |
134 | * b input register containing b (rotated 16) | |
135 | * c input register containing c (already rol $1) | |
136 | * d input register containing d | |
137 | * operations on a and b are interleaved to increase performance | |
138 | */ | |
139 | #define decrypt_round(a,b,c,d,round)\ | |
140 | movzx a ## B, %edi;\ | |
141 | mov (%r11,%rdi,4), %r9d;\ | |
142 | movzx b ## B, %edi;\ | |
143 | mov s3(%r11,%rdi,4),%r8d;\ | |
144 | movzx a ## H, %edi;\ | |
145 | ror $16, a ## D;\ | |
146 | xor s1(%r11,%rdi,4),%r9d;\ | |
147 | movzx b ## H, %edi;\ | |
148 | ror $16, b ## D;\ | |
149 | xor (%r11,%rdi,4), %r8d;\ | |
150 | movzx a ## B, %edi;\ | |
151 | xor s2(%r11,%rdi,4),%r9d;\ | |
152 | movzx b ## B, %edi;\ | |
153 | xor s1(%r11,%rdi,4),%r8d;\ | |
154 | movzx a ## H, %edi;\ | |
155 | ror $15, a ## D;\ | |
156 | xor s3(%r11,%rdi,4),%r9d;\ | |
157 | movzx b ## H, %edi;\ | |
158 | xor s2(%r11,%rdi,4),%r8d;\ | |
159 | add %r8d, %r9d;\ | |
160 | add %r9d, %r8d;\ | |
161 | add k+round(%r11), %r9d;\ | |
162 | xor %r9d, c ## D;\ | |
163 | add k+4+round(%r11),%r8d;\ | |
164 | xor %r8d, d ## D;\ | |
165 | rol $15, d ## D; | |
166 | ||
167 | /* | |
168 | * a input register containing a | |
169 | * b input register containing b | |
170 | * c input register containing c (already rol $1) | |
171 | * d input register containing d | |
172 | * operations on a and b are interleaved to increase performance | |
173 | * during the round a and b are prepared for the output whitening | |
174 | */ | |
175 | #define decrypt_last_round(a,b,c,d,round)\ | |
176 | movzx a ## B, %edi;\ | |
177 | mov (%r11,%rdi,4), %r9d;\ | |
178 | movzx b ## B, %edi;\ | |
179 | mov s3(%r11,%rdi,4),%r8d;\ | |
180 | movzx b ## H, %edi;\ | |
181 | ror $16, b ## D;\ | |
182 | xor (%r11,%rdi,4), %r8d;\ | |
183 | movzx a ## H, %edi;\ | |
184 | mov b ## D, %r10d;\ | |
185 | shl $32, %r10;\ | |
186 | xor a, %r10;\ | |
187 | ror $16, a ## D;\ | |
188 | xor s1(%r11,%rdi,4),%r9d;\ | |
189 | movzx b ## B, %edi;\ | |
190 | xor s1(%r11,%rdi,4),%r8d;\ | |
191 | movzx a ## B, %edi;\ | |
192 | xor s2(%r11,%rdi,4),%r9d;\ | |
193 | movzx b ## H, %edi;\ | |
194 | xor s2(%r11,%rdi,4),%r8d;\ | |
195 | movzx a ## H, %edi;\ | |
196 | xor s3(%r11,%rdi,4),%r9d;\ | |
197 | add %r8d, %r9d;\ | |
198 | add %r9d, %r8d;\ | |
199 | add k+round(%r11), %r9d;\ | |
200 | xor %r9d, c ## D;\ | |
201 | add k+4+round(%r11),%r8d;\ | |
202 | xor %r8d, d ## D;\ | |
203 | ror $1, d ## D; | |
204 | ||
6dcc5627 | 205 | SYM_FUNC_START(twofish_enc_blk) |
eaf44088 JF |
206 | pushq R1 |
207 | ||
91d41f15 | 208 | /* %rdi contains the ctx address */ |
3ad2f3fb DM |
209 | /* %rsi contains the output address */ |
210 | /* %rdx contains the input address */ | |
3ad2f3fb | 211 | /* ctx address is moved to free one non-rex register |
eaf44088 JF |
212 | as target for the 8bit high operations */ |
213 | mov %rdi, %r11 | |
214 | ||
215 | movq (R3), R1 | |
216 | movq 8(R3), R3 | |
217 | input_whitening(R1,%r11,a_offset) | |
218 | input_whitening(R3,%r11,c_offset) | |
219 | mov R1D, R0D | |
220 | rol $16, R0D | |
221 | shr $32, R1 | |
222 | mov R3D, R2D | |
223 | shr $32, R3 | |
224 | rol $1, R3D | |
225 | ||
226 | encrypt_round(R0,R1,R2,R3,0); | |
227 | encrypt_round(R2,R3,R0,R1,8); | |
228 | encrypt_round(R0,R1,R2,R3,2*8); | |
229 | encrypt_round(R2,R3,R0,R1,3*8); | |
230 | encrypt_round(R0,R1,R2,R3,4*8); | |
231 | encrypt_round(R2,R3,R0,R1,5*8); | |
232 | encrypt_round(R0,R1,R2,R3,6*8); | |
233 | encrypt_round(R2,R3,R0,R1,7*8); | |
234 | encrypt_round(R0,R1,R2,R3,8*8); | |
235 | encrypt_round(R2,R3,R0,R1,9*8); | |
236 | encrypt_round(R0,R1,R2,R3,10*8); | |
237 | encrypt_round(R2,R3,R0,R1,11*8); | |
238 | encrypt_round(R0,R1,R2,R3,12*8); | |
239 | encrypt_round(R2,R3,R0,R1,13*8); | |
240 | encrypt_round(R0,R1,R2,R3,14*8); | |
241 | encrypt_last_round(R2,R3,R0,R1,15*8); | |
242 | ||
243 | ||
244 | output_whitening(%r10,%r11,a_offset) | |
245 | movq %r10, (%rsi) | |
246 | ||
247 | shl $32, R1 | |
248 | xor R0, R1 | |
249 | ||
250 | output_whitening(R1,%r11,c_offset) | |
251 | movq R1, 8(%rsi) | |
252 | ||
253 | popq R1 | |
a734b4a2 | 254 | movl $1,%eax |
f94909ce | 255 | RET |
6dcc5627 | 256 | SYM_FUNC_END(twofish_enc_blk) |
eaf44088 | 257 | |
6dcc5627 | 258 | SYM_FUNC_START(twofish_dec_blk) |
eaf44088 JF |
259 | pushq R1 |
260 | ||
91d41f15 | 261 | /* %rdi contains the ctx address */ |
3ad2f3fb DM |
262 | /* %rsi contains the output address */ |
263 | /* %rdx contains the input address */ | |
3ad2f3fb | 264 | /* ctx address is moved to free one non-rex register |
eaf44088 JF |
265 | as target for the 8bit high operations */ |
266 | mov %rdi, %r11 | |
267 | ||
268 | movq (R3), R1 | |
269 | movq 8(R3), R3 | |
270 | output_whitening(R1,%r11,a_offset) | |
271 | output_whitening(R3,%r11,c_offset) | |
272 | mov R1D, R0D | |
273 | shr $32, R1 | |
274 | rol $16, R1D | |
275 | mov R3D, R2D | |
276 | shr $32, R3 | |
277 | rol $1, R2D | |
278 | ||
279 | decrypt_round(R0,R1,R2,R3,15*8); | |
280 | decrypt_round(R2,R3,R0,R1,14*8); | |
281 | decrypt_round(R0,R1,R2,R3,13*8); | |
282 | decrypt_round(R2,R3,R0,R1,12*8); | |
283 | decrypt_round(R0,R1,R2,R3,11*8); | |
284 | decrypt_round(R2,R3,R0,R1,10*8); | |
285 | decrypt_round(R0,R1,R2,R3,9*8); | |
286 | decrypt_round(R2,R3,R0,R1,8*8); | |
287 | decrypt_round(R0,R1,R2,R3,7*8); | |
288 | decrypt_round(R2,R3,R0,R1,6*8); | |
289 | decrypt_round(R0,R1,R2,R3,5*8); | |
290 | decrypt_round(R2,R3,R0,R1,4*8); | |
291 | decrypt_round(R0,R1,R2,R3,3*8); | |
292 | decrypt_round(R2,R3,R0,R1,2*8); | |
293 | decrypt_round(R0,R1,R2,R3,1*8); | |
294 | decrypt_last_round(R2,R3,R0,R1,0); | |
295 | ||
296 | input_whitening(%r10,%r11,a_offset) | |
297 | movq %r10, (%rsi) | |
298 | ||
299 | shl $32, R1 | |
300 | xor R0, R1 | |
301 | ||
302 | input_whitening(R1,%r11,c_offset) | |
303 | movq R1, 8(%rsi) | |
304 | ||
305 | popq R1 | |
a734b4a2 | 306 | movl $1,%eax |
f94909ce | 307 | RET |
6dcc5627 | 308 | SYM_FUNC_END(twofish_dec_blk) |