Commit | Line | Data |
---|---|---|
1a59d1b8 | 1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
64b94cea JK |
2 | /* |
3 | * Blowfish Cipher Algorithm (x86_64) | |
4 | * | |
5 | * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | |
64b94cea JK |
6 | */ |
7 | ||
5186e395 JK |
8 | #include <linux/linkage.h> |
9 | ||
64b94cea JK |
10 | .file "blowfish-x86_64-asm.S" |
11 | .text | |
12 | ||
13 | /* structure of crypto context */ | |
14 | #define p 0 | |
15 | #define s0 ((16 + 2) * 4) | |
16 | #define s1 ((16 + 2 + (1 * 256)) * 4) | |
17 | #define s2 ((16 + 2 + (2 * 256)) * 4) | |
18 | #define s3 ((16 + 2 + (3 * 256)) * 4) | |
19 | ||
20 | /* register macros */ | |
569f11c9 | 21 | #define CTX %r12 |
64b94cea JK |
22 | #define RIO %rsi |
23 | ||
24 | #define RX0 %rax | |
25 | #define RX1 %rbx | |
26 | #define RX2 %rcx | |
27 | #define RX3 %rdx | |
28 | ||
29 | #define RX0d %eax | |
30 | #define RX1d %ebx | |
31 | #define RX2d %ecx | |
32 | #define RX3d %edx | |
33 | ||
34 | #define RX0bl %al | |
35 | #define RX1bl %bl | |
36 | #define RX2bl %cl | |
37 | #define RX3bl %dl | |
38 | ||
39 | #define RX0bh %ah | |
40 | #define RX1bh %bh | |
41 | #define RX2bh %ch | |
42 | #define RX3bh %dh | |
43 | ||
569f11c9 | 44 | #define RT0 %rdi |
64b94cea | 45 | #define RT1 %rsi |
e827bb09 JK |
46 | #define RT2 %r8 |
47 | #define RT3 %r9 | |
64b94cea | 48 | |
569f11c9 | 49 | #define RT0d %edi |
64b94cea | 50 | #define RT1d %esi |
e827bb09 JK |
51 | #define RT2d %r8d |
52 | #define RT3d %r9d | |
64b94cea | 53 | |
e827bb09 | 54 | #define RKEY %r10 |
64b94cea JK |
55 | |
56 | /*********************************************************************** | |
57 | * 1-way blowfish | |
58 | ***********************************************************************/ | |
e827bb09 JK |
59 | #define F() \ |
60 | rorq $16, RX0; \ | |
61 | movzbl RX0bh, RT0d; \ | |
62 | movzbl RX0bl, RT1d; \ | |
63 | rolq $16, RX0; \ | |
64 | movl s0(CTX,RT0,4), RT0d; \ | |
65 | addl s1(CTX,RT1,4), RT0d; \ | |
66 | movzbl RX0bh, RT1d; \ | |
67 | movzbl RX0bl, RT2d; \ | |
68 | rolq $32, RX0; \ | |
69 | xorl s2(CTX,RT1,4), RT0d; \ | |
70 | addl s3(CTX,RT2,4), RT0d; \ | |
71 | xorq RT0, RX0; | |
64b94cea JK |
72 | |
73 | #define add_roundkey_enc(n) \ | |
74 | xorq p+4*(n)(CTX), RX0; | |
75 | ||
76 | #define round_enc(n) \ | |
77 | add_roundkey_enc(n); \ | |
78 | \ | |
e827bb09 JK |
79 | F(); \ |
80 | F(); | |
64b94cea JK |
81 | |
82 | #define add_roundkey_dec(n) \ | |
83 | movq p+4*(n-1)(CTX), RT0; \ | |
84 | rorq $32, RT0; \ | |
85 | xorq RT0, RX0; | |
86 | ||
87 | #define round_dec(n) \ | |
88 | add_roundkey_dec(n); \ | |
89 | \ | |
e827bb09 JK |
90 | F(); \ |
91 | F(); \ | |
64b94cea JK |
92 | |
93 | #define read_block() \ | |
94 | movq (RIO), RX0; \ | |
95 | rorq $32, RX0; \ | |
96 | bswapq RX0; | |
97 | ||
98 | #define write_block() \ | |
99 | bswapq RX0; \ | |
100 | movq RX0, (RIO); | |
101 | ||
102 | #define xor_block() \ | |
103 | bswapq RX0; \ | |
104 | xorq RX0, (RIO); | |
105 | ||
5186e395 | 106 | ENTRY(__blowfish_enc_blk) |
e827bb09 | 107 | /* input: |
569f11c9 | 108 | * %rdi: ctx |
e827bb09 JK |
109 | * %rsi: dst |
110 | * %rdx: src | |
111 | * %rcx: bool, if true: xor output | |
112 | */ | |
569f11c9 | 113 | movq %r12, %r11; |
e827bb09 | 114 | |
569f11c9 | 115 | movq %rdi, CTX; |
e827bb09 | 116 | movq %rsi, %r10; |
64b94cea JK |
117 | movq %rdx, RIO; |
118 | ||
119 | read_block(); | |
120 | ||
121 | round_enc(0); | |
122 | round_enc(2); | |
123 | round_enc(4); | |
124 | round_enc(6); | |
125 | round_enc(8); | |
126 | round_enc(10); | |
127 | round_enc(12); | |
128 | round_enc(14); | |
129 | add_roundkey_enc(16); | |
130 | ||
569f11c9 | 131 | movq %r11, %r12; |
64b94cea | 132 | |
e827bb09 JK |
133 | movq %r10, RIO; |
134 | test %cl, %cl; | |
5186e395 | 135 | jnz .L__enc_xor; |
64b94cea JK |
136 | |
137 | write_block(); | |
64b94cea | 138 | ret; |
5186e395 | 139 | .L__enc_xor: |
64b94cea | 140 | xor_block(); |
e827bb09 | 141 | ret; |
5186e395 | 142 | ENDPROC(__blowfish_enc_blk) |
64b94cea | 143 | |
5186e395 | 144 | ENTRY(blowfish_dec_blk) |
e827bb09 | 145 | /* input: |
569f11c9 | 146 | * %rdi: ctx |
e827bb09 JK |
147 | * %rsi: dst |
148 | * %rdx: src | |
149 | */ | |
569f11c9 | 150 | movq %r12, %r11; |
e827bb09 | 151 | |
569f11c9 | 152 | movq %rdi, CTX; |
e827bb09 | 153 | movq %rsi, %r10; |
64b94cea JK |
154 | movq %rdx, RIO; |
155 | ||
156 | read_block(); | |
157 | ||
158 | round_dec(17); | |
159 | round_dec(15); | |
160 | round_dec(13); | |
161 | round_dec(11); | |
162 | round_dec(9); | |
163 | round_dec(7); | |
164 | round_dec(5); | |
165 | round_dec(3); | |
166 | add_roundkey_dec(1); | |
167 | ||
e827bb09 | 168 | movq %r10, RIO; |
64b94cea JK |
169 | write_block(); |
170 | ||
569f11c9 | 171 | movq %r11, %r12; |
64b94cea JK |
172 | |
173 | ret; | |
5186e395 | 174 | ENDPROC(blowfish_dec_blk) |
64b94cea JK |
175 | |
176 | /********************************************************************** | |
177 | 4-way blowfish, four blocks parallel | |
178 | **********************************************************************/ | |
e827bb09 JK |
179 | |
180 | /* F() for 4-way. Slower when used alone/1-way, but faster when used | |
181 | * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330). | |
182 | */ | |
183 | #define F4(x) \ | |
184 | movzbl x ## bh, RT1d; \ | |
185 | movzbl x ## bl, RT3d; \ | |
186 | rorq $16, x; \ | |
187 | movzbl x ## bh, RT0d; \ | |
188 | movzbl x ## bl, RT2d; \ | |
189 | rorq $16, x; \ | |
190 | movl s0(CTX,RT0,4), RT0d; \ | |
191 | addl s1(CTX,RT2,4), RT0d; \ | |
192 | xorl s2(CTX,RT1,4), RT0d; \ | |
193 | addl s3(CTX,RT3,4), RT0d; \ | |
194 | xorq RT0, x; | |
195 | ||
64b94cea JK |
196 | #define add_preloaded_roundkey4() \ |
197 | xorq RKEY, RX0; \ | |
198 | xorq RKEY, RX1; \ | |
199 | xorq RKEY, RX2; \ | |
200 | xorq RKEY, RX3; | |
201 | ||
202 | #define preload_roundkey_enc(n) \ | |
203 | movq p+4*(n)(CTX), RKEY; | |
204 | ||
205 | #define add_roundkey_enc4(n) \ | |
206 | add_preloaded_roundkey4(); \ | |
207 | preload_roundkey_enc(n + 2); | |
208 | ||
209 | #define round_enc4(n) \ | |
210 | add_roundkey_enc4(n); \ | |
211 | \ | |
e827bb09 JK |
212 | F4(RX0); \ |
213 | F4(RX1); \ | |
214 | F4(RX2); \ | |
215 | F4(RX3); \ | |
64b94cea | 216 | \ |
e827bb09 JK |
217 | F4(RX0); \ |
218 | F4(RX1); \ | |
219 | F4(RX2); \ | |
220 | F4(RX3); | |
64b94cea JK |
221 | |
222 | #define preload_roundkey_dec(n) \ | |
223 | movq p+4*((n)-1)(CTX), RKEY; \ | |
224 | rorq $32, RKEY; | |
225 | ||
226 | #define add_roundkey_dec4(n) \ | |
227 | add_preloaded_roundkey4(); \ | |
228 | preload_roundkey_dec(n - 2); | |
229 | ||
230 | #define round_dec4(n) \ | |
231 | add_roundkey_dec4(n); \ | |
232 | \ | |
e827bb09 JK |
233 | F4(RX0); \ |
234 | F4(RX1); \ | |
235 | F4(RX2); \ | |
236 | F4(RX3); \ | |
64b94cea | 237 | \ |
e827bb09 JK |
238 | F4(RX0); \ |
239 | F4(RX1); \ | |
240 | F4(RX2); \ | |
241 | F4(RX3); | |
64b94cea JK |
242 | |
243 | #define read_block4() \ | |
244 | movq (RIO), RX0; \ | |
245 | rorq $32, RX0; \ | |
246 | bswapq RX0; \ | |
247 | \ | |
248 | movq 8(RIO), RX1; \ | |
249 | rorq $32, RX1; \ | |
250 | bswapq RX1; \ | |
251 | \ | |
252 | movq 16(RIO), RX2; \ | |
253 | rorq $32, RX2; \ | |
254 | bswapq RX2; \ | |
255 | \ | |
256 | movq 24(RIO), RX3; \ | |
257 | rorq $32, RX3; \ | |
258 | bswapq RX3; | |
259 | ||
260 | #define write_block4() \ | |
261 | bswapq RX0; \ | |
262 | movq RX0, (RIO); \ | |
263 | \ | |
264 | bswapq RX1; \ | |
265 | movq RX1, 8(RIO); \ | |
266 | \ | |
267 | bswapq RX2; \ | |
268 | movq RX2, 16(RIO); \ | |
269 | \ | |
270 | bswapq RX3; \ | |
271 | movq RX3, 24(RIO); | |
272 | ||
273 | #define xor_block4() \ | |
274 | bswapq RX0; \ | |
275 | xorq RX0, (RIO); \ | |
276 | \ | |
277 | bswapq RX1; \ | |
278 | xorq RX1, 8(RIO); \ | |
279 | \ | |
280 | bswapq RX2; \ | |
281 | xorq RX2, 16(RIO); \ | |
282 | \ | |
283 | bswapq RX3; \ | |
284 | xorq RX3, 24(RIO); | |
285 | ||
5186e395 | 286 | ENTRY(__blowfish_enc_blk_4way) |
e827bb09 | 287 | /* input: |
569f11c9 | 288 | * %rdi: ctx |
e827bb09 JK |
289 | * %rsi: dst |
290 | * %rdx: src | |
291 | * %rcx: bool, if true: xor output | |
292 | */ | |
569f11c9 | 293 | pushq %r12; |
64b94cea | 294 | pushq %rbx; |
e827bb09 JK |
295 | pushq %rcx; |
296 | ||
569f11c9 | 297 | movq %rdi, CTX |
e827bb09 | 298 | movq %rsi, %r11; |
64b94cea JK |
299 | movq %rdx, RIO; |
300 | ||
569f11c9 JP |
301 | preload_roundkey_enc(0); |
302 | ||
64b94cea JK |
303 | read_block4(); |
304 | ||
305 | round_enc4(0); | |
306 | round_enc4(2); | |
307 | round_enc4(4); | |
308 | round_enc4(6); | |
309 | round_enc4(8); | |
310 | round_enc4(10); | |
311 | round_enc4(12); | |
312 | round_enc4(14); | |
313 | add_preloaded_roundkey4(); | |
314 | ||
569f11c9 | 315 | popq %r12; |
e827bb09 | 316 | movq %r11, RIO; |
64b94cea | 317 | |
569f11c9 | 318 | test %r12b, %r12b; |
5186e395 | 319 | jnz .L__enc_xor4; |
64b94cea JK |
320 | |
321 | write_block4(); | |
322 | ||
64b94cea | 323 | popq %rbx; |
569f11c9 | 324 | popq %r12; |
64b94cea JK |
325 | ret; |
326 | ||
5186e395 | 327 | .L__enc_xor4: |
64b94cea JK |
328 | xor_block4(); |
329 | ||
e827bb09 | 330 | popq %rbx; |
569f11c9 | 331 | popq %r12; |
e827bb09 | 332 | ret; |
5186e395 | 333 | ENDPROC(__blowfish_enc_blk_4way) |
64b94cea | 334 | |
5186e395 | 335 | ENTRY(blowfish_dec_blk_4way) |
e827bb09 | 336 | /* input: |
569f11c9 | 337 | * %rdi: ctx |
e827bb09 JK |
338 | * %rsi: dst |
339 | * %rdx: src | |
340 | */ | |
569f11c9 | 341 | pushq %r12; |
64b94cea | 342 | pushq %rbx; |
64b94cea | 343 | |
569f11c9 JP |
344 | movq %rdi, CTX; |
345 | movq %rsi, %r11 | |
64b94cea JK |
346 | movq %rdx, RIO; |
347 | ||
569f11c9 | 348 | preload_roundkey_dec(17); |
64b94cea JK |
349 | read_block4(); |
350 | ||
351 | round_dec4(17); | |
352 | round_dec4(15); | |
353 | round_dec4(13); | |
354 | round_dec4(11); | |
355 | round_dec4(9); | |
356 | round_dec4(7); | |
357 | round_dec4(5); | |
358 | round_dec4(3); | |
359 | add_preloaded_roundkey4(); | |
360 | ||
e827bb09 | 361 | movq %r11, RIO; |
64b94cea JK |
362 | write_block4(); |
363 | ||
64b94cea | 364 | popq %rbx; |
569f11c9 | 365 | popq %r12; |
64b94cea JK |
366 | |
367 | ret; | |
5186e395 | 368 | ENDPROC(blowfish_dec_blk_4way) |