Commit | Line | Data |
---|---|---|
af1a8899 | 1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
49502766 | 2 | #ifndef _ASM_X86_XOR_H |
e8f6e3f8 JB |
3 | #define _ASM_X86_XOR_H |
4 | ||
5 | /* | |
6 | * Optimized RAID-5 checksumming functions for SSE. | |
e8f6e3f8 JB |
7 | */ |
8 | ||
9 | /* | |
10 | * Cache avoiding checksumming functions utilizing KNI instructions | |
11 | * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) | |
12 | */ | |
13 | ||
14 | /* | |
15 | * Based on | |
16 | * High-speed RAID5 checksumming functions utilizing SSE instructions. | |
17 | * Copyright (C) 1998 Ingo Molnar. | |
18 | */ | |
19 | ||
20 | /* | |
21 | * x86-64 changes / gcc fixes from Andi Kleen. | |
22 | * Copyright 2002 Andi Kleen, SuSE Labs. | |
23 | * | |
24 | * This hasn't been optimized for the hammer yet, but there are likely | |
25 | * no advantages to be gotten from x86-64 here anyways. | |
26 | */ | |
27 | ||
df6b35f4 | 28 | #include <asm/fpu/api.h> |
e8f6e3f8 JB |
29 | |
30 | #ifdef CONFIG_X86_32 | |
31 | /* reduce register pressure */ | |
32 | # define XOR_CONSTANT_CONSTRAINT "i" | |
f8561296 | 33 | #else |
e8f6e3f8 JB |
34 | # define XOR_CONSTANT_CONSTRAINT "re" |
35 | #endif | |
36 | ||
37 | #define OFFS(x) "16*("#x")" | |
38 | #define PF_OFFS(x) "256+16*("#x")" | |
39 | #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" | |
40 | #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" | |
41 | #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" | |
42 | #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" | |
43 | #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" | |
44 | #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" | |
45 | #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" | |
46 | #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" | |
47 | #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" | |
48 | #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" | |
49 | #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" | |
f317820c JB |
50 | #define NOP(x) |
51 | ||
52 | #define BLK64(pf, op, i) \ | |
53 | pf(i) \ | |
54 | op(i, 0) \ | |
55 | op(i + 1, 1) \ | |
56 | op(i + 2, 2) \ | |
57 | op(i + 3, 3) | |
e8f6e3f8 JB |
58 | |
59 | static void | |
60 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |
61 | { | |
62 | unsigned long lines = bytes >> 8; | |
63 | ||
64 | kernel_fpu_begin(); | |
65 | ||
66 | asm volatile( | |
67 | #undef BLOCK | |
68 | #define BLOCK(i) \ | |
69 | LD(i, 0) \ | |
70 | LD(i + 1, 1) \ | |
71 | PF1(i) \ | |
72 | PF1(i + 2) \ | |
73 | LD(i + 2, 2) \ | |
74 | LD(i + 3, 3) \ | |
75 | PF0(i + 4) \ | |
76 | PF0(i + 6) \ | |
77 | XO1(i, 0) \ | |
78 | XO1(i + 1, 1) \ | |
79 | XO1(i + 2, 2) \ | |
80 | XO1(i + 3, 3) \ | |
81 | ST(i, 0) \ | |
82 | ST(i + 1, 1) \ | |
83 | ST(i + 2, 2) \ | |
84 | ST(i + 3, 3) \ | |
85 | ||
86 | ||
87 | PF0(0) | |
88 | PF0(2) | |
89 | ||
90 | " .align 32 ;\n" | |
91 | " 1: ;\n" | |
92 | ||
93 | BLOCK(0) | |
94 | BLOCK(4) | |
95 | BLOCK(8) | |
96 | BLOCK(12) | |
97 | ||
98 | " add %[inc], %[p1] ;\n" | |
99 | " add %[inc], %[p2] ;\n" | |
100 | " dec %[cnt] ;\n" | |
101 | " jnz 1b ;\n" | |
102 | : [cnt] "+r" (lines), | |
103 | [p1] "+r" (p1), [p2] "+r" (p2) | |
104 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | |
105 | : "memory"); | |
106 | ||
107 | kernel_fpu_end(); | |
108 | } | |
109 | ||
f317820c JB |
110 | static void |
111 | xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |
112 | { | |
113 | unsigned long lines = bytes >> 8; | |
114 | ||
115 | kernel_fpu_begin(); | |
116 | ||
117 | asm volatile( | |
118 | #undef BLOCK | |
119 | #define BLOCK(i) \ | |
120 | BLK64(PF0, LD, i) \ | |
121 | BLK64(PF1, XO1, i) \ | |
122 | BLK64(NOP, ST, i) \ | |
123 | ||
124 | " .align 32 ;\n" | |
125 | " 1: ;\n" | |
126 | ||
127 | BLOCK(0) | |
128 | BLOCK(4) | |
129 | BLOCK(8) | |
130 | BLOCK(12) | |
131 | ||
132 | " add %[inc], %[p1] ;\n" | |
133 | " add %[inc], %[p2] ;\n" | |
134 | " dec %[cnt] ;\n" | |
135 | " jnz 1b ;\n" | |
136 | : [cnt] "+r" (lines), | |
137 | [p1] "+r" (p1), [p2] "+r" (p2) | |
138 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | |
139 | : "memory"); | |
140 | ||
141 | kernel_fpu_end(); | |
142 | } | |
143 | ||
e8f6e3f8 JB |
144 | static void |
145 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
146 | unsigned long *p3) | |
147 | { | |
148 | unsigned long lines = bytes >> 8; | |
149 | ||
150 | kernel_fpu_begin(); | |
151 | ||
152 | asm volatile( | |
153 | #undef BLOCK | |
154 | #define BLOCK(i) \ | |
155 | PF1(i) \ | |
156 | PF1(i + 2) \ | |
157 | LD(i, 0) \ | |
158 | LD(i + 1, 1) \ | |
159 | LD(i + 2, 2) \ | |
160 | LD(i + 3, 3) \ | |
161 | PF2(i) \ | |
162 | PF2(i + 2) \ | |
163 | PF0(i + 4) \ | |
164 | PF0(i + 6) \ | |
165 | XO1(i, 0) \ | |
166 | XO1(i + 1, 1) \ | |
167 | XO1(i + 2, 2) \ | |
168 | XO1(i + 3, 3) \ | |
169 | XO2(i, 0) \ | |
170 | XO2(i + 1, 1) \ | |
171 | XO2(i + 2, 2) \ | |
172 | XO2(i + 3, 3) \ | |
173 | ST(i, 0) \ | |
174 | ST(i + 1, 1) \ | |
175 | ST(i + 2, 2) \ | |
176 | ST(i + 3, 3) \ | |
177 | ||
178 | ||
179 | PF0(0) | |
180 | PF0(2) | |
181 | ||
182 | " .align 32 ;\n" | |
183 | " 1: ;\n" | |
184 | ||
185 | BLOCK(0) | |
186 | BLOCK(4) | |
187 | BLOCK(8) | |
188 | BLOCK(12) | |
189 | ||
190 | " add %[inc], %[p1] ;\n" | |
191 | " add %[inc], %[p2] ;\n" | |
192 | " add %[inc], %[p3] ;\n" | |
193 | " dec %[cnt] ;\n" | |
194 | " jnz 1b ;\n" | |
195 | : [cnt] "+r" (lines), | |
196 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) | |
197 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | |
198 | : "memory"); | |
199 | ||
200 | kernel_fpu_end(); | |
201 | } | |
202 | ||
f317820c JB |
203 | static void |
204 | xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
205 | unsigned long *p3) | |
206 | { | |
207 | unsigned long lines = bytes >> 8; | |
208 | ||
209 | kernel_fpu_begin(); | |
210 | ||
211 | asm volatile( | |
212 | #undef BLOCK | |
213 | #define BLOCK(i) \ | |
214 | BLK64(PF0, LD, i) \ | |
215 | BLK64(PF1, XO1, i) \ | |
216 | BLK64(PF2, XO2, i) \ | |
217 | BLK64(NOP, ST, i) \ | |
218 | ||
219 | " .align 32 ;\n" | |
220 | " 1: ;\n" | |
221 | ||
222 | BLOCK(0) | |
223 | BLOCK(4) | |
224 | BLOCK(8) | |
225 | BLOCK(12) | |
226 | ||
227 | " add %[inc], %[p1] ;\n" | |
228 | " add %[inc], %[p2] ;\n" | |
229 | " add %[inc], %[p3] ;\n" | |
230 | " dec %[cnt] ;\n" | |
231 | " jnz 1b ;\n" | |
232 | : [cnt] "+r" (lines), | |
233 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) | |
234 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | |
235 | : "memory"); | |
236 | ||
237 | kernel_fpu_end(); | |
238 | } | |
239 | ||
e8f6e3f8 JB |
240 | static void |
241 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
242 | unsigned long *p3, unsigned long *p4) | |
243 | { | |
244 | unsigned long lines = bytes >> 8; | |
245 | ||
246 | kernel_fpu_begin(); | |
247 | ||
248 | asm volatile( | |
249 | #undef BLOCK | |
250 | #define BLOCK(i) \ | |
251 | PF1(i) \ | |
252 | PF1(i + 2) \ | |
253 | LD(i, 0) \ | |
254 | LD(i + 1, 1) \ | |
255 | LD(i + 2, 2) \ | |
256 | LD(i + 3, 3) \ | |
257 | PF2(i) \ | |
258 | PF2(i + 2) \ | |
259 | XO1(i, 0) \ | |
260 | XO1(i + 1, 1) \ | |
261 | XO1(i + 2, 2) \ | |
262 | XO1(i + 3, 3) \ | |
263 | PF3(i) \ | |
264 | PF3(i + 2) \ | |
265 | PF0(i + 4) \ | |
266 | PF0(i + 6) \ | |
267 | XO2(i, 0) \ | |
268 | XO2(i + 1, 1) \ | |
269 | XO2(i + 2, 2) \ | |
270 | XO2(i + 3, 3) \ | |
271 | XO3(i, 0) \ | |
272 | XO3(i + 1, 1) \ | |
273 | XO3(i + 2, 2) \ | |
274 | XO3(i + 3, 3) \ | |
275 | ST(i, 0) \ | |
276 | ST(i + 1, 1) \ | |
277 | ST(i + 2, 2) \ | |
278 | ST(i + 3, 3) \ | |
279 | ||
280 | ||
281 | PF0(0) | |
282 | PF0(2) | |
283 | ||
284 | " .align 32 ;\n" | |
285 | " 1: ;\n" | |
286 | ||
287 | BLOCK(0) | |
288 | BLOCK(4) | |
289 | BLOCK(8) | |
290 | BLOCK(12) | |
291 | ||
292 | " add %[inc], %[p1] ;\n" | |
293 | " add %[inc], %[p2] ;\n" | |
294 | " add %[inc], %[p3] ;\n" | |
295 | " add %[inc], %[p4] ;\n" | |
296 | " dec %[cnt] ;\n" | |
297 | " jnz 1b ;\n" | |
298 | : [cnt] "+r" (lines), [p1] "+r" (p1), | |
299 | [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) | |
300 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | |
301 | : "memory"); | |
302 | ||
303 | kernel_fpu_end(); | |
304 | } | |
305 | ||
f317820c JB |
306 | static void |
307 | xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
308 | unsigned long *p3, unsigned long *p4) | |
309 | { | |
310 | unsigned long lines = bytes >> 8; | |
311 | ||
312 | kernel_fpu_begin(); | |
313 | ||
314 | asm volatile( | |
315 | #undef BLOCK | |
316 | #define BLOCK(i) \ | |
317 | BLK64(PF0, LD, i) \ | |
318 | BLK64(PF1, XO1, i) \ | |
319 | BLK64(PF2, XO2, i) \ | |
320 | BLK64(PF3, XO3, i) \ | |
321 | BLK64(NOP, ST, i) \ | |
322 | ||
323 | " .align 32 ;\n" | |
324 | " 1: ;\n" | |
325 | ||
326 | BLOCK(0) | |
327 | BLOCK(4) | |
328 | BLOCK(8) | |
329 | BLOCK(12) | |
330 | ||
331 | " add %[inc], %[p1] ;\n" | |
332 | " add %[inc], %[p2] ;\n" | |
333 | " add %[inc], %[p3] ;\n" | |
334 | " add %[inc], %[p4] ;\n" | |
335 | " dec %[cnt] ;\n" | |
336 | " jnz 1b ;\n" | |
337 | : [cnt] "+r" (lines), [p1] "+r" (p1), | |
338 | [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) | |
339 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | |
340 | : "memory"); | |
341 | ||
342 | kernel_fpu_end(); | |
343 | } | |
344 | ||
e8f6e3f8 JB |
345 | static void |
346 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
347 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | |
348 | { | |
349 | unsigned long lines = bytes >> 8; | |
350 | ||
351 | kernel_fpu_begin(); | |
352 | ||
353 | asm volatile( | |
354 | #undef BLOCK | |
355 | #define BLOCK(i) \ | |
356 | PF1(i) \ | |
357 | PF1(i + 2) \ | |
358 | LD(i, 0) \ | |
359 | LD(i + 1, 1) \ | |
360 | LD(i + 2, 2) \ | |
361 | LD(i + 3, 3) \ | |
362 | PF2(i) \ | |
363 | PF2(i + 2) \ | |
364 | XO1(i, 0) \ | |
365 | XO1(i + 1, 1) \ | |
366 | XO1(i + 2, 2) \ | |
367 | XO1(i + 3, 3) \ | |
368 | PF3(i) \ | |
369 | PF3(i + 2) \ | |
370 | XO2(i, 0) \ | |
371 | XO2(i + 1, 1) \ | |
372 | XO2(i + 2, 2) \ | |
373 | XO2(i + 3, 3) \ | |
374 | PF4(i) \ | |
375 | PF4(i + 2) \ | |
376 | PF0(i + 4) \ | |
377 | PF0(i + 6) \ | |
378 | XO3(i, 0) \ | |
379 | XO3(i + 1, 1) \ | |
380 | XO3(i + 2, 2) \ | |
381 | XO3(i + 3, 3) \ | |
382 | XO4(i, 0) \ | |
383 | XO4(i + 1, 1) \ | |
384 | XO4(i + 2, 2) \ | |
385 | XO4(i + 3, 3) \ | |
386 | ST(i, 0) \ | |
387 | ST(i + 1, 1) \ | |
388 | ST(i + 2, 2) \ | |
389 | ST(i + 3, 3) \ | |
390 | ||
391 | ||
392 | PF0(0) | |
393 | PF0(2) | |
394 | ||
395 | " .align 32 ;\n" | |
396 | " 1: ;\n" | |
397 | ||
398 | BLOCK(0) | |
399 | BLOCK(4) | |
400 | BLOCK(8) | |
401 | BLOCK(12) | |
402 | ||
403 | " add %[inc], %[p1] ;\n" | |
404 | " add %[inc], %[p2] ;\n" | |
405 | " add %[inc], %[p3] ;\n" | |
406 | " add %[inc], %[p4] ;\n" | |
407 | " add %[inc], %[p5] ;\n" | |
408 | " dec %[cnt] ;\n" | |
409 | " jnz 1b ;\n" | |
410 | : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), | |
411 | [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) | |
412 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | |
413 | : "memory"); | |
414 | ||
415 | kernel_fpu_end(); | |
416 | } | |
417 | ||
f317820c JB |
418 | static void |
419 | xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |
420 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | |
421 | { | |
422 | unsigned long lines = bytes >> 8; | |
423 | ||
424 | kernel_fpu_begin(); | |
425 | ||
426 | asm volatile( | |
427 | #undef BLOCK | |
428 | #define BLOCK(i) \ | |
429 | BLK64(PF0, LD, i) \ | |
430 | BLK64(PF1, XO1, i) \ | |
431 | BLK64(PF2, XO2, i) \ | |
432 | BLK64(PF3, XO3, i) \ | |
433 | BLK64(PF4, XO4, i) \ | |
434 | BLK64(NOP, ST, i) \ | |
435 | ||
436 | " .align 32 ;\n" | |
437 | " 1: ;\n" | |
438 | ||
439 | BLOCK(0) | |
440 | BLOCK(4) | |
441 | BLOCK(8) | |
442 | BLOCK(12) | |
443 | ||
444 | " add %[inc], %[p1] ;\n" | |
445 | " add %[inc], %[p2] ;\n" | |
446 | " add %[inc], %[p3] ;\n" | |
447 | " add %[inc], %[p4] ;\n" | |
448 | " add %[inc], %[p5] ;\n" | |
449 | " dec %[cnt] ;\n" | |
450 | " jnz 1b ;\n" | |
451 | : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), | |
452 | [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) | |
453 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | |
454 | : "memory"); | |
455 | ||
456 | kernel_fpu_end(); | |
457 | } | |
458 | ||
459 | static struct xor_block_template xor_block_sse_pf64 = { | |
460 | .name = "prefetch64-sse", | |
461 | .do_2 = xor_sse_2_pf64, | |
462 | .do_3 = xor_sse_3_pf64, | |
463 | .do_4 = xor_sse_4_pf64, | |
464 | .do_5 = xor_sse_5_pf64, | |
465 | }; | |
466 | ||
e8f6e3f8 JB |
467 | #undef LD |
468 | #undef XO1 | |
469 | #undef XO2 | |
470 | #undef XO3 | |
471 | #undef XO4 | |
472 | #undef ST | |
f317820c JB |
473 | #undef NOP |
474 | #undef BLK64 | |
e8f6e3f8 JB |
475 | #undef BLOCK |
476 | ||
477 | #undef XOR_CONSTANT_CONSTRAINT | |
478 | ||
96a388de | 479 | #ifdef CONFIG_X86_32 |
a1ce3928 | 480 | # include <asm/xor_32.h> |
96a388de | 481 | #else |
a1ce3928 | 482 | # include <asm/xor_64.h> |
96a388de | 483 | #endif |
e8f6e3f8 | 484 | |
f317820c JB |
485 | #define XOR_SELECT_TEMPLATE(FASTEST) \ |
486 | AVX_SELECT(FASTEST) | |
487 | ||
e8f6e3f8 | 488 | #endif /* _ASM_X86_XOR_H */ |