1 /* SPDX-License-Identifier: GPL-2.0-or-later */
6 * Optimized RAID-5 checksumming functions for SSE.
10 * Cache avoiding checksumming functions utilizing KNI instructions
11 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
16 * High-speed RAID5 checksumming functions utilizing SSE instructions.
17 * Copyright (C) 1998 Ingo Molnar.
21 * x86-64 changes / gcc fixes from Andi Kleen.
22 * Copyright 2002 Andi Kleen, SuSE Labs.
24 * This hasn't been optimized for the hammer yet, but there are likely
25 * no advantages to be gotten from x86-64 here anyways.
28 #include <asm/fpu/api.h>
31 /* reduce register pressure */
32 # define XOR_CONSTANT_CONSTRAINT "i"
34 # define XOR_CONSTANT_CONSTRAINT "re"
37 #define OFFS(x) "16*("#x")"
38 #define PF_OFFS(x) "256+16*("#x")"
39 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
40 #define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
41 #define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
42 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
43 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
44 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
45 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
46 #define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
47 #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
48 #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
49 #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
52 #define BLK64(pf, op, i) \
60 xor_sse_2(unsigned long bytes, unsigned long * __restrict p1,
61 const unsigned long * __restrict p2)
63 unsigned long lines = bytes >> 8;
99 " add %[inc], %[p1] ;\n"
100 " add %[inc], %[p2] ;\n"
103 : [cnt] "+r" (lines),
104 [p1] "+r" (p1), [p2] "+r" (p2)
105 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
112 xor_sse_2_pf64(unsigned long bytes, unsigned long * __restrict p1,
113 const unsigned long * __restrict p2)
115 unsigned long lines = bytes >> 8;
134 " add %[inc], %[p1] ;\n"
135 " add %[inc], %[p2] ;\n"
138 : [cnt] "+r" (lines),
139 [p1] "+r" (p1), [p2] "+r" (p2)
140 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
147 xor_sse_3(unsigned long bytes, unsigned long * __restrict p1,
148 const unsigned long * __restrict p2,
149 const unsigned long * __restrict p3)
151 unsigned long lines = bytes >> 8;
193 " add %[inc], %[p1] ;\n"
194 " add %[inc], %[p2] ;\n"
195 " add %[inc], %[p3] ;\n"
198 : [cnt] "+r" (lines),
199 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
200 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
207 xor_sse_3_pf64(unsigned long bytes, unsigned long * __restrict p1,
208 const unsigned long * __restrict p2,
209 const unsigned long * __restrict p3)
211 unsigned long lines = bytes >> 8;
231 " add %[inc], %[p1] ;\n"
232 " add %[inc], %[p2] ;\n"
233 " add %[inc], %[p3] ;\n"
236 : [cnt] "+r" (lines),
237 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
238 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
245 xor_sse_4(unsigned long bytes, unsigned long * __restrict p1,
246 const unsigned long * __restrict p2,
247 const unsigned long * __restrict p3,
248 const unsigned long * __restrict p4)
250 unsigned long lines = bytes >> 8;
298 " add %[inc], %[p1] ;\n"
299 " add %[inc], %[p2] ;\n"
300 " add %[inc], %[p3] ;\n"
301 " add %[inc], %[p4] ;\n"
304 : [cnt] "+r" (lines), [p1] "+r" (p1),
305 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
306 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
313 xor_sse_4_pf64(unsigned long bytes, unsigned long * __restrict p1,
314 const unsigned long * __restrict p2,
315 const unsigned long * __restrict p3,
316 const unsigned long * __restrict p4)
318 unsigned long lines = bytes >> 8;
339 " add %[inc], %[p1] ;\n"
340 " add %[inc], %[p2] ;\n"
341 " add %[inc], %[p3] ;\n"
342 " add %[inc], %[p4] ;\n"
345 : [cnt] "+r" (lines), [p1] "+r" (p1),
346 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
347 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
354 xor_sse_5(unsigned long bytes, unsigned long * __restrict p1,
355 const unsigned long * __restrict p2,
356 const unsigned long * __restrict p3,
357 const unsigned long * __restrict p4,
358 const unsigned long * __restrict p5)
360 unsigned long lines = bytes >> 8;
414 " add %[inc], %[p1] ;\n"
415 " add %[inc], %[p2] ;\n"
416 " add %[inc], %[p3] ;\n"
417 " add %[inc], %[p4] ;\n"
418 " add %[inc], %[p5] ;\n"
421 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
422 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
423 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
430 xor_sse_5_pf64(unsigned long bytes, unsigned long * __restrict p1,
431 const unsigned long * __restrict p2,
432 const unsigned long * __restrict p3,
433 const unsigned long * __restrict p4,
434 const unsigned long * __restrict p5)
436 unsigned long lines = bytes >> 8;
458 " add %[inc], %[p1] ;\n"
459 " add %[inc], %[p2] ;\n"
460 " add %[inc], %[p3] ;\n"
461 " add %[inc], %[p4] ;\n"
462 " add %[inc], %[p5] ;\n"
465 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
466 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
467 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
473 static struct xor_block_template xor_block_sse_pf64 = {
474 .name = "prefetch64-sse",
475 .do_2 = xor_sse_2_pf64,
476 .do_3 = xor_sse_3_pf64,
477 .do_4 = xor_sse_4_pf64,
478 .do_5 = xor_sse_5_pf64,
491 #undef XOR_CONSTANT_CONSTRAINT
494 # include <asm/xor_32.h>
496 # include <asm/xor_64.h>
499 #define XOR_SELECT_TEMPLATE(FASTEST) \
502 #endif /* _ASM_X86_XOR_H */