Merge tag 'rtc-5.5' of git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux
[linux-block.git] / arch / x86 / include / asm / xor.h
CommitLineData
af1a8899 1/* SPDX-License-Identifier: GPL-2.0-or-later */
49502766 2#ifndef _ASM_X86_XOR_H
e8f6e3f8
JB
3#define _ASM_X86_XOR_H
4
5/*
6 * Optimized RAID-5 checksumming functions for SSE.
e8f6e3f8
JB
7 */
8
9/*
10 * Cache avoiding checksumming functions utilizing KNI instructions
11 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
12 */
13
14/*
15 * Based on
16 * High-speed RAID5 checksumming functions utilizing SSE instructions.
17 * Copyright (C) 1998 Ingo Molnar.
18 */
19
20/*
21 * x86-64 changes / gcc fixes from Andi Kleen.
22 * Copyright 2002 Andi Kleen, SuSE Labs.
23 *
24 * This hasn't been optimized for the hammer yet, but there are likely
25 * no advantages to be gotten from x86-64 here anyways.
26 */
27
df6b35f4 28#include <asm/fpu/api.h>
e8f6e3f8
JB
29
30#ifdef CONFIG_X86_32
31/* reduce register pressure */
32# define XOR_CONSTANT_CONSTRAINT "i"
f8561296 33#else
e8f6e3f8
JB
34# define XOR_CONSTANT_CONSTRAINT "re"
35#endif
36
37#define OFFS(x) "16*("#x")"
38#define PF_OFFS(x) "256+16*("#x")"
39#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
40#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
41#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
42#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
43#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
44#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
45#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
46#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
47#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
48#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
49#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
f317820c
JB
50#define NOP(x)
51
52#define BLK64(pf, op, i) \
53 pf(i) \
54 op(i, 0) \
55 op(i + 1, 1) \
56 op(i + 2, 2) \
57 op(i + 3, 3)
e8f6e3f8
JB
58
59static void
60xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
61{
62 unsigned long lines = bytes >> 8;
63
64 kernel_fpu_begin();
65
66 asm volatile(
67#undef BLOCK
68#define BLOCK(i) \
69 LD(i, 0) \
70 LD(i + 1, 1) \
71 PF1(i) \
72 PF1(i + 2) \
73 LD(i + 2, 2) \
74 LD(i + 3, 3) \
75 PF0(i + 4) \
76 PF0(i + 6) \
77 XO1(i, 0) \
78 XO1(i + 1, 1) \
79 XO1(i + 2, 2) \
80 XO1(i + 3, 3) \
81 ST(i, 0) \
82 ST(i + 1, 1) \
83 ST(i + 2, 2) \
84 ST(i + 3, 3) \
85
86
87 PF0(0)
88 PF0(2)
89
90 " .align 32 ;\n"
91 " 1: ;\n"
92
93 BLOCK(0)
94 BLOCK(4)
95 BLOCK(8)
96 BLOCK(12)
97
98 " add %[inc], %[p1] ;\n"
99 " add %[inc], %[p2] ;\n"
100 " dec %[cnt] ;\n"
101 " jnz 1b ;\n"
102 : [cnt] "+r" (lines),
103 [p1] "+r" (p1), [p2] "+r" (p2)
104 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
105 : "memory");
106
107 kernel_fpu_end();
108}
109
f317820c
JB
110static void
111xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
112{
113 unsigned long lines = bytes >> 8;
114
115 kernel_fpu_begin();
116
117 asm volatile(
118#undef BLOCK
119#define BLOCK(i) \
120 BLK64(PF0, LD, i) \
121 BLK64(PF1, XO1, i) \
122 BLK64(NOP, ST, i) \
123
124 " .align 32 ;\n"
125 " 1: ;\n"
126
127 BLOCK(0)
128 BLOCK(4)
129 BLOCK(8)
130 BLOCK(12)
131
132 " add %[inc], %[p1] ;\n"
133 " add %[inc], %[p2] ;\n"
134 " dec %[cnt] ;\n"
135 " jnz 1b ;\n"
136 : [cnt] "+r" (lines),
137 [p1] "+r" (p1), [p2] "+r" (p2)
138 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
139 : "memory");
140
141 kernel_fpu_end();
142}
143
e8f6e3f8
JB
144static void
145xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
146 unsigned long *p3)
147{
148 unsigned long lines = bytes >> 8;
149
150 kernel_fpu_begin();
151
152 asm volatile(
153#undef BLOCK
154#define BLOCK(i) \
155 PF1(i) \
156 PF1(i + 2) \
157 LD(i, 0) \
158 LD(i + 1, 1) \
159 LD(i + 2, 2) \
160 LD(i + 3, 3) \
161 PF2(i) \
162 PF2(i + 2) \
163 PF0(i + 4) \
164 PF0(i + 6) \
165 XO1(i, 0) \
166 XO1(i + 1, 1) \
167 XO1(i + 2, 2) \
168 XO1(i + 3, 3) \
169 XO2(i, 0) \
170 XO2(i + 1, 1) \
171 XO2(i + 2, 2) \
172 XO2(i + 3, 3) \
173 ST(i, 0) \
174 ST(i + 1, 1) \
175 ST(i + 2, 2) \
176 ST(i + 3, 3) \
177
178
179 PF0(0)
180 PF0(2)
181
182 " .align 32 ;\n"
183 " 1: ;\n"
184
185 BLOCK(0)
186 BLOCK(4)
187 BLOCK(8)
188 BLOCK(12)
189
190 " add %[inc], %[p1] ;\n"
191 " add %[inc], %[p2] ;\n"
192 " add %[inc], %[p3] ;\n"
193 " dec %[cnt] ;\n"
194 " jnz 1b ;\n"
195 : [cnt] "+r" (lines),
196 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
197 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
198 : "memory");
199
200 kernel_fpu_end();
201}
202
f317820c
JB
203static void
204xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
205 unsigned long *p3)
206{
207 unsigned long lines = bytes >> 8;
208
209 kernel_fpu_begin();
210
211 asm volatile(
212#undef BLOCK
213#define BLOCK(i) \
214 BLK64(PF0, LD, i) \
215 BLK64(PF1, XO1, i) \
216 BLK64(PF2, XO2, i) \
217 BLK64(NOP, ST, i) \
218
219 " .align 32 ;\n"
220 " 1: ;\n"
221
222 BLOCK(0)
223 BLOCK(4)
224 BLOCK(8)
225 BLOCK(12)
226
227 " add %[inc], %[p1] ;\n"
228 " add %[inc], %[p2] ;\n"
229 " add %[inc], %[p3] ;\n"
230 " dec %[cnt] ;\n"
231 " jnz 1b ;\n"
232 : [cnt] "+r" (lines),
233 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
234 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
235 : "memory");
236
237 kernel_fpu_end();
238}
239
e8f6e3f8
JB
240static void
241xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
242 unsigned long *p3, unsigned long *p4)
243{
244 unsigned long lines = bytes >> 8;
245
246 kernel_fpu_begin();
247
248 asm volatile(
249#undef BLOCK
250#define BLOCK(i) \
251 PF1(i) \
252 PF1(i + 2) \
253 LD(i, 0) \
254 LD(i + 1, 1) \
255 LD(i + 2, 2) \
256 LD(i + 3, 3) \
257 PF2(i) \
258 PF2(i + 2) \
259 XO1(i, 0) \
260 XO1(i + 1, 1) \
261 XO1(i + 2, 2) \
262 XO1(i + 3, 3) \
263 PF3(i) \
264 PF3(i + 2) \
265 PF0(i + 4) \
266 PF0(i + 6) \
267 XO2(i, 0) \
268 XO2(i + 1, 1) \
269 XO2(i + 2, 2) \
270 XO2(i + 3, 3) \
271 XO3(i, 0) \
272 XO3(i + 1, 1) \
273 XO3(i + 2, 2) \
274 XO3(i + 3, 3) \
275 ST(i, 0) \
276 ST(i + 1, 1) \
277 ST(i + 2, 2) \
278 ST(i + 3, 3) \
279
280
281 PF0(0)
282 PF0(2)
283
284 " .align 32 ;\n"
285 " 1: ;\n"
286
287 BLOCK(0)
288 BLOCK(4)
289 BLOCK(8)
290 BLOCK(12)
291
292 " add %[inc], %[p1] ;\n"
293 " add %[inc], %[p2] ;\n"
294 " add %[inc], %[p3] ;\n"
295 " add %[inc], %[p4] ;\n"
296 " dec %[cnt] ;\n"
297 " jnz 1b ;\n"
298 : [cnt] "+r" (lines), [p1] "+r" (p1),
299 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
300 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
301 : "memory");
302
303 kernel_fpu_end();
304}
305
f317820c
JB
306static void
307xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
308 unsigned long *p3, unsigned long *p4)
309{
310 unsigned long lines = bytes >> 8;
311
312 kernel_fpu_begin();
313
314 asm volatile(
315#undef BLOCK
316#define BLOCK(i) \
317 BLK64(PF0, LD, i) \
318 BLK64(PF1, XO1, i) \
319 BLK64(PF2, XO2, i) \
320 BLK64(PF3, XO3, i) \
321 BLK64(NOP, ST, i) \
322
323 " .align 32 ;\n"
324 " 1: ;\n"
325
326 BLOCK(0)
327 BLOCK(4)
328 BLOCK(8)
329 BLOCK(12)
330
331 " add %[inc], %[p1] ;\n"
332 " add %[inc], %[p2] ;\n"
333 " add %[inc], %[p3] ;\n"
334 " add %[inc], %[p4] ;\n"
335 " dec %[cnt] ;\n"
336 " jnz 1b ;\n"
337 : [cnt] "+r" (lines), [p1] "+r" (p1),
338 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
339 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
340 : "memory");
341
342 kernel_fpu_end();
343}
344
e8f6e3f8
JB
345static void
346xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
347 unsigned long *p3, unsigned long *p4, unsigned long *p5)
348{
349 unsigned long lines = bytes >> 8;
350
351 kernel_fpu_begin();
352
353 asm volatile(
354#undef BLOCK
355#define BLOCK(i) \
356 PF1(i) \
357 PF1(i + 2) \
358 LD(i, 0) \
359 LD(i + 1, 1) \
360 LD(i + 2, 2) \
361 LD(i + 3, 3) \
362 PF2(i) \
363 PF2(i + 2) \
364 XO1(i, 0) \
365 XO1(i + 1, 1) \
366 XO1(i + 2, 2) \
367 XO1(i + 3, 3) \
368 PF3(i) \
369 PF3(i + 2) \
370 XO2(i, 0) \
371 XO2(i + 1, 1) \
372 XO2(i + 2, 2) \
373 XO2(i + 3, 3) \
374 PF4(i) \
375 PF4(i + 2) \
376 PF0(i + 4) \
377 PF0(i + 6) \
378 XO3(i, 0) \
379 XO3(i + 1, 1) \
380 XO3(i + 2, 2) \
381 XO3(i + 3, 3) \
382 XO4(i, 0) \
383 XO4(i + 1, 1) \
384 XO4(i + 2, 2) \
385 XO4(i + 3, 3) \
386 ST(i, 0) \
387 ST(i + 1, 1) \
388 ST(i + 2, 2) \
389 ST(i + 3, 3) \
390
391
392 PF0(0)
393 PF0(2)
394
395 " .align 32 ;\n"
396 " 1: ;\n"
397
398 BLOCK(0)
399 BLOCK(4)
400 BLOCK(8)
401 BLOCK(12)
402
403 " add %[inc], %[p1] ;\n"
404 " add %[inc], %[p2] ;\n"
405 " add %[inc], %[p3] ;\n"
406 " add %[inc], %[p4] ;\n"
407 " add %[inc], %[p5] ;\n"
408 " dec %[cnt] ;\n"
409 " jnz 1b ;\n"
410 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
411 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
412 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
413 : "memory");
414
415 kernel_fpu_end();
416}
417
f317820c
JB
418static void
419xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
420 unsigned long *p3, unsigned long *p4, unsigned long *p5)
421{
422 unsigned long lines = bytes >> 8;
423
424 kernel_fpu_begin();
425
426 asm volatile(
427#undef BLOCK
428#define BLOCK(i) \
429 BLK64(PF0, LD, i) \
430 BLK64(PF1, XO1, i) \
431 BLK64(PF2, XO2, i) \
432 BLK64(PF3, XO3, i) \
433 BLK64(PF4, XO4, i) \
434 BLK64(NOP, ST, i) \
435
436 " .align 32 ;\n"
437 " 1: ;\n"
438
439 BLOCK(0)
440 BLOCK(4)
441 BLOCK(8)
442 BLOCK(12)
443
444 " add %[inc], %[p1] ;\n"
445 " add %[inc], %[p2] ;\n"
446 " add %[inc], %[p3] ;\n"
447 " add %[inc], %[p4] ;\n"
448 " add %[inc], %[p5] ;\n"
449 " dec %[cnt] ;\n"
450 " jnz 1b ;\n"
451 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
452 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
453 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
454 : "memory");
455
456 kernel_fpu_end();
457}
458
459static struct xor_block_template xor_block_sse_pf64 = {
460 .name = "prefetch64-sse",
461 .do_2 = xor_sse_2_pf64,
462 .do_3 = xor_sse_3_pf64,
463 .do_4 = xor_sse_4_pf64,
464 .do_5 = xor_sse_5_pf64,
465};
466
e8f6e3f8
JB
467#undef LD
468#undef XO1
469#undef XO2
470#undef XO3
471#undef XO4
472#undef ST
f317820c
JB
473#undef NOP
474#undef BLK64
e8f6e3f8
JB
475#undef BLOCK
476
477#undef XOR_CONSTANT_CONSTRAINT
478
96a388de 479#ifdef CONFIG_X86_32
a1ce3928 480# include <asm/xor_32.h>
96a388de 481#else
a1ce3928 482# include <asm/xor_64.h>
96a388de 483#endif
e8f6e3f8 484
f317820c
JB
485#define XOR_SELECT_TEMPLATE(FASTEST) \
486 AVX_SELECT(FASTEST)
487
e8f6e3f8 488#endif /* _ASM_X86_XOR_H */