Commit | Line | Data |
---|---|---|
2c935842 YL |
1 | /* -*- linux-c -*- ------------------------------------------------------- * |
2 | * | |
3 | * Copyright (C) 2012 Intel Corporation | |
4 | * Author: Yuanhan Liu <yuanhan.liu@linux.intel.com> | |
5 | * | |
6 | * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved | |
7 | * | |
8 | * | |
9 | * This program is free software; you can redistribute it and/or modify | |
10 | * it under the terms of the GNU General Public License as published by | |
11 | * the Free Software Foundation, Inc., 53 Temple Place Ste 330, | |
12 | * Boston MA 02111-1307, USA; either version 2 of the License, or | |
13 | * (at your option) any later version; incorporated herein by reference. | |
14 | * | |
15 | * ----------------------------------------------------------------------- */ | |
16 | ||
17 | /* | |
18 | * AVX2 implementation of RAID-6 syndrome functions | |
19 | * | |
20 | */ | |
21 | ||
22 | #ifdef CONFIG_AS_AVX2 | |
23 | ||
24 | #include <linux/raid/pq.h> | |
25 | #include "x86.h" | |
26 | ||
27 | static const struct raid6_avx2_constants { | |
28 | u64 x1d[4]; | |
29 | } raid6_avx2_constants __aligned(32) = { | |
30 | { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, | |
31 | 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, | |
32 | }; | |
33 | ||
34 | static int raid6_have_avx2(void) | |
35 | { | |
36 | return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX); | |
37 | } | |
38 | ||
39 | /* | |
40 | * Plain AVX2 implementation | |
41 | */ | |
42 | static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs) | |
43 | { | |
44 | u8 **dptr = (u8 **)ptrs; | |
45 | u8 *p, *q; | |
46 | int d, z, z0; | |
47 | ||
48 | z0 = disks - 3; /* Highest data disk */ | |
49 | p = dptr[z0+1]; /* XOR parity */ | |
50 | q = dptr[z0+2]; /* RS syndrome */ | |
51 | ||
52 | kernel_fpu_begin(); | |
53 | ||
54 | asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); | |
55 | asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* Zero temp */ | |
56 | ||
57 | for (d = 0; d < bytes; d += 32) { | |
58 | asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); | |
59 | asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */ | |
60 | asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); | |
61 | asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */ | |
62 | asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d])); | |
63 | for (z = z0-2; z >= 0; z--) { | |
64 | asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); | |
65 | asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5"); | |
66 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | |
67 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | |
68 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | |
69 | asm volatile("vpxor %ymm6,%ymm2,%ymm2"); | |
70 | asm volatile("vpxor %ymm6,%ymm4,%ymm4"); | |
71 | asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d])); | |
72 | } | |
73 | asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5"); | |
74 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | |
75 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | |
76 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | |
77 | asm volatile("vpxor %ymm6,%ymm2,%ymm2"); | |
78 | asm volatile("vpxor %ymm6,%ymm4,%ymm4"); | |
79 | ||
80 | asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); | |
81 | asm volatile("vpxor %ymm2,%ymm2,%ymm2"); | |
82 | asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); | |
83 | asm volatile("vpxor %ymm4,%ymm4,%ymm4"); | |
84 | } | |
85 | ||
86 | asm volatile("sfence" : : : "memory"); | |
87 | kernel_fpu_end(); | |
88 | } | |
89 | ||
b9bf33a8 GK |
90 | static void raid6_avx21_xor_syndrome(int disks, int start, int stop, |
91 | size_t bytes, void **ptrs) | |
92 | { | |
93 | u8 **dptr = (u8 **)ptrs; | |
94 | u8 *p, *q; | |
95 | int d, z, z0; | |
96 | ||
97 | z0 = stop; /* P/Q right side optimization */ | |
98 | p = dptr[disks-2]; /* XOR parity */ | |
99 | q = dptr[disks-1]; /* RS syndrome */ | |
100 | ||
101 | kernel_fpu_begin(); | |
102 | ||
103 | asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); | |
104 | ||
105 | for (d = 0 ; d < bytes ; d += 32) { | |
106 | asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); | |
107 | asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); | |
108 | asm volatile("vpxor %ymm4,%ymm2,%ymm2"); | |
109 | /* P/Q data pages */ | |
110 | for (z = z0-1 ; z >= start ; z--) { | |
111 | asm volatile("vpxor %ymm5,%ymm5,%ymm5"); | |
112 | asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); | |
113 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | |
114 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | |
115 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | |
116 | asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); | |
117 | asm volatile("vpxor %ymm5,%ymm2,%ymm2"); | |
118 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | |
119 | } | |
120 | /* P/Q left side optimization */ | |
121 | for (z = start-1 ; z >= 0 ; z--) { | |
122 | asm volatile("vpxor %ymm5,%ymm5,%ymm5"); | |
123 | asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); | |
124 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | |
125 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | |
126 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | |
127 | } | |
128 | asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); | |
129 | /* Don't use movntdq for r/w memory area < cache line */ | |
130 | asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d])); | |
131 | asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d])); | |
132 | } | |
133 | ||
134 | asm volatile("sfence" : : : "memory"); | |
135 | kernel_fpu_end(); | |
136 | } | |
137 | ||
2c935842 YL |
138 | const struct raid6_calls raid6_avx2x1 = { |
139 | raid6_avx21_gen_syndrome, | |
b9bf33a8 | 140 | raid6_avx21_xor_syndrome, |
2c935842 YL |
141 | raid6_have_avx2, |
142 | "avx2x1", | |
143 | 1 /* Has cache hints */ | |
144 | }; | |
145 | ||
146 | /* | |
147 | * Unrolled-by-2 AVX2 implementation | |
148 | */ | |
149 | static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs) | |
150 | { | |
151 | u8 **dptr = (u8 **)ptrs; | |
152 | u8 *p, *q; | |
153 | int d, z, z0; | |
154 | ||
155 | z0 = disks - 3; /* Highest data disk */ | |
156 | p = dptr[z0+1]; /* XOR parity */ | |
157 | q = dptr[z0+2]; /* RS syndrome */ | |
158 | ||
159 | kernel_fpu_begin(); | |
160 | ||
161 | asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); | |
162 | asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */ | |
163 | ||
164 | /* We uniformly assume a single prefetch covers at least 32 bytes */ | |
165 | for (d = 0; d < bytes; d += 64) { | |
166 | asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); | |
167 | asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32])); | |
168 | asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */ | |
169 | asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */ | |
170 | asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */ | |
171 | asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */ | |
172 | for (z = z0-1; z >= 0; z--) { | |
173 | asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); | |
174 | asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32])); | |
175 | asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5"); | |
176 | asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7"); | |
177 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | |
178 | asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); | |
179 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | |
180 | asm volatile("vpand %ymm0,%ymm7,%ymm7"); | |
181 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | |
182 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | |
183 | asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d])); | |
184 | asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32])); | |
185 | asm volatile("vpxor %ymm5,%ymm2,%ymm2"); | |
186 | asm volatile("vpxor %ymm7,%ymm3,%ymm3"); | |
187 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | |
188 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | |
189 | } | |
190 | asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); | |
191 | asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); | |
192 | asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); | |
193 | asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); | |
194 | } | |
195 | ||
196 | asm volatile("sfence" : : : "memory"); | |
197 | kernel_fpu_end(); | |
198 | } | |
199 | ||
b9bf33a8 GK |
200 | static void raid6_avx22_xor_syndrome(int disks, int start, int stop, |
201 | size_t bytes, void **ptrs) | |
202 | { | |
203 | u8 **dptr = (u8 **)ptrs; | |
204 | u8 *p, *q; | |
205 | int d, z, z0; | |
206 | ||
207 | z0 = stop; /* P/Q right side optimization */ | |
208 | p = dptr[disks-2]; /* XOR parity */ | |
209 | q = dptr[disks-1]; /* RS syndrome */ | |
210 | ||
211 | kernel_fpu_begin(); | |
212 | ||
213 | asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); | |
214 | ||
215 | for (d = 0 ; d < bytes ; d += 64) { | |
216 | asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); | |
217 | asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32])); | |
218 | asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); | |
219 | asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32])); | |
220 | asm volatile("vpxor %ymm4,%ymm2,%ymm2"); | |
221 | asm volatile("vpxor %ymm6,%ymm3,%ymm3"); | |
222 | /* P/Q data pages */ | |
223 | for (z = z0-1 ; z >= start ; z--) { | |
224 | asm volatile("vpxor %ymm5,%ymm5,%ymm5"); | |
225 | asm volatile("vpxor %ymm7,%ymm7,%ymm7"); | |
226 | asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); | |
227 | asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); | |
228 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | |
229 | asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); | |
230 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | |
231 | asm volatile("vpand %ymm0,%ymm7,%ymm7"); | |
232 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | |
233 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | |
234 | asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); | |
235 | asm volatile("vmovdqa %0,%%ymm7" | |
236 | :: "m" (dptr[z][d+32])); | |
237 | asm volatile("vpxor %ymm5,%ymm2,%ymm2"); | |
238 | asm volatile("vpxor %ymm7,%ymm3,%ymm3"); | |
239 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | |
240 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | |
241 | } | |
242 | /* P/Q left side optimization */ | |
243 | for (z = start-1 ; z >= 0 ; z--) { | |
244 | asm volatile("vpxor %ymm5,%ymm5,%ymm5"); | |
245 | asm volatile("vpxor %ymm7,%ymm7,%ymm7"); | |
246 | asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); | |
247 | asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); | |
248 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | |
249 | asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); | |
250 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | |
251 | asm volatile("vpand %ymm0,%ymm7,%ymm7"); | |
252 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | |
253 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | |
254 | } | |
255 | asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); | |
256 | asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32])); | |
257 | /* Don't use movntdq for r/w memory area < cache line */ | |
258 | asm volatile("vmovdqa %%ymm4,%0" : "=m" (q[d])); | |
259 | asm volatile("vmovdqa %%ymm6,%0" : "=m" (q[d+32])); | |
260 | asm volatile("vmovdqa %%ymm2,%0" : "=m" (p[d])); | |
261 | asm volatile("vmovdqa %%ymm3,%0" : "=m" (p[d+32])); | |
262 | } | |
263 | ||
264 | asm volatile("sfence" : : : "memory"); | |
265 | kernel_fpu_end(); | |
266 | } | |
267 | ||
2c935842 YL |
268 | const struct raid6_calls raid6_avx2x2 = { |
269 | raid6_avx22_gen_syndrome, | |
b9bf33a8 | 270 | raid6_avx22_xor_syndrome, |
2c935842 YL |
271 | raid6_have_avx2, |
272 | "avx2x2", | |
273 | 1 /* Has cache hints */ | |
274 | }; | |
275 | ||
276 | #ifdef CONFIG_X86_64 | |
277 | ||
278 | /* | |
279 | * Unrolled-by-4 AVX2 implementation | |
280 | */ | |
281 | static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs) | |
282 | { | |
283 | u8 **dptr = (u8 **)ptrs; | |
284 | u8 *p, *q; | |
285 | int d, z, z0; | |
286 | ||
287 | z0 = disks - 3; /* Highest data disk */ | |
288 | p = dptr[z0+1]; /* XOR parity */ | |
289 | q = dptr[z0+2]; /* RS syndrome */ | |
290 | ||
291 | kernel_fpu_begin(); | |
292 | ||
293 | asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); | |
294 | asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */ | |
295 | asm volatile("vpxor %ymm2,%ymm2,%ymm2"); /* P[0] */ | |
296 | asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* P[1] */ | |
297 | asm volatile("vpxor %ymm4,%ymm4,%ymm4"); /* Q[0] */ | |
298 | asm volatile("vpxor %ymm6,%ymm6,%ymm6"); /* Q[1] */ | |
299 | asm volatile("vpxor %ymm10,%ymm10,%ymm10"); /* P[2] */ | |
300 | asm volatile("vpxor %ymm11,%ymm11,%ymm11"); /* P[3] */ | |
301 | asm volatile("vpxor %ymm12,%ymm12,%ymm12"); /* Q[2] */ | |
302 | asm volatile("vpxor %ymm14,%ymm14,%ymm14"); /* Q[3] */ | |
303 | ||
304 | for (d = 0; d < bytes; d += 128) { | |
305 | for (z = z0; z >= 0; z--) { | |
306 | asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); | |
307 | asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32])); | |
308 | asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64])); | |
309 | asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96])); | |
310 | asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5"); | |
311 | asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7"); | |
312 | asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13"); | |
313 | asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15"); | |
314 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | |
315 | asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); | |
316 | asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); | |
317 | asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); | |
318 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | |
319 | asm volatile("vpand %ymm0,%ymm7,%ymm7"); | |
320 | asm volatile("vpand %ymm0,%ymm13,%ymm13"); | |
321 | asm volatile("vpand %ymm0,%ymm15,%ymm15"); | |
322 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | |
323 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | |
324 | asm volatile("vpxor %ymm13,%ymm12,%ymm12"); | |
325 | asm volatile("vpxor %ymm15,%ymm14,%ymm14"); | |
326 | asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d])); | |
327 | asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32])); | |
328 | asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64])); | |
329 | asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96])); | |
330 | asm volatile("vpxor %ymm5,%ymm2,%ymm2"); | |
331 | asm volatile("vpxor %ymm7,%ymm3,%ymm3"); | |
332 | asm volatile("vpxor %ymm13,%ymm10,%ymm10"); | |
333 | asm volatile("vpxor %ymm15,%ymm11,%ymm11"); | |
334 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | |
335 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | |
336 | asm volatile("vpxor %ymm13,%ymm12,%ymm12"); | |
337 | asm volatile("vpxor %ymm15,%ymm14,%ymm14"); | |
338 | } | |
339 | asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); | |
340 | asm volatile("vpxor %ymm2,%ymm2,%ymm2"); | |
341 | asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); | |
342 | asm volatile("vpxor %ymm3,%ymm3,%ymm3"); | |
343 | asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64])); | |
344 | asm volatile("vpxor %ymm10,%ymm10,%ymm10"); | |
345 | asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96])); | |
346 | asm volatile("vpxor %ymm11,%ymm11,%ymm11"); | |
347 | asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); | |
348 | asm volatile("vpxor %ymm4,%ymm4,%ymm4"); | |
349 | asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); | |
350 | asm volatile("vpxor %ymm6,%ymm6,%ymm6"); | |
351 | asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64])); | |
352 | asm volatile("vpxor %ymm12,%ymm12,%ymm12"); | |
353 | asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96])); | |
354 | asm volatile("vpxor %ymm14,%ymm14,%ymm14"); | |
355 | } | |
356 | ||
357 | asm volatile("sfence" : : : "memory"); | |
358 | kernel_fpu_end(); | |
359 | } | |
360 | ||
b9bf33a8 GK |
361 | static void raid6_avx24_xor_syndrome(int disks, int start, int stop, |
362 | size_t bytes, void **ptrs) | |
363 | { | |
364 | u8 **dptr = (u8 **)ptrs; | |
365 | u8 *p, *q; | |
366 | int d, z, z0; | |
367 | ||
368 | z0 = stop; /* P/Q right side optimization */ | |
369 | p = dptr[disks-2]; /* XOR parity */ | |
370 | q = dptr[disks-1]; /* RS syndrome */ | |
371 | ||
372 | kernel_fpu_begin(); | |
373 | ||
374 | asm volatile("vmovdqa %0,%%ymm0" :: "m" (raid6_avx2_constants.x1d[0])); | |
375 | ||
376 | for (d = 0 ; d < bytes ; d += 128) { | |
377 | asm volatile("vmovdqa %0,%%ymm4" :: "m" (dptr[z0][d])); | |
378 | asm volatile("vmovdqa %0,%%ymm6" :: "m" (dptr[z0][d+32])); | |
379 | asm volatile("vmovdqa %0,%%ymm12" :: "m" (dptr[z0][d+64])); | |
380 | asm volatile("vmovdqa %0,%%ymm14" :: "m" (dptr[z0][d+96])); | |
381 | asm volatile("vmovdqa %0,%%ymm2" : : "m" (p[d])); | |
382 | asm volatile("vmovdqa %0,%%ymm3" : : "m" (p[d+32])); | |
383 | asm volatile("vmovdqa %0,%%ymm10" : : "m" (p[d+64])); | |
384 | asm volatile("vmovdqa %0,%%ymm11" : : "m" (p[d+96])); | |
385 | asm volatile("vpxor %ymm4,%ymm2,%ymm2"); | |
386 | asm volatile("vpxor %ymm6,%ymm3,%ymm3"); | |
387 | asm volatile("vpxor %ymm12,%ymm10,%ymm10"); | |
388 | asm volatile("vpxor %ymm14,%ymm11,%ymm11"); | |
389 | /* P/Q data pages */ | |
390 | for (z = z0-1 ; z >= start ; z--) { | |
391 | asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); | |
392 | asm volatile("prefetchnta %0" :: "m" (dptr[z][d+64])); | |
393 | asm volatile("vpxor %ymm5,%ymm5,%ymm5"); | |
394 | asm volatile("vpxor %ymm7,%ymm7,%ymm7"); | |
395 | asm volatile("vpxor %ymm13,%ymm13,%ymm13"); | |
396 | asm volatile("vpxor %ymm15,%ymm15,%ymm15"); | |
397 | asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); | |
398 | asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); | |
399 | asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13"); | |
400 | asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15"); | |
401 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | |
402 | asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); | |
403 | asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); | |
404 | asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); | |
405 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | |
406 | asm volatile("vpand %ymm0,%ymm7,%ymm7"); | |
407 | asm volatile("vpand %ymm0,%ymm13,%ymm13"); | |
408 | asm volatile("vpand %ymm0,%ymm15,%ymm15"); | |
409 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | |
410 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | |
411 | asm volatile("vpxor %ymm13,%ymm12,%ymm12"); | |
412 | asm volatile("vpxor %ymm15,%ymm14,%ymm14"); | |
413 | asm volatile("vmovdqa %0,%%ymm5" :: "m" (dptr[z][d])); | |
414 | asm volatile("vmovdqa %0,%%ymm7" | |
415 | :: "m" (dptr[z][d+32])); | |
416 | asm volatile("vmovdqa %0,%%ymm13" | |
417 | :: "m" (dptr[z][d+64])); | |
418 | asm volatile("vmovdqa %0,%%ymm15" | |
419 | :: "m" (dptr[z][d+96])); | |
420 | asm volatile("vpxor %ymm5,%ymm2,%ymm2"); | |
421 | asm volatile("vpxor %ymm7,%ymm3,%ymm3"); | |
422 | asm volatile("vpxor %ymm13,%ymm10,%ymm10"); | |
423 | asm volatile("vpxor %ymm15,%ymm11,%ymm11"); | |
424 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | |
425 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | |
426 | asm volatile("vpxor %ymm13,%ymm12,%ymm12"); | |
427 | asm volatile("vpxor %ymm15,%ymm14,%ymm14"); | |
428 | } | |
429 | asm volatile("prefetchnta %0" :: "m" (q[d])); | |
430 | asm volatile("prefetchnta %0" :: "m" (q[d+64])); | |
431 | /* P/Q left side optimization */ | |
432 | for (z = start-1 ; z >= 0 ; z--) { | |
433 | asm volatile("vpxor %ymm5,%ymm5,%ymm5"); | |
434 | asm volatile("vpxor %ymm7,%ymm7,%ymm7"); | |
435 | asm volatile("vpxor %ymm13,%ymm13,%ymm13"); | |
436 | asm volatile("vpxor %ymm15,%ymm15,%ymm15"); | |
437 | asm volatile("vpcmpgtb %ymm4,%ymm5,%ymm5"); | |
438 | asm volatile("vpcmpgtb %ymm6,%ymm7,%ymm7"); | |
439 | asm volatile("vpcmpgtb %ymm12,%ymm13,%ymm13"); | |
440 | asm volatile("vpcmpgtb %ymm14,%ymm15,%ymm15"); | |
441 | asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); | |
442 | asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); | |
443 | asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); | |
444 | asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); | |
445 | asm volatile("vpand %ymm0,%ymm5,%ymm5"); | |
446 | asm volatile("vpand %ymm0,%ymm7,%ymm7"); | |
447 | asm volatile("vpand %ymm0,%ymm13,%ymm13"); | |
448 | asm volatile("vpand %ymm0,%ymm15,%ymm15"); | |
449 | asm volatile("vpxor %ymm5,%ymm4,%ymm4"); | |
450 | asm volatile("vpxor %ymm7,%ymm6,%ymm6"); | |
451 | asm volatile("vpxor %ymm13,%ymm12,%ymm12"); | |
452 | asm volatile("vpxor %ymm15,%ymm14,%ymm14"); | |
453 | } | |
454 | asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); | |
455 | asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); | |
456 | asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64])); | |
457 | asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96])); | |
458 | asm volatile("vpxor %0,%%ymm4,%%ymm4" : : "m" (q[d])); | |
459 | asm volatile("vpxor %0,%%ymm6,%%ymm6" : : "m" (q[d+32])); | |
460 | asm volatile("vpxor %0,%%ymm12,%%ymm12" : : "m" (q[d+64])); | |
461 | asm volatile("vpxor %0,%%ymm14,%%ymm14" : : "m" (q[d+96])); | |
462 | asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); | |
463 | asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); | |
464 | asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64])); | |
465 | asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96])); | |
466 | } | |
467 | asm volatile("sfence" : : : "memory"); | |
468 | kernel_fpu_end(); | |
469 | } | |
470 | ||
2c935842 YL |
471 | const struct raid6_calls raid6_avx2x4 = { |
472 | raid6_avx24_gen_syndrome, | |
b9bf33a8 | 473 | raid6_avx24_xor_syndrome, |
2c935842 YL |
474 | raid6_have_avx2, |
475 | "avx2x4", | |
476 | 1 /* Has cache hints */ | |
477 | }; | |
478 | #endif | |
479 | ||
480 | #endif /* CONFIG_AS_AVX2 */ |