[linux-2.6-block.git] / arch / x86 / include / asm / xor_avx.h

/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _ASM_X86_XOR_AVX_H
#define _ASM_X86_XOR_AVX_H

/*
 * Optimized RAID-5 checksumming functions for AVX
 *
 * Copyright (C) 2012 Intel Corporation
 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
 *
 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
 */

#ifdef CONFIG_AS_AVX

#include <linux/compiler.h>
#include <asm/fpu/api.h>

#define BLOCK4(i) \
		BLOCK(32 * i, 0) \
		BLOCK(32 * (i + 1), 1) \
		BLOCK(32 * (i + 2), 2) \
		BLOCK(32 * (i + 3), 3)

#define BLOCK16() \
		BLOCK4(0) \
		BLOCK4(4) \
		BLOCK4(8) \
		BLOCK4(12)

static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
{
	unsigned long lines = bytes >> 9;

	kernel_fpu_begin();

	while (lines--) {
#undef BLOCK
#define BLOCK(i, reg) \
do { \
	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
		"m" (p0[i / sizeof(*p0)])); \
	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
		"=m" (p0[i / sizeof(*p0)])); \
} while (0);

		BLOCK16()

		p0 = (unsigned long *)((uintptr_t)p0 + 512);
		p1 = (unsigned long *)((uintptr_t)p1 + 512);
	}

	kernel_fpu_end();
}

static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
	unsigned long *p2)
{
	unsigned long lines = bytes >> 9;

	kernel_fpu_begin();

	while (lines--) {
#undef BLOCK
#define BLOCK(i, reg) \
do { \
	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		"m" (p1[i / sizeof(*p1)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		"m" (p0[i / sizeof(*p0)])); \
	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
		"=m" (p0[i / sizeof(*p0)])); \
} while (0);

		BLOCK16()

		p0 = (unsigned long *)((uintptr_t)p0 + 512);
		p1 = (unsigned long *)((uintptr_t)p1 + 512);
		p2 = (unsigned long *)((uintptr_t)p2 + 512);
	}

	kernel_fpu_end();
}

static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
	unsigned long *p2, unsigned long *p3)
{
	unsigned long lines = bytes >> 9;

	kernel_fpu_begin();

	while (lines--) {
#undef BLOCK
#define BLOCK(i, reg) \
do { \
	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		"m" (p2[i / sizeof(*p2)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		"m" (p1[i / sizeof(*p1)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		"m" (p0[i / sizeof(*p0)])); \
	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
		"=m" (p0[i / sizeof(*p0)])); \
} while (0);

		BLOCK16();

		p0 = (unsigned long *)((uintptr_t)p0 + 512);
		p1 = (unsigned long *)((uintptr_t)p1 + 512);
		p2 = (unsigned long *)((uintptr_t)p2 + 512);
		p3 = (unsigned long *)((uintptr_t)p3 + 512);
	}

	kernel_fpu_end();
}

static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
	unsigned long *p2, unsigned long *p3, unsigned long *p4)
{
	unsigned long lines = bytes >> 9;

	kernel_fpu_begin();

	while (lines--) {
#undef BLOCK
#define BLOCK(i, reg) \
do { \
	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		"m" (p3[i / sizeof(*p3)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		"m" (p2[i / sizeof(*p2)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		"m" (p1[i / sizeof(*p1)])); \
	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
		"m" (p0[i / sizeof(*p0)])); \
	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
		"=m" (p0[i / sizeof(*p0)])); \
} while (0);

		BLOCK16()

		p0 = (unsigned long *)((uintptr_t)p0 + 512);
		p1 = (unsigned long *)((uintptr_t)p1 + 512);
		p2 = (unsigned long *)((uintptr_t)p2 + 512);
		p3 = (unsigned long *)((uintptr_t)p3 + 512);
		p4 = (unsigned long *)((uintptr_t)p4 + 512);
	}

	kernel_fpu_end();
}

static struct xor_block_template xor_block_avx = {
	.name = "avx",
	.do_2 = xor_avx_2,
	.do_3 = xor_avx_3,
	.do_4 = xor_avx_4,
	.do_5 = xor_avx_5,
};

#define AVX_XOR_SPEED \
do { \
	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
		xor_speed(&xor_block_avx); \
} while (0)

#define AVX_SELECT(FASTEST) \
	(boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)

#else

#define AVX_XOR_SPEED {}

#define AVX_SELECT(FASTEST) (FASTEST)

#endif
#endif
Commit	Line	Data
b886d83c	1	/* SPDX-License-Identifier: GPL-2.0-only */
ea4d26ae JK	2	#ifndef _ASM_X86_XOR_AVX_H
	3	#define _ASM_X86_XOR_AVX_H
	4
	5	/*
	6	* Optimized RAID-5 checksumming functions for AVX
	7	*
	8	* Copyright (C) 2012 Intel Corporation
	9	* Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
	10	*
	11	* Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
ea4d26ae JK	12	*/
	13
	14	#ifdef CONFIG_AS_AVX
	15
	16	#include <linux/compiler.h>
df6b35f4	17	#include <asm/fpu/api.h>
ea4d26ae	18
ea4d26ae JK	19	#define BLOCK4(i) \
	20	BLOCK(32 * i, 0) \
	21	BLOCK(32 * (i + 1), 1) \
	22	BLOCK(32 * (i + 2), 2) \
	23	BLOCK(32 * (i + 3), 3)
	24
	25	#define BLOCK16() \
	26	BLOCK4(0) \
	27	BLOCK4(4) \
	28	BLOCK4(8) \
	29	BLOCK4(12)
	30
	31	static void xor_avx_2(unsigned long bytes, unsigned long p0, unsigned long p1)
	32	{
841e3604	33	unsigned long lines = bytes >> 9;
ea4d26ae	34
841e3604	35	kernel_fpu_begin();
ea4d26ae JK	36
	37	while (lines--) {
	38	#undef BLOCK
	39	#define BLOCK(i, reg) \
	40	do { \
	41	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
	42	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	43	"m" (p0[i / sizeof(*p0)])); \
	44	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
	45	"=m" (p0[i / sizeof(*p0)])); \
	46	} while (0);
	47
	48	BLOCK16()
	49
	50	p0 = (unsigned long *)((uintptr_t)p0 + 512);
	51	p1 = (unsigned long *)((uintptr_t)p1 + 512);
	52	}
	53
841e3604	54	kernel_fpu_end();
ea4d26ae JK	55	}
	56
	57	static void xor_avx_3(unsigned long bytes, unsigned long p0, unsigned long p1,
	58	unsigned long *p2)
	59	{
841e3604	60	unsigned long lines = bytes >> 9;
ea4d26ae	61
841e3604	62	kernel_fpu_begin();
ea4d26ae JK	63
	64	while (lines--) {
	65	#undef BLOCK
	66	#define BLOCK(i, reg) \
	67	do { \
	68	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
	69	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	70	"m" (p1[i / sizeof(*p1)])); \
	71	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	72	"m" (p0[i / sizeof(*p0)])); \
	73	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
	74	"=m" (p0[i / sizeof(*p0)])); \
	75	} while (0);
	76
	77	BLOCK16()
	78
	79	p0 = (unsigned long *)((uintptr_t)p0 + 512);
	80	p1 = (unsigned long *)((uintptr_t)p1 + 512);
	81	p2 = (unsigned long *)((uintptr_t)p2 + 512);
	82	}
	83
841e3604	84	kernel_fpu_end();
ea4d26ae JK	85	}
	86
	87	static void xor_avx_4(unsigned long bytes, unsigned long p0, unsigned long p1,
	88	unsigned long p2, unsigned long p3)
	89	{
841e3604	90	unsigned long lines = bytes >> 9;
ea4d26ae	91
841e3604	92	kernel_fpu_begin();
ea4d26ae JK	93
	94	while (lines--) {
	95	#undef BLOCK
	96	#define BLOCK(i, reg) \
	97	do { \
	98	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
	99	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	100	"m" (p2[i / sizeof(*p2)])); \
	101	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	102	"m" (p1[i / sizeof(*p1)])); \
	103	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	104	"m" (p0[i / sizeof(*p0)])); \
	105	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
	106	"=m" (p0[i / sizeof(*p0)])); \
	107	} while (0);
	108
	109	BLOCK16();
	110
	111	p0 = (unsigned long *)((uintptr_t)p0 + 512);
	112	p1 = (unsigned long *)((uintptr_t)p1 + 512);
	113	p2 = (unsigned long *)((uintptr_t)p2 + 512);
	114	p3 = (unsigned long *)((uintptr_t)p3 + 512);
	115	}
	116
841e3604	117	kernel_fpu_end();
ea4d26ae JK	118	}
	119
	120	static void xor_avx_5(unsigned long bytes, unsigned long p0, unsigned long p1,
	121	unsigned long p2, unsigned long p3, unsigned long *p4)
	122	{
841e3604	123	unsigned long lines = bytes >> 9;
ea4d26ae	124
841e3604	125	kernel_fpu_begin();
ea4d26ae JK	126
	127	while (lines--) {
	128	#undef BLOCK
	129	#define BLOCK(i, reg) \
	130	do { \
	131	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
	132	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	133	"m" (p3[i / sizeof(*p3)])); \
	134	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	135	"m" (p2[i / sizeof(*p2)])); \
	136	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	137	"m" (p1[i / sizeof(*p1)])); \
	138	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
	139	"m" (p0[i / sizeof(*p0)])); \
	140	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
	141	"=m" (p0[i / sizeof(*p0)])); \
	142	} while (0);
	143
	144	BLOCK16()
	145
	146	p0 = (unsigned long *)((uintptr_t)p0 + 512);
	147	p1 = (unsigned long *)((uintptr_t)p1 + 512);
	148	p2 = (unsigned long *)((uintptr_t)p2 + 512);
	149	p3 = (unsigned long *)((uintptr_t)p3 + 512);
	150	p4 = (unsigned long *)((uintptr_t)p4 + 512);
	151	}
	152
841e3604	153	kernel_fpu_end();
ea4d26ae JK	154	}
	155
	156	static struct xor_block_template xor_block_avx = {
	157	.name = "avx",
	158	.do_2 = xor_avx_2,
	159	.do_3 = xor_avx_3,
	160	.do_4 = xor_avx_4,
	161	.do_5 = xor_avx_5,
	162	};
	163
	164	#define AVX_XOR_SPEED \
	165	do { \
da154e82	166	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
ea4d26ae JK	167	xor_speed(&xor_block_avx); \
	168	} while (0)
	169
	170	#define AVX_SELECT(FASTEST) \
da154e82	171	(boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
ea4d26ae JK	172
	173	#else
	174
	175	#define AVX_XOR_SPEED {}
	176
	177	#define AVX_SELECT(FASTEST) (FASTEST)
	178
	179	#endif
	180	#endif