[linux-2.6-block.git] / arch / arm64 / lib / memset.S

/*
 * Copyright (C) 2013 ARM Ltd.
 * Copyright (C) 2013 Linaro.
 *
 * This code is based on glibc cortex strings work originally authored by Linaro
 * and re-licensed under GPLv2 for the Linux kernel. The original code can
 * be found @
 *
 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
 * files/head:/src/aarch64/
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <linux/linkage.h>
#include <asm/assembler.h>
#include <asm/cache.h>

/*
 * Fill in the buffer with character c (alignment handled by the hardware)
 *
 * Parameters:
 *	x0 - buf
 *	x1 - c
 *	x2 - n
 * Returns:
 *	x0 - buf
 */

dstin		.req	x0
val		.req	w1
count		.req	x2
tmp1		.req	x3
tmp1w		.req	w3
tmp2		.req	x4
tmp2w		.req	w4
zva_len_x	.req	x5
zva_len		.req	w5
zva_bits_x	.req	x6

A_l		.req	x7
A_lw		.req	w7
dst		.req	x8
tmp3w		.req	w9
tmp3		.req	x9

ENTRY(memset)
	mov	dst, dstin	/* Preserve return value.  */
	and	A_lw, val, #255
	orr	A_lw, A_lw, A_lw, lsl #8
	orr	A_lw, A_lw, A_lw, lsl #16
	orr	A_l, A_l, A_l, lsl #32

	cmp	count, #15
	b.hi	.Lover16_proc
	/*All store maybe are non-aligned..*/
	tbz	count, #3, 1f
	str	A_l, [dst], #8
1:
	tbz	count, #2, 2f
	str	A_lw, [dst], #4
2:
	tbz	count, #1, 3f
	strh	A_lw, [dst], #2
3:
	tbz	count, #0, 4f
	strb	A_lw, [dst]
4:
	ret

.Lover16_proc:
	/*Whether  the start address is aligned with 16.*/
	neg	tmp2, dst
	ands	tmp2, tmp2, #15
	b.eq	.Laligned
/*
* The count is not less than 16, we can use stp to store the start 16 bytes,
* then adjust the dst aligned with 16.This process will make the current
* memory address at alignment boundary.
*/
	stp	A_l, A_l, [dst] /*non-aligned store..*/
	/*make the dst aligned..*/
	sub	count, count, tmp2
	add	dst, dst, tmp2

.Laligned:
	cbz	A_l, .Lzero_mem

.Ltail_maybe_long:
	cmp	count, #64
	b.ge	.Lnot_short
.Ltail63:
	ands	tmp1, count, #0x30
	b.eq	3f
	cmp	tmp1w, #0x20
	b.eq	1f
	b.lt	2f
	stp	A_l, A_l, [dst], #16
1:
	stp	A_l, A_l, [dst], #16
2:
	stp	A_l, A_l, [dst], #16
/*
* The last store length is less than 16,use stp to write last 16 bytes.
* It will lead some bytes written twice and the access is non-aligned.
*/
3:
	ands	count, count, #15
	cbz	count, 4f
	add	dst, dst, count
	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
4:
	ret

	/*
	* Critical loop. Start at a new cache line boundary. Assuming
	* 64 bytes per line, this ensures the entire loop is in one line.
	*/
	.p2align	L1_CACHE_SHIFT
.Lnot_short:
	sub	dst, dst, #16/* Pre-bias.  */
	sub	count, count, #64
1:
	stp	A_l, A_l, [dst, #16]
	stp	A_l, A_l, [dst, #32]
	stp	A_l, A_l, [dst, #48]
	stp	A_l, A_l, [dst, #64]!
	subs	count, count, #64
	b.ge	1b
	tst	count, #0x3f
	add	dst, dst, #16
	b.ne	.Ltail63
.Lexitfunc:
	ret

	/*
	* For zeroing memory, check to see if we can use the ZVA feature to
	* zero entire 'cache' lines.
	*/
.Lzero_mem:
	cmp	count, #63
	b.le	.Ltail63
	/*
	* For zeroing small amounts of memory, it's not worth setting up
	* the line-clear code.
	*/
	cmp	count, #128
	b.lt	.Lnot_short /*count is at least  128 bytes*/

	mrs	tmp1, dczid_el0
	tbnz	tmp1, #4, .Lnot_short
	mov	tmp3w, #4
	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
	lsl	zva_len, tmp3w, zva_len

	ands	tmp3w, zva_len, #63
	/*
	* ensure the zva_len is not less than 64.
	* It is not meaningful to use ZVA if the block size is less than 64.
	*/
	b.ne	.Lnot_short
.Lzero_by_line:
	/*
	* Compute how far we need to go to become suitably aligned. We're
	* already at quad-word alignment.
	*/
	cmp	count, zva_len_x
	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
	sub	zva_bits_x, zva_len_x, #1
	neg	tmp2, dst
	ands	tmp2, tmp2, zva_bits_x
	b.eq	2f			/* Already aligned.  */
	/* Not aligned, check that there's enough to copy after alignment.*/
	sub	tmp1, count, tmp2
	/*
	* grantee the remain length to be ZVA is bigger than 64,
	* avoid to make the 2f's process over mem range.*/
	cmp	tmp1, #64
	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
	b.lt	.Lnot_short
	/*
	* We know that there's at least 64 bytes to zero and that it's safe
	* to overrun by 64 bytes.
	*/
	mov	count, tmp1
1:
	stp	A_l, A_l, [dst]
	stp	A_l, A_l, [dst, #16]
	stp	A_l, A_l, [dst, #32]
	subs	tmp2, tmp2, #64
	stp	A_l, A_l, [dst, #48]
	add	dst, dst, #64
	b.ge	1b
	/* We've overrun a bit, so adjust dst downwards.*/
	add	dst, dst, tmp2
2:
	sub	count, count, zva_len_x
3:
	dc	zva, dst
	add	dst, dst, zva_len_x
	subs	count, count, zva_len_x
	b.ge	3b
	ands	count, count, zva_bits_x
	b.ne	.Ltail_maybe_long
	ret
ENDPROC(memset)
Commit	Line	Data
4a899227 CM	1	/*
4a899227 CM	2	* Copyright (C) 2013 ARM Ltd.
b29a51fe	3	* Copyright (C) 2013 Linaro.
	4	*
	5	* This code is based on glibc cortex strings work originally authored by Linaro
	6	* and re-licensed under GPLv2 for the Linux kernel. The original code can
	7	* be found @
	8	*
	9	* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
	10	* files/head:/src/aarch64/
4a899227 CM	11	*
	12	* This program is free software; you can redistribute it and/or modify
	13	* it under the terms of the GNU General Public License version 2 as
	14	* published by the Free Software Foundation.
	15	*
	16	* This program is distributed in the hope that it will be useful,
	17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	* GNU General Public License for more details.
	20	*
	21	* You should have received a copy of the GNU General Public License
	22	* along with this program. If not, see <http://www.gnu.org/licenses/>.
	23	*/
	24
	25	#include <linux/linkage.h>
	26	#include <asm/assembler.h>
b29a51fe	27	#include <asm/cache.h>
4a899227 CM	28
	29	/*
	30	* Fill in the buffer with character c (alignment handled by the hardware)
	31	*
	32	* Parameters:
	33	* x0 - buf
	34	* x1 - c
	35	* x2 - n
	36	* Returns:
	37	* x0 - buf
	38	*/
b29a51fe	39
	40	dstin .req x0
	41	val .req w1
	42	count .req x2
	43	tmp1 .req x3
	44	tmp1w .req w3
	45	tmp2 .req x4
	46	tmp2w .req w4
	47	zva_len_x .req x5
	48	zva_len .req w5
	49	zva_bits_x .req x6
	50
	51	A_l .req x7
	52	A_lw .req w7
	53	dst .req x8
	54	tmp3w .req w9
	55	tmp3 .req x9
	56
4a899227	57	ENTRY(memset)
b29a51fe	58	mov dst, dstin /* Preserve return value. */
	59	and A_lw, val, #255
	60	orr A_lw, A_lw, A_lw, lsl #8
	61	orr A_lw, A_lw, A_lw, lsl #16
	62	orr A_l, A_l, A_l, lsl #32
	63
	64	cmp count, #15
	65	b.hi .Lover16_proc
	66	/All store maybe are non-aligned../
	67	tbz count, #3, 1f
	68	str A_l, [dst], #8
	69	1:
	70	tbz count, #2, 2f
	71	str A_lw, [dst], #4
	72	2:
	73	tbz count, #1, 3f
	74	strh A_lw, [dst], #2
	75	3:
	76	tbz count, #0, 4f
	77	strb A_lw, [dst]
	78	4:
	79	ret
	80
	81	.Lover16_proc:
	82	/Whether the start address is aligned with 16./
	83	neg tmp2, dst
	84	ands tmp2, tmp2, #15
	85	b.eq .Laligned
	86	/*
	87	* The count is not less than 16, we can use stp to store the start 16 bytes,
	88	* then adjust the dst aligned with 16.This process will make the current
	89	* memory address at alignment boundary.
	90	*/
	91	stp A_l, A_l, [dst] /non-aligned store../
	92	/make the dst aligned../
	93	sub count, count, tmp2
	94	add dst, dst, tmp2
	95
	96	.Laligned:
	97	cbz A_l, .Lzero_mem
	98
	99	.Ltail_maybe_long:
	100	cmp count, #64
	101	b.ge .Lnot_short
	102	.Ltail63:
	103	ands tmp1, count, #0x30
	104	b.eq 3f
	105	cmp tmp1w, #0x20
	106	b.eq 1f
	107	b.lt 2f
	108	stp A_l, A_l, [dst], #16
	109	1:
	110	stp A_l, A_l, [dst], #16
	111	2:
	112	stp A_l, A_l, [dst], #16
	113	/*
	114	* The last store length is less than 16,use stp to write last 16 bytes.
	115	* It will lead some bytes written twice and the access is non-aligned.
	116	*/
	117	3:
	118	ands count, count, #15
	119	cbz count, 4f
	120	add dst, dst, count
	121	stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
122	4:
123	ret
124
125	/*
126	* Critical loop. Start at a new cache line boundary. Assuming
127	* 64 bytes per line, this ensures the entire loop is in one line.
128	*/
129	.p2align L1_CACHE_SHIFT
130	.Lnot_short:
131	sub dst, dst, #16/* Pre-bias. */
132	sub count, count, #64
133	1:
134	stp A_l, A_l, [dst, #16]
135	stp A_l, A_l, [dst, #32]
136	stp A_l, A_l, [dst, #48]
137	stp A_l, A_l, [dst, #64]!
138	subs count, count, #64
139	b.ge 1b
140	tst count, #0x3f
141	add dst, dst, #16
142	b.ne .Ltail63
143	.Lexitfunc:
144	ret
145
146	/*
147	* For zeroing memory, check to see if we can use the ZVA feature to
148	* zero entire 'cache' lines.
149	*/
150	.Lzero_mem:
151	cmp count, #63
152	b.le .Ltail63
153	/*
154	* For zeroing small amounts of memory, it's not worth setting up
155	* the line-clear code.
156	*/
157	cmp count, #128
158	b.lt .Lnot_short /count is at least 128 bytes/
159
160	mrs tmp1, dczid_el0
161	tbnz tmp1, #4, .Lnot_short
162	mov tmp3w, #4
163	and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
164	lsl zva_len, tmp3w, zva_len
165
166	ands tmp3w, zva_len, #63
167	/*
168	* ensure the zva_len is not less than 64.
169	* It is not meaningful to use ZVA if the block size is less than 64.
170	*/
171	b.ne .Lnot_short
172	.Lzero_by_line:
173	/*
174	* Compute how far we need to go to become suitably aligned. We're
175	* already at quad-word alignment.
176	*/
177	cmp count, zva_len_x
178	b.lt .Lnot_short /* Not enough to reach alignment. */
179	sub zva_bits_x, zva_len_x, #1
180	neg tmp2, dst
181	ands tmp2, tmp2, zva_bits_x
182	b.eq 2f /* Already aligned. */
183	/* Not aligned, check that there's enough to copy after alignment.*/
184	sub tmp1, count, tmp2
185	/*
186	* grantee the remain length to be ZVA is bigger than 64,
187	* avoid to make the 2f's process over mem range.*/
188	cmp tmp1, #64
189	ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
190	b.lt .Lnot_short
191	/*
192	* We know that there's at least 64 bytes to zero and that it's safe
193	* to overrun by 64 bytes.
194	*/
195	mov count, tmp1
196	1:
197	stp A_l, A_l, [dst]
198	stp A_l, A_l, [dst, #16]
199	stp A_l, A_l, [dst, #32]
200	subs tmp2, tmp2, #64
201	stp A_l, A_l, [dst, #48]
202	add dst, dst, #64
203	b.ge 1b
204	/* We've overrun a bit, so adjust dst downwards.*/
205	add dst, dst, tmp2
206	2:
207	sub count, count, zva_len_x
208	3:
209	dc zva, dst
210	add dst, dst, zva_len_x
211	subs count, count, zva_len_x
212	b.ge 3b
213	ands count, count, zva_bits_x
214	b.ne .Ltail_maybe_long
215	ret
4a899227	216	ENDPROC(memset)