[linux-2.6-block.git] / arch / alpha / lib / ev6-memcpy.S

/* SPDX-License-Identifier: GPL-2.0 */
/*
 * arch/alpha/lib/ev6-memcpy.S
 * 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
 *
 * Reasonably optimized memcpy() routine for the Alpha 21264
 *
 *	- memory accessed as aligned quadwords only
 *	- uses bcmpge to compare 8 bytes in parallel
 *
 * Much of the information about 21264 scheduling/coding comes from:
 *	Compiler Writer's Guide for the Alpha 21264
 *	abbreviated as 'CWG' in other comments here
 *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
 * Scheduling notation:
 *	E	- either cluster
 *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
 *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
 *
 * Temp usage notes:
 *	$1,$2,		- scratch
 */
#include <asm/export.h>
	.set noreorder
	.set noat

	.align	4
	.globl memcpy
	.ent memcpy
memcpy:
	.frame $30,0,$26,0
	.prologue 0

	mov	$16, $0			# E : copy dest to return
	ble	$18, $nomoredata	# U : done with the copy?
	xor	$16, $17, $1		# E : are source and dest alignments the same?
	and	$1, 7, $1		# E : are they the same mod 8?

	bne	$1, $misaligned		# U : Nope - gotta do this the slow way
	/* source and dest are same mod 8 address */
	and	$16, 7, $1		# E : Are both 0mod8?
	beq	$1, $both_0mod8		# U : Yes
	nop				# E :

	/*
	 * source and dest are same misalignment.  move a byte at a time
	 * until a 0mod8 alignment for both is reached.
	 * At least one byte more to move
	 */

$head_align:
	ldbu	$1, 0($17)		# L : grab a byte
	subq	$18, 1, $18		# E : count--
	addq	$17, 1, $17		# E : src++
	stb	$1, 0($16)		# L :
	addq	$16, 1, $16		# E : dest++
	and	$16, 7, $1		# E : Are we at 0mod8 yet?
	ble	$18, $nomoredata	# U : done with the copy?
	bne	$1, $head_align		# U :

$both_0mod8:
	cmple	$18, 127, $1		# E : Can we unroll the loop?
	bne	$1, $no_unroll		# U :
	and	$16, 63, $1		# E : get mod64 alignment
	beq	$1, $do_unroll		# U : no single quads to fiddle

$single_head_quad:
	ldq	$1, 0($17)		# L : get 8 bytes
	subq	$18, 8, $18		# E : count -= 8
	addq	$17, 8, $17		# E : src += 8
	nop				# E :

	stq	$1, 0($16)		# L : store
	addq	$16, 8, $16		# E : dest += 8
	and	$16, 63, $1		# E : get mod64 alignment
	bne	$1, $single_head_quad	# U : still not fully aligned

$do_unroll:
	addq	$16, 64, $7		# E : Initial (+1 trip) wh64 address
	cmple	$18, 127, $1		# E : Can we go through the unrolled loop?
	bne	$1, $tail_quads		# U : Nope
	nop				# E : 

$unroll_body:
	wh64	($7)			# L1 : memory subsystem hint: 64 bytes at
					# ($7) are about to be over-written
	ldq	$6, 0($17)		# L0 : bytes 0..7
	nop				# E :
	nop				# E :

	ldq	$4, 8($17)		# L : bytes 8..15
	ldq	$5, 16($17)		# L : bytes 16..23
	addq	$7, 64, $7		# E : Update next wh64 address
	nop				# E :

	ldq	$3, 24($17)		# L : bytes 24..31
	addq	$16, 64, $1		# E : fallback value for wh64
	nop				# E :
	nop				# E :

	addq	$17, 32, $17		# E : src += 32 bytes
	stq	$6, 0($16)		# L : bytes 0..7
	nop				# E :
	nop				# E :

	stq	$4, 8($16)		# L : bytes 8..15
	stq	$5, 16($16)		# L : bytes 16..23
	subq	$18, 192, $2		# E : At least two more trips to go?
	nop				# E :

	stq	$3, 24($16)		# L : bytes 24..31
	addq	$16, 32, $16		# E : dest += 32 bytes
	nop				# E :
	nop				# E :

	ldq	$6, 0($17)		# L : bytes 0..7
	ldq	$4, 8($17)		# L : bytes 8..15
	cmovlt	$2, $1, $7		# E : Latency 2, extra map slot - Use
					# fallback wh64 address if < 2 more trips
	nop				# E :

	ldq	$5, 16($17)		# L : bytes 16..23
	ldq	$3, 24($17)		# L : bytes 24..31
	addq	$16, 32, $16		# E : dest += 32
	subq	$18, 64, $18		# E : count -= 64

	addq	$17, 32, $17		# E : src += 32
	stq	$6, -32($16)		# L : bytes 0..7
	stq	$4, -24($16)		# L : bytes 8..15
	cmple	$18, 63, $1		# E : At least one more trip?

	stq	$5, -16($16)		# L : bytes 16..23
	stq	$3, -8($16)		# L : bytes 24..31
	nop				# E :
	beq	$1, $unroll_body

$tail_quads:
$no_unroll:
	.align 4
	subq	$18, 8, $18		# E : At least a quad left?
	blt	$18, $less_than_8	# U : Nope
	nop				# E :
	nop				# E :

$move_a_quad:
	ldq	$1, 0($17)		# L : fetch 8
	subq	$18, 8, $18		# E : count -= 8
	addq	$17, 8, $17		# E : src += 8
	nop				# E :

	stq	$1, 0($16)		# L : store 8
	addq	$16, 8, $16		# E : dest += 8
	bge	$18, $move_a_quad	# U :
	nop				# E :

$less_than_8:
	.align 4
	addq	$18, 8, $18		# E : add back for trailing bytes
	ble	$18, $nomoredata	# U : All-done
	nop				# E :
	nop				# E :

	/* Trailing bytes */
$tail_bytes:
	subq	$18, 1, $18		# E : count--
	ldbu	$1, 0($17)		# L : fetch a byte
	addq	$17, 1, $17		# E : src++
	nop				# E :

	stb	$1, 0($16)		# L : store a byte
	addq	$16, 1, $16		# E : dest++
	bgt	$18, $tail_bytes	# U : more to be done?
	nop				# E :

	/* branching to exit takes 3 extra cycles, so replicate exit here */
	ret	$31, ($26), 1		# L0 :
	nop				# E :
	nop				# E :
	nop				# E :

$misaligned:
	mov	$0, $4			# E : dest temp
	and	$0, 7, $1		# E : dest alignment mod8
	beq	$1, $dest_0mod8		# U : life doesnt totally suck
	nop

$aligndest:
	ble	$18, $nomoredata	# U :
	ldbu	$1, 0($17)		# L : fetch a byte
	subq	$18, 1, $18		# E : count--
	addq	$17, 1, $17		# E : src++

	stb	$1, 0($4)		# L : store it
	addq	$4, 1, $4		# E : dest++
	and	$4, 7, $1		# E : dest 0mod8 yet?
	bne	$1, $aligndest		# U : go until we are aligned.

	/* Source has unknown alignment, but dest is known to be 0mod8 */
$dest_0mod8:
	subq	$18, 8, $18		# E : At least a quad left?
	blt	$18, $misalign_tail	# U : Nope
	ldq_u	$3, 0($17)		# L : seed (rotating load) of 8 bytes
	nop				# E :

$mis_quad:
	ldq_u	$16, 8($17)		# L : Fetch next 8
	extql	$3, $17, $3		# U : masking
	extqh	$16, $17, $1		# U : masking
	bis	$3, $1, $1		# E : merged bytes to store

	subq	$18, 8, $18		# E : count -= 8
	addq	$17, 8, $17		# E : src += 8
	stq	$1, 0($4)		# L : store 8 (aligned)
	mov	$16, $3			# E : "rotate" source data

	addq	$4, 8, $4		# E : dest += 8
	bge	$18, $mis_quad		# U : More quads to move
	nop
	nop

$misalign_tail:
	addq	$18, 8, $18		# E : account for tail stuff
	ble	$18, $nomoredata	# U :
	nop
	nop

$misalign_byte:
	ldbu	$1, 0($17)		# L : fetch 1
	subq	$18, 1, $18		# E : count--
	addq	$17, 1, $17		# E : src++
	nop				# E :

	stb	$1, 0($4)		# L : store
	addq	$4, 1, $4		# E : dest++
	bgt	$18, $misalign_byte	# U : more to go?
	nop


$nomoredata:
	ret	$31, ($26), 1		# L0 :
	nop				# E :
	nop				# E :
	nop				# E :

	.end memcpy
	EXPORT_SYMBOL(memcpy)

/* For backwards module compatibility.  */
__memcpy = memcpy
.globl __memcpy
Commit	Line	Data
b2441318	1	/* SPDX-License-Identifier: GPL-2.0 */
1da177e4 LT	2	/*
	3	* arch/alpha/lib/ev6-memcpy.S
	4	* 21264 version by Rick Gorton <rick.gorton@alpha-processor.com>
	5	*
	6	* Reasonably optimized memcpy() routine for the Alpha 21264
	7	*
	8	* - memory accessed as aligned quadwords only
	9	* - uses bcmpge to compare 8 bytes in parallel
	10	*
	11	* Much of the information about 21264 scheduling/coding comes from:
	12	* Compiler Writer's Guide for the Alpha 21264
	13	* abbreviated as 'CWG' in other comments here
	14	* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
	15	* Scheduling notation:
	16	* E - either cluster
	17	* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
	18	* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
	19	*
	20	* Temp usage notes:
	21	* $1,$2, - scratch
	22	*/
00fc0e0d	23	#include <asm/export.h>
1da177e4 LT	24	.set noreorder
	25	.set noat
	26
	27	.align 4
	28	.globl memcpy
	29	.ent memcpy
	30	memcpy:
	31	.frame $30,0,$26,0
	32	.prologue 0
	33
	34	mov $16, $0 # E : copy dest to return
	35	ble $18, $nomoredata # U : done with the copy?
	36	xor $16, $17, $1 # E : are source and dest alignments the same?
	37	and $1, 7, $1 # E : are they the same mod 8?
	38
	39	bne $1, $misaligned # U : Nope - gotta do this the slow way
	40	/* source and dest are same mod 8 address */
	41	and $16, 7, $1 # E : Are both 0mod8?
	42	beq $1, $both_0mod8 # U : Yes
	43	nop # E :
	44
	45	/*
	46	* source and dest are same misalignment. move a byte at a time
	47	* until a 0mod8 alignment for both is reached.
	48	* At least one byte more to move
	49	*/
	50
	51	$head_align:
	52	ldbu $1, 0($17) # L : grab a byte
	53	subq $18, 1, $18 # E : count--
	54	addq $17, 1, $17 # E : src++
	55	stb $1, 0($16) # L :
	56	addq $16, 1, $16 # E : dest++
	57	and $16, 7, $1 # E : Are we at 0mod8 yet?
	58	ble $18, $nomoredata # U : done with the copy?
	59	bne $1, $head_align # U :
	60
	61	$both_0mod8:
	62	cmple $18, 127, $1 # E : Can we unroll the loop?
	63	bne $1, $no_unroll # U :
	64	and $16, 63, $1 # E : get mod64 alignment
	65	beq $1, $do_unroll # U : no single quads to fiddle
	66
	67	$single_head_quad:
	68	ldq $1, 0($17) # L : get 8 bytes
	69	subq $18, 8, $18 # E : count -= 8
	70	addq $17, 8, $17 # E : src += 8
	71	nop # E :
	72
	73	stq $1, 0($16) # L : store
	74	addq $16, 8, $16 # E : dest += 8
	75	and $16, 63, $1 # E : get mod64 alignment
	76	bne $1, $single_head_quad # U : still not fully aligned
	77
	78	$do_unroll:
	79	addq $16, 64, $7 # E : Initial (+1 trip) wh64 address
	80	cmple $18, 127, $1 # E : Can we go through the unrolled loop?
	81	bne $1, $tail_quads # U : Nope
	82	nop # E :
	83
	84	$unroll_body:
	85	wh64 ($7) # L1 : memory subsystem hint: 64 bytes at
	86	# ($7) are about to be over-written
	87	ldq $6, 0($17) # L0 : bytes 0..7
88	nop # E :
89	nop # E :
90
91	ldq $4, 8($17) # L : bytes 8..15
92	ldq $5, 16($17) # L : bytes 16..23
93	addq $7, 64, $7 # E : Update next wh64 address
94	nop # E :
95
96	ldq $3, 24($17) # L : bytes 24..31
97	addq $16, 64, $1 # E : fallback value for wh64
98	nop # E :
99	nop # E :
100
101	addq $17, 32, $17 # E : src += 32 bytes
102	stq $6, 0($16) # L : bytes 0..7
103	nop # E :
104	nop # E :
105
106	stq $4, 8($16) # L : bytes 8..15
107	stq $5, 16($16) # L : bytes 16..23
108	subq $18, 192, $2 # E : At least two more trips to go?
109	nop # E :
110
111	stq $3, 24($16) # L : bytes 24..31
112	addq $16, 32, $16 # E : dest += 32 bytes
113	nop # E :
114	nop # E :
115
116	ldq $6, 0($17) # L : bytes 0..7
117	ldq $4, 8($17) # L : bytes 8..15
118	cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use
119	# fallback wh64 address if < 2 more trips
120	nop # E :
121
122	ldq $5, 16($17) # L : bytes 16..23
123	ldq $3, 24($17) # L : bytes 24..31
124	addq $16, 32, $16 # E : dest += 32
125	subq $18, 64, $18 # E : count -= 64
126
127	addq $17, 32, $17 # E : src += 32
128	stq $6, -32($16) # L : bytes 0..7
129	stq $4, -24($16) # L : bytes 8..15
130	cmple $18, 63, $1 # E : At least one more trip?
131
132	stq $5, -16($16) # L : bytes 16..23
133	stq $3, -8($16) # L : bytes 24..31
134	nop # E :
135	beq $1, $unroll_body
136
137	$tail_quads:
138	$no_unroll:
139	.align 4
140	subq $18, 8, $18 # E : At least a quad left?
141	blt $18, $less_than_8 # U : Nope
142	nop # E :
143	nop # E :
144
145	$move_a_quad:
146	ldq $1, 0($17) # L : fetch 8
147	subq $18, 8, $18 # E : count -= 8
148	addq $17, 8, $17 # E : src += 8
149	nop # E :
150
151	stq $1, 0($16) # L : store 8
152	addq $16, 8, $16 # E : dest += 8
153	bge $18, $move_a_quad # U :
154	nop # E :
155
156	$less_than_8:
157	.align 4
158	addq $18, 8, $18 # E : add back for trailing bytes
159	ble $18, $nomoredata # U : All-done
160	nop # E :
161	nop # E :
162
163	/* Trailing bytes */
164	$tail_bytes:
165	subq $18, 1, $18 # E : count--
166	ldbu $1, 0($17) # L : fetch a byte
167	addq $17, 1, $17 # E : src++
168	nop # E :
169
170	stb $1, 0($16) # L : store a byte
171	addq $16, 1, $16 # E : dest++
172	bgt $18, $tail_bytes # U : more to be done?
173	nop # E :
174
175	/* branching to exit takes 3 extra cycles, so replicate exit here */
176	ret $31, ($26), 1 # L0 :
177	nop # E :
178	nop # E :
179	nop # E :
180
181	$misaligned:
182	mov $0, $4 # E : dest temp
183	and $0, 7, $1 # E : dest alignment mod8
184	beq $1, $dest_0mod8 # U : life doesnt totally suck
185	nop
186
187	$aligndest:
188	ble $18, $nomoredata # U :
189	ldbu $1, 0($17) # L : fetch a byte
190	subq $18, 1, $18 # E : count--
191	addq $17, 1, $17 # E : src++
192
193	stb $1, 0($4) # L : store it
194	addq $4, 1, $4 # E : dest++
195	and $4, 7, $1 # E : dest 0mod8 yet?
196	bne $1, $aligndest # U : go until we are aligned.
197
198	/* Source has unknown alignment, but dest is known to be 0mod8 */
199	$dest_0mod8:
200	subq $18, 8, $18 # E : At least a quad left?
201	blt $18, $misalign_tail # U : Nope
202	ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes
203	nop # E :
204
205	$mis_quad:
206	ldq_u $16, 8($17) # L : Fetch next 8
207	extql $3, $17, $3 # U : masking
208	extqh $16, $17, $1 # U : masking
209	bis $3, $1, $1 # E : merged bytes to store
210
211	subq $18, 8, $18 # E : count -= 8
212	addq $17, 8, $17 # E : src += 8
213	stq $1, 0($4) # L : store 8 (aligned)
214	mov $16, $3 # E : "rotate" source data
215
216	addq $4, 8, $4 # E : dest += 8
217	bge $18, $mis_quad # U : More quads to move
218	nop
219	nop
220
221	$misalign_tail:
222	addq $18, 8, $18 # E : account for tail stuff
223	ble $18, $nomoredata # U :
224	nop
225	nop
226
227	$misalign_byte:
228	ldbu $1, 0($17) # L : fetch 1
229	subq $18, 1, $18 # E : count--
230	addq $17, 1, $17 # E : src++
231	nop # E :
232
233	stb $1, 0($4) # L : store
234	addq $4, 1, $4 # E : dest++
235	bgt $18, $misalign_byte # U : more to go?
236	nop
237
238
239	$nomoredata:
240	ret $31, ($26), 1 # L0 :
241	nop # E :
242	nop # E :
243	nop # E :
244
245	.end memcpy
00fc0e0d	246	EXPORT_SYMBOL(memcpy)
1da177e4 LT	247
	248	/* For backwards module compatibility. */
	249	__memcpy = memcpy
	250	.globl __memcpy