[linux-2.6-block.git] / arch / arc / lib / strcmp.S

/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
 */

/* This is optimized primarily for the ARC700.
   It would be possible to speed up the loops by one cycle / word
   respective one cycle / byte by forcing double source 1 alignment, unrolling
   by a factor of two, and speculatively loading the second word / byte of
   source 1; however, that would increase the overhead for loop setup / finish,
   and strcmp might often terminate early.  */

#include <linux/linkage.h>

ENTRY_CFI(strcmp)
	or	r2,r0,r1
	bmsk_s	r2,r2,1
	brne	r2,0,.Lcharloop
	mov_s	r12,0x01010101
	ror	r5,r12
.Lwordloop:
	ld.ab	r2,[r0,4]
	ld.ab	r3,[r1,4]
	nop_s
	sub	r4,r2,r12
	bic	r4,r4,r2
	and	r4,r4,r5
	brne	r4,0,.Lfound0
	breq	r2,r3,.Lwordloop
#ifdef	__LITTLE_ENDIAN__
	xor	r0,r2,r3	; mask for difference
	sub_s	r1,r0,1
	bic_s	r0,r0,r1	; mask for least significant difference bit
	sub	r1,r5,r0
	xor	r0,r5,r1	; mask for least significant difference byte
	and_s	r2,r2,r0
	and_s	r3,r3,r0
#endif /* LITTLE ENDIAN */
	cmp_s	r2,r3
	mov_s	r0,1
	j_s.d	[blink]
	bset.lo	r0,r0,31

	.balign	4
#ifdef __LITTLE_ENDIAN__
.Lfound0:
	xor	r0,r2,r3	; mask for difference
	or	r0,r0,r4	; or in zero indicator
	sub_s	r1,r0,1
	bic_s	r0,r0,r1	; mask for least significant difference bit
	sub	r1,r5,r0
	xor	r0,r5,r1	; mask for least significant difference byte
	and_s	r2,r2,r0
	and_s	r3,r3,r0
	sub.f	r0,r2,r3
	mov.hi	r0,1
	j_s.d	[blink]
	bset.lo	r0,r0,31
#else /* BIG ENDIAN */
	/* The zero-detection above can mis-detect 0x01 bytes as zeroes
	   because of carry-propagateion from a lower significant zero byte.
	   We can compensate for this by checking that bit0 is zero.
	   This compensation is not necessary in the step where we
	   get a low estimate for r2, because in any affected bytes
	   we already have 0x00 or 0x01, which will remain unchanged
	   when bit 7 is cleared.  */
	.balign	4
.Lfound0:
	lsr	r0,r4,8
	lsr_s	r1,r2
	bic_s	r2,r2,r0	; get low estimate for r2 and get ...
	bic_s	r0,r0,r1	; <this is the adjusted mask for zeros>
	or_s	r3,r3,r0	; ... high estimate r3 so that r2 > r3 will ...
	cmp_s	r3,r2		; ... be independent of trailing garbage
	or_s	r2,r2,r0	; likewise for r3 > r2
	bic_s	r3,r3,r0
	rlc	r0,0		; r0 := r2 > r3 ? 1 : 0
	cmp_s	r2,r3
	j_s.d	[blink]
	bset.lo	r0,r0,31
#endif /* ENDIAN */

	.balign	4
.Lcharloop:
	ldb.ab	r2,[r0,1]
	ldb.ab	r3,[r1,1]
	nop_s
	breq	r2,0,.Lcmpend
	breq	r2,r3,.Lcharloop
.Lcmpend:
	j_s.d	[blink]
	sub	r0,r2,r3
END_CFI(strcmp)
Commit	Line	Data
d2912cb1	1	/* SPDX-License-Identifier: GPL-2.0-only */
5210d1e6 VG	2	/*
5210d1e6 VG	3	* Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com)
5210d1e6 VG	4	*/
	5
	6	/* This is optimized primarily for the ARC700.
	7	It would be possible to speed up the loops by one cycle / word
	8	respective one cycle / byte by forcing double source 1 alignment, unrolling
	9	by a factor of two, and speculatively loading the second word / byte of
	10	source 1; however, that would increase the overhead for loop setup / finish,
	11	and strcmp might often terminate early. */
	12
ec7ac6af	13	#include <linux/linkage.h>
5210d1e6	14
86effd0d	15	ENTRY_CFI(strcmp)
5210d1e6 VG	16	or r2,r0,r1
	17	bmsk_s r2,r2,1
	18	brne r2,0,.Lcharloop
	19	mov_s r12,0x01010101
	20	ror r5,r12
	21	.Lwordloop:
	22	ld.ab r2,[r0,4]
	23	ld.ab r3,[r1,4]
	24	nop_s
	25	sub r4,r2,r12
	26	bic r4,r4,r2
	27	and r4,r4,r5
	28	brne r4,0,.Lfound0
	29	breq r2,r3,.Lwordloop
	30	#ifdef __LITTLE_ENDIAN__
	31	xor r0,r2,r3 ; mask for difference
	32	sub_s r1,r0,1
	33	bic_s r0,r0,r1 ; mask for least significant difference bit
	34	sub r1,r5,r0
	35	xor r0,r5,r1 ; mask for least significant difference byte
	36	and_s r2,r2,r0
	37	and_s r3,r3,r0
	38	#endif /* LITTLE ENDIAN */
	39	cmp_s r2,r3
	40	mov_s r0,1
	41	j_s.d [blink]
	42	bset.lo r0,r0,31
	43
	44	.balign 4
	45	#ifdef __LITTLE_ENDIAN__
	46	.Lfound0:
	47	xor r0,r2,r3 ; mask for difference
	48	or r0,r0,r4 ; or in zero indicator
	49	sub_s r1,r0,1
	50	bic_s r0,r0,r1 ; mask for least significant difference bit
	51	sub r1,r5,r0
	52	xor r0,r5,r1 ; mask for least significant difference byte
	53	and_s r2,r2,r0
	54	and_s r3,r3,r0
	55	sub.f r0,r2,r3
	56	mov.hi r0,1
	57	j_s.d [blink]
	58	bset.lo r0,r0,31
	59	#else /* BIG ENDIAN */
	60	/* The zero-detection above can mis-detect 0x01 bytes as zeroes
	61	because of carry-propagateion from a lower significant zero byte.
	62	We can compensate for this by checking that bit0 is zero.
	63	This compensation is not necessary in the step where we
	64	get a low estimate for r2, because in any affected bytes
	65	we already have 0x00 or 0x01, which will remain unchanged
	66	when bit 7 is cleared. */
	67	.balign 4
	68	.Lfound0:
	69	lsr r0,r4,8
	70	lsr_s r1,r2
	71	bic_s r2,r2,r0 ; get low estimate for r2 and get ...
	72	bic_s r0,r0,r1 ; <this is the adjusted mask for zeros>
	73	or_s r3,r3,r0 ; ... high estimate r3 so that r2 > r3 will ...
	74	cmp_s r3,r2 ; ... be independent of trailing garbage
	75	or_s r2,r2,r0 ; likewise for r3 > r2
	76	bic_s r3,r3,r0
	77	rlc r0,0 ; r0 := r2 > r3 ? 1 : 0
	78	cmp_s r2,r3
	79	j_s.d [blink]
80	bset.lo r0,r0,31
81	#endif /* ENDIAN */
82
83	.balign 4
84	.Lcharloop:
85	ldb.ab r2,[r0,1]
86	ldb.ab r3,[r1,1]
87	nop_s
88	breq r2,0,.Lcmpend
89	breq r2,r3,.Lcharloop
90	.Lcmpend:
91	j_s.d [blink]
92	sub r0,r2,r3
86effd0d	93	END_CFI(strcmp)