[linux-2.6-block.git] / arch / powerpc / lib / checksum_64.S

/*
 * This file contains assembly-language implementations
 * of IP-style 1's complement checksum routines.
 *	
 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version
 *  2 of the License, or (at your option) any later version.
 *
 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
 */

#include <linux/sys.h>
#include <asm/processor.h>
#include <asm/errno.h>
#include <asm/ppc_asm.h>

/*
 * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
 * len is in words and is always >= 5.
 *
 * In practice len == 5, but this is not guaranteed.  So this code does not
 * attempt to use doubleword instructions.
 */
_GLOBAL(ip_fast_csum)
	lwz	r0,0(r3)
	lwzu	r5,4(r3)
	addic.	r4,r4,-2
	addc	r0,r0,r5
	mtctr	r4
	blelr-
1:	lwzu	r4,4(r3)
	adde	r0,r0,r4
	bdnz	1b
	addze	r0,r0		/* add in final carry */
        rldicl  r4,r0,32,0      /* fold two 32-bit halves together */
        add     r0,r0,r4
        srdi    r0,r0,32
	rlwinm	r3,r0,16,0,31	/* fold two halves together */
	add	r3,r0,r3
	not	r3,r3
	srwi	r3,r3,16
	blr

/*
 * Compute checksum of TCP or UDP pseudo-header:
 *   csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
 * No real gain trying to do this specially for 64 bit, but
 * the 32 bit addition may spill into the upper bits of
 * the doubleword so we still must fold it down from 64.
 */	
_GLOBAL(csum_tcpudp_magic)
	rlwimi	r5,r6,16,0,15	/* put proto in upper half of len */
	addc	r0,r3,r4	/* add 4 32-bit words together */
	adde	r0,r0,r5
	adde	r0,r0,r7
        rldicl  r4,r0,32,0      /* fold 64 bit value */
        add     r0,r4,r0
        srdi    r0,r0,32
	rlwinm	r3,r0,16,0,31	/* fold two halves together */
	add	r3,r0,r3
	not	r3,r3
	srwi	r3,r3,16
	blr

/*
 * Computes the checksum of a memory block at buff, length len,
 * and adds in "sum" (32-bit).
 *
 * csum_partial(r3=buff, r4=len, r5=sum)
 */
_GLOBAL(csum_partial)
	addic	r0,r5,0			/* clear carry */

	srdi.	r6,r4,3			/* less than 8 bytes? */
	beq	.Lcsum_tail_word

	/*
	 * If only halfword aligned, align to a double word. Since odd
	 * aligned addresses should be rare and they would require more
	 * work to calculate the correct checksum, we ignore that case
	 * and take the potential slowdown of unaligned loads.
	 */
	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
	beq	.Lcsum_aligned

	li	r7,4
	sub	r6,r7,r6
	mtctr	r6

1:
	lhz	r6,0(r3)		/* align to doubleword */
	subi	r4,r4,2
	addi	r3,r3,2
	adde	r0,r0,r6
	bdnz	1b

.Lcsum_aligned:
	/*
	 * We unroll the loop such that each iteration is 64 bytes with an
	 * entry and exit limb of 64 bytes, meaning a minimum size of
	 * 128 bytes.
	 */
	srdi.	r6,r4,7
	beq	.Lcsum_tail_doublewords		/* len < 128 */

	srdi	r6,r4,6
	subi	r6,r6,1
	mtctr	r6

	stdu	r1,-STACKFRAMESIZE(r1)
	std	r14,STK_REG(R14)(r1)
	std	r15,STK_REG(R15)(r1)
	std	r16,STK_REG(R16)(r1)

	ld	r6,0(r3)
	ld	r9,8(r3)

	ld	r10,16(r3)
	ld	r11,24(r3)

	/*
	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
	 * the XER dependency. This means the fastest this loop can go is
	 * 16 cycles per iteration. The scheduling of the loop below has
	 * been shown to hit this on both POWER6 and POWER7.
	 */
	.align 5
2:
	adde	r0,r0,r6
	ld	r12,32(r3)
	ld	r14,40(r3)

	adde	r0,r0,r9
	ld	r15,48(r3)
	ld	r16,56(r3)
	addi	r3,r3,64

	adde	r0,r0,r10

	adde	r0,r0,r11

	adde	r0,r0,r12

	adde	r0,r0,r14

	adde	r0,r0,r15
	ld	r6,0(r3)
	ld	r9,8(r3)

	adde	r0,r0,r16
	ld	r10,16(r3)
	ld	r11,24(r3)
	bdnz	2b


	adde	r0,r0,r6
	ld	r12,32(r3)
	ld	r14,40(r3)

	adde	r0,r0,r9
	ld	r15,48(r3)
	ld	r16,56(r3)
	addi	r3,r3,64

	adde	r0,r0,r10
	adde	r0,r0,r11
	adde	r0,r0,r12
	adde	r0,r0,r14
	adde	r0,r0,r15
	adde	r0,r0,r16

	ld	r14,STK_REG(R14)(r1)
	ld	r15,STK_REG(R15)(r1)
	ld	r16,STK_REG(R16)(r1)
	addi	r1,r1,STACKFRAMESIZE

	andi.	r4,r4,63

.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
	srdi.	r6,r4,3
	beq	.Lcsum_tail_word

	mtctr	r6
3:
	ld	r6,0(r3)
	addi	r3,r3,8
	adde	r0,r0,r6
	bdnz	3b

	andi.	r4,r4,7

.Lcsum_tail_word:			/* Up to 7 bytes to go */
	srdi.	r6,r4,2
	beq	.Lcsum_tail_halfword

	lwz	r6,0(r3)
	addi	r3,r3,4
	adde	r0,r0,r6
	subi	r4,r4,4

.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
	srdi.	r6,r4,1
	beq	.Lcsum_tail_byte

	lhz	r6,0(r3)
	addi	r3,r3,2
	adde	r0,r0,r6
	subi	r4,r4,2

.Lcsum_tail_byte:			/* Up to 1 byte to go */
	andi.	r6,r4,1
	beq	.Lcsum_finish

	lbz	r6,0(r3)
	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
	adde	r0,r0,r9

.Lcsum_finish:
	addze	r0,r0			/* add in final carry */
	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
	add	r3,r4,r0
	srdi	r3,r3,32
	blr


	.macro srcnr
100:
	.section __ex_table,"a"
	.align 3
	.llong 100b,.Lsrc_error_nr
	.previous
	.endm

	.macro source
150:
	.section __ex_table,"a"
	.align 3
	.llong 150b,.Lsrc_error
	.previous
	.endm

	.macro dstnr
200:
	.section __ex_table,"a"
	.align 3
	.llong 200b,.Ldest_error_nr
	.previous
	.endm

	.macro dest
250:
	.section __ex_table,"a"
	.align 3
	.llong 250b,.Ldest_error
	.previous
	.endm

/*
 * Computes the checksum of a memory block at src, length len,
 * and adds in "sum" (32-bit), while copying the block to dst.
 * If an access exception occurs on src or dst, it stores -EFAULT
 * to *src_err or *dst_err respectively. The caller must take any action
 * required in this case (zeroing memory, recalculating partial checksum etc).
 *
 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
 */
_GLOBAL(csum_partial_copy_generic)
	addic	r0,r6,0			/* clear carry */

	srdi.	r6,r5,3			/* less than 8 bytes? */
	beq	.Lcopy_tail_word

	/*
	 * If only halfword aligned, align to a double word. Since odd
	 * aligned addresses should be rare and they would require more
	 * work to calculate the correct checksum, we ignore that case
	 * and take the potential slowdown of unaligned loads.
	 *
	 * If the source and destination are relatively unaligned we only
	 * align the source. This keeps things simple.
	 */
	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
	beq	.Lcopy_aligned

	li	r9,4
	sub	r6,r9,r6
	mtctr	r6

1:
srcnr;	lhz	r6,0(r3)		/* align to doubleword */
	subi	r5,r5,2
	addi	r3,r3,2
	adde	r0,r0,r6
dstnr;	sth	r6,0(r4)
	addi	r4,r4,2
	bdnz	1b

.Lcopy_aligned:
	/*
	 * We unroll the loop such that each iteration is 64 bytes with an
	 * entry and exit limb of 64 bytes, meaning a minimum size of
	 * 128 bytes.
	 */
	srdi.	r6,r5,7
	beq	.Lcopy_tail_doublewords		/* len < 128 */

	srdi	r6,r5,6
	subi	r6,r6,1
	mtctr	r6

	stdu	r1,-STACKFRAMESIZE(r1)
	std	r14,STK_REG(R14)(r1)
	std	r15,STK_REG(R15)(r1)
	std	r16,STK_REG(R16)(r1)

source;	ld	r6,0(r3)
source;	ld	r9,8(r3)

source;	ld	r10,16(r3)
source;	ld	r11,24(r3)

	/*
	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
	 * the XER dependency. This means the fastest this loop can go is
	 * 16 cycles per iteration. The scheduling of the loop below has
	 * been shown to hit this on both POWER6 and POWER7.
	 */
	.align 5
2:
	adde	r0,r0,r6
source;	ld	r12,32(r3)
source;	ld	r14,40(r3)

	adde	r0,r0,r9
source;	ld	r15,48(r3)
source;	ld	r16,56(r3)
	addi	r3,r3,64

	adde	r0,r0,r10
dest;	std	r6,0(r4)
dest;	std	r9,8(r4)

	adde	r0,r0,r11
dest;	std	r10,16(r4)
dest;	std	r11,24(r4)

	adde	r0,r0,r12
dest;	std	r12,32(r4)
dest;	std	r14,40(r4)

	adde	r0,r0,r14
dest;	std	r15,48(r4)
dest;	std	r16,56(r4)
	addi	r4,r4,64

	adde	r0,r0,r15
source;	ld	r6,0(r3)
source;	ld	r9,8(r3)

	adde	r0,r0,r16
source;	ld	r10,16(r3)
source;	ld	r11,24(r3)
	bdnz	2b


	adde	r0,r0,r6
source;	ld	r12,32(r3)
source;	ld	r14,40(r3)

	adde	r0,r0,r9
source;	ld	r15,48(r3)
source;	ld	r16,56(r3)
	addi	r3,r3,64

	adde	r0,r0,r10
dest;	std	r6,0(r4)
dest;	std	r9,8(r4)

	adde	r0,r0,r11
dest;	std	r10,16(r4)
dest;	std	r11,24(r4)

	adde	r0,r0,r12
dest;	std	r12,32(r4)
dest;	std	r14,40(r4)

	adde	r0,r0,r14
dest;	std	r15,48(r4)
dest;	std	r16,56(r4)
	addi	r4,r4,64

	adde	r0,r0,r15
	adde	r0,r0,r16

	ld	r14,STK_REG(R14)(r1)
	ld	r15,STK_REG(R15)(r1)
	ld	r16,STK_REG(R16)(r1)
	addi	r1,r1,STACKFRAMESIZE

	andi.	r5,r5,63

.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
	srdi.	r6,r5,3
	beq	.Lcopy_tail_word

	mtctr	r6
3:
srcnr;	ld	r6,0(r3)
	addi	r3,r3,8
	adde	r0,r0,r6
dstnr;	std	r6,0(r4)
	addi	r4,r4,8
	bdnz	3b

	andi.	r5,r5,7

.Lcopy_tail_word:			/* Up to 7 bytes to go */
	srdi.	r6,r5,2
	beq	.Lcopy_tail_halfword

srcnr;	lwz	r6,0(r3)
	addi	r3,r3,4
	adde	r0,r0,r6
dstnr;	stw	r6,0(r4)
	addi	r4,r4,4
	subi	r5,r5,4

.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
	srdi.	r6,r5,1
	beq	.Lcopy_tail_byte

srcnr;	lhz	r6,0(r3)
	addi	r3,r3,2
	adde	r0,r0,r6
dstnr;	sth	r6,0(r4)
	addi	r4,r4,2
	subi	r5,r5,2

.Lcopy_tail_byte:			/* Up to 1 byte to go */
	andi.	r6,r5,1
	beq	.Lcopy_finish

srcnr;	lbz	r6,0(r3)
	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
	adde	r0,r0,r9
dstnr;	stb	r6,0(r4)

.Lcopy_finish:
	addze	r0,r0			/* add in final carry */
	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
	add	r3,r4,r0
	srdi	r3,r3,32
	blr

.Lsrc_error:
	ld	r14,STK_REG(R14)(r1)
	ld	r15,STK_REG(R15)(r1)
	ld	r16,STK_REG(R16)(r1)
	addi	r1,r1,STACKFRAMESIZE
.Lsrc_error_nr:
	cmpdi	0,r7,0
	beqlr
	li	r6,-EFAULT
	stw	r6,0(r7)
	blr

.Ldest_error:
	ld	r14,STK_REG(R14)(r1)
	ld	r15,STK_REG(R15)(r1)
	ld	r16,STK_REG(R16)(r1)
	addi	r1,r1,STACKFRAMESIZE
.Ldest_error_nr:
	cmpdi	0,r8,0
	beqlr
	li	r6,-EFAULT
	stw	r6,0(r8)
	blr
Commit	Line	Data
14cf11af PM	1	/*
	2	* This file contains assembly-language implementations
	3	* of IP-style 1's complement checksum routines.
	4	*
	5	* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
	6	*
	7	* This program is free software; you can redistribute it and/or
	8	* modify it under the terms of the GNU General Public License
	9	* as published by the Free Software Foundation; either version
	10	* 2 of the License, or (at your option) any later version.
	11	*
	12	* Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
	13	*/
	14
	15	#include <linux/sys.h>
	16	#include <asm/processor.h>
	17	#include <asm/errno.h>
	18	#include <asm/ppc_asm.h>
	19
	20	/*
	21	* ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
	22	* len is in words and is always >= 5.
	23	*
	24	* In practice len == 5, but this is not guaranteed. So this code does not
	25	* attempt to use doubleword instructions.
	26	*/
	27	_GLOBAL(ip_fast_csum)
	28	lwz r0,0(r3)
	29	lwzu r5,4(r3)
	30	addic. r4,r4,-2
	31	addc r0,r0,r5
	32	mtctr r4
	33	blelr-
	34	1: lwzu r4,4(r3)
	35	adde r0,r0,r4
	36	bdnz 1b
	37	addze r0,r0 /* add in final carry */
	38	rldicl r4,r0,32,0 /* fold two 32-bit halves together */
	39	add r0,r0,r4
	40	srdi r0,r0,32
	41	rlwinm r3,r0,16,0,31 /* fold two halves together */
	42	add r3,r0,r3
	43	not r3,r3
	44	srwi r3,r3,16
	45	blr
	46
	47	/*
	48	* Compute checksum of TCP or UDP pseudo-header:
	49	* csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
	50	* No real gain trying to do this specially for 64 bit, but
	51	* the 32 bit addition may spill into the upper bits of
	52	* the doubleword so we still must fold it down from 64.
	53	*/
	54	_GLOBAL(csum_tcpudp_magic)
	55	rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
	56	addc r0,r3,r4 /* add 4 32-bit words together */
	57	adde r0,r0,r5
	58	adde r0,r0,r7
	59	rldicl r4,r0,32,0 /* fold 64 bit value */
	60	add r0,r4,r0
	61	srdi r0,r0,32
	62	rlwinm r3,r0,16,0,31 /* fold two halves together */
	63	add r3,r0,r3
	64	not r3,r3
65	srwi r3,r3,16
66	blr
67
68	/*
69	* Computes the checksum of a memory block at buff, length len,
70	* and adds in "sum" (32-bit).
71	*
14cf11af PM	72	* csum_partial(r3=buff, r4=len, r5=sum)
	73	*/
	74	_GLOBAL(csum_partial)
9b83ecb0 AB	75	addic r0,r5,0 /* clear carry */
	76
	77	srdi. r6,r4,3 /* less than 8 bytes? */
	78	beq .Lcsum_tail_word
	79
	80	/*
	81	* If only halfword aligned, align to a double word. Since odd
	82	* aligned addresses should be rare and they would require more
	83	* work to calculate the correct checksum, we ignore that case
	84	* and take the potential slowdown of unaligned loads.
	85	*/
	86	rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
	87	beq .Lcsum_aligned
	88
	89	li r7,4
	90	sub r6,r7,r6
	91	mtctr r6
	92
	93	1:
	94	lhz r6,0(r3) /* align to doubleword */
	95	subi r4,r4,2
	96	addi r3,r3,2
	97	adde r0,r0,r6
	98	bdnz 1b
	99
	100	.Lcsum_aligned:
	101	/*
	102	* We unroll the loop such that each iteration is 64 bytes with an
	103	* entry and exit limb of 64 bytes, meaning a minimum size of
	104	* 128 bytes.
	105	*/
	106	srdi. r6,r4,7
	107	beq .Lcsum_tail_doublewords /* len < 128 */
	108
	109	srdi r6,r4,6
	110	subi r6,r6,1
	111	mtctr r6
	112
	113	stdu r1,-STACKFRAMESIZE(r1)
c75df6f9 MN	114	std r14,STK_REG(R14)(r1)
	115	std r15,STK_REG(R15)(r1)
	116	std r16,STK_REG(R16)(r1)
9b83ecb0 AB	117
	118	ld r6,0(r3)
	119	ld r9,8(r3)
	120
	121	ld r10,16(r3)
	122	ld r11,24(r3)
	123
	124	/*
	125	* On POWER6 and POWER7 back to back addes take 2 cycles because of
	126	* the XER dependency. This means the fastest this loop can go is
	127	* 16 cycles per iteration. The scheduling of the loop below has
	128	* been shown to hit this on both POWER6 and POWER7.
	129	*/
	130	.align 5
	131	2:
	132	adde r0,r0,r6
	133	ld r12,32(r3)
	134	ld r14,40(r3)
	135
	136	adde r0,r0,r9
	137	ld r15,48(r3)
	138	ld r16,56(r3)
	139	addi r3,r3,64
	140
	141	adde r0,r0,r10
	142
	143	adde r0,r0,r11
	144
	145	adde r0,r0,r12
	146
	147	adde r0,r0,r14
	148
	149	adde r0,r0,r15
	150	ld r6,0(r3)
	151	ld r9,8(r3)
	152
	153	adde r0,r0,r16
	154	ld r10,16(r3)
	155	ld r11,24(r3)
	156	bdnz 2b
	157
	158
	159	adde r0,r0,r6
	160	ld r12,32(r3)
	161	ld r14,40(r3)
	162
	163	adde r0,r0,r9
	164	ld r15,48(r3)
	165	ld r16,56(r3)
	166	addi r3,r3,64
	167
	168	adde r0,r0,r10
	169	adde r0,r0,r11
	170	adde r0,r0,r12
	171	adde r0,r0,r14
	172	adde r0,r0,r15
	173	adde r0,r0,r16
	174
c75df6f9 MN	175	ld r14,STK_REG(R14)(r1)
	176	ld r15,STK_REG(R15)(r1)
	177	ld r16,STK_REG(R16)(r1)
9b83ecb0 AB	178	addi r1,r1,STACKFRAMESIZE
	179
	180	andi. r4,r4,63
	181
	182	.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
	183	srdi. r6,r4,3
	184	beq .Lcsum_tail_word
	185
	186	mtctr r6
	187	3:
	188	ld r6,0(r3)
	189	addi r3,r3,8
	190	adde r0,r0,r6
	191	bdnz 3b
	192
	193	andi. r4,r4,7
	194
	195	.Lcsum_tail_word: /* Up to 7 bytes to go */
	196	srdi. r6,r4,2
	197	beq .Lcsum_tail_halfword
	198
	199	lwz r6,0(r3)
14cf11af	200	addi r3,r3,4
9b83ecb0	201	adde r0,r0,r6
14cf11af	202	subi r4,r4,4
9b83ecb0 AB	203
	204	.Lcsum_tail_halfword: /* Up to 3 bytes to go */
	205	srdi. r6,r4,1
	206	beq .Lcsum_tail_byte
	207
	208	lhz r6,0(r3)
	209	addi r3,r3,2
	210	adde r0,r0,r6
	211	subi r4,r4,2
	212
	213	.Lcsum_tail_byte: /* Up to 1 byte to go */
	214	andi. r6,r4,1
	215	beq .Lcsum_finish
	216
	217	lbz r6,0(r3)
	218	sldi r9,r6,8 /* Pad the byte out to 16 bits */
	219	adde r0,r0,r9
	220
	221	.Lcsum_finish:
	222	addze r0,r0 /* add in final carry */
	223	rldicl r4,r0,32,0 /* fold two 32 bit halves together */
	224	add r3,r4,r0
	225	srdi r3,r3,32
	226	blr
14cf11af	227
fdd374b6	228
8f21bd00	229	.macro srcnr
fdd374b6 AB	230	100:
	231	.section __ex_table,"a"
	232	.align 3
8f21bd00	233	.llong 100b,.Lsrc_error_nr
fdd374b6 AB	234	.previous
	235	.endm
	236
8f21bd00 PM	237	.macro source
	238	150:
	239	.section __ex_table,"a"
	240	.align 3
	241	.llong 150b,.Lsrc_error
	242	.previous
	243	.endm
	244
	245	.macro dstnr
fdd374b6 AB	246	200:
	247	.section __ex_table,"a"
	248	.align 3
8f21bd00 PM	249	.llong 200b,.Ldest_error_nr
	250	.previous
	251	.endm
	252
	253	.macro dest
	254	250:
	255	.section __ex_table,"a"
	256	.align 3
	257	.llong 250b,.Ldest_error
fdd374b6 AB	258	.previous
	259	.endm
	260
14cf11af PM	261	/*
	262	* Computes the checksum of a memory block at src, length len,
	263	* and adds in "sum" (32-bit), while copying the block to dst.
	264	* If an access exception occurs on src or dst, it stores -EFAULT
fdd374b6 AB	265	* to src_err or dst_err respectively. The caller must take any action
fdd374b6 AB	266	* required in this case (zeroing memory, recalculating partial checksum etc).
14cf11af PM	267	*
	268	* csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
	269	*/
	270	_GLOBAL(csum_partial_copy_generic)
fdd374b6 AB	271	addic r0,r6,0 /* clear carry */
	272
	273	srdi. r6,r5,3 /* less than 8 bytes? */
	274	beq .Lcopy_tail_word
	275
	276	/*
	277	* If only halfword aligned, align to a double word. Since odd
	278	* aligned addresses should be rare and they would require more
	279	* work to calculate the correct checksum, we ignore that case
	280	* and take the potential slowdown of unaligned loads.
	281	*
	282	* If the source and destination are relatively unaligned we only
	283	* align the source. This keeps things simple.
	284	*/
	285	rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
	286	beq .Lcopy_aligned
	287
d9813c36 PM	288	li r9,4
d9813c36 PM	289	sub r6,r9,r6
fdd374b6 AB	290	mtctr r6
	291
	292	1:
8f21bd00	293	srcnr; lhz r6,0(r3) /* align to doubleword */
14cf11af	294	subi r5,r5,2
14cf11af	295	addi r3,r3,2
fdd374b6	296	adde r0,r0,r6
8f21bd00	297	dstnr; sth r6,0(r4)
14cf11af	298	addi r4,r4,2
fdd374b6 AB	299	bdnz 1b
	300
	301	.Lcopy_aligned:
	302	/*
	303	* We unroll the loop such that each iteration is 64 bytes with an
	304	* entry and exit limb of 64 bytes, meaning a minimum size of
	305	* 128 bytes.
	306	*/
	307	srdi. r6,r5,7
	308	beq .Lcopy_tail_doublewords /* len < 128 */
	309
	310	srdi r6,r5,6
	311	subi r6,r6,1
	312	mtctr r6
	313
	314	stdu r1,-STACKFRAMESIZE(r1)
c75df6f9 MN	315	std r14,STK_REG(R14)(r1)
	316	std r15,STK_REG(R15)(r1)
	317	std r16,STK_REG(R16)(r1)
fdd374b6 AB	318
	319	source; ld r6,0(r3)
	320	source; ld r9,8(r3)
	321
	322	source; ld r10,16(r3)
	323	source; ld r11,24(r3)
	324
	325	/*
	326	* On POWER6 and POWER7 back to back addes take 2 cycles because of
	327	* the XER dependency. This means the fastest this loop can go is
	328	* 16 cycles per iteration. The scheduling of the loop below has
	329	* been shown to hit this on both POWER6 and POWER7.
	330	*/
	331	.align 5
	332	2:
	333	adde r0,r0,r6
	334	source; ld r12,32(r3)
	335	source; ld r14,40(r3)
	336
	337	adde r0,r0,r9
	338	source; ld r15,48(r3)
	339	source; ld r16,56(r3)
	340	addi r3,r3,64
	341
	342	adde r0,r0,r10
	343	dest; std r6,0(r4)
	344	dest; std r9,8(r4)
	345
	346	adde r0,r0,r11
	347	dest; std r10,16(r4)
	348	dest; std r11,24(r4)
	349
	350	adde r0,r0,r12
	351	dest; std r12,32(r4)
	352	dest; std r14,40(r4)
	353
	354	adde r0,r0,r14
	355	dest; std r15,48(r4)
	356	dest; std r16,56(r4)
	357	addi r4,r4,64
	358
	359	adde r0,r0,r15
	360	source; ld r6,0(r3)
	361	source; ld r9,8(r3)
	362
	363	adde r0,r0,r16
	364	source; ld r10,16(r3)
	365	source; ld r11,24(r3)
	366	bdnz 2b
	367
	368
14cf11af	369	adde r0,r0,r6
fdd374b6 AB	370	source; ld r12,32(r3)
	371	source; ld r14,40(r3)
	372
	373	adde r0,r0,r9
	374	source; ld r15,48(r3)
	375	source; ld r16,56(r3)
	376	addi r3,r3,64
	377
	378	adde r0,r0,r10
	379	dest; std r6,0(r4)
	380	dest; std r9,8(r4)
	381
	382	adde r0,r0,r11
	383	dest; std r10,16(r4)
	384	dest; std r11,24(r4)
	385
	386	adde r0,r0,r12
	387	dest; std r12,32(r4)
	388	dest; std r14,40(r4)
	389
	390	adde r0,r0,r14
	391	dest; std r15,48(r4)
	392	dest; std r16,56(r4)
	393	addi r4,r4,64
	394
	395	adde r0,r0,r15
	396	adde r0,r0,r16
	397
c75df6f9 MN	398	ld r14,STK_REG(R14)(r1)
	399	ld r15,STK_REG(R15)(r1)
	400	ld r16,STK_REG(R16)(r1)
fdd374b6 AB	401	addi r1,r1,STACKFRAMESIZE
	402
	403	andi. r5,r5,63
	404
	405	.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
	406	srdi. r6,r5,3
	407	beq .Lcopy_tail_word
	408
	409	mtctr r6
	410	3:
8f21bd00	411	srcnr; ld r6,0(r3)
fdd374b6	412	addi r3,r3,8
14cf11af	413	adde r0,r0,r6
8f21bd00	414	dstnr; std r6,0(r4)
fdd374b6 AB	415	addi r4,r4,8
fdd374b6 AB	416	bdnz 3b
14cf11af	417
fdd374b6	418	andi. r5,r5,7
14cf11af	419
fdd374b6 AB	420	.Lcopy_tail_word: /* Up to 7 bytes to go */
	421	srdi. r6,r5,2
	422	beq .Lcopy_tail_halfword
	423
8f21bd00	424	srcnr; lwz r6,0(r3)
fdd374b6 AB	425	addi r3,r3,4
fdd374b6 AB	426	adde r0,r0,r6
8f21bd00	427	dstnr; stw r6,0(r4)
fdd374b6 AB	428	addi r4,r4,4
	429	subi r5,r5,4
	430
	431	.Lcopy_tail_halfword: /* Up to 3 bytes to go */
	432	srdi. r6,r5,1
	433	beq .Lcopy_tail_byte
	434
8f21bd00	435	srcnr; lhz r6,0(r3)
fdd374b6 AB	436	addi r3,r3,2
fdd374b6 AB	437	adde r0,r0,r6
8f21bd00	438	dstnr; sth r6,0(r4)
14cf11af	439	addi r4,r4,2
fdd374b6 AB	440	subi r5,r5,2
	441
	442	.Lcopy_tail_byte: /* Up to 1 byte to go */
	443	andi. r6,r5,1
	444	beq .Lcopy_finish
	445
8f21bd00	446	srcnr; lbz r6,0(r3)
fdd374b6 AB	447	sldi r9,r6,8 /* Pad the byte out to 16 bits */
fdd374b6 AB	448	adde r0,r0,r9
8f21bd00	449	dstnr; stb r6,0(r4)
fdd374b6 AB	450
	451	.Lcopy_finish:
	452	addze r0,r0 /* add in final carry */
	453	rldicl r4,r0,32,0 /* fold two 32 bit halves together */
	454	add r3,r4,r0
	455	srdi r3,r3,32
	456	blr
	457
	458	.Lsrc_error:
8f21bd00 PM	459	ld r14,STK_REG(R14)(r1)
	460	ld r15,STK_REG(R15)(r1)
	461	ld r16,STK_REG(R16)(r1)
	462	addi r1,r1,STACKFRAMESIZE
	463	.Lsrc_error_nr:
14cf11af	464	cmpdi 0,r7,0
fdd374b6	465	beqlr
14cf11af PM	466	li r6,-EFAULT
14cf11af PM	467	stw r6,0(r7)
14cf11af PM	468	blr
14cf11af PM	469
fdd374b6	470	.Ldest_error:
8f21bd00 PM	471	ld r14,STK_REG(R14)(r1)
	472	ld r15,STK_REG(R15)(r1)
	473	ld r16,STK_REG(R16)(r1)
	474	addi r1,r1,STACKFRAMESIZE
	475	.Ldest_error_nr:
14cf11af	476	cmpdi 0,r8,0
fdd374b6	477	beqlr
14cf11af PM	478	li r6,-EFAULT
14cf11af PM	479	stw r6,0(r8)
14cf11af	480	blr