powerpc: Fix 64bit memcpy() regression
[linux-2.6-block.git] / arch / powerpc / lib / memcpy_64.S
index 7173ba98f427d293fc6f66727a4448be2da40ff2..e178922b2c2129e808a427464ce8fccc34643309 100644 (file)
        .align  7
 _GLOBAL(memcpy)
        std     r3,48(r1)       /* save destination pointer for return value */
-       mtcrf   0x01,r5
+       PPC_MTOCRF      0x01,r5
        cmpldi  cr1,r5,16
        neg     r6,r3           # LS 3 bits = # bytes to 8-byte dest bdry
        andi.   r6,r6,7
        dcbt    0,r4
        blt     cr1,.Lshort_copy
+/* Below we want to nop out the bne if we're on a CPU that has the
+   CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
+   cleared.
+   At the time of writing the only CPU that has this combination of bits
+   set is Power6. */
+BEGIN_FTR_SECTION
+       nop
+FTR_SECTION_ELSE
        bne     .Ldst_unaligned
+ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
+                    CPU_FTR_UNALIGNED_LD_STD)
 .Ldst_aligned:
-       andi.   r0,r4,7
        addi    r3,r3,-16
+BEGIN_FTR_SECTION
+       andi.   r0,r4,7
        bne     .Lsrc_unaligned
+END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
        srdi    r7,r5,4
        ld      r9,0(r4)
        addi    r4,r4,-8
@@ -41,18 +53,19 @@ _GLOBAL(memcpy)
 3:     std     r8,8(r3)
        beq     3f
        addi    r3,r3,16
-       ld      r9,8(r4)
 .Ldo_tail:
        bf      cr7*4+1,1f
-       rotldi  r9,r9,32
+       lwz     r9,8(r4)
+       addi    r4,r4,4
        stw     r9,0(r3)
        addi    r3,r3,4
 1:     bf      cr7*4+2,2f
-       rotldi  r9,r9,16
+       lhz     r9,8(r4)
+       addi    r4,r4,2
        sth     r9,0(r3)
        addi    r3,r3,2
 2:     bf      cr7*4+3,3f
-       rotldi  r9,r9,8
+       lbz     r9,8(r4)
        stb     r9,0(r3)
 3:     ld      r3,48(r1)       /* return dest pointer */
        blr
@@ -121,17 +134,30 @@ _GLOBAL(memcpy)
        cmpwi   cr1,r5,8
        addi    r3,r3,32
        sld     r9,r9,r10
-       ble     cr1,.Ldo_tail
+       ble     cr1,6f
        ld      r0,8(r4)
        srd     r7,r0,r11
        or      r9,r7,r9
-       b       .Ldo_tail
+6:
+       bf      cr7*4+1,1f
+       rotldi  r9,r9,32
+       stw     r9,0(r3)
+       addi    r3,r3,4
+1:     bf      cr7*4+2,2f
+       rotldi  r9,r9,16
+       sth     r9,0(r3)
+       addi    r3,r3,2
+2:     bf      cr7*4+3,3f
+       rotldi  r9,r9,8
+       stb     r9,0(r3)
+3:     ld      r3,48(r1)       /* return dest pointer */
+       blr
 
 .Ldst_unaligned:
-       mtcrf   0x01,r6         # put #bytes to 8B bdry into cr7
+       PPC_MTOCRF      0x01,r6         # put #bytes to 8B bdry into cr7
        subf    r5,r6,r5
        li      r7,0
-       cmpldi  r1,r5,16
+       cmpldi  cr1,r5,16
        bf      cr7*4+3,1f
        lbz     r0,0(r4)
        stb     r0,0(r3)
@@ -143,7 +169,7 @@ _GLOBAL(memcpy)
 2:     bf      cr7*4+1,3f
        lwzx    r0,r7,r4
        stwx    r0,r7,r3
-3:     mtcrf   0x01,r5
+3:     PPC_MTOCRF      0x01,r5
        add     r4,r6,r4
        add     r3,r6,r3
        b       .Ldst_aligned