Merge tag 'armsoc-dt' of git://git.kernel.org/pub/scm/linux/kernel/git/soc/soc
[linux-2.6-block.git] / arch / powerpc / lib / memcpy_power7.S
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software
14  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15  *
16  * Copyright (C) IBM Corporation, 2012
17  *
18  * Author: Anton Blanchard <anton@au.ibm.com>
19  */
20 #include <asm/ppc_asm.h>
21
22 #ifndef SELFTEST_CASE
23 /* 0 == don't use VMX, 1 == use VMX */
24 #define SELFTEST_CASE   0
25 #endif
26
27 #ifdef __BIG_ENDIAN__
28 #define LVS(VRT,RA,RB)          lvsl    VRT,RA,RB
29 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRA,VRB,VRC
30 #else
31 #define LVS(VRT,RA,RB)          lvsr    VRT,RA,RB
32 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRB,VRA,VRC
33 #endif
34
35 _GLOBAL(memcpy_power7)
36         cmpldi  r5,16
37         cmpldi  cr1,r5,4096
38         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
39         blt     .Lshort_copy
40
41 #ifdef CONFIG_ALTIVEC
42 test_feature = SELFTEST_CASE
43 BEGIN_FTR_SECTION
44         bgt     cr1, .Lvmx_copy
45 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
46 #endif
47
48 .Lnonvmx_copy:
49         /* Get the source 8B aligned */
50         neg     r6,r4
51         mtocrf  0x01,r6
52         clrldi  r6,r6,(64-3)
53
54         bf      cr7*4+3,1f
55         lbz     r0,0(r4)
56         addi    r4,r4,1
57         stb     r0,0(r3)
58         addi    r3,r3,1
59
60 1:      bf      cr7*4+2,2f
61         lhz     r0,0(r4)
62         addi    r4,r4,2
63         sth     r0,0(r3)
64         addi    r3,r3,2
65
66 2:      bf      cr7*4+1,3f
67         lwz     r0,0(r4)
68         addi    r4,r4,4
69         stw     r0,0(r3)
70         addi    r3,r3,4
71
72 3:      sub     r5,r5,r6
73         cmpldi  r5,128
74         blt     5f
75
76         mflr    r0
77         stdu    r1,-STACKFRAMESIZE(r1)
78         std     r14,STK_REG(R14)(r1)
79         std     r15,STK_REG(R15)(r1)
80         std     r16,STK_REG(R16)(r1)
81         std     r17,STK_REG(R17)(r1)
82         std     r18,STK_REG(R18)(r1)
83         std     r19,STK_REG(R19)(r1)
84         std     r20,STK_REG(R20)(r1)
85         std     r21,STK_REG(R21)(r1)
86         std     r22,STK_REG(R22)(r1)
87         std     r0,STACKFRAMESIZE+16(r1)
88
89         srdi    r6,r5,7
90         mtctr   r6
91
92         /* Now do cacheline (128B) sized loads and stores. */
93         .align  5
94 4:
95         ld      r0,0(r4)
96         ld      r6,8(r4)
97         ld      r7,16(r4)
98         ld      r8,24(r4)
99         ld      r9,32(r4)
100         ld      r10,40(r4)
101         ld      r11,48(r4)
102         ld      r12,56(r4)
103         ld      r14,64(r4)
104         ld      r15,72(r4)
105         ld      r16,80(r4)
106         ld      r17,88(r4)
107         ld      r18,96(r4)
108         ld      r19,104(r4)
109         ld      r20,112(r4)
110         ld      r21,120(r4)
111         addi    r4,r4,128
112         std     r0,0(r3)
113         std     r6,8(r3)
114         std     r7,16(r3)
115         std     r8,24(r3)
116         std     r9,32(r3)
117         std     r10,40(r3)
118         std     r11,48(r3)
119         std     r12,56(r3)
120         std     r14,64(r3)
121         std     r15,72(r3)
122         std     r16,80(r3)
123         std     r17,88(r3)
124         std     r18,96(r3)
125         std     r19,104(r3)
126         std     r20,112(r3)
127         std     r21,120(r3)
128         addi    r3,r3,128
129         bdnz    4b
130
131         clrldi  r5,r5,(64-7)
132
133         ld      r14,STK_REG(R14)(r1)
134         ld      r15,STK_REG(R15)(r1)
135         ld      r16,STK_REG(R16)(r1)
136         ld      r17,STK_REG(R17)(r1)
137         ld      r18,STK_REG(R18)(r1)
138         ld      r19,STK_REG(R19)(r1)
139         ld      r20,STK_REG(R20)(r1)
140         ld      r21,STK_REG(R21)(r1)
141         ld      r22,STK_REG(R22)(r1)
142         addi    r1,r1,STACKFRAMESIZE
143
144         /* Up to 127B to go */
145 5:      srdi    r6,r5,4
146         mtocrf  0x01,r6
147
148 6:      bf      cr7*4+1,7f
149         ld      r0,0(r4)
150         ld      r6,8(r4)
151         ld      r7,16(r4)
152         ld      r8,24(r4)
153         ld      r9,32(r4)
154         ld      r10,40(r4)
155         ld      r11,48(r4)
156         ld      r12,56(r4)
157         addi    r4,r4,64
158         std     r0,0(r3)
159         std     r6,8(r3)
160         std     r7,16(r3)
161         std     r8,24(r3)
162         std     r9,32(r3)
163         std     r10,40(r3)
164         std     r11,48(r3)
165         std     r12,56(r3)
166         addi    r3,r3,64
167
168         /* Up to 63B to go */
169 7:      bf      cr7*4+2,8f
170         ld      r0,0(r4)
171         ld      r6,8(r4)
172         ld      r7,16(r4)
173         ld      r8,24(r4)
174         addi    r4,r4,32
175         std     r0,0(r3)
176         std     r6,8(r3)
177         std     r7,16(r3)
178         std     r8,24(r3)
179         addi    r3,r3,32
180
181         /* Up to 31B to go */
182 8:      bf      cr7*4+3,9f
183         ld      r0,0(r4)
184         ld      r6,8(r4)
185         addi    r4,r4,16
186         std     r0,0(r3)
187         std     r6,8(r3)
188         addi    r3,r3,16
189
190 9:      clrldi  r5,r5,(64-4)
191
192         /* Up to 15B to go */
193 .Lshort_copy:
194         mtocrf  0x01,r5
195         bf      cr7*4+0,12f
196         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
197         lwz     r6,4(r4)
198         addi    r4,r4,8
199         stw     r0,0(r3)
200         stw     r6,4(r3)
201         addi    r3,r3,8
202
203 12:     bf      cr7*4+1,13f
204         lwz     r0,0(r4)
205         addi    r4,r4,4
206         stw     r0,0(r3)
207         addi    r3,r3,4
208
209 13:     bf      cr7*4+2,14f
210         lhz     r0,0(r4)
211         addi    r4,r4,2
212         sth     r0,0(r3)
213         addi    r3,r3,2
214
215 14:     bf      cr7*4+3,15f
216         lbz     r0,0(r4)
217         stb     r0,0(r3)
218
219 15:     ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
220         blr
221
222 .Lunwind_stack_nonvmx_copy:
223         addi    r1,r1,STACKFRAMESIZE
224         b       .Lnonvmx_copy
225
226 .Lvmx_copy:
227 #ifdef CONFIG_ALTIVEC
228         mflr    r0
229         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
230         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
231         std     r0,16(r1)
232         stdu    r1,-STACKFRAMESIZE(r1)
233         bl      enter_vmx_ops
234         cmpwi   cr1,r3,0
235         ld      r0,STACKFRAMESIZE+16(r1)
236         ld      r3,STK_REG(R31)(r1)
237         ld      r4,STK_REG(R30)(r1)
238         ld      r5,STK_REG(R29)(r1)
239         mtlr    r0
240
241         /*
242          * We prefetch both the source and destination using enhanced touch
243          * instructions. We use a stream ID of 0 for the load side and
244          * 1 for the store side.
245          */
246         clrrdi  r6,r4,7
247         clrrdi  r9,r3,7
248         ori     r9,r9,1         /* stream=1 */
249
250         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
251         cmpldi  r7,0x3FF
252         ble     1f
253         li      r7,0x3FF
254 1:      lis     r0,0x0E00       /* depth=7 */
255         sldi    r7,r7,7
256         or      r7,r7,r0
257         ori     r10,r7,1        /* stream=1 */
258
259         lis     r8,0x8000       /* GO=1 */
260         clrldi  r8,r8,32
261
262         dcbt    0,r6,0b01000
263         dcbt    0,r7,0b01010
264         dcbtst  0,r9,0b01000
265         dcbtst  0,r10,0b01010
266         eieio
267         dcbt    0,r8,0b01010    /* GO */
268
269         beq     cr1,.Lunwind_stack_nonvmx_copy
270
271         /*
272          * If source and destination are not relatively aligned we use a
273          * slower permute loop.
274          */
275         xor     r6,r4,r3
276         rldicl. r6,r6,0,(64-4)
277         bne     .Lvmx_unaligned_copy
278
279         /* Get the destination 16B aligned */
280         neg     r6,r3
281         mtocrf  0x01,r6
282         clrldi  r6,r6,(64-4)
283
284         bf      cr7*4+3,1f
285         lbz     r0,0(r4)
286         addi    r4,r4,1
287         stb     r0,0(r3)
288         addi    r3,r3,1
289
290 1:      bf      cr7*4+2,2f
291         lhz     r0,0(r4)
292         addi    r4,r4,2
293         sth     r0,0(r3)
294         addi    r3,r3,2
295
296 2:      bf      cr7*4+1,3f
297         lwz     r0,0(r4)
298         addi    r4,r4,4
299         stw     r0,0(r3)
300         addi    r3,r3,4
301
302 3:      bf      cr7*4+0,4f
303         ld      r0,0(r4)
304         addi    r4,r4,8
305         std     r0,0(r3)
306         addi    r3,r3,8
307
308 4:      sub     r5,r5,r6
309
310         /* Get the desination 128B aligned */
311         neg     r6,r3
312         srdi    r7,r6,4
313         mtocrf  0x01,r7
314         clrldi  r6,r6,(64-7)
315
316         li      r9,16
317         li      r10,32
318         li      r11,48
319
320         bf      cr7*4+3,5f
321         lvx     v1,0,r4
322         addi    r4,r4,16
323         stvx    v1,0,r3
324         addi    r3,r3,16
325
326 5:      bf      cr7*4+2,6f
327         lvx     v1,0,r4
328         lvx     v0,r4,r9
329         addi    r4,r4,32
330         stvx    v1,0,r3
331         stvx    v0,r3,r9
332         addi    r3,r3,32
333
334 6:      bf      cr7*4+1,7f
335         lvx     v3,0,r4
336         lvx     v2,r4,r9
337         lvx     v1,r4,r10
338         lvx     v0,r4,r11
339         addi    r4,r4,64
340         stvx    v3,0,r3
341         stvx    v2,r3,r9
342         stvx    v1,r3,r10
343         stvx    v0,r3,r11
344         addi    r3,r3,64
345
346 7:      sub     r5,r5,r6
347         srdi    r6,r5,7
348
349         std     r14,STK_REG(R14)(r1)
350         std     r15,STK_REG(R15)(r1)
351         std     r16,STK_REG(R16)(r1)
352
353         li      r12,64
354         li      r14,80
355         li      r15,96
356         li      r16,112
357
358         mtctr   r6
359
360         /*
361          * Now do cacheline sized loads and stores. By this stage the
362          * cacheline stores are also cacheline aligned.
363          */
364         .align  5
365 8:
366         lvx     v7,0,r4
367         lvx     v6,r4,r9
368         lvx     v5,r4,r10
369         lvx     v4,r4,r11
370         lvx     v3,r4,r12
371         lvx     v2,r4,r14
372         lvx     v1,r4,r15
373         lvx     v0,r4,r16
374         addi    r4,r4,128
375         stvx    v7,0,r3
376         stvx    v6,r3,r9
377         stvx    v5,r3,r10
378         stvx    v4,r3,r11
379         stvx    v3,r3,r12
380         stvx    v2,r3,r14
381         stvx    v1,r3,r15
382         stvx    v0,r3,r16
383         addi    r3,r3,128
384         bdnz    8b
385
386         ld      r14,STK_REG(R14)(r1)
387         ld      r15,STK_REG(R15)(r1)
388         ld      r16,STK_REG(R16)(r1)
389
390         /* Up to 127B to go */
391         clrldi  r5,r5,(64-7)
392         srdi    r6,r5,4
393         mtocrf  0x01,r6
394
395         bf      cr7*4+1,9f
396         lvx     v3,0,r4
397         lvx     v2,r4,r9
398         lvx     v1,r4,r10
399         lvx     v0,r4,r11
400         addi    r4,r4,64
401         stvx    v3,0,r3
402         stvx    v2,r3,r9
403         stvx    v1,r3,r10
404         stvx    v0,r3,r11
405         addi    r3,r3,64
406
407 9:      bf      cr7*4+2,10f
408         lvx     v1,0,r4
409         lvx     v0,r4,r9
410         addi    r4,r4,32
411         stvx    v1,0,r3
412         stvx    v0,r3,r9
413         addi    r3,r3,32
414
415 10:     bf      cr7*4+3,11f
416         lvx     v1,0,r4
417         addi    r4,r4,16
418         stvx    v1,0,r3
419         addi    r3,r3,16
420
421         /* Up to 15B to go */
422 11:     clrldi  r5,r5,(64-4)
423         mtocrf  0x01,r5
424         bf      cr7*4+0,12f
425         ld      r0,0(r4)
426         addi    r4,r4,8
427         std     r0,0(r3)
428         addi    r3,r3,8
429
430 12:     bf      cr7*4+1,13f
431         lwz     r0,0(r4)
432         addi    r4,r4,4
433         stw     r0,0(r3)
434         addi    r3,r3,4
435
436 13:     bf      cr7*4+2,14f
437         lhz     r0,0(r4)
438         addi    r4,r4,2
439         sth     r0,0(r3)
440         addi    r3,r3,2
441
442 14:     bf      cr7*4+3,15f
443         lbz     r0,0(r4)
444         stb     r0,0(r3)
445
446 15:     addi    r1,r1,STACKFRAMESIZE
447         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
448         b       exit_vmx_ops            /* tail call optimise */
449
450 .Lvmx_unaligned_copy:
451         /* Get the destination 16B aligned */
452         neg     r6,r3
453         mtocrf  0x01,r6
454         clrldi  r6,r6,(64-4)
455
456         bf      cr7*4+3,1f
457         lbz     r0,0(r4)
458         addi    r4,r4,1
459         stb     r0,0(r3)
460         addi    r3,r3,1
461
462 1:      bf      cr7*4+2,2f
463         lhz     r0,0(r4)
464         addi    r4,r4,2
465         sth     r0,0(r3)
466         addi    r3,r3,2
467
468 2:      bf      cr7*4+1,3f
469         lwz     r0,0(r4)
470         addi    r4,r4,4
471         stw     r0,0(r3)
472         addi    r3,r3,4
473
474 3:      bf      cr7*4+0,4f
475         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
476         lwz     r7,4(r4)
477         addi    r4,r4,8
478         stw     r0,0(r3)
479         stw     r7,4(r3)
480         addi    r3,r3,8
481
482 4:      sub     r5,r5,r6
483
484         /* Get the desination 128B aligned */
485         neg     r6,r3
486         srdi    r7,r6,4
487         mtocrf  0x01,r7
488         clrldi  r6,r6,(64-7)
489
490         li      r9,16
491         li      r10,32
492         li      r11,48
493
494         LVS(v16,0,r4)           /* Setup permute control vector */
495         lvx     v0,0,r4
496         addi    r4,r4,16
497
498         bf      cr7*4+3,5f
499         lvx     v1,0,r4
500         VPERM(v8,v0,v1,v16)
501         addi    r4,r4,16
502         stvx    v8,0,r3
503         addi    r3,r3,16
504         vor     v0,v1,v1
505
506 5:      bf      cr7*4+2,6f
507         lvx     v1,0,r4
508         VPERM(v8,v0,v1,v16)
509         lvx     v0,r4,r9
510         VPERM(v9,v1,v0,v16)
511         addi    r4,r4,32
512         stvx    v8,0,r3
513         stvx    v9,r3,r9
514         addi    r3,r3,32
515
516 6:      bf      cr7*4+1,7f
517         lvx     v3,0,r4
518         VPERM(v8,v0,v3,v16)
519         lvx     v2,r4,r9
520         VPERM(v9,v3,v2,v16)
521         lvx     v1,r4,r10
522         VPERM(v10,v2,v1,v16)
523         lvx     v0,r4,r11
524         VPERM(v11,v1,v0,v16)
525         addi    r4,r4,64
526         stvx    v8,0,r3
527         stvx    v9,r3,r9
528         stvx    v10,r3,r10
529         stvx    v11,r3,r11
530         addi    r3,r3,64
531
532 7:      sub     r5,r5,r6
533         srdi    r6,r5,7
534
535         std     r14,STK_REG(R14)(r1)
536         std     r15,STK_REG(R15)(r1)
537         std     r16,STK_REG(R16)(r1)
538
539         li      r12,64
540         li      r14,80
541         li      r15,96
542         li      r16,112
543
544         mtctr   r6
545
546         /*
547          * Now do cacheline sized loads and stores. By this stage the
548          * cacheline stores are also cacheline aligned.
549          */
550         .align  5
551 8:
552         lvx     v7,0,r4
553         VPERM(v8,v0,v7,v16)
554         lvx     v6,r4,r9
555         VPERM(v9,v7,v6,v16)
556         lvx     v5,r4,r10
557         VPERM(v10,v6,v5,v16)
558         lvx     v4,r4,r11
559         VPERM(v11,v5,v4,v16)
560         lvx     v3,r4,r12
561         VPERM(v12,v4,v3,v16)
562         lvx     v2,r4,r14
563         VPERM(v13,v3,v2,v16)
564         lvx     v1,r4,r15
565         VPERM(v14,v2,v1,v16)
566         lvx     v0,r4,r16
567         VPERM(v15,v1,v0,v16)
568         addi    r4,r4,128
569         stvx    v8,0,r3
570         stvx    v9,r3,r9
571         stvx    v10,r3,r10
572         stvx    v11,r3,r11
573         stvx    v12,r3,r12
574         stvx    v13,r3,r14
575         stvx    v14,r3,r15
576         stvx    v15,r3,r16
577         addi    r3,r3,128
578         bdnz    8b
579
580         ld      r14,STK_REG(R14)(r1)
581         ld      r15,STK_REG(R15)(r1)
582         ld      r16,STK_REG(R16)(r1)
583
584         /* Up to 127B to go */
585         clrldi  r5,r5,(64-7)
586         srdi    r6,r5,4
587         mtocrf  0x01,r6
588
589         bf      cr7*4+1,9f
590         lvx     v3,0,r4
591         VPERM(v8,v0,v3,v16)
592         lvx     v2,r4,r9
593         VPERM(v9,v3,v2,v16)
594         lvx     v1,r4,r10
595         VPERM(v10,v2,v1,v16)
596         lvx     v0,r4,r11
597         VPERM(v11,v1,v0,v16)
598         addi    r4,r4,64
599         stvx    v8,0,r3
600         stvx    v9,r3,r9
601         stvx    v10,r3,r10
602         stvx    v11,r3,r11
603         addi    r3,r3,64
604
605 9:      bf      cr7*4+2,10f
606         lvx     v1,0,r4
607         VPERM(v8,v0,v1,v16)
608         lvx     v0,r4,r9
609         VPERM(v9,v1,v0,v16)
610         addi    r4,r4,32
611         stvx    v8,0,r3
612         stvx    v9,r3,r9
613         addi    r3,r3,32
614
615 10:     bf      cr7*4+3,11f
616         lvx     v1,0,r4
617         VPERM(v8,v0,v1,v16)
618         addi    r4,r4,16
619         stvx    v8,0,r3
620         addi    r3,r3,16
621
622         /* Up to 15B to go */
623 11:     clrldi  r5,r5,(64-4)
624         addi    r4,r4,-16       /* Unwind the +16 load offset */
625         mtocrf  0x01,r5
626         bf      cr7*4+0,12f
627         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
628         lwz     r6,4(r4)
629         addi    r4,r4,8
630         stw     r0,0(r3)
631         stw     r6,4(r3)
632         addi    r3,r3,8
633
634 12:     bf      cr7*4+1,13f
635         lwz     r0,0(r4)
636         addi    r4,r4,4
637         stw     r0,0(r3)
638         addi    r3,r3,4
639
640 13:     bf      cr7*4+2,14f
641         lhz     r0,0(r4)
642         addi    r4,r4,2
643         sth     r0,0(r3)
644         addi    r3,r3,2
645
646 14:     bf      cr7*4+3,15f
647         lbz     r0,0(r4)
648         stb     r0,0(r3)
649
650 15:     addi    r1,r1,STACKFRAMESIZE
651         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
652         b       exit_vmx_ops            /* tail call optimise */
653 #endif /* CONFIG_ALTIVEC */