powerpc: add vcmpequd/vcmpequb ppc instruction macro
[linux-block.git] / arch / powerpc / lib / memcpy_power7.S
CommitLineData
b3f271e8
AB
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2012
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/ppc_asm.h>
21
b3f271e8 22_GLOBAL(memcpy_power7)
32ee1e18
AB
23
24#ifdef __BIG_ENDIAN__
25#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
26#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
27#else
28#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
29#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
30#endif
31
b3f271e8
AB
32#ifdef CONFIG_ALTIVEC
33 cmpldi r5,16
34 cmpldi cr1,r5,4096
35
752a6422 36 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
b3f271e8
AB
37
38 blt .Lshort_copy
39 bgt cr1,.Lvmx_copy
40#else
41 cmpldi r5,16
42
752a6422 43 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
b3f271e8
AB
44
45 blt .Lshort_copy
46#endif
47
48.Lnonvmx_copy:
49 /* Get the source 8B aligned */
50 neg r6,r4
51 mtocrf 0x01,r6
52 clrldi r6,r6,(64-3)
53
54 bf cr7*4+3,1f
55 lbz r0,0(r4)
56 addi r4,r4,1
57 stb r0,0(r3)
58 addi r3,r3,1
59
601: bf cr7*4+2,2f
61 lhz r0,0(r4)
62 addi r4,r4,2
63 sth r0,0(r3)
64 addi r3,r3,2
65
662: bf cr7*4+1,3f
67 lwz r0,0(r4)
68 addi r4,r4,4
69 stw r0,0(r3)
70 addi r3,r3,4
71
723: sub r5,r5,r6
73 cmpldi r5,128
74 blt 5f
75
76 mflr r0
77 stdu r1,-STACKFRAMESIZE(r1)
c75df6f9
MN
78 std r14,STK_REG(R14)(r1)
79 std r15,STK_REG(R15)(r1)
80 std r16,STK_REG(R16)(r1)
81 std r17,STK_REG(R17)(r1)
82 std r18,STK_REG(R18)(r1)
83 std r19,STK_REG(R19)(r1)
84 std r20,STK_REG(R20)(r1)
85 std r21,STK_REG(R21)(r1)
86 std r22,STK_REG(R22)(r1)
b3f271e8
AB
87 std r0,STACKFRAMESIZE+16(r1)
88
89 srdi r6,r5,7
90 mtctr r6
91
92 /* Now do cacheline (128B) sized loads and stores. */
93 .align 5
944:
95 ld r0,0(r4)
96 ld r6,8(r4)
97 ld r7,16(r4)
98 ld r8,24(r4)
99 ld r9,32(r4)
100 ld r10,40(r4)
101 ld r11,48(r4)
102 ld r12,56(r4)
103 ld r14,64(r4)
104 ld r15,72(r4)
105 ld r16,80(r4)
106 ld r17,88(r4)
107 ld r18,96(r4)
108 ld r19,104(r4)
109 ld r20,112(r4)
110 ld r21,120(r4)
111 addi r4,r4,128
112 std r0,0(r3)
113 std r6,8(r3)
114 std r7,16(r3)
115 std r8,24(r3)
116 std r9,32(r3)
117 std r10,40(r3)
118 std r11,48(r3)
119 std r12,56(r3)
120 std r14,64(r3)
121 std r15,72(r3)
122 std r16,80(r3)
123 std r17,88(r3)
124 std r18,96(r3)
125 std r19,104(r3)
126 std r20,112(r3)
127 std r21,120(r3)
128 addi r3,r3,128
129 bdnz 4b
130
131 clrldi r5,r5,(64-7)
132
c75df6f9
MN
133 ld r14,STK_REG(R14)(r1)
134 ld r15,STK_REG(R15)(r1)
135 ld r16,STK_REG(R16)(r1)
136 ld r17,STK_REG(R17)(r1)
137 ld r18,STK_REG(R18)(r1)
138 ld r19,STK_REG(R19)(r1)
139 ld r20,STK_REG(R20)(r1)
140 ld r21,STK_REG(R21)(r1)
141 ld r22,STK_REG(R22)(r1)
b3f271e8
AB
142 addi r1,r1,STACKFRAMESIZE
143
144 /* Up to 127B to go */
1455: srdi r6,r5,4
146 mtocrf 0x01,r6
147
1486: bf cr7*4+1,7f
149 ld r0,0(r4)
150 ld r6,8(r4)
151 ld r7,16(r4)
152 ld r8,24(r4)
153 ld r9,32(r4)
154 ld r10,40(r4)
155 ld r11,48(r4)
156 ld r12,56(r4)
157 addi r4,r4,64
158 std r0,0(r3)
159 std r6,8(r3)
160 std r7,16(r3)
161 std r8,24(r3)
162 std r9,32(r3)
163 std r10,40(r3)
164 std r11,48(r3)
165 std r12,56(r3)
166 addi r3,r3,64
167
168 /* Up to 63B to go */
1697: bf cr7*4+2,8f
170 ld r0,0(r4)
171 ld r6,8(r4)
172 ld r7,16(r4)
173 ld r8,24(r4)
174 addi r4,r4,32
175 std r0,0(r3)
176 std r6,8(r3)
177 std r7,16(r3)
178 std r8,24(r3)
179 addi r3,r3,32
180
181 /* Up to 31B to go */
1828: bf cr7*4+3,9f
183 ld r0,0(r4)
184 ld r6,8(r4)
185 addi r4,r4,16
186 std r0,0(r3)
187 std r6,8(r3)
188 addi r3,r3,16
189
1909: clrldi r5,r5,(64-4)
191
192 /* Up to 15B to go */
193.Lshort_copy:
194 mtocrf 0x01,r5
195 bf cr7*4+0,12f
196 lwz r0,0(r4) /* Less chance of a reject with word ops */
197 lwz r6,4(r4)
198 addi r4,r4,8
199 stw r0,0(r3)
200 stw r6,4(r3)
201 addi r3,r3,8
202
20312: bf cr7*4+1,13f
204 lwz r0,0(r4)
205 addi r4,r4,4
206 stw r0,0(r3)
207 addi r3,r3,4
208
20913: bf cr7*4+2,14f
210 lhz r0,0(r4)
211 addi r4,r4,2
212 sth r0,0(r3)
213 addi r3,r3,2
214
21514: bf cr7*4+3,15f
216 lbz r0,0(r4)
217 stb r0,0(r3)
218
752a6422 21915: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
b3f271e8
AB
220 blr
221
222.Lunwind_stack_nonvmx_copy:
223 addi r1,r1,STACKFRAMESIZE
224 b .Lnonvmx_copy
225
226#ifdef CONFIG_ALTIVEC
227.Lvmx_copy:
228 mflr r0
752a6422
UW
229 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
230 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
b3f271e8
AB
231 std r0,16(r1)
232 stdu r1,-STACKFRAMESIZE(r1)
b1576fec 233 bl enter_vmx_copy
2fae7cdb 234 cmpwi cr1,r3,0
b3f271e8 235 ld r0,STACKFRAMESIZE+16(r1)
752a6422
UW
236 ld r3,STK_REG(R31)(r1)
237 ld r4,STK_REG(R30)(r1)
238 ld r5,STK_REG(R29)(r1)
b3f271e8
AB
239 mtlr r0
240
241 /*
242 * We prefetch both the source and destination using enhanced touch
243 * instructions. We use a stream ID of 0 for the load side and
244 * 1 for the store side.
245 */
246 clrrdi r6,r4,7
247 clrrdi r9,r3,7
248 ori r9,r9,1 /* stream=1 */
249
250 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
c8adfecc
NA
251 cmpldi r7,0x3FF
252 ble 1f
b3f271e8
AB
253 li r7,0x3FF
2541: lis r0,0x0E00 /* depth=7 */
255 sldi r7,r7,7
256 or r7,r7,r0
257 ori r10,r7,1 /* stream=1 */
258
259 lis r8,0x8000 /* GO=1 */
260 clrldi r8,r8,32
261
8a583c0a
AS
262 dcbt 0,r6,0b01000
263 dcbt 0,r7,0b01010
264 dcbtst 0,r9,0b01000
265 dcbtst 0,r10,0b01010
b3f271e8 266 eieio
8a583c0a 267 dcbt 0,r8,0b01010 /* GO */
b3f271e8 268
2fae7cdb 269 beq cr1,.Lunwind_stack_nonvmx_copy
b3f271e8
AB
270
271 /*
272 * If source and destination are not relatively aligned we use a
273 * slower permute loop.
274 */
275 xor r6,r4,r3
276 rldicl. r6,r6,0,(64-4)
277 bne .Lvmx_unaligned_copy
278
279 /* Get the destination 16B aligned */
280 neg r6,r3
281 mtocrf 0x01,r6
282 clrldi r6,r6,(64-4)
283
284 bf cr7*4+3,1f
285 lbz r0,0(r4)
286 addi r4,r4,1
287 stb r0,0(r3)
288 addi r3,r3,1
289
2901: bf cr7*4+2,2f
291 lhz r0,0(r4)
292 addi r4,r4,2
293 sth r0,0(r3)
294 addi r3,r3,2
295
2962: bf cr7*4+1,3f
297 lwz r0,0(r4)
298 addi r4,r4,4
299 stw r0,0(r3)
300 addi r3,r3,4
301
3023: bf cr7*4+0,4f
303 ld r0,0(r4)
304 addi r4,r4,8
305 std r0,0(r3)
306 addi r3,r3,8
307
3084: sub r5,r5,r6
309
310 /* Get the desination 128B aligned */
311 neg r6,r3
312 srdi r7,r6,4
313 mtocrf 0x01,r7
314 clrldi r6,r6,(64-7)
315
316 li r9,16
317 li r10,32
318 li r11,48
319
320 bf cr7*4+3,5f
8a583c0a 321 lvx v1,0,r4
b3f271e8 322 addi r4,r4,16
8a583c0a 323 stvx v1,0,r3
b3f271e8
AB
324 addi r3,r3,16
325
3265: bf cr7*4+2,6f
8a583c0a 327 lvx v1,0,r4
c2ce6f9f 328 lvx v0,r4,r9
b3f271e8 329 addi r4,r4,32
8a583c0a 330 stvx v1,0,r3
c2ce6f9f 331 stvx v0,r3,r9
b3f271e8
AB
332 addi r3,r3,32
333
3346: bf cr7*4+1,7f
8a583c0a 335 lvx v3,0,r4
c2ce6f9f
AB
336 lvx v2,r4,r9
337 lvx v1,r4,r10
338 lvx v0,r4,r11
b3f271e8 339 addi r4,r4,64
8a583c0a 340 stvx v3,0,r3
c2ce6f9f
AB
341 stvx v2,r3,r9
342 stvx v1,r3,r10
343 stvx v0,r3,r11
b3f271e8
AB
344 addi r3,r3,64
345
3467: sub r5,r5,r6
347 srdi r6,r5,7
348
c75df6f9
MN
349 std r14,STK_REG(R14)(r1)
350 std r15,STK_REG(R15)(r1)
351 std r16,STK_REG(R16)(r1)
b3f271e8
AB
352
353 li r12,64
354 li r14,80
355 li r15,96
356 li r16,112
357
358 mtctr r6
359
360 /*
361 * Now do cacheline sized loads and stores. By this stage the
362 * cacheline stores are also cacheline aligned.
363 */
364 .align 5
3658:
8a583c0a 366 lvx v7,0,r4
c2ce6f9f
AB
367 lvx v6,r4,r9
368 lvx v5,r4,r10
369 lvx v4,r4,r11
370 lvx v3,r4,r12
371 lvx v2,r4,r14
372 lvx v1,r4,r15
373 lvx v0,r4,r16
b3f271e8 374 addi r4,r4,128
8a583c0a 375 stvx v7,0,r3
c2ce6f9f
AB
376 stvx v6,r3,r9
377 stvx v5,r3,r10
378 stvx v4,r3,r11
379 stvx v3,r3,r12
380 stvx v2,r3,r14
381 stvx v1,r3,r15
382 stvx v0,r3,r16
b3f271e8
AB
383 addi r3,r3,128
384 bdnz 8b
385
c75df6f9
MN
386 ld r14,STK_REG(R14)(r1)
387 ld r15,STK_REG(R15)(r1)
388 ld r16,STK_REG(R16)(r1)
b3f271e8
AB
389
390 /* Up to 127B to go */
391 clrldi r5,r5,(64-7)
392 srdi r6,r5,4
393 mtocrf 0x01,r6
394
395 bf cr7*4+1,9f
8a583c0a 396 lvx v3,0,r4
c2ce6f9f
AB
397 lvx v2,r4,r9
398 lvx v1,r4,r10
399 lvx v0,r4,r11
b3f271e8 400 addi r4,r4,64
8a583c0a 401 stvx v3,0,r3
c2ce6f9f
AB
402 stvx v2,r3,r9
403 stvx v1,r3,r10
404 stvx v0,r3,r11
b3f271e8
AB
405 addi r3,r3,64
406
4079: bf cr7*4+2,10f
8a583c0a 408 lvx v1,0,r4
c2ce6f9f 409 lvx v0,r4,r9
b3f271e8 410 addi r4,r4,32
8a583c0a 411 stvx v1,0,r3
c2ce6f9f 412 stvx v0,r3,r9
b3f271e8
AB
413 addi r3,r3,32
414
41510: bf cr7*4+3,11f
8a583c0a 416 lvx v1,0,r4
b3f271e8 417 addi r4,r4,16
8a583c0a 418 stvx v1,0,r3
b3f271e8
AB
419 addi r3,r3,16
420
421 /* Up to 15B to go */
42211: clrldi r5,r5,(64-4)
423 mtocrf 0x01,r5
424 bf cr7*4+0,12f
425 ld r0,0(r4)
426 addi r4,r4,8
427 std r0,0(r3)
428 addi r3,r3,8
429
43012: bf cr7*4+1,13f
431 lwz r0,0(r4)
432 addi r4,r4,4
433 stw r0,0(r3)
434 addi r3,r3,4
435
43613: bf cr7*4+2,14f
437 lhz r0,0(r4)
438 addi r4,r4,2
439 sth r0,0(r3)
440 addi r3,r3,2
441
44214: bf cr7*4+3,15f
443 lbz r0,0(r4)
444 stb r0,0(r3)
445
44615: addi r1,r1,STACKFRAMESIZE
752a6422 447 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
b1576fec 448 b exit_vmx_copy /* tail call optimise */
b3f271e8
AB
449
450.Lvmx_unaligned_copy:
451 /* Get the destination 16B aligned */
452 neg r6,r3
453 mtocrf 0x01,r6
454 clrldi r6,r6,(64-4)
455
456 bf cr7*4+3,1f
457 lbz r0,0(r4)
458 addi r4,r4,1
459 stb r0,0(r3)
460 addi r3,r3,1
461
4621: bf cr7*4+2,2f
463 lhz r0,0(r4)
464 addi r4,r4,2
465 sth r0,0(r3)
466 addi r3,r3,2
467
4682: bf cr7*4+1,3f
469 lwz r0,0(r4)
470 addi r4,r4,4
471 stw r0,0(r3)
472 addi r3,r3,4
473
4743: bf cr7*4+0,4f
475 lwz r0,0(r4) /* Less chance of a reject with word ops */
476 lwz r7,4(r4)
477 addi r4,r4,8
478 stw r0,0(r3)
479 stw r7,4(r3)
480 addi r3,r3,8
481
4824: sub r5,r5,r6
483
484 /* Get the desination 128B aligned */
485 neg r6,r3
486 srdi r7,r6,4
487 mtocrf 0x01,r7
488 clrldi r6,r6,(64-7)
489
490 li r9,16
491 li r10,32
492 li r11,48
493
c2ce6f9f
AB
494 LVS(v16,0,r4) /* Setup permute control vector */
495 lvx v0,0,r4
b3f271e8
AB
496 addi r4,r4,16
497
498 bf cr7*4+3,5f
8a583c0a 499 lvx v1,0,r4
c2ce6f9f 500 VPERM(v8,v0,v1,v16)
b3f271e8 501 addi r4,r4,16
8a583c0a 502 stvx v8,0,r3
b3f271e8 503 addi r3,r3,16
c2ce6f9f 504 vor v0,v1,v1
b3f271e8
AB
505
5065: bf cr7*4+2,6f
8a583c0a 507 lvx v1,0,r4
c2ce6f9f
AB
508 VPERM(v8,v0,v1,v16)
509 lvx v0,r4,r9
510 VPERM(v9,v1,v0,v16)
b3f271e8 511 addi r4,r4,32
8a583c0a 512 stvx v8,0,r3
c2ce6f9f 513 stvx v9,r3,r9
b3f271e8
AB
514 addi r3,r3,32
515
5166: bf cr7*4+1,7f
8a583c0a 517 lvx v3,0,r4
c2ce6f9f
AB
518 VPERM(v8,v0,v3,v16)
519 lvx v2,r4,r9
520 VPERM(v9,v3,v2,v16)
521 lvx v1,r4,r10
522 VPERM(v10,v2,v1,v16)
523 lvx v0,r4,r11
524 VPERM(v11,v1,v0,v16)
b3f271e8 525 addi r4,r4,64
8a583c0a 526 stvx v8,0,r3
c2ce6f9f
AB
527 stvx v9,r3,r9
528 stvx v10,r3,r10
529 stvx v11,r3,r11
b3f271e8
AB
530 addi r3,r3,64
531
5327: sub r5,r5,r6
533 srdi r6,r5,7
534
c75df6f9
MN
535 std r14,STK_REG(R14)(r1)
536 std r15,STK_REG(R15)(r1)
537 std r16,STK_REG(R16)(r1)
b3f271e8
AB
538
539 li r12,64
540 li r14,80
541 li r15,96
542 li r16,112
543
544 mtctr r6
545
546 /*
547 * Now do cacheline sized loads and stores. By this stage the
548 * cacheline stores are also cacheline aligned.
549 */
550 .align 5
5518:
8a583c0a 552 lvx v7,0,r4
c2ce6f9f
AB
553 VPERM(v8,v0,v7,v16)
554 lvx v6,r4,r9
555 VPERM(v9,v7,v6,v16)
556 lvx v5,r4,r10
557 VPERM(v10,v6,v5,v16)
558 lvx v4,r4,r11
559 VPERM(v11,v5,v4,v16)
560 lvx v3,r4,r12
561 VPERM(v12,v4,v3,v16)
562 lvx v2,r4,r14
563 VPERM(v13,v3,v2,v16)
564 lvx v1,r4,r15
565 VPERM(v14,v2,v1,v16)
566 lvx v0,r4,r16
567 VPERM(v15,v1,v0,v16)
b3f271e8 568 addi r4,r4,128
8a583c0a 569 stvx v8,0,r3
c2ce6f9f
AB
570 stvx v9,r3,r9
571 stvx v10,r3,r10
572 stvx v11,r3,r11
573 stvx v12,r3,r12
574 stvx v13,r3,r14
575 stvx v14,r3,r15
576 stvx v15,r3,r16
b3f271e8
AB
577 addi r3,r3,128
578 bdnz 8b
579
c75df6f9
MN
580 ld r14,STK_REG(R14)(r1)
581 ld r15,STK_REG(R15)(r1)
582 ld r16,STK_REG(R16)(r1)
b3f271e8
AB
583
584 /* Up to 127B to go */
585 clrldi r5,r5,(64-7)
586 srdi r6,r5,4
587 mtocrf 0x01,r6
588
589 bf cr7*4+1,9f
8a583c0a 590 lvx v3,0,r4
c2ce6f9f
AB
591 VPERM(v8,v0,v3,v16)
592 lvx v2,r4,r9
593 VPERM(v9,v3,v2,v16)
594 lvx v1,r4,r10
595 VPERM(v10,v2,v1,v16)
596 lvx v0,r4,r11
597 VPERM(v11,v1,v0,v16)
b3f271e8 598 addi r4,r4,64
8a583c0a 599 stvx v8,0,r3
c2ce6f9f
AB
600 stvx v9,r3,r9
601 stvx v10,r3,r10
602 stvx v11,r3,r11
b3f271e8
AB
603 addi r3,r3,64
604
6059: bf cr7*4+2,10f
8a583c0a 606 lvx v1,0,r4
c2ce6f9f
AB
607 VPERM(v8,v0,v1,v16)
608 lvx v0,r4,r9
609 VPERM(v9,v1,v0,v16)
b3f271e8 610 addi r4,r4,32
8a583c0a 611 stvx v8,0,r3
c2ce6f9f 612 stvx v9,r3,r9
b3f271e8
AB
613 addi r3,r3,32
614
61510: bf cr7*4+3,11f
8a583c0a 616 lvx v1,0,r4
c2ce6f9f 617 VPERM(v8,v0,v1,v16)
b3f271e8 618 addi r4,r4,16
8a583c0a 619 stvx v8,0,r3
b3f271e8
AB
620 addi r3,r3,16
621
622 /* Up to 15B to go */
62311: clrldi r5,r5,(64-4)
624 addi r4,r4,-16 /* Unwind the +16 load offset */
625 mtocrf 0x01,r5
626 bf cr7*4+0,12f
627 lwz r0,0(r4) /* Less chance of a reject with word ops */
628 lwz r6,4(r4)
629 addi r4,r4,8
630 stw r0,0(r3)
631 stw r6,4(r3)
632 addi r3,r3,8
633
63412: bf cr7*4+1,13f
635 lwz r0,0(r4)
636 addi r4,r4,4
637 stw r0,0(r3)
638 addi r3,r3,4
639
64013: bf cr7*4+2,14f
641 lhz r0,0(r4)
642 addi r4,r4,2
643 sth r0,0(r3)
644 addi r3,r3,2
645
64614: bf cr7*4+3,15f
647 lbz r0,0(r4)
648 stb r0,0(r3)
649
65015: addi r1,r1,STACKFRAMESIZE
752a6422 651 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
b1576fec 652 b exit_vmx_copy /* tail call optimise */
c2522dcd 653#endif /* CONFIG_ALTIVEC */