Merge branch 'overlayfs-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mszer...
[linux-2.6-block.git] / arch / sparc / lib / NG4memcpy.S
CommitLineData
ae2c6ca6
DM
1/* NG4memcpy.S: Niagara-4 optimized memcpy.
2 *
3 * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
4 */
5
6#ifdef __KERNEL__
7#include <asm/visasm.h>
8#include <asm/asi.h>
9#define GLOBAL_SPARE %g7
10#else
11#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
12#define FPRS_FEF 0x04
13
14/* On T4 it is very expensive to access ASRs like %fprs and
15 * %asi, avoiding a read or a write can save ~50 cycles.
16 */
17#define FPU_ENTER \
18 rd %fprs, %o5; \
19 andcc %o5, FPRS_FEF, %g0; \
20 be,a,pn %icc, 999f; \
21 wr %g0, FPRS_FEF, %fprs; \
22 999:
23
24#ifdef MEMCPY_DEBUG
25#define VISEntryHalf FPU_ENTER; \
26 clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
27#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
28#else
29#define VISEntryHalf FPU_ENTER
30#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
31#endif
32
33#define GLOBAL_SPARE %g5
34#endif
35
36#ifndef STORE_ASI
37#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
38#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
39#else
40#define STORE_ASI 0x80 /* ASI_P */
41#endif
42#endif
43
f4da3628
DM
44#if !defined(EX_LD) && !defined(EX_ST)
45#define NON_USER_COPY
46#endif
47
ae2c6ca6
DM
48#ifndef EX_LD
49#define EX_LD(x) x
50#endif
a7c5724b
RG
51#ifndef EX_LD_FP
52#define EX_LD_FP(x) x
53#endif
ae2c6ca6
DM
54
55#ifndef EX_ST
56#define EX_ST(x) x
57#endif
a7c5724b
RG
58#ifndef EX_ST_FP
59#define EX_ST_FP(x) x
60#endif
ae2c6ca6
DM
61
62#ifndef EX_RETVAL
63#define EX_RETVAL(x) x
64#endif
65
66#ifndef LOAD
67#define LOAD(type,addr,dest) type [addr], dest
68#endif
69
70#ifndef STORE
71#ifndef MEMCPY_DEBUG
72#define STORE(type,src,addr) type src, [addr]
73#else
74#define STORE(type,src,addr) type##a src, [addr] %asi
75#endif
76#endif
77
78#ifndef STORE_INIT
79#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
80#endif
81
82#ifndef FUNC_NAME
83#define FUNC_NAME NG4memcpy
84#endif
85#ifndef PREAMBLE
86#define PREAMBLE
87#endif
88
89#ifndef XCC
90#define XCC xcc
91#endif
92
93 .register %g2,#scratch
94 .register %g3,#scratch
95
96 .text
97 .align 64
98
99 .globl FUNC_NAME
100 .type FUNC_NAME,#function
101FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
102#ifdef MEMCPY_DEBUG
103 wr %g0, 0x80, %asi
104#endif
105 srlx %o2, 31, %g2
106 cmp %g2, 0
107 tne %XCC, 5
108 PREAMBLE
109 mov %o0, %o3
110 brz,pn %o2, .Lexit
111 cmp %o2, 3
112 ble,pn %icc, .Ltiny
113 cmp %o2, 19
114 ble,pn %icc, .Lsmall
115 or %o0, %o1, %g2
116 cmp %o2, 128
117 bl,pn %icc, .Lmedium
118 nop
119
120.Llarge:/* len >= 0x80 */
121 /* First get dest 8 byte aligned. */
122 sub %g0, %o0, %g1
123 and %g1, 0x7, %g1
124 brz,pt %g1, 51f
125 sub %o2, %g1, %o2
42a4172b 126
ae2c6ca6
DM
1271: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
128 add %o1, 1, %o1
129 subcc %g1, 1, %g1
130 add %o0, 1, %o0
131 bne,pt %icc, 1b
132 EX_ST(STORE(stb, %g2, %o0 - 0x01))
133
13451: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
135 LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
136 LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
137 LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
138 LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
139 LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
140 LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
141 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
142
143 /* Check if we can use the straight fully aligned
144 * loop, or we require the alignaddr/faligndata variant.
145 */
146 andcc %o1, 0x7, %o5
147 bne,pn %icc, .Llarge_src_unaligned
148 sub %g0, %o0, %g1
149
150 /* Legitimize the use of initializing stores by getting dest
151 * to be 64-byte aligned.
152 */
153 and %g1, 0x3f, %g1
154 brz,pt %g1, .Llarge_aligned
155 sub %o2, %g1, %o2
42a4172b 156
ae2c6ca6
DM
1571: EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
158 add %o1, 8, %o1
159 subcc %g1, 8, %g1
160 add %o0, 8, %o0
161 bne,pt %icc, 1b
162 EX_ST(STORE(stx, %g2, %o0 - 0x08))
163
164.Llarge_aligned:
165 /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
166 andn %o2, 0x3f, %o4
167 sub %o2, %o4, %o2
168
1691: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
170 add %o1, 0x40, %o1
171 EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
172 subcc %o4, 0x40, %o4
173 EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
174 EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
175 EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
176 EX_ST(STORE_INIT(%g1, %o0))
177 add %o0, 0x08, %o0
178 EX_ST(STORE_INIT(%g2, %o0))
179 add %o0, 0x08, %o0
180 EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
181 EX_ST(STORE_INIT(%g3, %o0))
182 add %o0, 0x08, %o0
183 EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
184 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
185 add %o0, 0x08, %o0
186 EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
187 EX_ST(STORE_INIT(%o5, %o0))
188 add %o0, 0x08, %o0
189 EX_ST(STORE_INIT(%g2, %o0))
190 add %o0, 0x08, %o0
191 EX_ST(STORE_INIT(%g3, %o0))
192 add %o0, 0x08, %o0
193 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
194 add %o0, 0x08, %o0
195 bne,pt %icc, 1b
196 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
197
198 membar #StoreLoad | #StoreStore
199
200 brz,pn %o2, .Lexit
201 cmp %o2, 19
202 ble,pn %icc, .Lsmall_unaligned
203 nop
204 ba,a,pt %icc, .Lmedium_noprefetch
205
206.Lexit: retl
207 mov EX_RETVAL(%o3), %o0
208
209.Llarge_src_unaligned:
f4da3628
DM
210#ifdef NON_USER_COPY
211 VISEntryHalfFast(.Lmedium_vis_entry_fail)
212#else
213 VISEntryHalf
214#endif
ae2c6ca6
DM
215 andn %o2, 0x3f, %o4
216 sub %o2, %o4, %o2
ae2c6ca6
DM
217 alignaddr %o1, %g0, %g1
218 add %o1, %o4, %o1
a7c5724b
RG
219 EX_LD_FP(LOAD(ldd, %g1 + 0x00, %f0))
2201: EX_LD_FP(LOAD(ldd, %g1 + 0x08, %f2))
ae2c6ca6 221 subcc %o4, 0x40, %o4
a7c5724b
RG
222 EX_LD_FP(LOAD(ldd, %g1 + 0x10, %f4))
223 EX_LD_FP(LOAD(ldd, %g1 + 0x18, %f6))
224 EX_LD_FP(LOAD(ldd, %g1 + 0x20, %f8))
225 EX_LD_FP(LOAD(ldd, %g1 + 0x28, %f10))
226 EX_LD_FP(LOAD(ldd, %g1 + 0x30, %f12))
227 EX_LD_FP(LOAD(ldd, %g1 + 0x38, %f14))
ae2c6ca6 228 faligndata %f0, %f2, %f16
a7c5724b 229 EX_LD_FP(LOAD(ldd, %g1 + 0x40, %f0))
ae2c6ca6
DM
230 faligndata %f2, %f4, %f18
231 add %g1, 0x40, %g1
232 faligndata %f4, %f6, %f20
233 faligndata %f6, %f8, %f22
234 faligndata %f8, %f10, %f24
235 faligndata %f10, %f12, %f26
236 faligndata %f12, %f14, %f28
237 faligndata %f14, %f0, %f30
a7c5724b
RG
238 EX_ST_FP(STORE(std, %f16, %o0 + 0x00))
239 EX_ST_FP(STORE(std, %f18, %o0 + 0x08))
240 EX_ST_FP(STORE(std, %f20, %o0 + 0x10))
241 EX_ST_FP(STORE(std, %f22, %o0 + 0x18))
242 EX_ST_FP(STORE(std, %f24, %o0 + 0x20))
243 EX_ST_FP(STORE(std, %f26, %o0 + 0x28))
244 EX_ST_FP(STORE(std, %f28, %o0 + 0x30))
245 EX_ST_FP(STORE(std, %f30, %o0 + 0x38))
ae2c6ca6
DM
246 add %o0, 0x40, %o0
247 bne,pt %icc, 1b
248 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
44922150
DM
249#ifdef NON_USER_COPY
250 VISExitHalfFast
251#else
ae2c6ca6 252 VISExitHalf
44922150 253#endif
ae2c6ca6
DM
254 brz,pn %o2, .Lexit
255 cmp %o2, 19
256 ble,pn %icc, .Lsmall_unaligned
257 nop
258 ba,a,pt %icc, .Lmedium_unaligned
259
f4da3628
DM
260#ifdef NON_USER_COPY
261.Lmedium_vis_entry_fail:
262 or %o0, %o1, %g2
263#endif
ae2c6ca6
DM
264.Lmedium:
265 LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
266 andcc %g2, 0x7, %g0
267 bne,pn %icc, .Lmedium_unaligned
268 nop
269.Lmedium_noprefetch:
270 andncc %o2, 0x20 - 1, %o5
271 be,pn %icc, 2f
272 sub %o2, %o5, %o2
2731: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
274 EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
275 EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
276 EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
277 add %o1, 0x20, %o1
278 subcc %o5, 0x20, %o5
279 EX_ST(STORE(stx, %g1, %o0 + 0x00))
280 EX_ST(STORE(stx, %g2, %o0 + 0x08))
281 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
282 EX_ST(STORE(stx, %o4, %o0 + 0x18))
283 bne,pt %icc, 1b
284 add %o0, 0x20, %o0
2852: andcc %o2, 0x18, %o5
286 be,pt %icc, 3f
287 sub %o2, %o5, %o2
2881: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
289 add %o1, 0x08, %o1
290 add %o0, 0x08, %o0
291 subcc %o5, 0x08, %o5
292 bne,pt %icc, 1b
293 EX_ST(STORE(stx, %g1, %o0 - 0x08))
2943: brz,pt %o2, .Lexit
295 cmp %o2, 0x04
296 bl,pn %icc, .Ltiny
297 nop
298 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
299 add %o1, 0x04, %o1
300 add %o0, 0x04, %o0
301 subcc %o2, 0x04, %o2
302 bne,pn %icc, .Ltiny
303 EX_ST(STORE(stw, %g1, %o0 - 0x04))
304 ba,a,pt %icc, .Lexit
305.Lmedium_unaligned:
306 /* First get dest 8 byte aligned. */
307 sub %g0, %o0, %g1
308 and %g1, 0x7, %g1
309 brz,pt %g1, 2f
310 sub %o2, %g1, %o2
42a4172b 311
ae2c6ca6
DM
3121: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
313 add %o1, 1, %o1
314 subcc %g1, 1, %g1
315 add %o0, 1, %o0
316 bne,pt %icc, 1b
317 EX_ST(STORE(stb, %g2, %o0 - 0x01))
3182:
319 and %o1, 0x7, %g1
320 brz,pn %g1, .Lmedium_noprefetch
321 sll %g1, 3, %g1
322 mov 64, %g2
323 sub %g2, %g1, %g2
324 andn %o1, 0x7, %o1
325 EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
326 sllx %o4, %g1, %o4
327 andn %o2, 0x08 - 1, %o5
328 sub %o2, %o5, %o2
3291: EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
330 add %o1, 0x08, %o1
331 subcc %o5, 0x08, %o5
332 srlx %g3, %g2, GLOBAL_SPARE
333 or GLOBAL_SPARE, %o4, GLOBAL_SPARE
334 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
335 add %o0, 0x08, %o0
336 bne,pt %icc, 1b
337 sllx %g3, %g1, %o4
338 srl %g1, 3, %g1
339 add %o1, %g1, %o1
340 brz,pn %o2, .Lexit
341 nop
342 ba,pt %icc, .Lsmall_unaligned
343
344.Ltiny:
345 EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
346 subcc %o2, 1, %o2
347 be,pn %icc, .Lexit
348 EX_ST(STORE(stb, %g1, %o0 + 0x00))
349 EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
350 subcc %o2, 1, %o2
351 be,pn %icc, .Lexit
352 EX_ST(STORE(stb, %g1, %o0 + 0x01))
353 EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
354 ba,pt %icc, .Lexit
355 EX_ST(STORE(stb, %g1, %o0 + 0x02))
356
357.Lsmall:
358 andcc %g2, 0x3, %g0
359 bne,pn %icc, .Lsmall_unaligned
360 andn %o2, 0x4 - 1, %o5
361 sub %o2, %o5, %o2
3621:
363 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
364 add %o1, 0x04, %o1
365 subcc %o5, 0x04, %o5
366 add %o0, 0x04, %o0
367 bne,pt %icc, 1b
368 EX_ST(STORE(stw, %g1, %o0 - 0x04))
369 brz,pt %o2, .Lexit
370 nop
371 ba,a,pt %icc, .Ltiny
372
373.Lsmall_unaligned:
3741: EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
375 add %o1, 1, %o1
376 add %o0, 1, %o0
377 subcc %o2, 1, %o2
378 bne,pt %icc, 1b
379 EX_ST(STORE(stb, %g1, %o0 - 0x01))
380 ba,a,pt %icc, .Lexit
381 .size FUNC_NAME, .-FUNC_NAME