Commit | Line | Data |
---|---|---|
ae2c6ca6 DM |
1 | /* NG4memcpy.S: Niagara-4 optimized memcpy. |
2 | * | |
3 | * Copyright (C) 2012 David S. Miller (davem@davemloft.net) | |
4 | */ | |
5 | ||
6 | #ifdef __KERNEL__ | |
7 | #include <asm/visasm.h> | |
8 | #include <asm/asi.h> | |
9 | #define GLOBAL_SPARE %g7 | |
10 | #else | |
11 | #define ASI_BLK_INIT_QUAD_LDD_P 0xe2 | |
12 | #define FPRS_FEF 0x04 | |
13 | ||
14 | /* On T4 it is very expensive to access ASRs like %fprs and | |
15 | * %asi, avoiding a read or a write can save ~50 cycles. | |
16 | */ | |
17 | #define FPU_ENTER \ | |
18 | rd %fprs, %o5; \ | |
19 | andcc %o5, FPRS_FEF, %g0; \ | |
20 | be,a,pn %icc, 999f; \ | |
21 | wr %g0, FPRS_FEF, %fprs; \ | |
22 | 999: | |
23 | ||
24 | #ifdef MEMCPY_DEBUG | |
25 | #define VISEntryHalf FPU_ENTER; \ | |
26 | clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0; | |
27 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | |
28 | #else | |
29 | #define VISEntryHalf FPU_ENTER | |
30 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | |
31 | #endif | |
32 | ||
33 | #define GLOBAL_SPARE %g5 | |
34 | #endif | |
35 | ||
36 | #ifndef STORE_ASI | |
37 | #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA | |
38 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P | |
39 | #else | |
40 | #define STORE_ASI 0x80 /* ASI_P */ | |
41 | #endif | |
42 | #endif | |
43 | ||
f4da3628 DM |
44 | #if !defined(EX_LD) && !defined(EX_ST) |
45 | #define NON_USER_COPY | |
46 | #endif | |
47 | ||
ae2c6ca6 DM |
48 | #ifndef EX_LD |
49 | #define EX_LD(x) x | |
50 | #endif | |
a7c5724b RG |
51 | #ifndef EX_LD_FP |
52 | #define EX_LD_FP(x) x | |
53 | #endif | |
ae2c6ca6 DM |
54 | |
55 | #ifndef EX_ST | |
56 | #define EX_ST(x) x | |
57 | #endif | |
a7c5724b RG |
58 | #ifndef EX_ST_FP |
59 | #define EX_ST_FP(x) x | |
60 | #endif | |
ae2c6ca6 DM |
61 | |
62 | #ifndef EX_RETVAL | |
63 | #define EX_RETVAL(x) x | |
64 | #endif | |
65 | ||
66 | #ifndef LOAD | |
67 | #define LOAD(type,addr,dest) type [addr], dest | |
68 | #endif | |
69 | ||
70 | #ifndef STORE | |
71 | #ifndef MEMCPY_DEBUG | |
72 | #define STORE(type,src,addr) type src, [addr] | |
73 | #else | |
74 | #define STORE(type,src,addr) type##a src, [addr] %asi | |
75 | #endif | |
76 | #endif | |
77 | ||
78 | #ifndef STORE_INIT | |
79 | #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI | |
80 | #endif | |
81 | ||
82 | #ifndef FUNC_NAME | |
83 | #define FUNC_NAME NG4memcpy | |
84 | #endif | |
85 | #ifndef PREAMBLE | |
86 | #define PREAMBLE | |
87 | #endif | |
88 | ||
89 | #ifndef XCC | |
90 | #define XCC xcc | |
91 | #endif | |
92 | ||
93 | .register %g2,#scratch | |
94 | .register %g3,#scratch | |
95 | ||
96 | .text | |
97 | .align 64 | |
98 | ||
99 | .globl FUNC_NAME | |
100 | .type FUNC_NAME,#function | |
101 | FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ | |
102 | #ifdef MEMCPY_DEBUG | |
103 | wr %g0, 0x80, %asi | |
104 | #endif | |
105 | srlx %o2, 31, %g2 | |
106 | cmp %g2, 0 | |
107 | tne %XCC, 5 | |
108 | PREAMBLE | |
109 | mov %o0, %o3 | |
110 | brz,pn %o2, .Lexit | |
111 | cmp %o2, 3 | |
112 | ble,pn %icc, .Ltiny | |
113 | cmp %o2, 19 | |
114 | ble,pn %icc, .Lsmall | |
115 | or %o0, %o1, %g2 | |
116 | cmp %o2, 128 | |
117 | bl,pn %icc, .Lmedium | |
118 | nop | |
119 | ||
120 | .Llarge:/* len >= 0x80 */ | |
121 | /* First get dest 8 byte aligned. */ | |
122 | sub %g0, %o0, %g1 | |
123 | and %g1, 0x7, %g1 | |
124 | brz,pt %g1, 51f | |
125 | sub %o2, %g1, %o2 | |
42a4172b | 126 | |
ae2c6ca6 DM |
127 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) |
128 | add %o1, 1, %o1 | |
129 | subcc %g1, 1, %g1 | |
130 | add %o0, 1, %o0 | |
131 | bne,pt %icc, 1b | |
132 | EX_ST(STORE(stb, %g2, %o0 - 0x01)) | |
133 | ||
134 | 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong) | |
135 | LOAD(prefetch, %o1 + 0x080, #n_reads_strong) | |
136 | LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong) | |
137 | LOAD(prefetch, %o1 + 0x100, #n_reads_strong) | |
138 | LOAD(prefetch, %o1 + 0x140, #n_reads_strong) | |
139 | LOAD(prefetch, %o1 + 0x180, #n_reads_strong) | |
140 | LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong) | |
141 | LOAD(prefetch, %o1 + 0x200, #n_reads_strong) | |
142 | ||
143 | /* Check if we can use the straight fully aligned | |
144 | * loop, or we require the alignaddr/faligndata variant. | |
145 | */ | |
146 | andcc %o1, 0x7, %o5 | |
147 | bne,pn %icc, .Llarge_src_unaligned | |
148 | sub %g0, %o0, %g1 | |
149 | ||
150 | /* Legitimize the use of initializing stores by getting dest | |
151 | * to be 64-byte aligned. | |
152 | */ | |
153 | and %g1, 0x3f, %g1 | |
154 | brz,pt %g1, .Llarge_aligned | |
155 | sub %o2, %g1, %o2 | |
42a4172b | 156 | |
ae2c6ca6 DM |
157 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2)) |
158 | add %o1, 8, %o1 | |
159 | subcc %g1, 8, %g1 | |
160 | add %o0, 8, %o0 | |
161 | bne,pt %icc, 1b | |
162 | EX_ST(STORE(stx, %g2, %o0 - 0x08)) | |
163 | ||
164 | .Llarge_aligned: | |
165 | /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */ | |
166 | andn %o2, 0x3f, %o4 | |
167 | sub %o2, %o4, %o2 | |
168 | ||
169 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) | |
170 | add %o1, 0x40, %o1 | |
171 | EX_LD(LOAD(ldx, %o1 - 0x38, %g2)) | |
172 | subcc %o4, 0x40, %o4 | |
173 | EX_LD(LOAD(ldx, %o1 - 0x30, %g3)) | |
174 | EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE)) | |
175 | EX_LD(LOAD(ldx, %o1 - 0x20, %o5)) | |
176 | EX_ST(STORE_INIT(%g1, %o0)) | |
177 | add %o0, 0x08, %o0 | |
178 | EX_ST(STORE_INIT(%g2, %o0)) | |
179 | add %o0, 0x08, %o0 | |
180 | EX_LD(LOAD(ldx, %o1 - 0x18, %g2)) | |
181 | EX_ST(STORE_INIT(%g3, %o0)) | |
182 | add %o0, 0x08, %o0 | |
183 | EX_LD(LOAD(ldx, %o1 - 0x10, %g3)) | |
184 | EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) | |
185 | add %o0, 0x08, %o0 | |
186 | EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE)) | |
187 | EX_ST(STORE_INIT(%o5, %o0)) | |
188 | add %o0, 0x08, %o0 | |
189 | EX_ST(STORE_INIT(%g2, %o0)) | |
190 | add %o0, 0x08, %o0 | |
191 | EX_ST(STORE_INIT(%g3, %o0)) | |
192 | add %o0, 0x08, %o0 | |
193 | EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) | |
194 | add %o0, 0x08, %o0 | |
195 | bne,pt %icc, 1b | |
196 | LOAD(prefetch, %o1 + 0x200, #n_reads_strong) | |
197 | ||
198 | membar #StoreLoad | #StoreStore | |
199 | ||
200 | brz,pn %o2, .Lexit | |
201 | cmp %o2, 19 | |
202 | ble,pn %icc, .Lsmall_unaligned | |
203 | nop | |
204 | ba,a,pt %icc, .Lmedium_noprefetch | |
205 | ||
206 | .Lexit: retl | |
207 | mov EX_RETVAL(%o3), %o0 | |
208 | ||
209 | .Llarge_src_unaligned: | |
f4da3628 DM |
210 | #ifdef NON_USER_COPY |
211 | VISEntryHalfFast(.Lmedium_vis_entry_fail) | |
212 | #else | |
213 | VISEntryHalf | |
214 | #endif | |
ae2c6ca6 DM |
215 | andn %o2, 0x3f, %o4 |
216 | sub %o2, %o4, %o2 | |
ae2c6ca6 DM |
217 | alignaddr %o1, %g0, %g1 |
218 | add %o1, %o4, %o1 | |
a7c5724b RG |
219 | EX_LD_FP(LOAD(ldd, %g1 + 0x00, %f0)) |
220 | 1: EX_LD_FP(LOAD(ldd, %g1 + 0x08, %f2)) | |
ae2c6ca6 | 221 | subcc %o4, 0x40, %o4 |
a7c5724b RG |
222 | EX_LD_FP(LOAD(ldd, %g1 + 0x10, %f4)) |
223 | EX_LD_FP(LOAD(ldd, %g1 + 0x18, %f6)) | |
224 | EX_LD_FP(LOAD(ldd, %g1 + 0x20, %f8)) | |
225 | EX_LD_FP(LOAD(ldd, %g1 + 0x28, %f10)) | |
226 | EX_LD_FP(LOAD(ldd, %g1 + 0x30, %f12)) | |
227 | EX_LD_FP(LOAD(ldd, %g1 + 0x38, %f14)) | |
ae2c6ca6 | 228 | faligndata %f0, %f2, %f16 |
a7c5724b | 229 | EX_LD_FP(LOAD(ldd, %g1 + 0x40, %f0)) |
ae2c6ca6 DM |
230 | faligndata %f2, %f4, %f18 |
231 | add %g1, 0x40, %g1 | |
232 | faligndata %f4, %f6, %f20 | |
233 | faligndata %f6, %f8, %f22 | |
234 | faligndata %f8, %f10, %f24 | |
235 | faligndata %f10, %f12, %f26 | |
236 | faligndata %f12, %f14, %f28 | |
237 | faligndata %f14, %f0, %f30 | |
a7c5724b RG |
238 | EX_ST_FP(STORE(std, %f16, %o0 + 0x00)) |
239 | EX_ST_FP(STORE(std, %f18, %o0 + 0x08)) | |
240 | EX_ST_FP(STORE(std, %f20, %o0 + 0x10)) | |
241 | EX_ST_FP(STORE(std, %f22, %o0 + 0x18)) | |
242 | EX_ST_FP(STORE(std, %f24, %o0 + 0x20)) | |
243 | EX_ST_FP(STORE(std, %f26, %o0 + 0x28)) | |
244 | EX_ST_FP(STORE(std, %f28, %o0 + 0x30)) | |
245 | EX_ST_FP(STORE(std, %f30, %o0 + 0x38)) | |
ae2c6ca6 DM |
246 | add %o0, 0x40, %o0 |
247 | bne,pt %icc, 1b | |
248 | LOAD(prefetch, %g1 + 0x200, #n_reads_strong) | |
44922150 DM |
249 | #ifdef NON_USER_COPY |
250 | VISExitHalfFast | |
251 | #else | |
ae2c6ca6 | 252 | VISExitHalf |
44922150 | 253 | #endif |
ae2c6ca6 DM |
254 | brz,pn %o2, .Lexit |
255 | cmp %o2, 19 | |
256 | ble,pn %icc, .Lsmall_unaligned | |
257 | nop | |
258 | ba,a,pt %icc, .Lmedium_unaligned | |
259 | ||
f4da3628 DM |
260 | #ifdef NON_USER_COPY |
261 | .Lmedium_vis_entry_fail: | |
262 | or %o0, %o1, %g2 | |
263 | #endif | |
ae2c6ca6 DM |
264 | .Lmedium: |
265 | LOAD(prefetch, %o1 + 0x40, #n_reads_strong) | |
266 | andcc %g2, 0x7, %g0 | |
267 | bne,pn %icc, .Lmedium_unaligned | |
268 | nop | |
269 | .Lmedium_noprefetch: | |
270 | andncc %o2, 0x20 - 1, %o5 | |
271 | be,pn %icc, 2f | |
272 | sub %o2, %o5, %o2 | |
273 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) | |
274 | EX_LD(LOAD(ldx, %o1 + 0x08, %g2)) | |
275 | EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE)) | |
276 | EX_LD(LOAD(ldx, %o1 + 0x18, %o4)) | |
277 | add %o1, 0x20, %o1 | |
278 | subcc %o5, 0x20, %o5 | |
279 | EX_ST(STORE(stx, %g1, %o0 + 0x00)) | |
280 | EX_ST(STORE(stx, %g2, %o0 + 0x08)) | |
281 | EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10)) | |
282 | EX_ST(STORE(stx, %o4, %o0 + 0x18)) | |
283 | bne,pt %icc, 1b | |
284 | add %o0, 0x20, %o0 | |
285 | 2: andcc %o2, 0x18, %o5 | |
286 | be,pt %icc, 3f | |
287 | sub %o2, %o5, %o2 | |
288 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) | |
289 | add %o1, 0x08, %o1 | |
290 | add %o0, 0x08, %o0 | |
291 | subcc %o5, 0x08, %o5 | |
292 | bne,pt %icc, 1b | |
293 | EX_ST(STORE(stx, %g1, %o0 - 0x08)) | |
294 | 3: brz,pt %o2, .Lexit | |
295 | cmp %o2, 0x04 | |
296 | bl,pn %icc, .Ltiny | |
297 | nop | |
298 | EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) | |
299 | add %o1, 0x04, %o1 | |
300 | add %o0, 0x04, %o0 | |
301 | subcc %o2, 0x04, %o2 | |
302 | bne,pn %icc, .Ltiny | |
303 | EX_ST(STORE(stw, %g1, %o0 - 0x04)) | |
304 | ba,a,pt %icc, .Lexit | |
305 | .Lmedium_unaligned: | |
306 | /* First get dest 8 byte aligned. */ | |
307 | sub %g0, %o0, %g1 | |
308 | and %g1, 0x7, %g1 | |
309 | brz,pt %g1, 2f | |
310 | sub %o2, %g1, %o2 | |
42a4172b | 311 | |
ae2c6ca6 DM |
312 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) |
313 | add %o1, 1, %o1 | |
314 | subcc %g1, 1, %g1 | |
315 | add %o0, 1, %o0 | |
316 | bne,pt %icc, 1b | |
317 | EX_ST(STORE(stb, %g2, %o0 - 0x01)) | |
318 | 2: | |
319 | and %o1, 0x7, %g1 | |
320 | brz,pn %g1, .Lmedium_noprefetch | |
321 | sll %g1, 3, %g1 | |
322 | mov 64, %g2 | |
323 | sub %g2, %g1, %g2 | |
324 | andn %o1, 0x7, %o1 | |
325 | EX_LD(LOAD(ldx, %o1 + 0x00, %o4)) | |
326 | sllx %o4, %g1, %o4 | |
327 | andn %o2, 0x08 - 1, %o5 | |
328 | sub %o2, %o5, %o2 | |
329 | 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3)) | |
330 | add %o1, 0x08, %o1 | |
331 | subcc %o5, 0x08, %o5 | |
332 | srlx %g3, %g2, GLOBAL_SPARE | |
333 | or GLOBAL_SPARE, %o4, GLOBAL_SPARE | |
334 | EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00)) | |
335 | add %o0, 0x08, %o0 | |
336 | bne,pt %icc, 1b | |
337 | sllx %g3, %g1, %o4 | |
338 | srl %g1, 3, %g1 | |
339 | add %o1, %g1, %o1 | |
340 | brz,pn %o2, .Lexit | |
341 | nop | |
342 | ba,pt %icc, .Lsmall_unaligned | |
343 | ||
344 | .Ltiny: | |
345 | EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) | |
346 | subcc %o2, 1, %o2 | |
347 | be,pn %icc, .Lexit | |
348 | EX_ST(STORE(stb, %g1, %o0 + 0x00)) | |
349 | EX_LD(LOAD(ldub, %o1 + 0x01, %g1)) | |
350 | subcc %o2, 1, %o2 | |
351 | be,pn %icc, .Lexit | |
352 | EX_ST(STORE(stb, %g1, %o0 + 0x01)) | |
353 | EX_LD(LOAD(ldub, %o1 + 0x02, %g1)) | |
354 | ba,pt %icc, .Lexit | |
355 | EX_ST(STORE(stb, %g1, %o0 + 0x02)) | |
356 | ||
357 | .Lsmall: | |
358 | andcc %g2, 0x3, %g0 | |
359 | bne,pn %icc, .Lsmall_unaligned | |
360 | andn %o2, 0x4 - 1, %o5 | |
361 | sub %o2, %o5, %o2 | |
362 | 1: | |
363 | EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) | |
364 | add %o1, 0x04, %o1 | |
365 | subcc %o5, 0x04, %o5 | |
366 | add %o0, 0x04, %o0 | |
367 | bne,pt %icc, 1b | |
368 | EX_ST(STORE(stw, %g1, %o0 - 0x04)) | |
369 | brz,pt %o2, .Lexit | |
370 | nop | |
371 | ba,a,pt %icc, .Ltiny | |
372 | ||
373 | .Lsmall_unaligned: | |
374 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) | |
375 | add %o1, 1, %o1 | |
376 | add %o0, 1, %o0 | |
377 | subcc %o2, 1, %o2 | |
378 | bne,pt %icc, 1b | |
379 | EX_ST(STORE(stb, %g1, %o0 - 0x01)) | |
380 | ba,a,pt %icc, .Lexit | |
381 | .size FUNC_NAME, .-FUNC_NAME |