Commit | Line | Data |
---|---|---|
ae2c6ca6 DM |
1 | /* NG4memcpy.S: Niagara-4 optimized memcpy. |
2 | * | |
3 | * Copyright (C) 2012 David S. Miller (davem@davemloft.net) | |
4 | */ | |
5 | ||
6 | #ifdef __KERNEL__ | |
7 | #include <asm/visasm.h> | |
8 | #include <asm/asi.h> | |
9 | #define GLOBAL_SPARE %g7 | |
10 | #else | |
11 | #define ASI_BLK_INIT_QUAD_LDD_P 0xe2 | |
12 | #define FPRS_FEF 0x04 | |
13 | ||
14 | /* On T4 it is very expensive to access ASRs like %fprs and | |
15 | * %asi, avoiding a read or a write can save ~50 cycles. | |
16 | */ | |
17 | #define FPU_ENTER \ | |
18 | rd %fprs, %o5; \ | |
19 | andcc %o5, FPRS_FEF, %g0; \ | |
20 | be,a,pn %icc, 999f; \ | |
21 | wr %g0, FPRS_FEF, %fprs; \ | |
22 | 999: | |
23 | ||
24 | #ifdef MEMCPY_DEBUG | |
25 | #define VISEntryHalf FPU_ENTER; \ | |
26 | clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0; | |
27 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | |
28 | #else | |
29 | #define VISEntryHalf FPU_ENTER | |
30 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | |
31 | #endif | |
32 | ||
33 | #define GLOBAL_SPARE %g5 | |
34 | #endif | |
35 | ||
36 | #ifndef STORE_ASI | |
37 | #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA | |
38 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P | |
39 | #else | |
40 | #define STORE_ASI 0x80 /* ASI_P */ | |
41 | #endif | |
42 | #endif | |
43 | ||
f4da3628 DM |
44 | #if !defined(EX_LD) && !defined(EX_ST) |
45 | #define NON_USER_COPY | |
46 | #endif | |
47 | ||
ae2c6ca6 DM |
48 | #ifndef EX_LD |
49 | #define EX_LD(x) x | |
50 | #endif | |
51 | ||
52 | #ifndef EX_ST | |
53 | #define EX_ST(x) x | |
54 | #endif | |
55 | ||
56 | #ifndef EX_RETVAL | |
57 | #define EX_RETVAL(x) x | |
58 | #endif | |
59 | ||
60 | #ifndef LOAD | |
61 | #define LOAD(type,addr,dest) type [addr], dest | |
62 | #endif | |
63 | ||
64 | #ifndef STORE | |
65 | #ifndef MEMCPY_DEBUG | |
66 | #define STORE(type,src,addr) type src, [addr] | |
67 | #else | |
68 | #define STORE(type,src,addr) type##a src, [addr] %asi | |
69 | #endif | |
70 | #endif | |
71 | ||
72 | #ifndef STORE_INIT | |
73 | #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI | |
74 | #endif | |
75 | ||
76 | #ifndef FUNC_NAME | |
77 | #define FUNC_NAME NG4memcpy | |
78 | #endif | |
79 | #ifndef PREAMBLE | |
80 | #define PREAMBLE | |
81 | #endif | |
82 | ||
83 | #ifndef XCC | |
84 | #define XCC xcc | |
85 | #endif | |
86 | ||
87 | .register %g2,#scratch | |
88 | .register %g3,#scratch | |
89 | ||
90 | .text | |
91 | .align 64 | |
92 | ||
93 | .globl FUNC_NAME | |
94 | .type FUNC_NAME,#function | |
95 | FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ | |
96 | #ifdef MEMCPY_DEBUG | |
97 | wr %g0, 0x80, %asi | |
98 | #endif | |
99 | srlx %o2, 31, %g2 | |
100 | cmp %g2, 0 | |
101 | tne %XCC, 5 | |
102 | PREAMBLE | |
103 | mov %o0, %o3 | |
104 | brz,pn %o2, .Lexit | |
105 | cmp %o2, 3 | |
106 | ble,pn %icc, .Ltiny | |
107 | cmp %o2, 19 | |
108 | ble,pn %icc, .Lsmall | |
109 | or %o0, %o1, %g2 | |
110 | cmp %o2, 128 | |
111 | bl,pn %icc, .Lmedium | |
112 | nop | |
113 | ||
114 | .Llarge:/* len >= 0x80 */ | |
115 | /* First get dest 8 byte aligned. */ | |
116 | sub %g0, %o0, %g1 | |
117 | and %g1, 0x7, %g1 | |
118 | brz,pt %g1, 51f | |
119 | sub %o2, %g1, %o2 | |
42a4172b | 120 | |
ae2c6ca6 DM |
121 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) |
122 | add %o1, 1, %o1 | |
123 | subcc %g1, 1, %g1 | |
124 | add %o0, 1, %o0 | |
125 | bne,pt %icc, 1b | |
126 | EX_ST(STORE(stb, %g2, %o0 - 0x01)) | |
127 | ||
128 | 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong) | |
129 | LOAD(prefetch, %o1 + 0x080, #n_reads_strong) | |
130 | LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong) | |
131 | LOAD(prefetch, %o1 + 0x100, #n_reads_strong) | |
132 | LOAD(prefetch, %o1 + 0x140, #n_reads_strong) | |
133 | LOAD(prefetch, %o1 + 0x180, #n_reads_strong) | |
134 | LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong) | |
135 | LOAD(prefetch, %o1 + 0x200, #n_reads_strong) | |
136 | ||
137 | /* Check if we can use the straight fully aligned | |
138 | * loop, or we require the alignaddr/faligndata variant. | |
139 | */ | |
140 | andcc %o1, 0x7, %o5 | |
141 | bne,pn %icc, .Llarge_src_unaligned | |
142 | sub %g0, %o0, %g1 | |
143 | ||
144 | /* Legitimize the use of initializing stores by getting dest | |
145 | * to be 64-byte aligned. | |
146 | */ | |
147 | and %g1, 0x3f, %g1 | |
148 | brz,pt %g1, .Llarge_aligned | |
149 | sub %o2, %g1, %o2 | |
42a4172b | 150 | |
ae2c6ca6 DM |
151 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2)) |
152 | add %o1, 8, %o1 | |
153 | subcc %g1, 8, %g1 | |
154 | add %o0, 8, %o0 | |
155 | bne,pt %icc, 1b | |
156 | EX_ST(STORE(stx, %g2, %o0 - 0x08)) | |
157 | ||
158 | .Llarge_aligned: | |
159 | /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */ | |
160 | andn %o2, 0x3f, %o4 | |
161 | sub %o2, %o4, %o2 | |
162 | ||
163 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) | |
164 | add %o1, 0x40, %o1 | |
165 | EX_LD(LOAD(ldx, %o1 - 0x38, %g2)) | |
166 | subcc %o4, 0x40, %o4 | |
167 | EX_LD(LOAD(ldx, %o1 - 0x30, %g3)) | |
168 | EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE)) | |
169 | EX_LD(LOAD(ldx, %o1 - 0x20, %o5)) | |
170 | EX_ST(STORE_INIT(%g1, %o0)) | |
171 | add %o0, 0x08, %o0 | |
172 | EX_ST(STORE_INIT(%g2, %o0)) | |
173 | add %o0, 0x08, %o0 | |
174 | EX_LD(LOAD(ldx, %o1 - 0x18, %g2)) | |
175 | EX_ST(STORE_INIT(%g3, %o0)) | |
176 | add %o0, 0x08, %o0 | |
177 | EX_LD(LOAD(ldx, %o1 - 0x10, %g3)) | |
178 | EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) | |
179 | add %o0, 0x08, %o0 | |
180 | EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE)) | |
181 | EX_ST(STORE_INIT(%o5, %o0)) | |
182 | add %o0, 0x08, %o0 | |
183 | EX_ST(STORE_INIT(%g2, %o0)) | |
184 | add %o0, 0x08, %o0 | |
185 | EX_ST(STORE_INIT(%g3, %o0)) | |
186 | add %o0, 0x08, %o0 | |
187 | EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) | |
188 | add %o0, 0x08, %o0 | |
189 | bne,pt %icc, 1b | |
190 | LOAD(prefetch, %o1 + 0x200, #n_reads_strong) | |
191 | ||
192 | membar #StoreLoad | #StoreStore | |
193 | ||
194 | brz,pn %o2, .Lexit | |
195 | cmp %o2, 19 | |
196 | ble,pn %icc, .Lsmall_unaligned | |
197 | nop | |
198 | ba,a,pt %icc, .Lmedium_noprefetch | |
199 | ||
200 | .Lexit: retl | |
201 | mov EX_RETVAL(%o3), %o0 | |
202 | ||
203 | .Llarge_src_unaligned: | |
f4da3628 DM |
204 | #ifdef NON_USER_COPY |
205 | VISEntryHalfFast(.Lmedium_vis_entry_fail) | |
206 | #else | |
207 | VISEntryHalf | |
208 | #endif | |
ae2c6ca6 DM |
209 | andn %o2, 0x3f, %o4 |
210 | sub %o2, %o4, %o2 | |
ae2c6ca6 DM |
211 | alignaddr %o1, %g0, %g1 |
212 | add %o1, %o4, %o1 | |
213 | EX_LD(LOAD(ldd, %g1 + 0x00, %f0)) | |
214 | 1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2)) | |
215 | subcc %o4, 0x40, %o4 | |
216 | EX_LD(LOAD(ldd, %g1 + 0x10, %f4)) | |
217 | EX_LD(LOAD(ldd, %g1 + 0x18, %f6)) | |
218 | EX_LD(LOAD(ldd, %g1 + 0x20, %f8)) | |
219 | EX_LD(LOAD(ldd, %g1 + 0x28, %f10)) | |
220 | EX_LD(LOAD(ldd, %g1 + 0x30, %f12)) | |
221 | EX_LD(LOAD(ldd, %g1 + 0x38, %f14)) | |
222 | faligndata %f0, %f2, %f16 | |
223 | EX_LD(LOAD(ldd, %g1 + 0x40, %f0)) | |
224 | faligndata %f2, %f4, %f18 | |
225 | add %g1, 0x40, %g1 | |
226 | faligndata %f4, %f6, %f20 | |
227 | faligndata %f6, %f8, %f22 | |
228 | faligndata %f8, %f10, %f24 | |
229 | faligndata %f10, %f12, %f26 | |
230 | faligndata %f12, %f14, %f28 | |
231 | faligndata %f14, %f0, %f30 | |
232 | EX_ST(STORE(std, %f16, %o0 + 0x00)) | |
233 | EX_ST(STORE(std, %f18, %o0 + 0x08)) | |
234 | EX_ST(STORE(std, %f20, %o0 + 0x10)) | |
235 | EX_ST(STORE(std, %f22, %o0 + 0x18)) | |
236 | EX_ST(STORE(std, %f24, %o0 + 0x20)) | |
237 | EX_ST(STORE(std, %f26, %o0 + 0x28)) | |
238 | EX_ST(STORE(std, %f28, %o0 + 0x30)) | |
239 | EX_ST(STORE(std, %f30, %o0 + 0x38)) | |
240 | add %o0, 0x40, %o0 | |
241 | bne,pt %icc, 1b | |
242 | LOAD(prefetch, %g1 + 0x200, #n_reads_strong) | |
44922150 DM |
243 | #ifdef NON_USER_COPY |
244 | VISExitHalfFast | |
245 | #else | |
ae2c6ca6 | 246 | VISExitHalf |
44922150 | 247 | #endif |
ae2c6ca6 DM |
248 | brz,pn %o2, .Lexit |
249 | cmp %o2, 19 | |
250 | ble,pn %icc, .Lsmall_unaligned | |
251 | nop | |
252 | ba,a,pt %icc, .Lmedium_unaligned | |
253 | ||
f4da3628 DM |
254 | #ifdef NON_USER_COPY |
255 | .Lmedium_vis_entry_fail: | |
256 | or %o0, %o1, %g2 | |
257 | #endif | |
ae2c6ca6 DM |
258 | .Lmedium: |
259 | LOAD(prefetch, %o1 + 0x40, #n_reads_strong) | |
260 | andcc %g2, 0x7, %g0 | |
261 | bne,pn %icc, .Lmedium_unaligned | |
262 | nop | |
263 | .Lmedium_noprefetch: | |
264 | andncc %o2, 0x20 - 1, %o5 | |
265 | be,pn %icc, 2f | |
266 | sub %o2, %o5, %o2 | |
267 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) | |
268 | EX_LD(LOAD(ldx, %o1 + 0x08, %g2)) | |
269 | EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE)) | |
270 | EX_LD(LOAD(ldx, %o1 + 0x18, %o4)) | |
271 | add %o1, 0x20, %o1 | |
272 | subcc %o5, 0x20, %o5 | |
273 | EX_ST(STORE(stx, %g1, %o0 + 0x00)) | |
274 | EX_ST(STORE(stx, %g2, %o0 + 0x08)) | |
275 | EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10)) | |
276 | EX_ST(STORE(stx, %o4, %o0 + 0x18)) | |
277 | bne,pt %icc, 1b | |
278 | add %o0, 0x20, %o0 | |
279 | 2: andcc %o2, 0x18, %o5 | |
280 | be,pt %icc, 3f | |
281 | sub %o2, %o5, %o2 | |
282 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) | |
283 | add %o1, 0x08, %o1 | |
284 | add %o0, 0x08, %o0 | |
285 | subcc %o5, 0x08, %o5 | |
286 | bne,pt %icc, 1b | |
287 | EX_ST(STORE(stx, %g1, %o0 - 0x08)) | |
288 | 3: brz,pt %o2, .Lexit | |
289 | cmp %o2, 0x04 | |
290 | bl,pn %icc, .Ltiny | |
291 | nop | |
292 | EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) | |
293 | add %o1, 0x04, %o1 | |
294 | add %o0, 0x04, %o0 | |
295 | subcc %o2, 0x04, %o2 | |
296 | bne,pn %icc, .Ltiny | |
297 | EX_ST(STORE(stw, %g1, %o0 - 0x04)) | |
298 | ba,a,pt %icc, .Lexit | |
299 | .Lmedium_unaligned: | |
300 | /* First get dest 8 byte aligned. */ | |
301 | sub %g0, %o0, %g1 | |
302 | and %g1, 0x7, %g1 | |
303 | brz,pt %g1, 2f | |
304 | sub %o2, %g1, %o2 | |
42a4172b | 305 | |
ae2c6ca6 DM |
306 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) |
307 | add %o1, 1, %o1 | |
308 | subcc %g1, 1, %g1 | |
309 | add %o0, 1, %o0 | |
310 | bne,pt %icc, 1b | |
311 | EX_ST(STORE(stb, %g2, %o0 - 0x01)) | |
312 | 2: | |
313 | and %o1, 0x7, %g1 | |
314 | brz,pn %g1, .Lmedium_noprefetch | |
315 | sll %g1, 3, %g1 | |
316 | mov 64, %g2 | |
317 | sub %g2, %g1, %g2 | |
318 | andn %o1, 0x7, %o1 | |
319 | EX_LD(LOAD(ldx, %o1 + 0x00, %o4)) | |
320 | sllx %o4, %g1, %o4 | |
321 | andn %o2, 0x08 - 1, %o5 | |
322 | sub %o2, %o5, %o2 | |
323 | 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3)) | |
324 | add %o1, 0x08, %o1 | |
325 | subcc %o5, 0x08, %o5 | |
326 | srlx %g3, %g2, GLOBAL_SPARE | |
327 | or GLOBAL_SPARE, %o4, GLOBAL_SPARE | |
328 | EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00)) | |
329 | add %o0, 0x08, %o0 | |
330 | bne,pt %icc, 1b | |
331 | sllx %g3, %g1, %o4 | |
332 | srl %g1, 3, %g1 | |
333 | add %o1, %g1, %o1 | |
334 | brz,pn %o2, .Lexit | |
335 | nop | |
336 | ba,pt %icc, .Lsmall_unaligned | |
337 | ||
338 | .Ltiny: | |
339 | EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) | |
340 | subcc %o2, 1, %o2 | |
341 | be,pn %icc, .Lexit | |
342 | EX_ST(STORE(stb, %g1, %o0 + 0x00)) | |
343 | EX_LD(LOAD(ldub, %o1 + 0x01, %g1)) | |
344 | subcc %o2, 1, %o2 | |
345 | be,pn %icc, .Lexit | |
346 | EX_ST(STORE(stb, %g1, %o0 + 0x01)) | |
347 | EX_LD(LOAD(ldub, %o1 + 0x02, %g1)) | |
348 | ba,pt %icc, .Lexit | |
349 | EX_ST(STORE(stb, %g1, %o0 + 0x02)) | |
350 | ||
351 | .Lsmall: | |
352 | andcc %g2, 0x3, %g0 | |
353 | bne,pn %icc, .Lsmall_unaligned | |
354 | andn %o2, 0x4 - 1, %o5 | |
355 | sub %o2, %o5, %o2 | |
356 | 1: | |
357 | EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) | |
358 | add %o1, 0x04, %o1 | |
359 | subcc %o5, 0x04, %o5 | |
360 | add %o0, 0x04, %o0 | |
361 | bne,pt %icc, 1b | |
362 | EX_ST(STORE(stw, %g1, %o0 - 0x04)) | |
363 | brz,pt %o2, .Lexit | |
364 | nop | |
365 | ba,a,pt %icc, .Ltiny | |
366 | ||
367 | .Lsmall_unaligned: | |
368 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) | |
369 | add %o1, 1, %o1 | |
370 | add %o0, 1, %o0 | |
371 | subcc %o2, 1, %o2 | |
372 | bne,pt %icc, 1b | |
373 | EX_ST(STORE(stb, %g1, %o0 - 0x01)) | |
374 | ba,a,pt %icc, .Lexit | |
375 | .size FUNC_NAME, .-FUNC_NAME |