Commit | Line | Data |
---|---|---|
b2441318 | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
398d1083 DM |
2 | /* NGmemcpy.S: Niagara optimized memcpy. |
3 | * | |
25e5566e | 4 | * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net) |
398d1083 DM |
5 | */ |
6 | ||
7 | #ifdef __KERNEL__ | |
7ae3aaf5 | 8 | #include <linux/linkage.h> |
398d1083 | 9 | #include <asm/asi.h> |
0d4bc95b | 10 | #include <asm/thread_info.h> |
398d1083 | 11 | #define GLOBAL_SPARE %g7 |
0d4bc95b DM |
12 | #define RESTORE_ASI(TMP) \ |
13 | ldub [%g6 + TI_CURRENT_DS], TMP; \ | |
14 | wr TMP, 0x0, %asi; | |
398d1083 DM |
15 | #else |
16 | #define GLOBAL_SPARE %g5 | |
0d4bc95b DM |
17 | #define RESTORE_ASI(TMP) \ |
18 | wr %g0, ASI_PNF, %asi | |
398d1083 DM |
19 | #endif |
20 | ||
25e5566e DM |
21 | #ifdef __sparc_v9__ |
22 | #define SAVE_AMOUNT 128 | |
23 | #else | |
24 | #define SAVE_AMOUNT 64 | |
25 | #endif | |
26 | ||
398d1083 DM |
27 | #ifndef STORE_ASI |
28 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P | |
29 | #endif | |
30 | ||
31 | #ifndef EX_LD | |
7ae3aaf5 | 32 | #define EX_LD(x,y) x |
398d1083 DM |
33 | #endif |
34 | ||
35 | #ifndef EX_ST | |
7ae3aaf5 | 36 | #define EX_ST(x,y) x |
398d1083 DM |
37 | #endif |
38 | ||
39 | #ifndef LOAD | |
40 | #ifndef MEMCPY_DEBUG | |
41 | #define LOAD(type,addr,dest) type [addr], dest | |
42 | #else | |
43 | #define LOAD(type,addr,dest) type##a [addr] 0x80, dest | |
44 | #endif | |
45 | #endif | |
46 | ||
47 | #ifndef LOAD_TWIN | |
48 | #define LOAD_TWIN(addr_reg,dest0,dest1) \ | |
49 | ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0 | |
50 | #endif | |
51 | ||
52 | #ifndef STORE | |
53 | #define STORE(type,src,addr) type src, [addr] | |
54 | #endif | |
55 | ||
56 | #ifndef STORE_INIT | |
25e5566e | 57 | #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA |
398d1083 | 58 | #define STORE_INIT(src,addr) stxa src, [addr] %asi |
25e5566e DM |
59 | #else |
60 | #define STORE_INIT(src,addr) stx src, [addr + 0x00] | |
61 | #endif | |
398d1083 DM |
62 | #endif |
63 | ||
64 | #ifndef FUNC_NAME | |
65 | #define FUNC_NAME NGmemcpy | |
66 | #endif | |
67 | ||
68 | #ifndef PREAMBLE | |
69 | #define PREAMBLE | |
70 | #endif | |
71 | ||
72 | #ifndef XCC | |
73 | #define XCC xcc | |
74 | #endif | |
75 | ||
76 | .register %g2,#scratch | |
77 | .register %g3,#scratch | |
78 | ||
79 | .text | |
7ae3aaf5 DM |
80 | #ifndef EX_RETVAL |
81 | #define EX_RETVAL(x) x | |
82 | __restore_asi: | |
83 | ret | |
84 | wr %g0, ASI_AIUS, %asi | |
85 | restore | |
86 | ENTRY(NG_ret_i2_plus_i4_plus_1) | |
87 | ba,pt %xcc, __restore_asi | |
88 | add %i2, %i5, %i0 | |
89 | ENDPROC(NG_ret_i2_plus_i4_plus_1) | |
90 | ENTRY(NG_ret_i2_plus_g1) | |
91 | ba,pt %xcc, __restore_asi | |
92 | add %i2, %g1, %i0 | |
93 | ENDPROC(NG_ret_i2_plus_g1) | |
94 | ENTRY(NG_ret_i2_plus_g1_minus_8) | |
95 | sub %g1, 8, %g1 | |
96 | ba,pt %xcc, __restore_asi | |
97 | add %i2, %g1, %i0 | |
98 | ENDPROC(NG_ret_i2_plus_g1_minus_8) | |
99 | ENTRY(NG_ret_i2_plus_g1_minus_16) | |
100 | sub %g1, 16, %g1 | |
101 | ba,pt %xcc, __restore_asi | |
102 | add %i2, %g1, %i0 | |
103 | ENDPROC(NG_ret_i2_plus_g1_minus_16) | |
104 | ENTRY(NG_ret_i2_plus_g1_minus_24) | |
105 | sub %g1, 24, %g1 | |
106 | ba,pt %xcc, __restore_asi | |
107 | add %i2, %g1, %i0 | |
108 | ENDPROC(NG_ret_i2_plus_g1_minus_24) | |
109 | ENTRY(NG_ret_i2_plus_g1_minus_32) | |
110 | sub %g1, 32, %g1 | |
111 | ba,pt %xcc, __restore_asi | |
112 | add %i2, %g1, %i0 | |
113 | ENDPROC(NG_ret_i2_plus_g1_minus_32) | |
114 | ENTRY(NG_ret_i2_plus_g1_minus_40) | |
115 | sub %g1, 40, %g1 | |
116 | ba,pt %xcc, __restore_asi | |
117 | add %i2, %g1, %i0 | |
118 | ENDPROC(NG_ret_i2_plus_g1_minus_40) | |
119 | ENTRY(NG_ret_i2_plus_g1_minus_48) | |
120 | sub %g1, 48, %g1 | |
121 | ba,pt %xcc, __restore_asi | |
122 | add %i2, %g1, %i0 | |
123 | ENDPROC(NG_ret_i2_plus_g1_minus_48) | |
124 | ENTRY(NG_ret_i2_plus_g1_minus_56) | |
125 | sub %g1, 56, %g1 | |
126 | ba,pt %xcc, __restore_asi | |
127 | add %i2, %g1, %i0 | |
128 | ENDPROC(NG_ret_i2_plus_g1_minus_56) | |
129 | ENTRY(NG_ret_i2_plus_i4) | |
130 | ba,pt %xcc, __restore_asi | |
131 | add %i2, %i4, %i0 | |
132 | ENDPROC(NG_ret_i2_plus_i4) | |
133 | ENTRY(NG_ret_i2_plus_i4_minus_8) | |
134 | sub %i4, 8, %i4 | |
135 | ba,pt %xcc, __restore_asi | |
136 | add %i2, %i4, %i0 | |
137 | ENDPROC(NG_ret_i2_plus_i4_minus_8) | |
138 | ENTRY(NG_ret_i2_plus_8) | |
139 | ba,pt %xcc, __restore_asi | |
140 | add %i2, 8, %i0 | |
141 | ENDPROC(NG_ret_i2_plus_8) | |
142 | ENTRY(NG_ret_i2_plus_4) | |
143 | ba,pt %xcc, __restore_asi | |
144 | add %i2, 4, %i0 | |
145 | ENDPROC(NG_ret_i2_plus_4) | |
146 | ENTRY(NG_ret_i2_plus_1) | |
147 | ba,pt %xcc, __restore_asi | |
148 | add %i2, 1, %i0 | |
149 | ENDPROC(NG_ret_i2_plus_1) | |
150 | ENTRY(NG_ret_i2_plus_g1_plus_1) | |
151 | add %g1, 1, %g1 | |
152 | ba,pt %xcc, __restore_asi | |
153 | add %i2, %g1, %i0 | |
154 | ENDPROC(NG_ret_i2_plus_g1_plus_1) | |
155 | ENTRY(NG_ret_i2) | |
156 | ba,pt %xcc, __restore_asi | |
157 | mov %i2, %i0 | |
158 | ENDPROC(NG_ret_i2) | |
159 | ENTRY(NG_ret_i2_and_7_plus_i4) | |
160 | and %i2, 7, %i2 | |
161 | ba,pt %xcc, __restore_asi | |
162 | add %i2, %i4, %i0 | |
163 | ENDPROC(NG_ret_i2_and_7_plus_i4) | |
164 | #endif | |
165 | ||
398d1083 DM |
166 | .align 64 |
167 | ||
168 | .globl FUNC_NAME | |
169 | .type FUNC_NAME,#function | |
25e5566e DM |
170 | FUNC_NAME: /* %i0=dst, %i1=src, %i2=len */ |
171 | PREAMBLE | |
172 | save %sp, -SAVE_AMOUNT, %sp | |
173 | srlx %i2, 31, %g2 | |
398d1083 DM |
174 | cmp %g2, 0 |
175 | tne %xcc, 5 | |
25e5566e DM |
176 | mov %i0, %o0 |
177 | cmp %i2, 0 | |
398d1083 | 178 | be,pn %XCC, 85f |
25e5566e DM |
179 | or %o0, %i1, %i3 |
180 | cmp %i2, 16 | |
398d1083 | 181 | blu,a,pn %XCC, 80f |
25e5566e | 182 | or %i3, %i2, %i3 |
398d1083 DM |
183 | |
184 | /* 2 blocks (128 bytes) is the minimum we can do the block | |
185 | * copy with. We need to ensure that we'll iterate at least | |
186 | * once in the block copy loop. At worst we'll need to align | |
187 | * the destination to a 64-byte boundary which can chew up | |
188 | * to (64 - 1) bytes from the length before we perform the | |
189 | * block copy loop. | |
190 | */ | |
25e5566e | 191 | cmp %i2, (2 * 64) |
398d1083 | 192 | blu,pt %XCC, 70f |
25e5566e | 193 | andcc %i3, 0x7, %g0 |
398d1083 DM |
194 | |
195 | /* %o0: dst | |
25e5566e DM |
196 | * %i1: src |
197 | * %i2: len (known to be >= 128) | |
398d1083 | 198 | * |
25e5566e | 199 | * The block copy loops will use %i4/%i5,%g2/%g3 as |
398d1083 DM |
200 | * temporaries while copying the data. |
201 | */ | |
202 | ||
25e5566e | 203 | LOAD(prefetch, %i1, #one_read) |
398d1083 DM |
204 | wr %g0, STORE_ASI, %asi |
205 | ||
206 | /* Align destination on 64-byte boundary. */ | |
25e5566e | 207 | andcc %o0, (64 - 1), %i4 |
398d1083 | 208 | be,pt %XCC, 2f |
25e5566e DM |
209 | sub %i4, 64, %i4 |
210 | sub %g0, %i4, %i4 ! bytes to align dst | |
211 | sub %i2, %i4, %i2 | |
212 | 1: subcc %i4, 1, %i4 | |
7ae3aaf5 DM |
213 | EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_i4_plus_1) |
214 | EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_plus_i4_plus_1) | |
25e5566e | 215 | add %i1, 1, %i1 |
398d1083 DM |
216 | bne,pt %XCC, 1b |
217 | add %o0, 1, %o0 | |
218 | ||
219 | /* If the source is on a 16-byte boundary we can do | |
220 | * the direct block copy loop. If it is 8-byte aligned | |
221 | * we can do the 16-byte loads offset by -8 bytes and the | |
222 | * init stores offset by one register. | |
223 | * | |
224 | * If the source is not even 8-byte aligned, we need to do | |
225 | * shifting and masking (basically integer faligndata). | |
226 | * | |
227 | * The careful bit with init stores is that if we store | |
228 | * to any part of the cache line we have to store the whole | |
229 | * cacheline else we can end up with corrupt L2 cache line | |
230 | * contents. Since the loop works on 64-bytes of 64-byte | |
231 | * aligned store data at a time, this is easy to ensure. | |
232 | */ | |
233 | 2: | |
25e5566e DM |
234 | andcc %i1, (16 - 1), %i4 |
235 | andn %i2, (64 - 1), %g1 ! block copy loop iterator | |
398d1083 | 236 | be,pt %XCC, 50f |
25e5566e DM |
237 | sub %i2, %g1, %i2 ! final sub-block copy bytes |
238 | ||
239 | cmp %i4, 8 | |
240 | be,pt %XCC, 10f | |
241 | sub %i1, %i4, %i1 | |
398d1083 DM |
242 | |
243 | /* Neither 8-byte nor 16-byte aligned, shift and mask. */ | |
25e5566e DM |
244 | and %i4, 0x7, GLOBAL_SPARE |
245 | sll GLOBAL_SPARE, 3, GLOBAL_SPARE | |
246 | mov 64, %i5 | |
7ae3aaf5 | 247 | EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret_i2_plus_g1) |
25e5566e DM |
248 | sub %i5, GLOBAL_SPARE, %i5 |
249 | mov 16, %o4 | |
250 | mov 32, %o5 | |
251 | mov 48, %o7 | |
252 | mov 64, %i3 | |
253 | ||
254 | bg,pn %XCC, 9f | |
255 | nop | |
398d1083 | 256 | |
25e5566e DM |
257 | #define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \ |
258 | sllx WORD1, POST_SHIFT, WORD1; \ | |
259 | srlx WORD2, PRE_SHIFT, TMP; \ | |
260 | sllx WORD2, POST_SHIFT, WORD2; \ | |
261 | or WORD1, TMP, WORD1; \ | |
262 | srlx WORD3, PRE_SHIFT, TMP; \ | |
263 | or WORD2, TMP, WORD2; | |
264 | ||
7ae3aaf5 | 265 | 8: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1) |
25e5566e DM |
266 | MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) |
267 | LOAD(prefetch, %i1 + %i3, #one_read) | |
268 | ||
7ae3aaf5 DM |
269 | EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ret_i2_plus_g1) |
270 | EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) | |
25e5566e | 271 | |
7ae3aaf5 | 272 | EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16) |
25e5566e DM |
273 | MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) |
274 | ||
7ae3aaf5 DM |
275 | EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) |
276 | EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) | |
25e5566e | 277 | |
7ae3aaf5 | 278 | EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32) |
25e5566e DM |
279 | MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) |
280 | ||
7ae3aaf5 DM |
281 | EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) |
282 | EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) | |
25e5566e | 283 | |
7ae3aaf5 | 284 | EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48) |
25e5566e DM |
285 | add %i1, 64, %i1 |
286 | MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) | |
287 | ||
7ae3aaf5 DM |
288 | EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) |
289 | EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) | |
25e5566e DM |
290 | |
291 | subcc %g1, 64, %g1 | |
292 | bne,pt %XCC, 8b | |
398d1083 DM |
293 | add %o0, 64, %o0 |
294 | ||
25e5566e DM |
295 | ba,pt %XCC, 60f |
296 | add %i1, %i4, %i1 | |
297 | ||
7ae3aaf5 | 298 | 9: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1) |
25e5566e DM |
299 | MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) |
300 | LOAD(prefetch, %i1 + %i3, #one_read) | |
301 | ||
7ae3aaf5 DM |
302 | EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ret_i2_plus_g1) |
303 | EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) | |
25e5566e | 304 | |
7ae3aaf5 | 305 | EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16) |
25e5566e DM |
306 | MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) |
307 | ||
7ae3aaf5 DM |
308 | EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) |
309 | EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) | |
25e5566e | 310 | |
7ae3aaf5 | 311 | EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32) |
25e5566e DM |
312 | MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) |
313 | ||
7ae3aaf5 DM |
314 | EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) |
315 | EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) | |
25e5566e | 316 | |
7ae3aaf5 | 317 | EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48) |
25e5566e DM |
318 | add %i1, 64, %i1 |
319 | MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) | |
320 | ||
7ae3aaf5 DM |
321 | EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) |
322 | EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) | |
25e5566e DM |
323 | |
324 | subcc %g1, 64, %g1 | |
325 | bne,pt %XCC, 9b | |
326 | add %o0, 64, %o0 | |
398d1083 | 327 | |
398d1083 | 328 | ba,pt %XCC, 60f |
25e5566e | 329 | add %i1, %i4, %i1 |
398d1083 DM |
330 | |
331 | 10: /* Destination is 64-byte aligned, source was only 8-byte | |
332 | * aligned but it has been subtracted by 8 and we perform | |
333 | * one twin load ahead, then add 8 back into source when | |
334 | * we finish the loop. | |
335 | */ | |
7ae3aaf5 | 336 | EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret_i2_plus_g1) |
25e5566e DM |
337 | mov 16, %o7 |
338 | mov 32, %g2 | |
339 | mov 48, %g3 | |
340 | mov 64, %o1 | |
7ae3aaf5 | 341 | 1: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1) |
25e5566e | 342 | LOAD(prefetch, %i1 + %o1, #one_read) |
7ae3aaf5 DM |
343 | EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line |
344 | EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) | |
345 | EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16) | |
346 | EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) | |
347 | EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) | |
348 | EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32) | |
349 | EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) | |
350 | EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) | |
351 | EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), NG_ret_i2_plus_g1_minus_48) | |
25e5566e | 352 | add %i1, 64, %i1 |
7ae3aaf5 DM |
353 | EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) |
354 | EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) | |
398d1083 DM |
355 | subcc %g1, 64, %g1 |
356 | bne,pt %XCC, 1b | |
357 | add %o0, 64, %o0 | |
358 | ||
359 | ba,pt %XCC, 60f | |
25e5566e | 360 | add %i1, 0x8, %i1 |
398d1083 DM |
361 | |
362 | 50: /* Destination is 64-byte aligned, and source is 16-byte | |
363 | * aligned. | |
364 | */ | |
25e5566e DM |
365 | mov 16, %o7 |
366 | mov 32, %g2 | |
367 | mov 48, %g3 | |
368 | mov 64, %o1 | |
7ae3aaf5 DM |
369 | 1: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), NG_ret_i2_plus_g1) |
370 | EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1) | |
25e5566e | 371 | LOAD(prefetch, %i1 + %o1, #one_read) |
7ae3aaf5 DM |
372 | EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line |
373 | EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) | |
374 | EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16) | |
375 | EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) | |
376 | EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) | |
377 | EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32) | |
25e5566e | 378 | add %i1, 64, %i1 |
7ae3aaf5 DM |
379 | EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) |
380 | EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) | |
381 | EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) | |
382 | EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) | |
398d1083 DM |
383 | subcc %g1, 64, %g1 |
384 | bne,pt %XCC, 1b | |
385 | add %o0, 64, %o0 | |
386 | /* fall through */ | |
387 | ||
388 | 60: | |
24d559ca DM |
389 | membar #Sync |
390 | ||
25e5566e | 391 | /* %i2 contains any final bytes still needed to be copied |
398d1083 DM |
392 | * over. If anything is left, we copy it one byte at a time. |
393 | */ | |
25e5566e DM |
394 | RESTORE_ASI(%i3) |
395 | brz,pt %i2, 85f | |
396 | sub %o0, %i1, %i3 | |
398d1083 | 397 | ba,a,pt %XCC, 90f |
0ae2d26f | 398 | nop |
398d1083 DM |
399 | |
400 | .align 64 | |
401 | 70: /* 16 < len <= 64 */ | |
402 | bne,pn %XCC, 75f | |
25e5566e | 403 | sub %o0, %i1, %i3 |
398d1083 DM |
404 | |
405 | 72: | |
25e5566e DM |
406 | andn %i2, 0xf, %i4 |
407 | and %i2, 0xf, %i2 | |
408 | 1: subcc %i4, 0x10, %i4 | |
7ae3aaf5 | 409 | EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_i4) |
25e5566e | 410 | add %i1, 0x08, %i1 |
7ae3aaf5 | 411 | EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_plus_i4) |
25e5566e | 412 | sub %i1, 0x08, %i1 |
7ae3aaf5 | 413 | EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_i4) |
25e5566e | 414 | add %i1, 0x8, %i1 |
7ae3aaf5 | 415 | EX_ST(STORE(stx, %g1, %i1 + %i3), NG_ret_i2_plus_i4_minus_8) |
398d1083 | 416 | bgu,pt %XCC, 1b |
25e5566e DM |
417 | add %i1, 0x8, %i1 |
418 | 73: andcc %i2, 0x8, %g0 | |
398d1083 DM |
419 | be,pt %XCC, 1f |
420 | nop | |
25e5566e | 421 | sub %i2, 0x8, %i2 |
7ae3aaf5 DM |
422 | EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_8) |
423 | EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_8) | |
25e5566e DM |
424 | add %i1, 0x8, %i1 |
425 | 1: andcc %i2, 0x4, %g0 | |
398d1083 DM |
426 | be,pt %XCC, 1f |
427 | nop | |
25e5566e | 428 | sub %i2, 0x4, %i2 |
7ae3aaf5 DM |
429 | EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_plus_4) |
430 | EX_ST(STORE(stw, %i5, %i1 + %i3), NG_ret_i2_plus_4) | |
25e5566e DM |
431 | add %i1, 0x4, %i1 |
432 | 1: cmp %i2, 0 | |
398d1083 DM |
433 | be,pt %XCC, 85f |
434 | nop | |
435 | ba,pt %xcc, 90f | |
436 | nop | |
437 | ||
438 | 75: | |
439 | andcc %o0, 0x7, %g1 | |
440 | sub %g1, 0x8, %g1 | |
441 | be,pn %icc, 2f | |
442 | sub %g0, %g1, %g1 | |
25e5566e | 443 | sub %i2, %g1, %i2 |
398d1083 DM |
444 | |
445 | 1: subcc %g1, 1, %g1 | |
7ae3aaf5 DM |
446 | EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_plus_g1_plus_1) |
447 | EX_ST(STORE(stb, %i5, %i1 + %i3), NG_ret_i2_plus_g1_plus_1) | |
398d1083 | 448 | bgu,pt %icc, 1b |
25e5566e | 449 | add %i1, 1, %i1 |
398d1083 | 450 | |
25e5566e DM |
451 | 2: add %i1, %i3, %o0 |
452 | andcc %i1, 0x7, %g1 | |
398d1083 DM |
453 | bne,pt %icc, 8f |
454 | sll %g1, 3, %g1 | |
455 | ||
25e5566e | 456 | cmp %i2, 16 |
398d1083 DM |
457 | bgeu,pt %icc, 72b |
458 | nop | |
459 | ba,a,pt %xcc, 73b | |
460 | ||
25e5566e DM |
461 | 8: mov 64, %i3 |
462 | andn %i1, 0x7, %i1 | |
7ae3aaf5 | 463 | EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2) |
25e5566e DM |
464 | sub %i3, %g1, %i3 |
465 | andn %i2, 0x7, %i4 | |
398d1083 | 466 | sllx %g2, %g1, %g2 |
25e5566e | 467 | 1: add %i1, 0x8, %i1 |
7ae3aaf5 | 468 | EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_and_7_plus_i4) |
25e5566e DM |
469 | subcc %i4, 0x8, %i4 |
470 | srlx %g3, %i3, %i5 | |
471 | or %i5, %g2, %i5 | |
7ae3aaf5 | 472 | EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_and_7_plus_i4) |
398d1083 DM |
473 | add %o0, 0x8, %o0 |
474 | bgu,pt %icc, 1b | |
475 | sllx %g3, %g1, %g2 | |
476 | ||
477 | srl %g1, 3, %g1 | |
25e5566e | 478 | andcc %i2, 0x7, %i2 |
398d1083 | 479 | be,pn %icc, 85f |
25e5566e | 480 | add %i1, %g1, %i1 |
398d1083 | 481 | ba,pt %xcc, 90f |
25e5566e | 482 | sub %o0, %i1, %i3 |
398d1083 DM |
483 | |
484 | .align 64 | |
485 | 80: /* 0 < len <= 16 */ | |
25e5566e | 486 | andcc %i3, 0x3, %g0 |
398d1083 | 487 | bne,pn %XCC, 90f |
25e5566e | 488 | sub %o0, %i1, %i3 |
398d1083 DM |
489 | |
490 | 1: | |
25e5566e | 491 | subcc %i2, 4, %i2 |
7ae3aaf5 DM |
492 | EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_plus_4) |
493 | EX_ST(STORE(stw, %g1, %i1 + %i3), NG_ret_i2_plus_4) | |
398d1083 | 494 | bgu,pt %XCC, 1b |
25e5566e | 495 | add %i1, 4, %i1 |
398d1083 | 496 | |
25e5566e DM |
497 | 85: ret |
498 | restore EX_RETVAL(%i0), %g0, %o0 | |
398d1083 DM |
499 | |
500 | .align 32 | |
501 | 90: | |
25e5566e | 502 | subcc %i2, 1, %i2 |
7ae3aaf5 DM |
503 | EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_1) |
504 | EX_ST(STORE(stb, %g1, %i1 + %i3), NG_ret_i2_plus_1) | |
398d1083 | 505 | bgu,pt %XCC, 90b |
25e5566e DM |
506 | add %i1, 1, %i1 |
507 | ret | |
508 | restore EX_RETVAL(%i0), %g0, %o0 | |
398d1083 DM |
509 | |
510 | .size FUNC_NAME, .-FUNC_NAME |