Merge tag 'scsi-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi
[linux-2.6-block.git] / arch / sparc / lib / M7memcpy.S
CommitLineData
b3a04ed5
BM
1/*
2 * M7memcpy: Optimized SPARC M7 memcpy
3 *
4 * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
5 */
6
7 .file "M7memcpy.S"
8
9/*
10 * memcpy(s1, s2, len)
11 *
12 * Copy s2 to s1, always copy n bytes.
13 * Note: this C code does not work for overlapped copies.
14 *
15 * Fast assembler language version of the following C-program for memcpy
16 * which represents the `standard' for the C-library.
17 *
18 * void *
19 * memcpy(void *s, const void *s0, size_t n)
20 * {
21 * if (n != 0) {
22 * char *s1 = s;
23 * const char *s2 = s0;
24 * do {
25 * *s1++ = *s2++;
26 * } while (--n != 0);
27 * }
28 * return (s);
29 * }
30 *
31 *
32 * SPARC T7/M7 Flow :
33 *
34 * if (count < SMALL_MAX) {
35 * if count < SHORTCOPY (SHORTCOPY=3)
36 * copy bytes; exit with dst addr
37 * if src & dst aligned on word boundary but not long word boundary,
38 * copy with ldw/stw; branch to finish_up
39 * if src & dst aligned on long word boundary
40 * copy with ldx/stx; branch to finish_up
41 * if src & dst not aligned and length <= SHORTCHECK (SHORTCHECK=14)
42 * copy bytes; exit with dst addr
43 * move enough bytes to get src to word boundary
44 * if dst now on word boundary
45 * move_words:
46 * copy words; branch to finish_up
47 * if dst now on half word boundary
48 * load words, shift half words, store words; branch to finish_up
49 * if dst on byte 1
50 * load words, shift 3 bytes, store words; branch to finish_up
51 * if dst on byte 3
52 * load words, shift 1 byte, store words; branch to finish_up
53 * finish_up:
54 * copy bytes; exit with dst addr
55 * } else { More than SMALL_MAX bytes
56 * move bytes until dst is on long word boundary
57 * if( src is on long word boundary ) {
58 * if (count < MED_MAX) {
59 * finish_long: src/dst aligned on 8 bytes
60 * copy with ldx/stx in 8-way unrolled loop;
61 * copy final 0-63 bytes; exit with dst addr
62 * } else { src/dst aligned; count > MED_MAX
63 * align dst on 64 byte boundary; for main data movement:
64 * prefetch src data to L2 cache; let HW prefetch move data to L1 cache
65 * Use BIS (block initializing store) to avoid copying store cache
66 * lines from memory. But pre-store first element of each cache line
67 * ST_CHUNK lines in advance of the rest of that cache line. That
68 * gives time for replacement cache lines to be written back without
69 * excess STQ and Miss Buffer filling. Repeat until near the end,
70 * then finish up storing before going to finish_long.
71 * }
72 * } else { src/dst not aligned on 8 bytes
73 * if src is word aligned and count < MED_WMAX
74 * move words in 8-way unrolled loop
75 * move final 0-31 bytes; exit with dst addr
76 * if count < MED_UMAX
77 * use alignaddr/faligndata combined with ldd/std in 8-way
78 * unrolled loop to move data.
79 * go to unalign_done
80 * else
81 * setup alignaddr for faligndata instructions
82 * align dst on 64 byte boundary; prefetch src data to L1 cache
83 * loadx8, falign, block-store, prefetch loop
84 * (only use block-init-store when src/dst on 8 byte boundaries.)
85 * unalign_done:
86 * move remaining bytes for unaligned cases. exit with dst addr.
87 * }
88 *
89 */
90
91#include <asm/visasm.h>
92#include <asm/asi.h>
93
94#if !defined(EX_LD) && !defined(EX_ST)
95#define NON_USER_COPY
96#endif
97
98#ifndef EX_LD
34060b8f 99#define EX_LD(x,y) x
b3a04ed5
BM
100#endif
101#ifndef EX_LD_FP
34060b8f 102#define EX_LD_FP(x,y) x
b3a04ed5
BM
103#endif
104
105#ifndef EX_ST
34060b8f 106#define EX_ST(x,y) x
b3a04ed5
BM
107#endif
108#ifndef EX_ST_FP
34060b8f 109#define EX_ST_FP(x,y) x
b3a04ed5
BM
110#endif
111
112#ifndef EX_RETVAL
113#define EX_RETVAL(x) x
114#endif
115
116#ifndef LOAD
117#define LOAD(type,addr,dest) type [addr], dest
118#endif
119
120#ifndef STORE
121#define STORE(type,src,addr) type src, [addr]
122#endif
123
124/*
125 * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache
126 * line as "least recently used" which means if many threads are
127 * active, it has a high probability of being pushed out of the cache
128 * between the first initializing store and the final stores.
129 * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which
130 * marks the cache line as "most recently used" for all
131 * but the last cache line
132 */
133#ifndef STORE_ASI
134#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
135#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
136#else
137#define STORE_ASI 0x80 /* ASI_P */
138#endif
139#endif
140
141#ifndef STORE_MRU_ASI
142#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
143#define STORE_MRU_ASI ASI_ST_BLKINIT_MRU_P
144#else
145#define STORE_MRU_ASI 0x80 /* ASI_P */
146#endif
147#endif
148
149#ifndef STORE_INIT
150#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
151#endif
152
153#ifndef STORE_INIT_MRU
154#define STORE_INIT_MRU(src,addr) stxa src, [addr] STORE_MRU_ASI
155#endif
156
157#ifndef FUNC_NAME
158#define FUNC_NAME M7memcpy
159#endif
160
161#ifndef PREAMBLE
162#define PREAMBLE
163#endif
164
165#define BLOCK_SIZE 64
166#define SHORTCOPY 3
167#define SHORTCHECK 14
168#define SHORT_LONG 64 /* max copy for short longword-aligned case */
169 /* must be at least 64 */
170#define SMALL_MAX 128
171#define MED_UMAX 1024 /* max copy for medium un-aligned case */
172#define MED_WMAX 1024 /* max copy for medium word-aligned case */
173#define MED_MAX 1024 /* max copy for medium longword-aligned case */
174#define ST_CHUNK 24 /* ST_CHUNK - block of values for BIS Store */
175#define ALIGN_PRE 24 /* distance for aligned prefetch loop */
176
177 .register %g2,#scratch
178
179 .section ".text"
180 .global FUNC_NAME
181 .type FUNC_NAME, #function
182 .align 16
183FUNC_NAME:
184 srlx %o2, 31, %g2
185 cmp %g2, 0
186 tne %xcc, 5
187 PREAMBLE
188 mov %o0, %g1 ! save %o0
189 brz,pn %o2, .Lsmallx
190 cmp %o2, 3
191 ble,pn %icc, .Ltiny_cp
192 cmp %o2, 19
193 ble,pn %icc, .Lsmall_cp
194 or %o0, %o1, %g2
195 cmp %o2, SMALL_MAX
196 bl,pn %icc, .Lmedium_cp
197 nop
198
199.Lmedium:
200 neg %o0, %o5
201 andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned
202 brz,pt %o5, .Ldst_aligned_on_8
203
204 ! %o5 has the bytes to be written in partial store.
205 sub %o2, %o5, %o2
206 sub %o1, %o0, %o1 ! %o1 gets the difference
2077: ! dst aligning loop
208 add %o1, %o0, %o4
34060b8f 209 EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5) ! load one byte
b3a04ed5 210 subcc %o5, 1, %o5
34060b8f 211 EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1)
b3a04ed5
BM
212 bgu,pt %xcc, 7b
213 add %o0, 1, %o0 ! advance dst
214 add %o1, %o0, %o1 ! restore %o1
215.Ldst_aligned_on_8:
216 andcc %o1, 7, %o5
217 brnz,pt %o5, .Lsrc_dst_unaligned_on_8
218 nop
219
220.Lsrc_dst_aligned_on_8:
221 ! check if we are copying MED_MAX or more bytes
222 set MED_MAX, %o3
223 cmp %o2, %o3 ! limit to store buffer size
224 bgu,pn %xcc, .Llarge_align8_copy
225 nop
226
227/*
228 * Special case for handling when src and dest are both long word aligned
229 * and total data to move is less than MED_MAX bytes
230 */
231.Lmedlong:
232 subcc %o2, 63, %o2 ! adjust length to allow cc test
233 ble,pn %xcc, .Lmedl63 ! skip big loop if less than 64 bytes
234 nop
235.Lmedl64:
34060b8f 236 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63) ! load
b3a04ed5 237 subcc %o2, 64, %o2 ! decrement length count
34060b8f
BM
238 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64) ! and store
239 EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56) ! a block of 64
240 EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56)
241 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48)
242 EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48)
243 EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40)
244 EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40)
245 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store
246 EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32)
247 EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64
b3a04ed5 248 add %o1, 64, %o1 ! increase src ptr by 64
34060b8f
BM
249 EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24)
250 EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16)
b3a04ed5 251 add %o0, 64, %o0 ! increase dst ptr by 64
34060b8f
BM
252 EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16)
253 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8)
b3a04ed5 254 bgu,pt %xcc, .Lmedl64 ! repeat if at least 64 bytes left
34060b8f 255 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8)
b3a04ed5
BM
256.Lmedl63:
257 addcc %o2, 32, %o2 ! adjust remaining count
258 ble,pt %xcc, .Lmedl31 ! to skip if 31 or fewer bytes left
259 nop
34060b8f 260 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31) ! load
b3a04ed5 261 sub %o2, 32, %o2 ! decrement length count
34060b8f
BM
262 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32) ! and store
263 EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24) ! a block of 32
b3a04ed5 264 add %o1, 32, %o1 ! increase src ptr by 32
34060b8f
BM
265 EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24)
266 EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
b3a04ed5 267 add %o0, 32, %o0 ! increase dst ptr by 32
34060b8f
BM
268 EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16)
269 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8)
270 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8)
b3a04ed5
BM
271.Lmedl31:
272 addcc %o2, 16, %o2 ! adjust remaining count
273 ble,pt %xcc, .Lmedl15 ! skip if 15 or fewer bytes left
274 nop !
34060b8f 275 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15)
b3a04ed5 276 add %o1, 16, %o1 ! increase src ptr by 16
34060b8f 277 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15)
b3a04ed5 278 sub %o2, 16, %o2 ! decrease count by 16
34060b8f 279 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8)
b3a04ed5 280 add %o0, 16, %o0 ! increase dst ptr by 16
34060b8f 281 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8)
b3a04ed5
BM
282.Lmedl15:
283 addcc %o2, 15, %o2 ! restore count
284 bz,pt %xcc, .Lsmallx ! exit if finished
285 cmp %o2, 8
286 blt,pt %xcc, .Lmedw7 ! skip if 7 or fewer bytes left
287 tst %o2
34060b8f 288 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) ! load 8 bytes
b3a04ed5
BM
289 add %o1, 8, %o1 ! increase src ptr by 8
290 add %o0, 8, %o0 ! increase dst ptr by 8
291 subcc %o2, 8, %o2 ! decrease count by 8
292 bnz,pn %xcc, .Lmedw7
34060b8f 293 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) ! and store 8
b3a04ed5
BM
294 retl
295 mov EX_RETVAL(%g1), %o0 ! restore %o0
296
297 .align 16
298.Lsrc_dst_unaligned_on_8:
299 ! DST is 8-byte aligned, src is not
3002:
301 andcc %o1, 0x3, %o5 ! test word alignment
302 bnz,pt %xcc, .Lunalignsetup ! branch to skip if not word aligned
303 nop
304
305/*
306 * Handle all cases where src and dest are aligned on word
307 * boundaries. Use unrolled loops for better performance.
308 * This option wins over standard large data move when
309 * source and destination is in cache for.Lmedium
310 * to short data moves.
311 */
312 set MED_WMAX, %o3
313 cmp %o2, %o3 ! limit to store buffer size
314 bge,pt %xcc, .Lunalignrejoin ! otherwise rejoin main loop
315 nop
316
317 subcc %o2, 31, %o2 ! adjust length to allow cc test
318 ! for end of loop
319 ble,pt %xcc, .Lmedw31 ! skip big loop if less than 16
320.Lmedw32:
34060b8f 321 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32
b3a04ed5 322 sllx %o4, 32, %o5
34060b8f 323 EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31)
b3a04ed5 324 or %o4, %o5, %o5
34060b8f 325 EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31)
b3a04ed5 326 subcc %o2, 32, %o2 ! decrement length count
34060b8f 327 EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24)
b3a04ed5 328 sllx %o4, 32, %o5
34060b8f 329 EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24)
b3a04ed5 330 or %o4, %o5, %o5
34060b8f 331 EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24)
b3a04ed5 332 add %o1, 32, %o1 ! increase src ptr by 32
34060b8f 333 EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
b3a04ed5 334 sllx %o4, 32, %o5
34060b8f 335 EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16)
b3a04ed5 336 or %o4, %o5, %o5
34060b8f 337 EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16)
b3a04ed5 338 add %o0, 32, %o0 ! increase dst ptr by 32
34060b8f 339 EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8)
b3a04ed5 340 sllx %o4, 32, %o5
34060b8f 341 EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8)
b3a04ed5
BM
342 or %o4, %o5, %o5
343 bgu,pt %xcc, .Lmedw32 ! repeat if at least 32 bytes left
34060b8f 344 EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8)
b3a04ed5
BM
345.Lmedw31:
346 addcc %o2, 31, %o2 ! restore count
347
348 bz,pt %xcc, .Lsmallx ! exit if finished
349 nop
350 cmp %o2, 16
351 blt,pt %xcc, .Lmedw15
352 nop
34060b8f 353 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes
b3a04ed5
BM
354 sllx %o4, 32, %o5
355 subcc %o2, 16, %o2 ! decrement length count
34060b8f 356 EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16)
b3a04ed5 357 or %o4, %o5, %o5
34060b8f 358 EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16)
b3a04ed5 359 add %o1, 16, %o1 ! increase src ptr by 16
34060b8f 360 EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8)
b3a04ed5
BM
361 add %o0, 16, %o0 ! increase dst ptr by 16
362 sllx %o4, 32, %o5
34060b8f 363 EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8)
b3a04ed5 364 or %o4, %o5, %o5
34060b8f 365 EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8)
b3a04ed5
BM
366.Lmedw15:
367 bz,pt %xcc, .Lsmallx ! exit if finished
368 cmp %o2, 8
369 blt,pn %xcc, .Lmedw7 ! skip if 7 or fewer bytes left
370 tst %o2
34060b8f 371 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes
b3a04ed5 372 subcc %o2, 8, %o2 ! decrease count by 8
34060b8f 373 EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes
b3a04ed5 374 add %o1, 8, %o1 ! increase src ptr by 8
34060b8f 375 EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4) ! load 4 bytes
b3a04ed5 376 add %o0, 8, %o0 ! increase dst ptr by 8
34060b8f 377 EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
b3a04ed5
BM
378 bz,pt %xcc, .Lsmallx ! exit if finished
379.Lmedw7: ! count is ge 1, less than 8
380 cmp %o2, 4 ! check for 4 bytes left
381 blt,pn %xcc, .Lsmallleft3 ! skip if 3 or fewer bytes left
382 nop !
34060b8f 383 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes
b3a04ed5
BM
384 add %o1, 4, %o1 ! increase src ptr by 4
385 add %o0, 4, %o0 ! increase dst ptr by 4
386 subcc %o2, 4, %o2 ! decrease count by 4
387 bnz .Lsmallleft3
34060b8f 388 EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
b3a04ed5
BM
389 retl
390 mov EX_RETVAL(%g1), %o0
391
392 .align 16
393.Llarge_align8_copy: ! Src and dst share 8 byte alignment
394 ! align dst to 64 byte boundary
395 andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
396 brz,pn %o3, .Laligned_to_64
397 andcc %o0, 8, %o3 ! odd long words to move?
398 brz,pt %o3, .Laligned_to_16
399 nop
34060b8f 400 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
b3a04ed5
BM
401 sub %o2, 8, %o2
402 add %o1, 8, %o1 ! increment src ptr
403 add %o0, 8, %o0 ! increment dst ptr
34060b8f 404 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
b3a04ed5
BM
405.Laligned_to_16:
406 andcc %o0, 16, %o3 ! pair of long words to move?
407 brz,pt %o3, .Laligned_to_32
408 nop
34060b8f 409 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
b3a04ed5 410 sub %o2, 16, %o2
34060b8f 411 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16)
b3a04ed5 412 add %o1, 16, %o1 ! increment src ptr
34060b8f 413 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
b3a04ed5 414 add %o0, 16, %o0 ! increment dst ptr
34060b8f 415 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
b3a04ed5
BM
416.Laligned_to_32:
417 andcc %o0, 32, %o3 ! four long words to move?
418 brz,pt %o3, .Laligned_to_64
419 nop
34060b8f 420 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
b3a04ed5 421 sub %o2, 32, %o2
34060b8f
BM
422 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32)
423 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24)
424 EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24)
425 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16)
426 EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16)
b3a04ed5 427 add %o1, 32, %o1 ! increment src ptr
34060b8f 428 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
b3a04ed5 429 add %o0, 32, %o0 ! increment dst ptr
34060b8f 430 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
b3a04ed5
BM
431.Laligned_to_64:
432!
433! Using block init store (BIS) instructions to avoid fetching cache
434! lines from memory. Use ST_CHUNK stores to first element of each cache
435! line (similar to prefetching) to avoid overfilling STQ or miss buffers.
436! Gives existing cache lines time to be moved out of L1/L2/L3 cache.
437! Initial stores using MRU version of BIS to keep cache line in
438! cache until we are ready to store final element of cache line.
439! Then store last element using the LRU version of BIS.
440!
441 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size
442 and %o2, 0x3f, %o2 ! residue bytes in %o2
443!
444! We use STORE_MRU_ASI for the first seven stores to each cache line
445! followed by STORE_ASI (mark as LRU) for the last store. That
446! mixed approach reduces the probability that the cache line is removed
447! before we finish setting it, while minimizing the effects on
448! other cached values during a large memcpy
449!
450! ST_CHUNK batches up initial BIS operations for several cache lines
451! to allow multiple requests to not be blocked by overflowing the
452! the store miss buffer. Then the matching stores for all those
453! BIS operations are executed.
454!
455
456 sub %o0, 8, %o0 ! adjust %o0 for ASI alignment
457.Lalign_loop:
458 cmp %o5, ST_CHUNK*64
459 blu,pt %xcc, .Lalign_loop_fin
460 mov ST_CHUNK,%o3
461.Lalign_loop_start:
462 prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
463 subcc %o3, 1, %o3
34060b8f 464 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
b3a04ed5
BM
465 add %o1, 64, %o1
466 add %o0, 8, %o0
34060b8f 467 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
b3a04ed5
BM
468 bgu %xcc,.Lalign_loop_start
469 add %o0, 56, %o0
470
471 mov ST_CHUNK,%o3
472 sllx %o3, 6, %o4 ! ST_CHUNK*64
473 sub %o1, %o4, %o1 ! reset %o1
474 sub %o0, %o4, %o0 ! reset %o0
475
476.Lalign_loop_rest:
34060b8f 477 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
b3a04ed5 478 add %o0, 16, %o0
34060b8f
BM
479 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
480 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
b3a04ed5 481 add %o0, 8, %o0
34060b8f 482 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
b3a04ed5 483 subcc %o3, 1, %o3
34060b8f 484 EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5)
b3a04ed5 485 add %o0, 8, %o0
34060b8f
BM
486 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
487 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5)
b3a04ed5 488 add %o0, 8, %o0
34060b8f
BM
489 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
490 EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5)
b3a04ed5 491 add %o0, 8, %o0
34060b8f
BM
492 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
493 EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5)
b3a04ed5
BM
494 add %o1, 64, %o1
495 add %o0, 8, %o0
34060b8f 496 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
b3a04ed5 497 add %o0, 8, %o0
34060b8f 498 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5)
b3a04ed5
BM
499 sub %o5, 64, %o5
500 bgu %xcc,.Lalign_loop_rest
501 ! mark cache line as LRU
34060b8f 502 EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64)
b3a04ed5
BM
503
504 cmp %o5, ST_CHUNK*64
505 bgu,pt %xcc, .Lalign_loop_start
506 mov ST_CHUNK,%o3
507
508 cmp %o5, 0
509 beq .Lalign_done
510 nop
511.Lalign_loop_fin:
34060b8f
BM
512 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
513 EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5)
514 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
515 EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5)
516 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
517 EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5)
b3a04ed5 518 subcc %o5, 64, %o5
34060b8f
BM
519 EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64)
520 EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64)
521 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64)
522 EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64)
523 EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64)
524 EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64)
525 EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64)
b3a04ed5 526 add %o1, 64, %o1
34060b8f 527 EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64)
b3a04ed5 528 add %o0, 64, %o0
34060b8f 529 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64)
b3a04ed5 530 bgu %xcc,.Lalign_loop_fin
34060b8f 531 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64)
b3a04ed5
BM
532
533.Lalign_done:
534 add %o0, 8, %o0 ! restore %o0 from ASI alignment
535 membar #StoreStore
536 sub %o2, 63, %o2 ! adjust length to allow cc test
537 ba .Lmedl63 ! in .Lmedl63
538 nop
539
540 .align 16
541 ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
542.Lunalignsetup:
543.Lunalignrejoin:
544 mov %g1, %o3 ! save %g1 as VISEntryHalf clobbers it
545#ifdef NON_USER_COPY
546 VISEntryHalfFast(.Lmedium_vis_entry_fail_cp)
547#else
548 VISEntryHalf
549#endif
550 mov %o3, %g1 ! restore %g1
551
552 set MED_UMAX, %o3
553 cmp %o2, %o3 ! check for.Lmedium unaligned limit
554 bge,pt %xcc,.Lunalign_large
555 prefetch [%o1 + (4 * BLOCK_SIZE)], 20
556 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size
557 and %o2, 0x3f, %o2 ! residue bytes in %o2
558 cmp %o2, 8 ! Insure we do not load beyond
559 bgt .Lunalign_adjust ! end of source buffer
560 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address
561 add %o2, 64, %o2 ! adjust to leave loop
562 sub %o5, 64, %o5 ! early if necessary
563.Lunalign_adjust:
564 alignaddr %o1, %g0, %g0 ! generate %gsr
565 add %o1, %o5, %o1 ! advance %o1 to after blocks
34060b8f 566 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)
b3a04ed5 567.Lunalign_loop:
34060b8f 568 EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
b3a04ed5 569 faligndata %f0, %f2, %f16
34060b8f 570 EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5)
b3a04ed5 571 subcc %o5, BLOCK_SIZE, %o5
34060b8f 572 EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64)
b3a04ed5 573 faligndata %f2, %f4, %f18
34060b8f
BM
574 EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56)
575 EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
b3a04ed5 576 faligndata %f4, %f6, %f20
34060b8f
BM
577 EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48)
578 EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
b3a04ed5 579 faligndata %f6, %f8, %f22
34060b8f
BM
580 EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40)
581 EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
b3a04ed5 582 faligndata %f8, %f10, %f24
34060b8f
BM
583 EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32)
584 EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32)
b3a04ed5 585 faligndata %f10, %f12, %f26
34060b8f 586 EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24)
b3a04ed5 587 add %o4, BLOCK_SIZE, %o4
34060b8f 588 EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24)
b3a04ed5 589 faligndata %f12, %f14, %f28
34060b8f
BM
590 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16)
591 EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16)
b3a04ed5 592 faligndata %f14, %f0, %f30
34060b8f 593 EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8)
b3a04ed5
BM
594 add %o0, BLOCK_SIZE, %o0
595 bgu,pt %xcc, .Lunalign_loop
596 prefetch [%o4 + (5 * BLOCK_SIZE)], 20
597 ba .Lunalign_done
598 nop
599
600.Lunalign_large:
601 andcc %o0, 0x3f, %o3 ! is dst 64-byte block aligned?
602 bz %xcc, .Lunalignsrc
603 sub %o3, 64, %o3 ! %o3 will be multiple of 8
604 neg %o3 ! bytes until dest is 64 byte aligned
605 sub %o2, %o3, %o2 ! update cnt with bytes to be moved
606 ! Move bytes according to source alignment
607 andcc %o1, 0x1, %o5
608 bnz %xcc, .Lunalignbyte ! check for byte alignment
609 nop
610 andcc %o1, 2, %o5 ! check for half word alignment
611 bnz %xcc, .Lunalignhalf
612 nop
613 ! Src is word aligned
614.Lunalignword:
34060b8f 615 EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 4 bytes
b3a04ed5 616 add %o1, 8, %o1 ! increase src ptr by 8
34060b8f 617 EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3) ! and store 4
b3a04ed5 618 subcc %o3, 8, %o3 ! decrease count by 8
34060b8f 619 EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4
b3a04ed5
BM
620 add %o0, 8, %o0 ! increase dst ptr by 8
621 bnz %xcc, .Lunalignword
34060b8f 622 EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4)
b3a04ed5
BM
623 ba .Lunalignsrc
624 nop
625
626 ! Src is half-word aligned
627.Lunalignhalf:
34060b8f 628 EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 2 bytes
b3a04ed5 629 sllx %o4, 32, %o5 ! shift left
34060b8f 630 EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3)
b3a04ed5
BM
631 or %o4, %o5, %o5
632 sllx %o5, 16, %o5
34060b8f 633 EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3)
b3a04ed5 634 or %o4, %o5, %o5
34060b8f 635 EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
b3a04ed5
BM
636 add %o1, 8, %o1
637 subcc %o3, 8, %o3
638 bnz %xcc, .Lunalignhalf
639 add %o0, 8, %o0
640 ba .Lunalignsrc
641 nop
642
643 ! Src is Byte aligned
644.Lunalignbyte:
645 sub %o0, %o1, %o0 ! share pointer advance
646.Lunalignbyte_loop:
34060b8f 647 EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3)
b3a04ed5 648 sllx %o4, 56, %o5
34060b8f 649 EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3)
b3a04ed5
BM
650 sllx %o4, 40, %o4
651 or %o4, %o5, %o5
34060b8f 652 EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3)
b3a04ed5
BM
653 sllx %o4, 24, %o4
654 or %o4, %o5, %o5
34060b8f 655 EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3)
b3a04ed5
BM
656 sllx %o4, 8, %o4
657 or %o4, %o5, %o5
34060b8f 658 EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3)
b3a04ed5
BM
659 or %o4, %o5, %o5
660 add %o0, %o1, %o0
34060b8f 661 EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
b3a04ed5
BM
662 sub %o0, %o1, %o0
663 subcc %o3, 8, %o3
664 bnz %xcc, .Lunalignbyte_loop
665 add %o1, 8, %o1
666 add %o0,%o1, %o0 ! restore pointer
667
668 ! Destination is now block (64 byte aligned)
669.Lunalignsrc:
670 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size
671 and %o2, 0x3f, %o2 ! residue bytes in %o2
672 add %o2, 64, %o2 ! Insure we do not load beyond
673 sub %o5, 64, %o5 ! end of source buffer
674
675 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address
676 alignaddr %o1, %g0, %g0 ! generate %gsr
677 add %o1, %o5, %o1 ! advance %o1 to after blocks
678
34060b8f 679 EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5)
b3a04ed5
BM
680 add %o4, 8, %o4
681.Lunalign_sloop:
34060b8f 682 EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5)
b3a04ed5 683 faligndata %f14, %f16, %f0
34060b8f 684 EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5)
b3a04ed5 685 faligndata %f16, %f18, %f2
34060b8f 686 EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5)
b3a04ed5 687 faligndata %f18, %f20, %f4
34060b8f 688 EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5)
b3a04ed5 689 subcc %o5, 64, %o5
34060b8f 690 EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56)
b3a04ed5 691 faligndata %f20, %f22, %f6
34060b8f
BM
692 EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
693 EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48)
b3a04ed5 694 faligndata %f22, %f24, %f8
34060b8f
BM
695 EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
696 EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40)
b3a04ed5 697 faligndata %f24, %f26, %f10
34060b8f
BM
698 EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
699 EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40)
b3a04ed5 700 faligndata %f26, %f28, %f12
34060b8f 701 EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40)
b3a04ed5 702 add %o4, 64, %o4
34060b8f 703 EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40)
b3a04ed5 704 faligndata %f28, %f30, %f14
34060b8f
BM
705 EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40)
706 EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40)
b3a04ed5 707 add %o0, 64, %o0
34060b8f 708 EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40)
b3a04ed5
BM
709 fsrc2 %f30, %f14
710 bgu,pt %xcc, .Lunalign_sloop
711 prefetch [%o4 + (8 * BLOCK_SIZE)], 20
712
713.Lunalign_done:
714 ! Handle trailing bytes, 64 to 127
715 ! Dest long word aligned, Src not long word aligned
716 cmp %o2, 15
717 bleu %xcc, .Lunalign_short
718
719 andn %o2, 0x7, %o5 ! %o5 is multiple of 8
720 and %o2, 0x7, %o2 ! residue bytes in %o2
721 add %o2, 8, %o2
722 sub %o5, 8, %o5 ! insure we do not load past end of src
723 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address
724 add %o1, %o5, %o1 ! advance %o1 to after multiple of 8
34060b8f 725 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword
b3a04ed5 726.Lunalign_by8:
34060b8f 727 EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
b3a04ed5
BM
728 add %o4, 8, %o4
729 faligndata %f0, %f2, %f16
730 subcc %o5, 8, %o5
34060b8f 731 EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5)
b3a04ed5
BM
732 fsrc2 %f2, %f0
733 bgu,pt %xcc, .Lunalign_by8
734 add %o0, 8, %o0
735
736.Lunalign_short:
737#ifdef NON_USER_COPY
738 VISExitHalfFast
739#else
740 VISExitHalf
741#endif
742 ba .Lsmallrest
743 nop
744
745/*
746 * This is a special case of nested memcpy. This can happen when kernel
747 * calls unaligned memcpy back to back without saving FP registers. We need
748 * traps(context switch) to save/restore FP registers. If the kernel calls
749 * memcpy without this trap sequence we will hit FP corruption. Let's use
750 * the normal integer load/store method in this case.
751 */
752
753#ifdef NON_USER_COPY
754.Lmedium_vis_entry_fail_cp:
755 or %o0, %o1, %g2
756#endif
757.Lmedium_cp:
758 LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
759 andcc %g2, 0x7, %g0
760 bne,pn %xcc, .Lmedium_unaligned_cp
761 nop
762
763.Lmedium_noprefetch_cp:
764 andncc %o2, 0x20 - 1, %o5
765 be,pn %xcc, 2f
766 sub %o2, %o5, %o2
34060b8f
BM
7671: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
768 EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5)
769 EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5)
770 EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5)
b3a04ed5
BM
771 add %o1, 0x20, %o1
772 subcc %o5, 0x20, %o5
34060b8f
BM
773 EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32)
774 EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24)
775 EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24)
776 EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8)
b3a04ed5
BM
777 bne,pt %xcc, 1b
778 add %o0, 0x20, %o0
7792: andcc %o2, 0x18, %o5
780 be,pt %xcc, 3f
781 sub %o2, %o5, %o2
34060b8f 7821: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
b3a04ed5
BM
783 add %o1, 0x08, %o1
784 add %o0, 0x08, %o0
785 subcc %o5, 0x08, %o5
786 bne,pt %xcc, 1b
34060b8f 787 EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8)
b3a04ed5
BM
7883: brz,pt %o2, .Lexit_cp
789 cmp %o2, 0x04
790 bl,pn %xcc, .Ltiny_cp
791 nop
34060b8f 792 EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2)
b3a04ed5
BM
793 add %o1, 0x04, %o1
794 add %o0, 0x04, %o0
795 subcc %o2, 0x04, %o2
796 bne,pn %xcc, .Ltiny_cp
34060b8f 797 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4)
b3a04ed5
BM
798 ba,a,pt %xcc, .Lexit_cp
799
800.Lmedium_unaligned_cp:
801 /* First get dest 8 byte aligned. */
802 sub %g0, %o0, %o3
803 and %o3, 0x7, %o3
804 brz,pt %o3, 2f
805 sub %o2, %o3, %o2
806
34060b8f 8071: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1)
b3a04ed5
BM
808 add %o1, 1, %o1
809 subcc %o3, 1, %o3
810 add %o0, 1, %o0
811 bne,pt %xcc, 1b
34060b8f 812 EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1)
b3a04ed5
BM
8132:
814 and %o1, 0x7, %o3
815 brz,pn %o3, .Lmedium_noprefetch_cp
816 sll %o3, 3, %o3
817 mov 64, %g2
818 sub %g2, %o3, %g2
819 andn %o1, 0x7, %o1
34060b8f 820 EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2)
b3a04ed5
BM
821 sllx %o4, %o3, %o4
822 andn %o2, 0x08 - 1, %o5
823 sub %o2, %o5, %o2
824
34060b8f 8251: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5)
b3a04ed5
BM
826 add %o1, 0x08, %o1
827 subcc %o5, 0x08, %o5
828 srlx %g3, %g2, %g7
829 or %g7, %o4, %g7
34060b8f 830 EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8)
b3a04ed5
BM
831 add %o0, 0x08, %o0
832 bne,pt %xcc, 1b
833 sllx %g3, %o3, %o4
834 srl %o3, 3, %o3
835 add %o1, %o3, %o1
836 brz,pn %o2, .Lexit_cp
837 nop
838 ba,pt %xcc, .Lsmall_unaligned_cp
839
840.Ltiny_cp:
34060b8f 841 EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
b3a04ed5
BM
842 subcc %o2, 1, %o2
843 be,pn %xcc, .Lexit_cp
34060b8f
BM
844 EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1)
845 EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2)
b3a04ed5
BM
846 subcc %o2, 1, %o2
847 be,pn %xcc, .Lexit_cp
34060b8f
BM
848 EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1)
849 EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2)
b3a04ed5 850 ba,pt %xcc, .Lexit_cp
34060b8f 851 EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2)
b3a04ed5
BM
852
853.Lsmall_cp:
854 andcc %g2, 0x3, %g0
855 bne,pn %xcc, .Lsmall_unaligned_cp
856 andn %o2, 0x4 - 1, %o5
857 sub %o2, %o5, %o2
8581:
34060b8f 859 EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
b3a04ed5
BM
860 add %o1, 0x04, %o1
861 subcc %o5, 0x04, %o5
862 add %o0, 0x04, %o0
863 bne,pt %xcc, 1b
34060b8f 864 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4)
b3a04ed5
BM
865 brz,pt %o2, .Lexit_cp
866 nop
867 ba,a,pt %xcc, .Ltiny_cp
868
869.Lsmall_unaligned_cp:
34060b8f 8701: EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
b3a04ed5
BM
871 add %o1, 1, %o1
872 add %o0, 1, %o0
873 subcc %o2, 1, %o2
874 bne,pt %xcc, 1b
34060b8f 875 EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1)
b3a04ed5
BM
876 ba,a,pt %xcc, .Lexit_cp
877
878.Lsmallrest:
879 tst %o2
880 bz,pt %xcc, .Lsmallx
881 cmp %o2, 4
882 blt,pn %xcc, .Lsmallleft3
883 nop
884 sub %o2, 3, %o2
885.Lsmallnotalign4:
34060b8f 886 EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte
b3a04ed5 887 subcc %o2, 4, %o2 ! reduce count by 4
34060b8f
BM
888 EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat
889 EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4
b3a04ed5 890 add %o1, 4, %o1 ! advance SRC by 4
34060b8f
BM
891 EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6)
892 EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5)
b3a04ed5 893 add %o0, 4, %o0 ! advance DST by 4
34060b8f
BM
894 EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5)
895 EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4)
b3a04ed5 896 bgu,pt %xcc, .Lsmallnotalign4 ! loop til 3 or fewer bytes remain
34060b8f 897 EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4)
b3a04ed5
BM
898 addcc %o2, 3, %o2 ! restore count
899 bz,pt %xcc, .Lsmallx
900.Lsmallleft3: ! 1, 2, or 3 bytes remain
901 subcc %o2, 1, %o2
34060b8f 902 EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1) ! load one byte
b3a04ed5 903 bz,pt %xcc, .Lsmallx
34060b8f
BM
904 EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1) ! store one byte
905 EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2) ! load second byte
b3a04ed5
BM
906 subcc %o2, 1, %o2
907 bz,pt %xcc, .Lsmallx
34060b8f
BM
908 EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte
909 EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2) ! load third byte
910 EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2) ! store third byte
b3a04ed5
BM
911.Lsmallx:
912 retl
913 mov EX_RETVAL(%g1), %o0
914.Lsmallfin:
915 tst %o2
916 bnz,pn %xcc, .Lsmallleft3
917 nop
918 retl
919 mov EX_RETVAL(%g1), %o0 ! restore %o0
920.Lexit_cp:
921 retl
922 mov EX_RETVAL(%g1), %o0
923 .size FUNC_NAME, .-FUNC_NAME