2 * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions
3 * xthal_memcpy and xthal_bcopy
5 * This file is subject to the terms and conditions of the GNU General Public
6 * License. See the file "COPYING" in the main directory of this archive
9 * Copyright (C) 2002 - 2012 Tensilica Inc.
12 #include <linux/linkage.h>
13 #include <asm/asmmacro.h>
17 * void *memcpy(void *dst, const void *src, size_t len);
19 * This function is intended to do the same thing as the standard
20 * library function memcpy() for most cases.
21 * However, where the source and/or destination references
22 * an instruction RAM or ROM or a data RAM or ROM, that
23 * source and/or destination will always be accessed with
24 * 32-bit load and store instructions (as required for these
28 * !!!!!!! Handling of IRAM/IROM has not yet
29 * !!!!!!! been implemented.
31 * The (general case) algorithm is as follows:
32 * If destination is unaligned, align it by conditionally
33 * copying 1 and 2 bytes.
34 * If source is aligned,
35 * do 16 bytes with a loop, and then finish up with
36 * 8, 4, 2, and 1 byte copies conditional on the length;
37 * else (if source is unaligned),
38 * do the same, but use SRC to align the source data.
39 * This code tries to use fall-through branches for the common
40 * case of aligned source and destination and multiple
64 .byte 0 # 1 mod 4 alignment for LOOPNEZ
65 # (0 mod 4 alignment for LBEG)
68 loopnez a4, .Lbytecopydone
69 #else /* !XCHAL_HAVE_LOOPS */
70 beqz a4, .Lbytecopydone
71 add a7, a3, a4 # a7 = end address for source
72 #endif /* !XCHAL_HAVE_LOOPS */
79 bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
80 #endif /* !XCHAL_HAVE_LOOPS */
85 * Destination is unaligned
89 .Ldst1mod2: # dst is only byte aligned
90 _bltui a4, 7, .Lbytecopy # do short copies byte by byte
98 _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then
99 # return to main algorithm
100 .Ldst2mod4: # dst 16-bit aligned
102 _bltui a4, 6, .Lbytecopy # do short copies byte by byte
110 j .Ldstaligned # dst is now aligned, return to main algorithm
116 # a2/ dst, a3/ src, a4/ len
117 mov a5, a2 # copy dst so that a2 is return value
119 _bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2
120 _bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4
121 .Ldstaligned: # return here from .Ldst?mod? once dst is aligned
122 srli a7, a4, 4 # number of loop iterations with 16B
124 movi a8, 3 # if source is not aligned,
125 _bany a3, a8, .Lsrcunaligned # then use shifting copy
127 * Destination and source are word-aligned, use word copy.
129 # copy 16 bytes per iteration for word-aligned dst and word-aligned src
131 loopnez a7, .Loop1done
132 #else /* !XCHAL_HAVE_LOOPS */
135 add a8, a8, a3 # a8 = end of last 16B source chunk
136 #endif /* !XCHAL_HAVE_LOOPS */
148 #if !XCHAL_HAVE_LOOPS
149 bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end
150 #endif /* !XCHAL_HAVE_LOOPS */
189 * Destination is aligned, Source is unaligned
194 _beqz a4, .Ldone # avoid loading anything for zero-length copies
195 # copy 16 bytes per iteration for word-aligned dst and unaligned src
196 __ssa8 a3 # set shift amount from byte offset
198 /* set to 1 when running on ISS (simulator) with the
199 lint or ferret client, or 0 to save a few cycles */
200 #define SIM_CHECKS_ALIGNMENT 1
201 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
202 and a11, a3, a8 # save unalignment offset for below
203 sub a3, a3, a11 # align a3
205 l32i a6, a3, 0 # load first word
207 loopnez a7, .Loop2done
208 #else /* !XCHAL_HAVE_LOOPS */
211 add a10, a10, a3 # a10 = end of last 16B source chunk
212 #endif /* !XCHAL_HAVE_LOOPS */
228 #if !XCHAL_HAVE_LOOPS
229 bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
230 #endif /* !XCHAL_HAVE_LOOPS */
253 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
254 add a3, a3, a11 # readjust a3 with correct misalignment
258 .Ldone: abi_ret_default
278 * void *memmove(void *dst, const void *src, size_t len);
280 * This function is intended to do the same thing as the standard
281 * library function memmove() for most cases.
282 * However, where the source and/or destination references
283 * an instruction RAM or ROM or a data RAM or ROM, that
284 * source and/or destination will always be accessed with
285 * 32-bit load and store instructions (as required for these
289 * !!!!!!! Handling of IRAM/IROM has not yet
290 * !!!!!!! been implemented.
292 * The (general case) algorithm is as follows:
293 * If end of source doesn't overlap destination then use memcpy.
294 * Otherwise do memcpy backwards.
315 .byte 0 # 1 mod 4 alignment for LOOPNEZ
316 # (0 mod 4 alignment for LBEG)
319 loopnez a4, .Lbackbytecopydone
320 #else /* !XCHAL_HAVE_LOOPS */
321 beqz a4, .Lbackbytecopydone
322 sub a7, a3, a4 # a7 = start address for source
323 #endif /* !XCHAL_HAVE_LOOPS */
329 #if !XCHAL_HAVE_LOOPS
330 bne a3, a7, .Lbacknextbyte # continue loop if
331 # $a3:src != $a7:src_start
332 #endif /* !XCHAL_HAVE_LOOPS */
337 * Destination is unaligned
341 .Lbackdst1mod2: # dst is only byte aligned
342 _bltui a4, 7, .Lbackbytecopy # do short copies byte by byte
350 _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
351 # return to main algorithm
352 .Lbackdst2mod4: # dst 16-bit aligned
354 _bltui a4, 6, .Lbackbytecopy # do short copies byte by byte
362 j .Lbackdstaligned # dst is now aligned,
363 # return to main algorithm
369 # a2/ dst, a3/ src, a4/ len
370 mov a5, a2 # copy dst so that a2 is return value
373 bgeu a6, a4, .Lcommon
378 _bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2
379 _bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4
380 .Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned
381 srli a7, a4, 4 # number of loop iterations with 16B
383 movi a8, 3 # if source is not aligned,
384 _bany a3, a8, .Lbacksrcunaligned # then use shifting copy
386 * Destination and source are word-aligned, use word copy.
388 # copy 16 bytes per iteration for word-aligned dst and word-aligned src
390 loopnez a7, .LbackLoop1done
391 #else /* !XCHAL_HAVE_LOOPS */
392 beqz a7, .LbackLoop1done
394 sub a8, a3, a8 # a8 = start of first 16B source chunk
395 #endif /* !XCHAL_HAVE_LOOPS */
407 #if !XCHAL_HAVE_LOOPS
408 bne a3, a8, .LbackLoop1 # continue loop if a3:src != a8:src_start
409 #endif /* !XCHAL_HAVE_LOOPS */
411 bbci.l a4, 3, .Lback2
420 bbsi.l a4, 2, .Lback3
421 bbsi.l a4, 1, .Lback4
422 bbsi.l a4, 0, .Lback5
430 bbsi.l a4, 1, .Lback4
431 bbsi.l a4, 0, .Lback5
439 bbsi.l a4, 0, .Lback5
450 * Destination is aligned, Source is unaligned
455 _beqz a4, .Lbackdone # avoid loading anything for zero-length copies
456 # copy 16 bytes per iteration for word-aligned dst and unaligned src
457 __ssa8 a3 # set shift amount from byte offset
458 #define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with
459 * the lint or ferret client, or 0
460 * to save a few cycles */
461 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
462 and a11, a3, a8 # save unalignment offset for below
463 sub a3, a3, a11 # align a3
465 l32i a6, a3, 0 # load first word
467 loopnez a7, .LbackLoop2done
468 #else /* !XCHAL_HAVE_LOOPS */
469 beqz a7, .LbackLoop2done
471 sub a10, a3, a10 # a10 = start of first 16B source chunk
472 #endif /* !XCHAL_HAVE_LOOPS */
488 #if !XCHAL_HAVE_LOOPS
489 bne a3, a10, .LbackLoop2 # continue loop if a3:src != a10:src_start
490 #endif /* !XCHAL_HAVE_LOOPS */
492 bbci.l a4, 3, .Lback12
504 bbci.l a4, 2, .Lback13
513 #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
514 add a3, a3, a11 # readjust a3 with correct misalignment
516 bbsi.l a4, 1, .Lback14
517 bbsi.l a4, 0, .Lback15
528 bbsi.l a4, 0, .Lback15