Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | Copyright 2003 Richard Curnow, SuperH (UK) Ltd. | |
3 | ||
4 | This file is subject to the terms and conditions of the GNU General Public | |
5 | License. See the file "COPYING" in the main directory of this archive | |
6 | for more details. | |
7 | ||
8 | Tight version of mempy for the case of just copying a page. | |
9 | Prefetch strategy empirically optimised against RTL simulations | |
10 | of SH5-101 cut2 eval chip with Cayman board DDR memory. | |
11 | ||
12 | Parameters: | |
13 | r2 : source effective address (start of page) | |
14 | r3 : destination effective address (start of page) | |
15 | ||
16 | Always copies 4096 bytes. | |
17 | ||
18 | Points to review. | |
19 | * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead. | |
20 | It seems like the prefetch needs to be at at least 4 lines ahead to get | |
21 | the data into the cache in time, and the allocos contend with outstanding | |
22 | prefetches for the same cache set, so it's better to have the numbers | |
23 | different. | |
24 | */ | |
25 | ||
26 | .section .text..SHmedia32,"ax" | |
27 | .little | |
28 | ||
29 | .balign 8 | |
30 | .global sh64_page_copy | |
31 | sh64_page_copy: | |
32 | ||
33 | /* Copy 4096 bytes worth of data from r2 to r3. | |
34 | Do prefetches 4 lines ahead. | |
35 | Do alloco 2 lines ahead */ | |
36 | ||
37 | pta 1f, tr1 | |
38 | pta 2f, tr2 | |
39 | pta 3f, tr3 | |
40 | ptabs r18, tr0 | |
41 | ||
42 | #if 0 | |
43 | /* TAKum03020 */ | |
44 | ld.q r2, 0x00, r63 | |
45 | ld.q r2, 0x20, r63 | |
46 | ld.q r2, 0x40, r63 | |
47 | ld.q r2, 0x60, r63 | |
48 | #endif | |
49 | alloco r3, 0x00 | |
50 | synco ! TAKum03020 | |
51 | alloco r3, 0x20 | |
52 | synco ! TAKum03020 | |
53 | ||
54 | movi 3968, r6 | |
55 | add r3, r6, r6 | |
56 | addi r6, 64, r7 | |
57 | addi r7, 64, r8 | |
58 | sub r2, r3, r60 | |
59 | addi r60, 8, r61 | |
60 | addi r61, 8, r62 | |
61 | addi r62, 8, r23 | |
62 | addi r60, 0x80, r22 | |
63 | ||
64 | /* Minimal code size. The extra branches inside the loop don't cost much | |
65 | because they overlap with the time spent waiting for prefetches to | |
66 | complete. */ | |
67 | 1: | |
68 | #if 0 | |
69 | /* TAKum03020 */ | |
70 | bge/u r3, r6, tr2 ! skip prefetch for last 4 lines | |
71 | ldx.q r3, r22, r63 ! prefetch 4 lines hence | |
72 | #endif | |
73 | 2: | |
74 | bge/u r3, r7, tr3 ! skip alloco for last 2 lines | |
75 | alloco r3, 0x40 ! alloc destination line 2 lines ahead | |
76 | synco ! TAKum03020 | |
77 | 3: | |
78 | ldx.q r3, r60, r36 | |
79 | ldx.q r3, r61, r37 | |
80 | ldx.q r3, r62, r38 | |
81 | ldx.q r3, r23, r39 | |
82 | st.q r3, 0, r36 | |
83 | st.q r3, 8, r37 | |
84 | st.q r3, 16, r38 | |
85 | st.q r3, 24, r39 | |
86 | addi r3, 32, r3 | |
87 | bgt/l r8, r3, tr1 | |
88 | ||
89 | blink tr0, r63 ! return | |
90 | ||
91 |