Commit | Line | Data |
---|---|---|
b2441318 | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
1da177e4 LT |
2 | /* |
3 | * This routine clears to zero a linear memory buffer in user space. | |
4 | * | |
5 | * Inputs: | |
6 | * in0: address of buffer | |
7 | * in1: length of buffer in bytes | |
8 | * Outputs: | |
9 | * r8: number of bytes that didn't get cleared due to a fault | |
10 | * | |
11 | * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co | |
12 | * Stephane Eranian <eranian@hpl.hp.com> | |
13 | */ | |
14 | ||
15 | #include <asm/asmmacro.h> | |
e007c533 | 16 | #include <asm/export.h> |
1da177e4 LT |
17 | |
18 | // | |
19 | // arguments | |
20 | // | |
21 | #define buf r32 | |
22 | #define len r33 | |
23 | ||
24 | // | |
25 | // local registers | |
26 | // | |
27 | #define cnt r16 | |
28 | #define buf2 r17 | |
29 | #define saved_lc r18 | |
30 | #define saved_pfs r19 | |
31 | #define tmp r20 | |
32 | #define len2 r21 | |
33 | #define len3 r22 | |
34 | ||
35 | // | |
36 | // Theory of operations: | |
37 | // - we check whether or not the buffer is small, i.e., less than 17 | |
38 | // in which case we do the byte by byte loop. | |
39 | // | |
40 | // - Otherwise we go progressively from 1 byte store to 8byte store in | |
41 | // the head part, the body is a 16byte store loop and we finish we the | |
42 | // tail for the last 15 bytes. | |
43 | // The good point about this breakdown is that the long buffer handling | |
44 | // contains only 2 branches. | |
45 | // | |
46 | // The reason for not using shifting & masking for both the head and the | |
47 | // tail is to stay semantically correct. This routine is not supposed | |
48 | // to write bytes outside of the buffer. While most of the time this would | |
49 | // be ok, we can't tolerate a mistake. A classical example is the case | |
50 | // of multithreaded code were to the extra bytes touched is actually owned | |
51 | // by another thread which runs concurrently to ours. Another, less likely, | |
52 | // example is with device drivers where reading an I/O mapped location may | |
53 | // have side effects (same thing for writing). | |
54 | // | |
55 | ||
56 | GLOBAL_ENTRY(__do_clear_user) | |
57 | .prologue | |
58 | .save ar.pfs, saved_pfs | |
59 | alloc saved_pfs=ar.pfs,2,0,0,0 | |
60 | cmp.eq p6,p0=r0,len // check for zero length | |
61 | .save ar.lc, saved_lc | |
62 | mov saved_lc=ar.lc // preserve ar.lc (slow) | |
63 | .body | |
64 | ;; // avoid WAW on CFM | |
65 | adds tmp=-1,len // br.ctop is repeat/until | |
66 | mov ret0=len // return value is length at this point | |
67 | (p6) br.ret.spnt.many rp | |
68 | ;; | |
69 | cmp.lt p6,p0=16,len // if len > 16 then long memset | |
70 | mov ar.lc=tmp // initialize lc for small count | |
71 | (p6) br.cond.dptk .long_do_clear | |
72 | ;; // WAR on ar.lc | |
73 | // | |
74 | // worst case 16 iterations, avg 8 iterations | |
75 | // | |
76 | // We could have played with the predicates to use the extra | |
77 | // M slot for 2 stores/iteration but the cost the initialization | |
78 | // the various counters compared to how long the loop is supposed | |
79 | // to last on average does not make this solution viable. | |
80 | // | |
81 | 1: | |
82 | EX( .Lexit1, st1 [buf]=r0,1 ) | |
83 | adds len=-1,len // countdown length using len | |
84 | br.cloop.dptk 1b | |
85 | ;; // avoid RAW on ar.lc | |
86 | // | |
87 | // .Lexit4: comes from byte by byte loop | |
88 | // len contains bytes left | |
89 | .Lexit1: | |
90 | mov ret0=len // faster than using ar.lc | |
91 | mov ar.lc=saved_lc | |
92 | br.ret.sptk.many rp // end of short clear_user | |
93 | ||
94 | ||
95 | // | |
96 | // At this point we know we have more than 16 bytes to copy | |
97 | // so we focus on alignment (no branches required) | |
98 | // | |
99 | // The use of len/len2 for countdown of the number of bytes left | |
100 | // instead of ret0 is due to the fact that the exception code | |
101 | // changes the values of r8. | |
102 | // | |
103 | .long_do_clear: | |
104 | tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear) | |
105 | ;; | |
106 | EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned | |
107 | (p6) adds len=-1,len;; // sync because buf is modified | |
108 | tbit.nz p6,p0=buf,1 | |
109 | ;; | |
110 | EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned | |
111 | (p6) adds len=-2,len;; | |
112 | tbit.nz p6,p0=buf,2 | |
113 | ;; | |
114 | EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned | |
115 | (p6) adds len=-4,len;; | |
116 | tbit.nz p6,p0=buf,3 | |
117 | ;; | |
118 | EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned | |
119 | (p6) adds len=-8,len;; | |
120 | shr.u cnt=len,4 // number of 128-bit (2x64bit) words | |
121 | ;; | |
122 | cmp.eq p6,p0=r0,cnt | |
123 | adds tmp=-1,cnt | |
124 | (p6) br.cond.dpnt .dotail // we have less than 16 bytes left | |
125 | ;; | |
126 | adds buf2=8,buf // setup second base pointer | |
127 | mov ar.lc=tmp | |
128 | ;; | |
129 | ||
130 | // | |
131 | // 16bytes/iteration core loop | |
132 | // | |
133 | // The second store can never generate a fault because | |
134 | // we come into the loop only when we are 16-byte aligned. | |
135 | // This means that if we cross a page then it will always be | |
136 | // in the first store and never in the second. | |
137 | // | |
138 | // | |
139 | // We need to keep track of the remaining length. A possible (optimistic) | |
140 | // way would be to use ar.lc and derive how many byte were left by | |
141 | // doing : left= 16*ar.lc + 16. this would avoid the addition at | |
142 | // every iteration. | |
143 | // However we need to keep the synchronization point. A template | |
144 | // M;;MB does not exist and thus we can keep the addition at no | |
145 | // extra cycle cost (use a nop slot anyway). It also simplifies the | |
146 | // (unlikely) error recovery code | |
147 | // | |
148 | ||
149 | 2: EX(.Lexit3, st8 [buf]=r0,16 ) | |
150 | ;; // needed to get len correct when error | |
151 | st8 [buf2]=r0,16 | |
152 | adds len=-16,len | |
153 | br.cloop.dptk 2b | |
154 | ;; | |
155 | mov ar.lc=saved_lc | |
156 | // | |
157 | // tail correction based on len only | |
158 | // | |
159 | // We alternate the use of len3,len2 to allow parallelism and correct | |
160 | // error handling. We also reuse p6/p7 to return correct value. | |
161 | // The addition of len2/len3 does not cost anything more compared to | |
162 | // the regular memset as we had empty slots. | |
163 | // | |
164 | .dotail: | |
165 | mov len2=len // for parallelization of error handling | |
166 | mov len3=len | |
167 | tbit.nz p6,p0=len,3 | |
168 | ;; | |
169 | EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes | |
170 | (p6) adds len3=-8,len2 | |
171 | tbit.nz p7,p6=len,2 | |
172 | ;; | |
173 | EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes | |
174 | (p7) adds len2=-4,len3 | |
175 | tbit.nz p6,p7=len,1 | |
176 | ;; | |
177 | EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes | |
178 | (p6) adds len3=-2,len2 | |
179 | tbit.nz p7,p6=len,0 | |
180 | ;; | |
181 | EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left | |
182 | mov ret0=r0 // success | |
183 | br.ret.sptk.many rp // end of most likely path | |
184 | ||
185 | // | |
186 | // Outlined error handling code | |
187 | // | |
188 | ||
189 | // | |
190 | // .Lexit3: comes from core loop, need restore pr/lc | |
191 | // len contains bytes left | |
192 | // | |
193 | // | |
194 | // .Lexit2: | |
195 | // if p6 -> coming from st8 or st2 : len2 contains what's left | |
196 | // if p7 -> coming from st4 or st1 : len3 contains what's left | |
197 | // We must restore lc/pr even though might not have been used. | |
198 | .Lexit2: | |
199 | .pred.rel "mutex", p6, p7 | |
200 | (p6) mov len=len2 | |
201 | (p7) mov len=len3 | |
202 | ;; | |
203 | // | |
204 | // .Lexit4: comes from head, need not restore pr/lc | |
205 | // len contains bytes left | |
206 | // | |
207 | .Lexit3: | |
208 | mov ret0=len | |
209 | mov ar.lc=saved_lc | |
210 | br.ret.sptk.many rp | |
211 | END(__do_clear_user) | |
e007c533 | 212 | EXPORT_SYMBOL(__do_clear_user) |