x86, mem: memset_64.S: Optimize memset by enhanced REP MOVSB/STOSB
[linux-2.6-block.git] / arch / x86 / lib / copy_user_64.S
CommitLineData
ad2fc2cd
VM
1/*
2 * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
3 * Copyright 2002 Andi Kleen, SuSE Labs.
1da177e4 4 * Subject to the GNU Public License v2.
ad2fc2cd
VM
5 *
6 * Functions to copy from and to user space.
7 */
1da177e4 8
8d379dad
JB
9#include <linux/linkage.h>
10#include <asm/dwarf2.h>
11
7bcd3f34
AK
12#define FIX_ALIGNMENT 1
13
3022d734
AK
14#include <asm/current.h>
15#include <asm/asm-offsets.h>
16#include <asm/thread_info.h>
17#include <asm/cpufeature.h>
4307bec9 18#include <asm/alternative-asm.h>
3022d734 19
4307bec9
FY
20/*
21 * By placing feature2 after feature1 in altinstructions section, we logically
22 * implement:
23 * If CPU has feature2, jmp to alt2 is used
24 * else if CPU has feature1, jmp to alt1 is used
25 * else jmp to orig is used.
26 */
27 .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
3022d734
AK
280:
29 .byte 0xe9 /* 32bit jump */
30 .long \orig-1f /* by default jump to orig */
311:
32 .section .altinstr_replacement,"ax"
ad2fc2cd 332: .byte 0xe9 /* near jump with 32bit immediate */
4307bec9
FY
34 .long \alt1-1b /* offset */ /* or alternatively to alt1 */
353: .byte 0xe9 /* near jump with 32bit immediate */
36 .long \alt2-1b /* offset */ /* or alternatively to alt2 */
3022d734 37 .previous
4307bec9 38
3022d734 39 .section .altinstructions,"a"
4307bec9
FY
40 altinstruction_entry 0b,2b,\feature1,5,5
41 altinstruction_entry 0b,3b,\feature2,5,5
3022d734
AK
42 .previous
43 .endm
1da177e4 44
ad2fc2cd
VM
45 .macro ALIGN_DESTINATION
46#ifdef FIX_ALIGNMENT
47 /* check for bad alignment of destination */
48 movl %edi,%ecx
49 andl $7,%ecx
50 jz 102f /* already aligned */
51 subl $8,%ecx
52 negl %ecx
53 subl %ecx,%edx
54100: movb (%rsi),%al
55101: movb %al,(%rdi)
56 incq %rsi
57 incq %rdi
58 decl %ecx
59 jnz 100b
60102:
61 .section .fixup,"ax"
afd962a9 62103: addl %ecx,%edx /* ecx is zerorest also */
ad2fc2cd
VM
63 jmp copy_user_handle_tail
64 .previous
65
66 .section __ex_table,"a"
67 .align 8
68 .quad 100b,103b
69 .quad 101b,103b
70 .previous
71#endif
72 .endm
73
74/* Standard copy_to_user with segment limit checking */
3c93ca00 75ENTRY(_copy_to_user)
8d379dad 76 CFI_STARTPROC
1da177e4
LT
77 GET_THREAD_INFO(%rax)
78 movq %rdi,%rcx
79 addq %rdx,%rcx
ad2fc2cd 80 jc bad_to_user
26ccb8a7 81 cmpq TI_addr_limit(%rax),%rcx
1da177e4 82 jae bad_to_user
4307bec9
FY
83 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
84 copy_user_generic_unrolled,copy_user_generic_string, \
85 copy_user_enhanced_fast_string
8d379dad 86 CFI_ENDPROC
3c93ca00 87ENDPROC(_copy_to_user)
7bcd3f34 88
ad2fc2cd 89/* Standard copy_from_user with segment limit checking */
9f0cf4ad 90ENTRY(_copy_from_user)
3022d734 91 CFI_STARTPROC
ad2fc2cd
VM
92 GET_THREAD_INFO(%rax)
93 movq %rsi,%rcx
94 addq %rdx,%rcx
95 jc bad_from_user
96 cmpq TI_addr_limit(%rax),%rcx
97 jae bad_from_user
4307bec9
FY
98 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
99 copy_user_generic_unrolled,copy_user_generic_string, \
100 copy_user_enhanced_fast_string
3022d734 101 CFI_ENDPROC
9f0cf4ad 102ENDPROC(_copy_from_user)
3022d734 103
1da177e4
LT
104 .section .fixup,"ax"
105 /* must zero dest */
ad2fc2cd 106ENTRY(bad_from_user)
1da177e4 107bad_from_user:
8d379dad 108 CFI_STARTPROC
1da177e4
LT
109 movl %edx,%ecx
110 xorl %eax,%eax
111 rep
112 stosb
113bad_to_user:
ad2fc2cd 114 movl %edx,%eax
1da177e4 115 ret
8d379dad 116 CFI_ENDPROC
ad2fc2cd 117ENDPROC(bad_from_user)
1da177e4 118 .previous
ad2fc2cd 119
1da177e4 120/*
3022d734 121 * copy_user_generic_unrolled - memory copy with exception handling.
ad2fc2cd
VM
122 * This version is for CPUs like P4 that don't have efficient micro
123 * code for rep movsq
124 *
125 * Input:
1da177e4
LT
126 * rdi destination
127 * rsi source
128 * rdx count
129 *
ad2fc2cd 130 * Output:
0d2eb44f 131 * eax uncopied bytes or 0 if successful.
1da177e4 132 */
3022d734 133ENTRY(copy_user_generic_unrolled)
8d379dad 134 CFI_STARTPROC
ad2fc2cd
VM
135 cmpl $8,%edx
136 jb 20f /* less then 8 bytes, go to byte copy loop */
137 ALIGN_DESTINATION
138 movl %edx,%ecx
139 andl $63,%edx
140 shrl $6,%ecx
141 jz 17f
1421: movq (%rsi),%r8
1432: movq 1*8(%rsi),%r9
1443: movq 2*8(%rsi),%r10
1454: movq 3*8(%rsi),%r11
1465: movq %r8,(%rdi)
1476: movq %r9,1*8(%rdi)
1487: movq %r10,2*8(%rdi)
1498: movq %r11,3*8(%rdi)
1509: movq 4*8(%rsi),%r8
15110: movq 5*8(%rsi),%r9
15211: movq 6*8(%rsi),%r10
15312: movq 7*8(%rsi),%r11
15413: movq %r8,4*8(%rdi)
15514: movq %r9,5*8(%rdi)
15615: movq %r10,6*8(%rdi)
15716: movq %r11,7*8(%rdi)
7bcd3f34
AK
158 leaq 64(%rsi),%rsi
159 leaq 64(%rdi),%rdi
7bcd3f34 160 decl %ecx
ad2fc2cd
VM
161 jnz 1b
16217: movl %edx,%ecx
163 andl $7,%edx
164 shrl $3,%ecx
165 jz 20f
16618: movq (%rsi),%r8
16719: movq %r8,(%rdi)
7bcd3f34 168 leaq 8(%rsi),%rsi
ad2fc2cd
VM
169 leaq 8(%rdi),%rdi
170 decl %ecx
171 jnz 18b
17220: andl %edx,%edx
173 jz 23f
7bcd3f34 174 movl %edx,%ecx
ad2fc2cd
VM
17521: movb (%rsi),%al
17622: movb %al,(%rdi)
7bcd3f34 177 incq %rsi
ad2fc2cd 178 incq %rdi
7bcd3f34 179 decl %ecx
ad2fc2cd
VM
180 jnz 21b
18123: xor %eax,%eax
7bcd3f34
AK
182 ret
183
ad2fc2cd
VM
184 .section .fixup,"ax"
18530: shll $6,%ecx
186 addl %ecx,%edx
187 jmp 60f
27cb0a75 18840: lea (%rdx,%rcx,8),%rdx
ad2fc2cd
VM
189 jmp 60f
19050: movl %ecx,%edx
19160: jmp copy_user_handle_tail /* ecx is zerorest also */
192 .previous
7bcd3f34 193
7bcd3f34
AK
194 .section __ex_table,"a"
195 .align 8
ad2fc2cd
VM
196 .quad 1b,30b
197 .quad 2b,30b
198 .quad 3b,30b
199 .quad 4b,30b
200 .quad 5b,30b
201 .quad 6b,30b
202 .quad 7b,30b
203 .quad 8b,30b
204 .quad 9b,30b
205 .quad 10b,30b
206 .quad 11b,30b
207 .quad 12b,30b
208 .quad 13b,30b
209 .quad 14b,30b
210 .quad 15b,30b
211 .quad 16b,30b
212 .quad 18b,40b
213 .quad 19b,40b
214 .quad 21b,50b
215 .quad 22b,50b
7bcd3f34 216 .previous
8d379dad 217 CFI_ENDPROC
ad2fc2cd 218ENDPROC(copy_user_generic_unrolled)
8d379dad 219
ad2fc2cd
VM
220/* Some CPUs run faster using the string copy instructions.
221 * This is also a lot simpler. Use them when possible.
222 *
223 * Only 4GB of copy is supported. This shouldn't be a problem
224 * because the kernel normally only writes from/to page sized chunks
225 * even if user space passed a longer buffer.
226 * And more would be dangerous because both Intel and AMD have
227 * errata with rep movsq > 4GB. If someone feels the need to fix
228 * this please consider this.
229 *
230 * Input:
231 * rdi destination
232 * rsi source
233 * rdx count
234 *
235 * Output:
236 * eax uncopied bytes or 0 if successful.
237 */
3022d734 238ENTRY(copy_user_generic_string)
8d379dad 239 CFI_STARTPROC
ad2fc2cd
VM
240 andl %edx,%edx
241 jz 4f
242 cmpl $8,%edx
243 jb 2f /* less than 8 bytes, go to byte copy loop */
244 ALIGN_DESTINATION
1da177e4
LT
245 movl %edx,%ecx
246 shrl $3,%ecx
ad2fc2cd
VM
247 andl $7,%edx
2481: rep
3022d734 249 movsq
ad2fc2cd
VM
2502: movl %edx,%ecx
2513: rep
252 movsb
2534: xorl %eax,%eax
1da177e4 254 ret
3022d734 255
ad2fc2cd 256 .section .fixup,"ax"
27cb0a75 25711: lea (%rdx,%rcx,8),%rcx
ad2fc2cd
VM
25812: movl %ecx,%edx /* ecx is zerorest also */
259 jmp copy_user_handle_tail
260 .previous
2cbc9ee3 261
1da177e4 262 .section __ex_table,"a"
ad2fc2cd
VM
263 .align 8
264 .quad 1b,11b
265 .quad 3b,12b
1da177e4 266 .previous
ad2fc2cd
VM
267 CFI_ENDPROC
268ENDPROC(copy_user_generic_string)
4307bec9
FY
269
270/*
271 * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
272 * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
273 *
274 * Input:
275 * rdi destination
276 * rsi source
277 * rdx count
278 *
279 * Output:
280 * eax uncopied bytes or 0 if successful.
281 */
282ENTRY(copy_user_enhanced_fast_string)
283 CFI_STARTPROC
284 andl %edx,%edx
285 jz 2f
286 movl %edx,%ecx
2871: rep
288 movsb
2892: xorl %eax,%eax
290 ret
291
292 .section .fixup,"ax"
29312: movl %ecx,%edx /* ecx is zerorest also */
294 jmp copy_user_handle_tail
295 .previous
296
297 .section __ex_table,"a"
298 .align 8
299 .quad 1b,12b
300 .previous
301 CFI_ENDPROC
302ENDPROC(copy_user_enhanced_fast_string)