Commit | Line | Data |
---|---|---|
1da177e4 | 1 | /* Copyright 2002 Andi Kleen */ |
038b0a6d | 2 | |
8d379dad | 3 | #include <linux/linkage.h> |
8d379dad | 4 | #include <asm/cpufeature.h> |
101068c1 | 5 | #include <asm/alternative-asm.h> |
8d379dad | 6 | |
e0bc8d17 BP |
7 | /* |
8 | * We build a jump to memcpy_orig by default which gets NOPped out on | |
9 | * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which | |
10 | * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs | |
11 | * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. | |
12 | */ | |
13 | ||
14 | .weak memcpy | |
15 | ||
1da177e4 LT |
16 | /* |
17 | * memcpy - Copy a memory block. | |
18 | * | |
f3b6eaf0 IM |
19 | * Input: |
20 | * rdi destination | |
21 | * rsi source | |
22 | * rdx count | |
23 | * | |
1da177e4 LT |
24 | * Output: |
25 | * rax original destination | |
f3b6eaf0 | 26 | */ |
e0bc8d17 BP |
27 | ENTRY(__memcpy) |
28 | ENTRY(memcpy) | |
29 | ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ | |
30 | "jmp memcpy_erms", X86_FEATURE_ERMS | |
1da177e4 | 31 | |
f3b6eaf0 | 32 | movq %rdi, %rax |
2ab56091 JB |
33 | movq %rdx, %rcx |
34 | shrq $3, %rcx | |
f3b6eaf0 | 35 | andl $7, %edx |
8d379dad | 36 | rep movsq |
f3b6eaf0 | 37 | movl %edx, %ecx |
8d379dad JB |
38 | rep movsb |
39 | ret | |
e0bc8d17 BP |
40 | ENDPROC(memcpy) |
41 | ENDPROC(__memcpy) | |
8d379dad | 42 | |
101068c1 | 43 | /* |
e0bc8d17 BP |
44 | * memcpy_erms() - enhanced fast string memcpy. This is faster and |
45 | * simpler than memcpy. Use memcpy_erms when possible. | |
101068c1 | 46 | */ |
e0bc8d17 | 47 | ENTRY(memcpy_erms) |
101068c1 | 48 | movq %rdi, %rax |
2ab56091 | 49 | movq %rdx, %rcx |
101068c1 FY |
50 | rep movsb |
51 | ret | |
e0bc8d17 | 52 | ENDPROC(memcpy_erms) |
393f203f | 53 | |
e0bc8d17 | 54 | ENTRY(memcpy_orig) |
59daa706 | 55 | movq %rdi, %rax |
7bcd3f34 | 56 | |
2ab56091 | 57 | cmpq $0x20, %rdx |
59daa706 | 58 | jb .Lhandle_tail |
7bcd3f34 | 59 | |
f3b6eaf0 | 60 | /* |
9de4966a | 61 | * We check whether memory false dependence could occur, |
59daa706 | 62 | * then jump to corresponding copy mode. |
f3b6eaf0 | 63 | */ |
59daa706 ML |
64 | cmp %dil, %sil |
65 | jl .Lcopy_backward | |
2ab56091 | 66 | subq $0x20, %rdx |
59daa706 ML |
67 | .Lcopy_forward_loop: |
68 | subq $0x20, %rdx | |
7bcd3f34 | 69 | |
f3b6eaf0 | 70 | /* |
59daa706 | 71 | * Move in blocks of 4x8 bytes: |
f3b6eaf0 | 72 | */ |
59daa706 ML |
73 | movq 0*8(%rsi), %r8 |
74 | movq 1*8(%rsi), %r9 | |
75 | movq 2*8(%rsi), %r10 | |
76 | movq 3*8(%rsi), %r11 | |
77 | leaq 4*8(%rsi), %rsi | |
78 | ||
79 | movq %r8, 0*8(%rdi) | |
80 | movq %r9, 1*8(%rdi) | |
81 | movq %r10, 2*8(%rdi) | |
82 | movq %r11, 3*8(%rdi) | |
83 | leaq 4*8(%rdi), %rdi | |
84 | jae .Lcopy_forward_loop | |
2ab56091 | 85 | addl $0x20, %edx |
59daa706 ML |
86 | jmp .Lhandle_tail |
87 | ||
88 | .Lcopy_backward: | |
89 | /* | |
90 | * Calculate copy position to tail. | |
91 | */ | |
92 | addq %rdx, %rsi | |
93 | addq %rdx, %rdi | |
94 | subq $0x20, %rdx | |
95 | /* | |
96 | * At most 3 ALU operations in one cycle, | |
d50ba368 | 97 | * so append NOPS in the same 16 bytes trunk. |
59daa706 ML |
98 | */ |
99 | .p2align 4 | |
100 | .Lcopy_backward_loop: | |
101 | subq $0x20, %rdx | |
102 | movq -1*8(%rsi), %r8 | |
103 | movq -2*8(%rsi), %r9 | |
104 | movq -3*8(%rsi), %r10 | |
105 | movq -4*8(%rsi), %r11 | |
106 | leaq -4*8(%rsi), %rsi | |
107 | movq %r8, -1*8(%rdi) | |
108 | movq %r9, -2*8(%rdi) | |
109 | movq %r10, -3*8(%rdi) | |
110 | movq %r11, -4*8(%rdi) | |
111 | leaq -4*8(%rdi), %rdi | |
112 | jae .Lcopy_backward_loop | |
7bcd3f34 | 113 | |
59daa706 ML |
114 | /* |
115 | * Calculate copy position to head. | |
116 | */ | |
2ab56091 | 117 | addl $0x20, %edx |
59daa706 ML |
118 | subq %rdx, %rsi |
119 | subq %rdx, %rdi | |
7bcd3f34 | 120 | .Lhandle_tail: |
2ab56091 | 121 | cmpl $16, %edx |
59daa706 | 122 | jb .Lless_16bytes |
f3b6eaf0 | 123 | |
59daa706 ML |
124 | /* |
125 | * Move data from 16 bytes to 31 bytes. | |
126 | */ | |
127 | movq 0*8(%rsi), %r8 | |
128 | movq 1*8(%rsi), %r9 | |
129 | movq -2*8(%rsi, %rdx), %r10 | |
130 | movq -1*8(%rsi, %rdx), %r11 | |
131 | movq %r8, 0*8(%rdi) | |
132 | movq %r9, 1*8(%rdi) | |
133 | movq %r10, -2*8(%rdi, %rdx) | |
134 | movq %r11, -1*8(%rdi, %rdx) | |
135 | retq | |
7bcd3f34 | 136 | .p2align 4 |
59daa706 | 137 | .Lless_16bytes: |
2ab56091 | 138 | cmpl $8, %edx |
59daa706 ML |
139 | jb .Lless_8bytes |
140 | /* | |
141 | * Move data from 8 bytes to 15 bytes. | |
142 | */ | |
143 | movq 0*8(%rsi), %r8 | |
144 | movq -1*8(%rsi, %rdx), %r9 | |
145 | movq %r8, 0*8(%rdi) | |
146 | movq %r9, -1*8(%rdi, %rdx) | |
147 | retq | |
148 | .p2align 4 | |
149 | .Lless_8bytes: | |
2ab56091 | 150 | cmpl $4, %edx |
59daa706 | 151 | jb .Lless_3bytes |
f3b6eaf0 | 152 | |
59daa706 ML |
153 | /* |
154 | * Move data from 4 bytes to 7 bytes. | |
155 | */ | |
156 | movl (%rsi), %ecx | |
157 | movl -4(%rsi, %rdx), %r8d | |
158 | movl %ecx, (%rdi) | |
159 | movl %r8d, -4(%rdi, %rdx) | |
160 | retq | |
7bcd3f34 | 161 | .p2align 4 |
59daa706 | 162 | .Lless_3bytes: |
9d8e2277 JB |
163 | subl $1, %edx |
164 | jb .Lend | |
59daa706 ML |
165 | /* |
166 | * Move data from 1 bytes to 3 bytes. | |
167 | */ | |
9d8e2277 JB |
168 | movzbl (%rsi), %ecx |
169 | jz .Lstore_1byte | |
170 | movzbq 1(%rsi), %r8 | |
171 | movzbq (%rsi, %rdx), %r9 | |
172 | movb %r8b, 1(%rdi) | |
173 | movb %r9b, (%rdi, %rdx) | |
174 | .Lstore_1byte: | |
175 | movb %cl, (%rdi) | |
7bcd3f34 | 176 | |
f3b6eaf0 | 177 | .Lend: |
59daa706 | 178 | retq |
e0bc8d17 | 179 | ENDPROC(memcpy_orig) |