Commit | Line | Data |
---|---|---|
ce0ad7f0 NP |
1 | /* |
2 | * Lockless get_user_pages_fast for powerpc | |
3 | * | |
4 | * Copyright (C) 2008 Nick Piggin | |
5 | * Copyright (C) 2008 Novell Inc. | |
6 | */ | |
7 | #undef DEBUG | |
8 | ||
9 | #include <linux/sched.h> | |
10 | #include <linux/mm.h> | |
11 | #include <linux/hugetlb.h> | |
12 | #include <linux/vmstat.h> | |
13 | #include <linux/pagemap.h> | |
14 | #include <linux/rwsem.h> | |
15 | #include <asm/pgtable.h> | |
16 | ||
17 | /* | |
18 | * The performance critical leaf functions are made noinline otherwise gcc | |
19 | * inlines everything into a single function which results in too much | |
20 | * register pressure. | |
21 | */ | |
22 | static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, | |
23 | unsigned long end, int write, struct page **pages, int *nr) | |
24 | { | |
25 | unsigned long mask, result; | |
26 | pte_t *ptep; | |
27 | ||
28 | result = _PAGE_PRESENT|_PAGE_USER; | |
29 | if (write) | |
30 | result |= _PAGE_RW; | |
31 | mask = result | _PAGE_SPECIAL; | |
32 | ||
33 | ptep = pte_offset_kernel(&pmd, addr); | |
34 | do { | |
35 | pte_t pte = *ptep; | |
36 | struct page *page; | |
37 | ||
38 | if ((pte_val(pte) & mask) != result) | |
39 | return 0; | |
40 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | |
41 | page = pte_page(pte); | |
42 | if (!page_cache_get_speculative(page)) | |
43 | return 0; | |
f5ea64dc | 44 | if (unlikely(pte_val(pte) != pte_val(*ptep))) { |
ce0ad7f0 NP |
45 | put_page(page); |
46 | return 0; | |
47 | } | |
48 | pages[*nr] = page; | |
49 | (*nr)++; | |
50 | ||
51 | } while (ptep++, addr += PAGE_SIZE, addr != end); | |
52 | ||
53 | return 1; | |
54 | } | |
55 | ||
56 | #ifdef CONFIG_HUGETLB_PAGE | |
57 | static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate, | |
58 | unsigned long *addr, unsigned long end, | |
59 | int write, struct page **pages, int *nr) | |
60 | { | |
61 | unsigned long mask; | |
62 | unsigned long pte_end; | |
63 | struct page *head, *page; | |
64 | pte_t pte; | |
65 | int refs; | |
66 | ||
67 | pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate); | |
68 | if (pte_end < end) | |
69 | end = pte_end; | |
70 | ||
71 | pte = *ptep; | |
72 | mask = _PAGE_PRESENT|_PAGE_USER; | |
73 | if (write) | |
74 | mask |= _PAGE_RW; | |
75 | if ((pte_val(pte) & mask) != mask) | |
76 | return 0; | |
77 | /* hugepages are never "special" */ | |
78 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | |
79 | ||
80 | refs = 0; | |
81 | head = pte_page(pte); | |
82 | page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT); | |
83 | do { | |
84 | VM_BUG_ON(compound_head(page) != head); | |
85 | pages[*nr] = page; | |
86 | (*nr)++; | |
87 | page++; | |
88 | refs++; | |
89 | } while (*addr += PAGE_SIZE, *addr != end); | |
90 | ||
91 | if (!page_cache_add_speculative(head, refs)) { | |
92 | *nr -= refs; | |
93 | return 0; | |
94 | } | |
f5ea64dc | 95 | if (unlikely(pte_val(pte) != pte_val(*ptep))) { |
ce0ad7f0 NP |
96 | /* Could be optimized better */ |
97 | while (*nr) { | |
98 | put_page(page); | |
99 | (*nr)--; | |
100 | } | |
101 | } | |
102 | ||
103 | return 1; | |
104 | } | |
105 | #endif /* CONFIG_HUGETLB_PAGE */ | |
106 | ||
107 | static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | |
108 | int write, struct page **pages, int *nr) | |
109 | { | |
110 | unsigned long next; | |
111 | pmd_t *pmdp; | |
112 | ||
113 | pmdp = pmd_offset(&pud, addr); | |
114 | do { | |
115 | pmd_t pmd = *pmdp; | |
116 | ||
117 | next = pmd_addr_end(addr, end); | |
118 | if (pmd_none(pmd)) | |
119 | return 0; | |
120 | if (!gup_pte_range(pmd, addr, next, write, pages, nr)) | |
121 | return 0; | |
122 | } while (pmdp++, addr = next, addr != end); | |
123 | ||
124 | return 1; | |
125 | } | |
126 | ||
127 | static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, | |
128 | int write, struct page **pages, int *nr) | |
129 | { | |
130 | unsigned long next; | |
131 | pud_t *pudp; | |
132 | ||
133 | pudp = pud_offset(&pgd, addr); | |
134 | do { | |
135 | pud_t pud = *pudp; | |
136 | ||
137 | next = pud_addr_end(addr, end); | |
138 | if (pud_none(pud)) | |
139 | return 0; | |
140 | if (!gup_pmd_range(pud, addr, next, write, pages, nr)) | |
141 | return 0; | |
142 | } while (pudp++, addr = next, addr != end); | |
143 | ||
144 | return 1; | |
145 | } | |
146 | ||
147 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | |
148 | struct page **pages) | |
149 | { | |
150 | struct mm_struct *mm = current->mm; | |
151 | unsigned long addr, len, end; | |
152 | unsigned long next; | |
153 | pgd_t *pgdp; | |
154 | int psize, nr = 0; | |
155 | unsigned int shift; | |
156 | ||
157 | pr_debug("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read"); | |
158 | ||
159 | start &= PAGE_MASK; | |
160 | addr = start; | |
161 | len = (unsigned long) nr_pages << PAGE_SHIFT; | |
162 | end = start + len; | |
163 | ||
164 | if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, | |
165 | start, len))) | |
166 | goto slow_irqon; | |
167 | ||
168 | pr_debug(" aligned: %lx .. %lx\n", start, end); | |
169 | ||
170 | #ifdef CONFIG_HUGETLB_PAGE | |
171 | /* We bail out on slice boundary crossing when hugetlb is | |
172 | * enabled in order to not have to deal with two different | |
173 | * page table formats | |
174 | */ | |
175 | if (addr < SLICE_LOW_TOP) { | |
176 | if (end > SLICE_LOW_TOP) | |
177 | goto slow_irqon; | |
178 | ||
179 | if (unlikely(GET_LOW_SLICE_INDEX(addr) != | |
180 | GET_LOW_SLICE_INDEX(end - 1))) | |
181 | goto slow_irqon; | |
182 | } else { | |
183 | if (unlikely(GET_HIGH_SLICE_INDEX(addr) != | |
184 | GET_HIGH_SLICE_INDEX(end - 1))) | |
185 | goto slow_irqon; | |
186 | } | |
187 | #endif /* CONFIG_HUGETLB_PAGE */ | |
188 | ||
189 | /* | |
190 | * XXX: batch / limit 'nr', to avoid large irq off latency | |
191 | * needs some instrumenting to determine the common sizes used by | |
192 | * important workloads (eg. DB2), and whether limiting the batch size | |
193 | * will decrease performance. | |
194 | * | |
195 | * It seems like we're in the clear for the moment. Direct-IO is | |
196 | * the main guy that batches up lots of get_user_pages, and even | |
197 | * they are limited to 64-at-a-time which is not so many. | |
198 | */ | |
199 | /* | |
200 | * This doesn't prevent pagetable teardown, but does prevent | |
201 | * the pagetables from being freed on powerpc. | |
202 | * | |
203 | * So long as we atomically load page table pointers versus teardown, | |
204 | * we can follow the address down to the the page and take a ref on it. | |
205 | */ | |
206 | local_irq_disable(); | |
207 | ||
208 | psize = get_slice_psize(mm, addr); | |
209 | shift = mmu_psize_defs[psize].shift; | |
210 | ||
211 | #ifdef CONFIG_HUGETLB_PAGE | |
212 | if (unlikely(mmu_huge_psizes[psize])) { | |
213 | pte_t *ptep; | |
214 | unsigned long a = addr; | |
215 | unsigned long sz = ((1UL) << shift); | |
216 | struct hstate *hstate = size_to_hstate(sz); | |
217 | ||
218 | BUG_ON(!hstate); | |
219 | /* | |
220 | * XXX: could be optimized to avoid hstate | |
221 | * lookup entirely (just use shift) | |
222 | */ | |
223 | ||
224 | do { | |
225 | VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift); | |
226 | ptep = huge_pte_offset(mm, a); | |
227 | pr_debug(" %016lx: huge ptep %p\n", a, ptep); | |
228 | if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages, | |
229 | &nr)) | |
230 | goto slow; | |
231 | } while (a != end); | |
232 | } else | |
233 | #endif /* CONFIG_HUGETLB_PAGE */ | |
234 | { | |
235 | pgdp = pgd_offset(mm, addr); | |
236 | do { | |
237 | pgd_t pgd = *pgdp; | |
238 | ||
239 | VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift); | |
f5ea64dc DG |
240 | pr_debug(" %016lx: normal pgd %p\n", addr, |
241 | (void *)pgd_val(pgd)); | |
ce0ad7f0 NP |
242 | next = pgd_addr_end(addr, end); |
243 | if (pgd_none(pgd)) | |
244 | goto slow; | |
245 | if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) | |
246 | goto slow; | |
247 | } while (pgdp++, addr = next, addr != end); | |
248 | } | |
249 | local_irq_enable(); | |
250 | ||
251 | VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); | |
252 | return nr; | |
253 | ||
254 | { | |
255 | int ret; | |
256 | ||
257 | slow: | |
258 | local_irq_enable(); | |
259 | slow_irqon: | |
260 | pr_debug(" slow path ! nr = %d\n", nr); | |
261 | ||
262 | /* Try to get the remaining pages with get_user_pages */ | |
263 | start += nr << PAGE_SHIFT; | |
264 | pages += nr; | |
265 | ||
266 | down_read(&mm->mmap_sem); | |
267 | ret = get_user_pages(current, mm, start, | |
268 | (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); | |
269 | up_read(&mm->mmap_sem); | |
270 | ||
271 | /* Have to be a bit careful with return values */ | |
272 | if (nr > 0) { | |
273 | if (ret < 0) | |
274 | ret = nr; | |
275 | else | |
276 | ret += nr; | |
277 | } | |
278 | ||
279 | return ret; | |
280 | } | |
281 | } |