Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * PPC64 (POWER4) Huge TLB Page Support for Kernel. | |
3 | * | |
4 | * Copyright (C) 2003 David Gibson, IBM Corporation. | |
5 | * | |
6 | * Based on the IA-32 version: | |
7 | * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> | |
8 | */ | |
9 | ||
10 | #include <linux/init.h> | |
11 | #include <linux/fs.h> | |
12 | #include <linux/mm.h> | |
13 | #include <linux/hugetlb.h> | |
14 | #include <linux/pagemap.h> | |
15 | #include <linux/smp_lock.h> | |
16 | #include <linux/slab.h> | |
17 | #include <linux/err.h> | |
18 | #include <linux/sysctl.h> | |
19 | #include <asm/mman.h> | |
20 | #include <asm/pgalloc.h> | |
21 | #include <asm/tlb.h> | |
22 | #include <asm/tlbflush.h> | |
23 | #include <asm/mmu_context.h> | |
24 | #include <asm/machdep.h> | |
25 | #include <asm/cputable.h> | |
26 | #include <asm/tlb.h> | |
27 | ||
28 | #include <linux/sysctl.h> | |
29 | ||
c594adad DG |
30 | #define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) |
31 | #define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) | |
32 | ||
e28f7faf DG |
33 | /* Modelled after find_linux_pte() */ |
34 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | |
1da177e4 | 35 | { |
e28f7faf DG |
36 | pgd_t *pg; |
37 | pud_t *pu; | |
38 | pmd_t *pm; | |
39 | pte_t *pt; | |
1da177e4 | 40 | |
e28f7faf | 41 | BUG_ON(! in_hugepage_area(mm->context, addr)); |
1da177e4 | 42 | |
e28f7faf DG |
43 | addr &= HPAGE_MASK; |
44 | ||
45 | pg = pgd_offset(mm, addr); | |
46 | if (!pgd_none(*pg)) { | |
47 | pu = pud_offset(pg, addr); | |
48 | if (!pud_none(*pu)) { | |
49 | pm = pmd_offset(pu, addr); | |
3c726f8d BH |
50 | #ifdef CONFIG_PPC_64K_PAGES |
51 | /* Currently, we use the normal PTE offset within full | |
52 | * size PTE pages, thus our huge PTEs are scattered in | |
53 | * the PTE page and we do waste some. We may change | |
54 | * that in the future, but the current mecanism keeps | |
55 | * things much simpler | |
56 | */ | |
57 | if (!pmd_none(*pm)) { | |
58 | /* Note: pte_offset_* are all equivalent on | |
59 | * ppc64 as we don't have HIGHMEM | |
60 | */ | |
61 | pt = pte_offset_kernel(pm, addr); | |
62 | return pt; | |
63 | } | |
64 | #else /* CONFIG_PPC_64K_PAGES */ | |
65 | /* On 4k pages, we put huge PTEs in the PMD page */ | |
e28f7faf | 66 | pt = (pte_t *)pm; |
e28f7faf | 67 | return pt; |
3c726f8d | 68 | #endif /* CONFIG_PPC_64K_PAGES */ |
e28f7faf DG |
69 | } |
70 | } | |
1da177e4 | 71 | |
e28f7faf | 72 | return NULL; |
1da177e4 LT |
73 | } |
74 | ||
e28f7faf | 75 | pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) |
1da177e4 | 76 | { |
e28f7faf DG |
77 | pgd_t *pg; |
78 | pud_t *pu; | |
79 | pmd_t *pm; | |
80 | pte_t *pt; | |
1da177e4 | 81 | |
1da177e4 LT |
82 | BUG_ON(! in_hugepage_area(mm->context, addr)); |
83 | ||
e28f7faf | 84 | addr &= HPAGE_MASK; |
1da177e4 | 85 | |
e28f7faf DG |
86 | pg = pgd_offset(mm, addr); |
87 | pu = pud_alloc(mm, pg, addr); | |
1da177e4 | 88 | |
e28f7faf DG |
89 | if (pu) { |
90 | pm = pmd_alloc(mm, pu, addr); | |
91 | if (pm) { | |
3c726f8d BH |
92 | #ifdef CONFIG_PPC_64K_PAGES |
93 | /* See comment in huge_pte_offset. Note that if we ever | |
94 | * want to put the page size in the PMD, we would have | |
95 | * to open code our own pte_alloc* function in order | |
96 | * to populate and set the size atomically | |
97 | */ | |
98 | pt = pte_alloc_map(mm, pm, addr); | |
99 | #else /* CONFIG_PPC_64K_PAGES */ | |
e28f7faf | 100 | pt = (pte_t *)pm; |
3c726f8d | 101 | #endif /* CONFIG_PPC_64K_PAGES */ |
e28f7faf | 102 | return pt; |
1da177e4 LT |
103 | } |
104 | } | |
105 | ||
e28f7faf | 106 | return NULL; |
1da177e4 LT |
107 | } |
108 | ||
e28f7faf DG |
109 | void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, |
110 | pte_t *ptep, pte_t pte) | |
111 | { | |
e28f7faf | 112 | if (pte_present(*ptep)) { |
3c726f8d BH |
113 | /* We open-code pte_clear because we need to pass the right |
114 | * argument to hpte_update (huge / !huge) | |
115 | */ | |
116 | unsigned long old = pte_update(ptep, ~0UL); | |
117 | if (old & _PAGE_HASHPTE) | |
118 | hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1); | |
e28f7faf DG |
119 | flush_tlb_pending(); |
120 | } | |
3c726f8d | 121 | *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); |
1da177e4 LT |
122 | } |
123 | ||
e28f7faf DG |
124 | pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, |
125 | pte_t *ptep) | |
1da177e4 | 126 | { |
e28f7faf | 127 | unsigned long old = pte_update(ptep, ~0UL); |
1da177e4 | 128 | |
e28f7faf | 129 | if (old & _PAGE_HASHPTE) |
3c726f8d BH |
130 | hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1); |
131 | *ptep = __pte(0); | |
1da177e4 | 132 | |
e28f7faf | 133 | return __pte(old); |
1da177e4 LT |
134 | } |
135 | ||
23ed6cb9 DG |
136 | struct slb_flush_info { |
137 | struct mm_struct *mm; | |
138 | u16 newareas; | |
139 | }; | |
140 | ||
c594adad | 141 | static void flush_low_segments(void *parm) |
1da177e4 | 142 | { |
23ed6cb9 | 143 | struct slb_flush_info *fi = parm; |
1da177e4 LT |
144 | unsigned long i; |
145 | ||
23ed6cb9 DG |
146 | BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS); |
147 | ||
148 | if (current->active_mm != fi->mm) | |
149 | return; | |
1da177e4 | 150 | |
23ed6cb9 DG |
151 | /* Only need to do anything if this CPU is working in the same |
152 | * mm as the one which has changed */ | |
153 | ||
154 | /* update the paca copy of the context struct */ | |
155 | get_paca()->context = current->active_mm->context; | |
c594adad | 156 | |
23ed6cb9 | 157 | asm volatile("isync" : : : "memory"); |
c594adad | 158 | for (i = 0; i < NUM_LOW_AREAS; i++) { |
23ed6cb9 | 159 | if (! (fi->newareas & (1U << i))) |
1da177e4 | 160 | continue; |
14b34661 DG |
161 | asm volatile("slbie %0" |
162 | : : "r" ((i << SID_SHIFT) | SLBIE_C)); | |
1da177e4 | 163 | } |
1da177e4 LT |
164 | asm volatile("isync" : : : "memory"); |
165 | } | |
166 | ||
c594adad DG |
167 | static void flush_high_segments(void *parm) |
168 | { | |
23ed6cb9 | 169 | struct slb_flush_info *fi = parm; |
c594adad DG |
170 | unsigned long i, j; |
171 | ||
c594adad | 172 | |
23ed6cb9 DG |
173 | BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS); |
174 | ||
175 | if (current->active_mm != fi->mm) | |
176 | return; | |
177 | ||
178 | /* Only need to do anything if this CPU is working in the same | |
179 | * mm as the one which has changed */ | |
c594adad | 180 | |
23ed6cb9 DG |
181 | /* update the paca copy of the context struct */ |
182 | get_paca()->context = current->active_mm->context; | |
183 | ||
184 | asm volatile("isync" : : : "memory"); | |
c594adad | 185 | for (i = 0; i < NUM_HIGH_AREAS; i++) { |
23ed6cb9 | 186 | if (! (fi->newareas & (1U << i))) |
c594adad DG |
187 | continue; |
188 | for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++) | |
189 | asm volatile("slbie %0" | |
14b34661 | 190 | :: "r" (((i << HTLB_AREA_SHIFT) |
23ed6cb9 | 191 | + (j << SID_SHIFT)) | SLBIE_C)); |
c594adad | 192 | } |
c594adad DG |
193 | asm volatile("isync" : : : "memory"); |
194 | } | |
195 | ||
196 | static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area) | |
1da177e4 | 197 | { |
c594adad DG |
198 | unsigned long start = area << SID_SHIFT; |
199 | unsigned long end = (area+1) << SID_SHIFT; | |
1da177e4 | 200 | struct vm_area_struct *vma; |
1da177e4 | 201 | |
c594adad | 202 | BUG_ON(area >= NUM_LOW_AREAS); |
1da177e4 LT |
203 | |
204 | /* Check no VMAs are in the region */ | |
205 | vma = find_vma(mm, start); | |
206 | if (vma && (vma->vm_start < end)) | |
207 | return -EBUSY; | |
208 | ||
1da177e4 LT |
209 | return 0; |
210 | } | |
211 | ||
c594adad DG |
212 | static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area) |
213 | { | |
214 | unsigned long start = area << HTLB_AREA_SHIFT; | |
215 | unsigned long end = (area+1) << HTLB_AREA_SHIFT; | |
216 | struct vm_area_struct *vma; | |
217 | ||
218 | BUG_ON(area >= NUM_HIGH_AREAS); | |
219 | ||
7d24f0b8 DG |
220 | /* Hack, so that each addresses is controlled by exactly one |
221 | * of the high or low area bitmaps, the first high area starts | |
222 | * at 4GB, not 0 */ | |
223 | if (start == 0) | |
224 | start = 0x100000000UL; | |
225 | ||
c594adad DG |
226 | /* Check no VMAs are in the region */ |
227 | vma = find_vma(mm, start); | |
228 | if (vma && (vma->vm_start < end)) | |
229 | return -EBUSY; | |
230 | ||
231 | return 0; | |
232 | } | |
233 | ||
234 | static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas) | |
1da177e4 LT |
235 | { |
236 | unsigned long i; | |
23ed6cb9 | 237 | struct slb_flush_info fi; |
1da177e4 | 238 | |
c594adad DG |
239 | BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS); |
240 | BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS); | |
241 | ||
242 | newareas &= ~(mm->context.low_htlb_areas); | |
243 | if (! newareas) | |
1da177e4 LT |
244 | return 0; /* The segments we want are already open */ |
245 | ||
c594adad DG |
246 | for (i = 0; i < NUM_LOW_AREAS; i++) |
247 | if ((1 << i) & newareas) | |
248 | if (prepare_low_area_for_htlb(mm, i) != 0) | |
249 | return -EBUSY; | |
250 | ||
251 | mm->context.low_htlb_areas |= newareas; | |
252 | ||
c594adad DG |
253 | /* the context change must make it to memory before the flush, |
254 | * so that further SLB misses do the right thing. */ | |
255 | mb(); | |
23ed6cb9 DG |
256 | |
257 | fi.mm = mm; | |
258 | fi.newareas = newareas; | |
259 | on_each_cpu(flush_low_segments, &fi, 0, 1); | |
c594adad DG |
260 | |
261 | return 0; | |
262 | } | |
263 | ||
264 | static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas) | |
265 | { | |
23ed6cb9 | 266 | struct slb_flush_info fi; |
c594adad DG |
267 | unsigned long i; |
268 | ||
269 | BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS); | |
270 | BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8) | |
271 | != NUM_HIGH_AREAS); | |
272 | ||
273 | newareas &= ~(mm->context.high_htlb_areas); | |
274 | if (! newareas) | |
275 | return 0; /* The areas we want are already open */ | |
276 | ||
277 | for (i = 0; i < NUM_HIGH_AREAS; i++) | |
278 | if ((1 << i) & newareas) | |
279 | if (prepare_high_area_for_htlb(mm, i) != 0) | |
1da177e4 LT |
280 | return -EBUSY; |
281 | ||
c594adad | 282 | mm->context.high_htlb_areas |= newareas; |
1da177e4 LT |
283 | |
284 | /* update the paca copy of the context struct */ | |
285 | get_paca()->context = mm->context; | |
286 | ||
287 | /* the context change must make it to memory before the flush, | |
288 | * so that further SLB misses do the right thing. */ | |
289 | mb(); | |
23ed6cb9 DG |
290 | |
291 | fi.mm = mm; | |
292 | fi.newareas = newareas; | |
293 | on_each_cpu(flush_high_segments, &fi, 0, 1); | |
1da177e4 LT |
294 | |
295 | return 0; | |
296 | } | |
297 | ||
298 | int prepare_hugepage_range(unsigned long addr, unsigned long len) | |
299 | { | |
5e391dc9 | 300 | int err = 0; |
c594adad DG |
301 | |
302 | if ( (addr+len) < addr ) | |
303 | return -EINVAL; | |
304 | ||
5e391dc9 | 305 | if (addr < 0x100000000UL) |
c594adad | 306 | err = open_low_hpage_areas(current->mm, |
1da177e4 | 307 | LOW_ESID_MASK(addr, len)); |
9a94c579 | 308 | if ((addr + len) > 0x100000000UL) |
c594adad DG |
309 | err = open_high_hpage_areas(current->mm, |
310 | HTLB_AREA_MASK(addr, len)); | |
311 | if (err) { | |
312 | printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)" | |
313 | " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n", | |
314 | addr, len, | |
315 | LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len)); | |
1da177e4 LT |
316 | return err; |
317 | } | |
318 | ||
c594adad | 319 | return 0; |
1da177e4 LT |
320 | } |
321 | ||
1da177e4 LT |
322 | struct page * |
323 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | |
324 | { | |
325 | pte_t *ptep; | |
326 | struct page *page; | |
327 | ||
328 | if (! in_hugepage_area(mm->context, address)) | |
329 | return ERR_PTR(-EINVAL); | |
330 | ||
331 | ptep = huge_pte_offset(mm, address); | |
332 | page = pte_page(*ptep); | |
333 | if (page) | |
334 | page += (address % HPAGE_SIZE) / PAGE_SIZE; | |
335 | ||
336 | return page; | |
337 | } | |
338 | ||
339 | int pmd_huge(pmd_t pmd) | |
340 | { | |
341 | return 0; | |
342 | } | |
343 | ||
344 | struct page * | |
345 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |
346 | pmd_t *pmd, int write) | |
347 | { | |
348 | BUG(); | |
349 | return NULL; | |
350 | } | |
351 | ||
1da177e4 LT |
352 | /* Because we have an exclusive hugepage region which lies within the |
353 | * normal user address space, we have to take special measures to make | |
354 | * non-huge mmap()s evade the hugepage reserved regions. */ | |
355 | unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, | |
356 | unsigned long len, unsigned long pgoff, | |
357 | unsigned long flags) | |
358 | { | |
359 | struct mm_struct *mm = current->mm; | |
360 | struct vm_area_struct *vma; | |
361 | unsigned long start_addr; | |
362 | ||
363 | if (len > TASK_SIZE) | |
364 | return -ENOMEM; | |
365 | ||
366 | if (addr) { | |
367 | addr = PAGE_ALIGN(addr); | |
368 | vma = find_vma(mm, addr); | |
369 | if (((TASK_SIZE - len) >= addr) | |
370 | && (!vma || (addr+len) <= vma->vm_start) | |
371 | && !is_hugepage_only_range(mm, addr,len)) | |
372 | return addr; | |
373 | } | |
1363c3cd WW |
374 | if (len > mm->cached_hole_size) { |
375 | start_addr = addr = mm->free_area_cache; | |
376 | } else { | |
377 | start_addr = addr = TASK_UNMAPPED_BASE; | |
378 | mm->cached_hole_size = 0; | |
379 | } | |
1da177e4 LT |
380 | |
381 | full_search: | |
382 | vma = find_vma(mm, addr); | |
383 | while (TASK_SIZE - len >= addr) { | |
384 | BUG_ON(vma && (addr >= vma->vm_end)); | |
385 | ||
386 | if (touches_hugepage_low_range(mm, addr, len)) { | |
387 | addr = ALIGN(addr+1, 1<<SID_SHIFT); | |
388 | vma = find_vma(mm, addr); | |
389 | continue; | |
390 | } | |
c594adad DG |
391 | if (touches_hugepage_high_range(mm, addr, len)) { |
392 | addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT); | |
1da177e4 LT |
393 | vma = find_vma(mm, addr); |
394 | continue; | |
395 | } | |
396 | if (!vma || addr + len <= vma->vm_start) { | |
397 | /* | |
398 | * Remember the place where we stopped the search: | |
399 | */ | |
400 | mm->free_area_cache = addr + len; | |
401 | return addr; | |
402 | } | |
1363c3cd WW |
403 | if (addr + mm->cached_hole_size < vma->vm_start) |
404 | mm->cached_hole_size = vma->vm_start - addr; | |
1da177e4 LT |
405 | addr = vma->vm_end; |
406 | vma = vma->vm_next; | |
407 | } | |
408 | ||
409 | /* Make sure we didn't miss any holes */ | |
410 | if (start_addr != TASK_UNMAPPED_BASE) { | |
411 | start_addr = addr = TASK_UNMAPPED_BASE; | |
1363c3cd | 412 | mm->cached_hole_size = 0; |
1da177e4 LT |
413 | goto full_search; |
414 | } | |
415 | return -ENOMEM; | |
416 | } | |
417 | ||
418 | /* | |
419 | * This mmap-allocator allocates new areas top-down from below the | |
420 | * stack's low limit (the base): | |
421 | * | |
422 | * Because we have an exclusive hugepage region which lies within the | |
423 | * normal user address space, we have to take special measures to make | |
424 | * non-huge mmap()s evade the hugepage reserved regions. | |
425 | */ | |
426 | unsigned long | |
427 | arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |
428 | const unsigned long len, const unsigned long pgoff, | |
429 | const unsigned long flags) | |
430 | { | |
431 | struct vm_area_struct *vma, *prev_vma; | |
432 | struct mm_struct *mm = current->mm; | |
433 | unsigned long base = mm->mmap_base, addr = addr0; | |
1363c3cd | 434 | unsigned long largest_hole = mm->cached_hole_size; |
1da177e4 LT |
435 | int first_time = 1; |
436 | ||
437 | /* requested length too big for entire address space */ | |
438 | if (len > TASK_SIZE) | |
439 | return -ENOMEM; | |
440 | ||
441 | /* dont allow allocations above current base */ | |
442 | if (mm->free_area_cache > base) | |
443 | mm->free_area_cache = base; | |
444 | ||
445 | /* requesting a specific address */ | |
446 | if (addr) { | |
447 | addr = PAGE_ALIGN(addr); | |
448 | vma = find_vma(mm, addr); | |
449 | if (TASK_SIZE - len >= addr && | |
450 | (!vma || addr + len <= vma->vm_start) | |
451 | && !is_hugepage_only_range(mm, addr,len)) | |
452 | return addr; | |
453 | } | |
454 | ||
1363c3cd WW |
455 | if (len <= largest_hole) { |
456 | largest_hole = 0; | |
457 | mm->free_area_cache = base; | |
458 | } | |
1da177e4 LT |
459 | try_again: |
460 | /* make sure it can fit in the remaining address space */ | |
461 | if (mm->free_area_cache < len) | |
462 | goto fail; | |
463 | ||
464 | /* either no address requested or cant fit in requested address hole */ | |
465 | addr = (mm->free_area_cache - len) & PAGE_MASK; | |
466 | do { | |
467 | hugepage_recheck: | |
468 | if (touches_hugepage_low_range(mm, addr, len)) { | |
469 | addr = (addr & ((~0) << SID_SHIFT)) - len; | |
470 | goto hugepage_recheck; | |
c594adad DG |
471 | } else if (touches_hugepage_high_range(mm, addr, len)) { |
472 | addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len; | |
473 | goto hugepage_recheck; | |
1da177e4 LT |
474 | } |
475 | ||
476 | /* | |
477 | * Lookup failure means no vma is above this address, | |
478 | * i.e. return with success: | |
479 | */ | |
480 | if (!(vma = find_vma_prev(mm, addr, &prev_vma))) | |
481 | return addr; | |
482 | ||
483 | /* | |
484 | * new region fits between prev_vma->vm_end and | |
485 | * vma->vm_start, use it: | |
486 | */ | |
487 | if (addr+len <= vma->vm_start && | |
1363c3cd | 488 | (!prev_vma || (addr >= prev_vma->vm_end))) { |
1da177e4 | 489 | /* remember the address as a hint for next time */ |
1363c3cd WW |
490 | mm->cached_hole_size = largest_hole; |
491 | return (mm->free_area_cache = addr); | |
492 | } else { | |
1da177e4 | 493 | /* pull free_area_cache down to the first hole */ |
1363c3cd | 494 | if (mm->free_area_cache == vma->vm_end) { |
1da177e4 | 495 | mm->free_area_cache = vma->vm_start; |
1363c3cd WW |
496 | mm->cached_hole_size = largest_hole; |
497 | } | |
498 | } | |
499 | ||
500 | /* remember the largest hole we saw so far */ | |
501 | if (addr + largest_hole < vma->vm_start) | |
502 | largest_hole = vma->vm_start - addr; | |
1da177e4 LT |
503 | |
504 | /* try just below the current vma->vm_start */ | |
505 | addr = vma->vm_start-len; | |
506 | } while (len <= vma->vm_start); | |
507 | ||
508 | fail: | |
509 | /* | |
510 | * if hint left us with no space for the requested | |
511 | * mapping then try again: | |
512 | */ | |
513 | if (first_time) { | |
514 | mm->free_area_cache = base; | |
1363c3cd | 515 | largest_hole = 0; |
1da177e4 LT |
516 | first_time = 0; |
517 | goto try_again; | |
518 | } | |
519 | /* | |
520 | * A failed mmap() very likely causes application failure, | |
521 | * so fall back to the bottom-up function here. This scenario | |
522 | * can happen with large stack limits and large mmap() | |
523 | * allocations. | |
524 | */ | |
525 | mm->free_area_cache = TASK_UNMAPPED_BASE; | |
1363c3cd | 526 | mm->cached_hole_size = ~0UL; |
1da177e4 LT |
527 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); |
528 | /* | |
529 | * Restore the topdown base: | |
530 | */ | |
531 | mm->free_area_cache = base; | |
1363c3cd | 532 | mm->cached_hole_size = ~0UL; |
1da177e4 LT |
533 | |
534 | return addr; | |
535 | } | |
536 | ||
456752f7 DG |
537 | static int htlb_check_hinted_area(unsigned long addr, unsigned long len) |
538 | { | |
539 | struct vm_area_struct *vma; | |
540 | ||
541 | vma = find_vma(current->mm, addr); | |
542 | if (!vma || ((addr + len) <= vma->vm_start)) | |
543 | return 0; | |
544 | ||
545 | return -ENOMEM; | |
546 | } | |
547 | ||
1da177e4 LT |
548 | static unsigned long htlb_get_low_area(unsigned long len, u16 segmask) |
549 | { | |
550 | unsigned long addr = 0; | |
551 | struct vm_area_struct *vma; | |
552 | ||
553 | vma = find_vma(current->mm, addr); | |
554 | while (addr + len <= 0x100000000UL) { | |
555 | BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ | |
556 | ||
557 | if (! __within_hugepage_low_range(addr, len, segmask)) { | |
558 | addr = ALIGN(addr+1, 1<<SID_SHIFT); | |
559 | vma = find_vma(current->mm, addr); | |
560 | continue; | |
561 | } | |
562 | ||
563 | if (!vma || (addr + len) <= vma->vm_start) | |
564 | return addr; | |
565 | addr = ALIGN(vma->vm_end, HPAGE_SIZE); | |
566 | /* Depending on segmask this might not be a confirmed | |
567 | * hugepage region, so the ALIGN could have skipped | |
568 | * some VMAs */ | |
569 | vma = find_vma(current->mm, addr); | |
570 | } | |
571 | ||
572 | return -ENOMEM; | |
573 | } | |
574 | ||
c594adad | 575 | static unsigned long htlb_get_high_area(unsigned long len, u16 areamask) |
1da177e4 | 576 | { |
c594adad | 577 | unsigned long addr = 0x100000000UL; |
1da177e4 LT |
578 | struct vm_area_struct *vma; |
579 | ||
580 | vma = find_vma(current->mm, addr); | |
c594adad | 581 | while (addr + len <= TASK_SIZE_USER64) { |
1da177e4 | 582 | BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ |
c594adad DG |
583 | |
584 | if (! __within_hugepage_high_range(addr, len, areamask)) { | |
585 | addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT); | |
586 | vma = find_vma(current->mm, addr); | |
587 | continue; | |
588 | } | |
1da177e4 LT |
589 | |
590 | if (!vma || (addr + len) <= vma->vm_start) | |
591 | return addr; | |
592 | addr = ALIGN(vma->vm_end, HPAGE_SIZE); | |
c594adad DG |
593 | /* Depending on segmask this might not be a confirmed |
594 | * hugepage region, so the ALIGN could have skipped | |
595 | * some VMAs */ | |
596 | vma = find_vma(current->mm, addr); | |
1da177e4 LT |
597 | } |
598 | ||
599 | return -ENOMEM; | |
600 | } | |
601 | ||
602 | unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | |
603 | unsigned long len, unsigned long pgoff, | |
604 | unsigned long flags) | |
605 | { | |
c594adad DG |
606 | int lastshift; |
607 | u16 areamask, curareas; | |
608 | ||
3c726f8d BH |
609 | if (HPAGE_SHIFT == 0) |
610 | return -EINVAL; | |
1da177e4 LT |
611 | if (len & ~HPAGE_MASK) |
612 | return -EINVAL; | |
613 | ||
614 | if (!cpu_has_feature(CPU_FTR_16M_PAGE)) | |
615 | return -EINVAL; | |
616 | ||
456752f7 DG |
617 | /* Paranoia, caller should have dealt with this */ |
618 | BUG_ON((addr + len) < addr); | |
619 | ||
1da177e4 | 620 | if (test_thread_flag(TIF_32BIT)) { |
456752f7 DG |
621 | /* Paranoia, caller should have dealt with this */ |
622 | BUG_ON((addr + len) > 0x100000000UL); | |
623 | ||
c594adad | 624 | curareas = current->mm->context.low_htlb_areas; |
1da177e4 | 625 | |
456752f7 DG |
626 | /* First see if we can use the hint address */ |
627 | if (addr && (htlb_check_hinted_area(addr, len) == 0)) { | |
628 | areamask = LOW_ESID_MASK(addr, len); | |
629 | if (open_low_hpage_areas(current->mm, areamask) == 0) | |
630 | return addr; | |
631 | } | |
632 | ||
633 | /* Next see if we can map in the existing low areas */ | |
c594adad | 634 | addr = htlb_get_low_area(len, curareas); |
1da177e4 LT |
635 | if (addr != -ENOMEM) |
636 | return addr; | |
637 | ||
456752f7 | 638 | /* Finally go looking for areas to open */ |
c594adad DG |
639 | lastshift = 0; |
640 | for (areamask = LOW_ESID_MASK(0x100000000UL-len, len); | |
641 | ! lastshift; areamask >>=1) { | |
642 | if (areamask & 1) | |
1da177e4 LT |
643 | lastshift = 1; |
644 | ||
c594adad | 645 | addr = htlb_get_low_area(len, curareas | areamask); |
1da177e4 | 646 | if ((addr != -ENOMEM) |
c594adad | 647 | && open_low_hpage_areas(current->mm, areamask) == 0) |
1da177e4 LT |
648 | return addr; |
649 | } | |
1da177e4 | 650 | } else { |
c594adad DG |
651 | curareas = current->mm->context.high_htlb_areas; |
652 | ||
456752f7 DG |
653 | /* First see if we can use the hint address */ |
654 | /* We discourage 64-bit processes from doing hugepage | |
655 | * mappings below 4GB (must use MAP_FIXED) */ | |
656 | if ((addr >= 0x100000000UL) | |
657 | && (htlb_check_hinted_area(addr, len) == 0)) { | |
658 | areamask = HTLB_AREA_MASK(addr, len); | |
659 | if (open_high_hpage_areas(current->mm, areamask) == 0) | |
660 | return addr; | |
661 | } | |
662 | ||
663 | /* Next see if we can map in the existing high areas */ | |
c594adad DG |
664 | addr = htlb_get_high_area(len, curareas); |
665 | if (addr != -ENOMEM) | |
666 | return addr; | |
667 | ||
456752f7 | 668 | /* Finally go looking for areas to open */ |
c594adad DG |
669 | lastshift = 0; |
670 | for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len); | |
671 | ! lastshift; areamask >>=1) { | |
672 | if (areamask & 1) | |
673 | lastshift = 1; | |
674 | ||
675 | addr = htlb_get_high_area(len, curareas | areamask); | |
676 | if ((addr != -ENOMEM) | |
677 | && open_high_hpage_areas(current->mm, areamask) == 0) | |
678 | return addr; | |
679 | } | |
1da177e4 | 680 | } |
c594adad DG |
681 | printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open" |
682 | " enough areas\n"); | |
683 | return -ENOMEM; | |
1da177e4 LT |
684 | } |
685 | ||
cbf52afd DG |
686 | /* |
687 | * Called by asm hashtable.S for doing lazy icache flush | |
688 | */ | |
689 | static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags, | |
690 | pte_t pte, int trap) | |
691 | { | |
692 | struct page *page; | |
693 | int i; | |
694 | ||
695 | if (!pfn_valid(pte_pfn(pte))) | |
696 | return rflags; | |
697 | ||
698 | page = pte_page(pte); | |
699 | ||
700 | /* page is dirty */ | |
701 | if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { | |
702 | if (trap == 0x400) { | |
703 | for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) | |
704 | __flush_dcache_icache(page_address(page+i)); | |
705 | set_bit(PG_arch_1, &page->flags); | |
706 | } else { | |
707 | rflags |= HPTE_R_N; | |
708 | } | |
709 | } | |
710 | return rflags; | |
711 | } | |
712 | ||
1da177e4 | 713 | int hash_huge_page(struct mm_struct *mm, unsigned long access, |
cbf52afd DG |
714 | unsigned long ea, unsigned long vsid, int local, |
715 | unsigned long trap) | |
1da177e4 LT |
716 | { |
717 | pte_t *ptep; | |
3c726f8d BH |
718 | unsigned long old_pte, new_pte; |
719 | unsigned long va, rflags, pa; | |
1da177e4 LT |
720 | long slot; |
721 | int err = 1; | |
722 | ||
1da177e4 LT |
723 | ptep = huge_pte_offset(mm, ea); |
724 | ||
725 | /* Search the Linux page table for a match with va */ | |
726 | va = (vsid << 28) | (ea & 0x0fffffff); | |
1da177e4 LT |
727 | |
728 | /* | |
729 | * If no pte found or not present, send the problem up to | |
730 | * do_page_fault | |
731 | */ | |
732 | if (unlikely(!ptep || pte_none(*ptep))) | |
733 | goto out; | |
734 | ||
1da177e4 LT |
735 | /* |
736 | * Check the user's access rights to the page. If access should be | |
737 | * prevented then send the problem up to do_page_fault. | |
738 | */ | |
739 | if (unlikely(access & ~pte_val(*ptep))) | |
740 | goto out; | |
741 | /* | |
742 | * At this point, we have a pte (old_pte) which can be used to build | |
743 | * or update an HPTE. There are 2 cases: | |
744 | * | |
745 | * 1. There is a valid (present) pte with no associated HPTE (this is | |
746 | * the most common case) | |
747 | * 2. There is a valid (present) pte with an associated HPTE. The | |
748 | * current values of the pp bits in the HPTE prevent access | |
749 | * because we are doing software DIRTY bit management and the | |
750 | * page is currently not DIRTY. | |
751 | */ | |
752 | ||
753 | ||
3c726f8d BH |
754 | do { |
755 | old_pte = pte_val(*ptep); | |
756 | if (old_pte & _PAGE_BUSY) | |
757 | goto out; | |
758 | new_pte = old_pte | _PAGE_BUSY | | |
759 | _PAGE_ACCESSED | _PAGE_HASHPTE; | |
760 | } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, | |
761 | old_pte, new_pte)); | |
762 | ||
763 | rflags = 0x2 | (!(new_pte & _PAGE_RW)); | |
1da177e4 | 764 | /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ |
3c726f8d | 765 | rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); |
cbf52afd DG |
766 | if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) |
767 | /* No CPU has hugepages but lacks no execute, so we | |
768 | * don't need to worry about that case */ | |
769 | rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte), | |
770 | trap); | |
1da177e4 LT |
771 | |
772 | /* Check if pte already has an hpte (case 2) */ | |
3c726f8d | 773 | if (unlikely(old_pte & _PAGE_HASHPTE)) { |
1da177e4 LT |
774 | /* There MIGHT be an HPTE for this pte */ |
775 | unsigned long hash, slot; | |
776 | ||
3c726f8d BH |
777 | hash = hpt_hash(va, HPAGE_SHIFT); |
778 | if (old_pte & _PAGE_F_SECOND) | |
1da177e4 LT |
779 | hash = ~hash; |
780 | slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; | |
3c726f8d | 781 | slot += (old_pte & _PAGE_F_GIX) >> 12; |
1da177e4 | 782 | |
325c82a0 BH |
783 | if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize, |
784 | local) == -1) | |
3c726f8d | 785 | old_pte &= ~_PAGE_HPTEFLAGS; |
1da177e4 LT |
786 | } |
787 | ||
3c726f8d BH |
788 | if (likely(!(old_pte & _PAGE_HASHPTE))) { |
789 | unsigned long hash = hpt_hash(va, HPAGE_SHIFT); | |
1da177e4 LT |
790 | unsigned long hpte_group; |
791 | ||
3c726f8d | 792 | pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; |
1da177e4 LT |
793 | |
794 | repeat: | |
795 | hpte_group = ((hash & htab_hash_mask) * | |
796 | HPTES_PER_GROUP) & ~0x7UL; | |
797 | ||
3c726f8d BH |
798 | /* clear HPTE slot informations in new PTE */ |
799 | new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; | |
1da177e4 LT |
800 | |
801 | /* Add in WIMG bits */ | |
802 | /* XXX We should store these in the pte */ | |
3c726f8d | 803 | /* --BenH: I think they are ... */ |
96e28449 | 804 | rflags |= _PAGE_COHERENT; |
1da177e4 | 805 | |
3c726f8d BH |
806 | /* Insert into the hash table, primary slot */ |
807 | slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0, | |
808 | mmu_huge_psize); | |
1da177e4 LT |
809 | |
810 | /* Primary is full, try the secondary */ | |
811 | if (unlikely(slot == -1)) { | |
3c726f8d | 812 | new_pte |= _PAGE_F_SECOND; |
1da177e4 LT |
813 | hpte_group = ((~hash & htab_hash_mask) * |
814 | HPTES_PER_GROUP) & ~0x7UL; | |
3c726f8d | 815 | slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, |
67b10813 | 816 | HPTE_V_SECONDARY, |
3c726f8d | 817 | mmu_huge_psize); |
1da177e4 LT |
818 | if (slot == -1) { |
819 | if (mftb() & 0x1) | |
67b10813 BH |
820 | hpte_group = ((hash & htab_hash_mask) * |
821 | HPTES_PER_GROUP)&~0x7UL; | |
1da177e4 LT |
822 | |
823 | ppc_md.hpte_remove(hpte_group); | |
824 | goto repeat; | |
825 | } | |
826 | } | |
827 | ||
828 | if (unlikely(slot == -2)) | |
829 | panic("hash_huge_page: pte_insert failed\n"); | |
830 | ||
3c726f8d | 831 | new_pte |= (slot << 12) & _PAGE_F_GIX; |
1da177e4 LT |
832 | } |
833 | ||
3c726f8d | 834 | /* |
01edcd89 | 835 | * No need to use ldarx/stdcx here |
3c726f8d BH |
836 | */ |
837 | *ptep = __pte(new_pte & ~_PAGE_BUSY); | |
838 | ||
1da177e4 LT |
839 | err = 0; |
840 | ||
841 | out: | |
1da177e4 LT |
842 | return err; |
843 | } |