Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * PPC64 (POWER4) Huge TLB Page Support for Kernel. | |
3 | * | |
4 | * Copyright (C) 2003 David Gibson, IBM Corporation. | |
5 | * | |
6 | * Based on the IA-32 version: | |
7 | * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> | |
8 | */ | |
9 | ||
10 | #include <linux/init.h> | |
11 | #include <linux/fs.h> | |
12 | #include <linux/mm.h> | |
13 | #include <linux/hugetlb.h> | |
14 | #include <linux/pagemap.h> | |
15 | #include <linux/smp_lock.h> | |
16 | #include <linux/slab.h> | |
17 | #include <linux/err.h> | |
18 | #include <linux/sysctl.h> | |
19 | #include <asm/mman.h> | |
20 | #include <asm/pgalloc.h> | |
21 | #include <asm/tlb.h> | |
22 | #include <asm/tlbflush.h> | |
23 | #include <asm/mmu_context.h> | |
24 | #include <asm/machdep.h> | |
25 | #include <asm/cputable.h> | |
26 | #include <asm/tlb.h> | |
27 | ||
28 | #include <linux/sysctl.h> | |
29 | ||
c594adad DG |
30 | #define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) |
31 | #define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) | |
32 | ||
e28f7faf DG |
33 | /* Modelled after find_linux_pte() */ |
34 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | |
1da177e4 | 35 | { |
e28f7faf DG |
36 | pgd_t *pg; |
37 | pud_t *pu; | |
38 | pmd_t *pm; | |
39 | pte_t *pt; | |
1da177e4 | 40 | |
e28f7faf | 41 | BUG_ON(! in_hugepage_area(mm->context, addr)); |
1da177e4 | 42 | |
e28f7faf DG |
43 | addr &= HPAGE_MASK; |
44 | ||
45 | pg = pgd_offset(mm, addr); | |
46 | if (!pgd_none(*pg)) { | |
47 | pu = pud_offset(pg, addr); | |
48 | if (!pud_none(*pu)) { | |
49 | pm = pmd_offset(pu, addr); | |
3c726f8d BH |
50 | #ifdef CONFIG_PPC_64K_PAGES |
51 | /* Currently, we use the normal PTE offset within full | |
52 | * size PTE pages, thus our huge PTEs are scattered in | |
53 | * the PTE page and we do waste some. We may change | |
54 | * that in the future, but the current mecanism keeps | |
55 | * things much simpler | |
56 | */ | |
57 | if (!pmd_none(*pm)) { | |
58 | /* Note: pte_offset_* are all equivalent on | |
59 | * ppc64 as we don't have HIGHMEM | |
60 | */ | |
61 | pt = pte_offset_kernel(pm, addr); | |
62 | return pt; | |
63 | } | |
64 | #else /* CONFIG_PPC_64K_PAGES */ | |
65 | /* On 4k pages, we put huge PTEs in the PMD page */ | |
e28f7faf | 66 | pt = (pte_t *)pm; |
e28f7faf | 67 | return pt; |
3c726f8d | 68 | #endif /* CONFIG_PPC_64K_PAGES */ |
e28f7faf DG |
69 | } |
70 | } | |
1da177e4 | 71 | |
e28f7faf | 72 | return NULL; |
1da177e4 LT |
73 | } |
74 | ||
e28f7faf | 75 | pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) |
1da177e4 | 76 | { |
e28f7faf DG |
77 | pgd_t *pg; |
78 | pud_t *pu; | |
79 | pmd_t *pm; | |
80 | pte_t *pt; | |
1da177e4 | 81 | |
1da177e4 LT |
82 | BUG_ON(! in_hugepage_area(mm->context, addr)); |
83 | ||
e28f7faf | 84 | addr &= HPAGE_MASK; |
1da177e4 | 85 | |
e28f7faf DG |
86 | pg = pgd_offset(mm, addr); |
87 | pu = pud_alloc(mm, pg, addr); | |
1da177e4 | 88 | |
e28f7faf DG |
89 | if (pu) { |
90 | pm = pmd_alloc(mm, pu, addr); | |
91 | if (pm) { | |
3c726f8d BH |
92 | #ifdef CONFIG_PPC_64K_PAGES |
93 | /* See comment in huge_pte_offset. Note that if we ever | |
94 | * want to put the page size in the PMD, we would have | |
95 | * to open code our own pte_alloc* function in order | |
96 | * to populate and set the size atomically | |
97 | */ | |
98 | pt = pte_alloc_map(mm, pm, addr); | |
99 | #else /* CONFIG_PPC_64K_PAGES */ | |
e28f7faf | 100 | pt = (pte_t *)pm; |
3c726f8d | 101 | #endif /* CONFIG_PPC_64K_PAGES */ |
e28f7faf | 102 | return pt; |
1da177e4 LT |
103 | } |
104 | } | |
105 | ||
e28f7faf | 106 | return NULL; |
1da177e4 LT |
107 | } |
108 | ||
e28f7faf DG |
109 | void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, |
110 | pte_t *ptep, pte_t pte) | |
111 | { | |
e28f7faf | 112 | if (pte_present(*ptep)) { |
3c726f8d BH |
113 | /* We open-code pte_clear because we need to pass the right |
114 | * argument to hpte_update (huge / !huge) | |
115 | */ | |
116 | unsigned long old = pte_update(ptep, ~0UL); | |
117 | if (old & _PAGE_HASHPTE) | |
118 | hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1); | |
e28f7faf DG |
119 | flush_tlb_pending(); |
120 | } | |
3c726f8d | 121 | *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); |
1da177e4 LT |
122 | } |
123 | ||
e28f7faf DG |
124 | pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, |
125 | pte_t *ptep) | |
1da177e4 | 126 | { |
e28f7faf | 127 | unsigned long old = pte_update(ptep, ~0UL); |
1da177e4 | 128 | |
e28f7faf | 129 | if (old & _PAGE_HASHPTE) |
3c726f8d BH |
130 | hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1); |
131 | *ptep = __pte(0); | |
1da177e4 | 132 | |
e28f7faf | 133 | return __pte(old); |
1da177e4 LT |
134 | } |
135 | ||
1da177e4 LT |
136 | /* |
137 | * This function checks for proper alignment of input addr and len parameters. | |
138 | */ | |
139 | int is_aligned_hugepage_range(unsigned long addr, unsigned long len) | |
140 | { | |
141 | if (len & ~HPAGE_MASK) | |
142 | return -EINVAL; | |
143 | if (addr & ~HPAGE_MASK) | |
144 | return -EINVAL; | |
145 | if (! (within_hugepage_low_range(addr, len) | |
146 | || within_hugepage_high_range(addr, len)) ) | |
147 | return -EINVAL; | |
148 | return 0; | |
149 | } | |
150 | ||
c594adad | 151 | static void flush_low_segments(void *parm) |
1da177e4 | 152 | { |
c594adad | 153 | u16 areas = (unsigned long) parm; |
1da177e4 LT |
154 | unsigned long i; |
155 | ||
156 | asm volatile("isync" : : : "memory"); | |
157 | ||
c594adad DG |
158 | BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS); |
159 | ||
160 | for (i = 0; i < NUM_LOW_AREAS; i++) { | |
161 | if (! (areas & (1U << i))) | |
1da177e4 | 162 | continue; |
14b34661 DG |
163 | asm volatile("slbie %0" |
164 | : : "r" ((i << SID_SHIFT) | SLBIE_C)); | |
1da177e4 LT |
165 | } |
166 | ||
167 | asm volatile("isync" : : : "memory"); | |
168 | } | |
169 | ||
c594adad DG |
170 | static void flush_high_segments(void *parm) |
171 | { | |
172 | u16 areas = (unsigned long) parm; | |
173 | unsigned long i, j; | |
174 | ||
175 | asm volatile("isync" : : : "memory"); | |
176 | ||
177 | BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS); | |
178 | ||
179 | for (i = 0; i < NUM_HIGH_AREAS; i++) { | |
180 | if (! (areas & (1U << i))) | |
181 | continue; | |
182 | for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++) | |
183 | asm volatile("slbie %0" | |
14b34661 DG |
184 | :: "r" (((i << HTLB_AREA_SHIFT) |
185 | + (j << SID_SHIFT)) | SLBIE_C)); | |
c594adad DG |
186 | } |
187 | ||
188 | asm volatile("isync" : : : "memory"); | |
189 | } | |
190 | ||
191 | static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area) | |
1da177e4 | 192 | { |
c594adad DG |
193 | unsigned long start = area << SID_SHIFT; |
194 | unsigned long end = (area+1) << SID_SHIFT; | |
1da177e4 | 195 | struct vm_area_struct *vma; |
1da177e4 | 196 | |
c594adad | 197 | BUG_ON(area >= NUM_LOW_AREAS); |
1da177e4 LT |
198 | |
199 | /* Check no VMAs are in the region */ | |
200 | vma = find_vma(mm, start); | |
201 | if (vma && (vma->vm_start < end)) | |
202 | return -EBUSY; | |
203 | ||
1da177e4 LT |
204 | return 0; |
205 | } | |
206 | ||
c594adad DG |
207 | static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area) |
208 | { | |
209 | unsigned long start = area << HTLB_AREA_SHIFT; | |
210 | unsigned long end = (area+1) << HTLB_AREA_SHIFT; | |
211 | struct vm_area_struct *vma; | |
212 | ||
213 | BUG_ON(area >= NUM_HIGH_AREAS); | |
214 | ||
7d24f0b8 DG |
215 | /* Hack, so that each addresses is controlled by exactly one |
216 | * of the high or low area bitmaps, the first high area starts | |
217 | * at 4GB, not 0 */ | |
218 | if (start == 0) | |
219 | start = 0x100000000UL; | |
220 | ||
c594adad DG |
221 | /* Check no VMAs are in the region */ |
222 | vma = find_vma(mm, start); | |
223 | if (vma && (vma->vm_start < end)) | |
224 | return -EBUSY; | |
225 | ||
226 | return 0; | |
227 | } | |
228 | ||
229 | static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas) | |
1da177e4 LT |
230 | { |
231 | unsigned long i; | |
232 | ||
c594adad DG |
233 | BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS); |
234 | BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS); | |
235 | ||
236 | newareas &= ~(mm->context.low_htlb_areas); | |
237 | if (! newareas) | |
1da177e4 LT |
238 | return 0; /* The segments we want are already open */ |
239 | ||
c594adad DG |
240 | for (i = 0; i < NUM_LOW_AREAS; i++) |
241 | if ((1 << i) & newareas) | |
242 | if (prepare_low_area_for_htlb(mm, i) != 0) | |
243 | return -EBUSY; | |
244 | ||
245 | mm->context.low_htlb_areas |= newareas; | |
246 | ||
247 | /* update the paca copy of the context struct */ | |
248 | get_paca()->context = mm->context; | |
249 | ||
250 | /* the context change must make it to memory before the flush, | |
251 | * so that further SLB misses do the right thing. */ | |
252 | mb(); | |
253 | on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1); | |
254 | ||
255 | return 0; | |
256 | } | |
257 | ||
258 | static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas) | |
259 | { | |
260 | unsigned long i; | |
261 | ||
262 | BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS); | |
263 | BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8) | |
264 | != NUM_HIGH_AREAS); | |
265 | ||
266 | newareas &= ~(mm->context.high_htlb_areas); | |
267 | if (! newareas) | |
268 | return 0; /* The areas we want are already open */ | |
269 | ||
270 | for (i = 0; i < NUM_HIGH_AREAS; i++) | |
271 | if ((1 << i) & newareas) | |
272 | if (prepare_high_area_for_htlb(mm, i) != 0) | |
1da177e4 LT |
273 | return -EBUSY; |
274 | ||
c594adad | 275 | mm->context.high_htlb_areas |= newareas; |
1da177e4 LT |
276 | |
277 | /* update the paca copy of the context struct */ | |
278 | get_paca()->context = mm->context; | |
279 | ||
280 | /* the context change must make it to memory before the flush, | |
281 | * so that further SLB misses do the right thing. */ | |
282 | mb(); | |
c594adad | 283 | on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1); |
1da177e4 LT |
284 | |
285 | return 0; | |
286 | } | |
287 | ||
288 | int prepare_hugepage_range(unsigned long addr, unsigned long len) | |
289 | { | |
5e391dc9 | 290 | int err = 0; |
c594adad DG |
291 | |
292 | if ( (addr+len) < addr ) | |
293 | return -EINVAL; | |
294 | ||
5e391dc9 | 295 | if (addr < 0x100000000UL) |
c594adad | 296 | err = open_low_hpage_areas(current->mm, |
1da177e4 | 297 | LOW_ESID_MASK(addr, len)); |
9a94c579 | 298 | if ((addr + len) > 0x100000000UL) |
c594adad DG |
299 | err = open_high_hpage_areas(current->mm, |
300 | HTLB_AREA_MASK(addr, len)); | |
301 | if (err) { | |
302 | printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)" | |
303 | " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n", | |
304 | addr, len, | |
305 | LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len)); | |
1da177e4 LT |
306 | return err; |
307 | } | |
308 | ||
c594adad | 309 | return 0; |
1da177e4 LT |
310 | } |
311 | ||
1da177e4 LT |
312 | struct page * |
313 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | |
314 | { | |
315 | pte_t *ptep; | |
316 | struct page *page; | |
317 | ||
318 | if (! in_hugepage_area(mm->context, address)) | |
319 | return ERR_PTR(-EINVAL); | |
320 | ||
321 | ptep = huge_pte_offset(mm, address); | |
322 | page = pte_page(*ptep); | |
323 | if (page) | |
324 | page += (address % HPAGE_SIZE) / PAGE_SIZE; | |
325 | ||
326 | return page; | |
327 | } | |
328 | ||
329 | int pmd_huge(pmd_t pmd) | |
330 | { | |
331 | return 0; | |
332 | } | |
333 | ||
334 | struct page * | |
335 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |
336 | pmd_t *pmd, int write) | |
337 | { | |
338 | BUG(); | |
339 | return NULL; | |
340 | } | |
341 | ||
1da177e4 LT |
342 | /* Because we have an exclusive hugepage region which lies within the |
343 | * normal user address space, we have to take special measures to make | |
344 | * non-huge mmap()s evade the hugepage reserved regions. */ | |
345 | unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, | |
346 | unsigned long len, unsigned long pgoff, | |
347 | unsigned long flags) | |
348 | { | |
349 | struct mm_struct *mm = current->mm; | |
350 | struct vm_area_struct *vma; | |
351 | unsigned long start_addr; | |
352 | ||
353 | if (len > TASK_SIZE) | |
354 | return -ENOMEM; | |
355 | ||
356 | if (addr) { | |
357 | addr = PAGE_ALIGN(addr); | |
358 | vma = find_vma(mm, addr); | |
359 | if (((TASK_SIZE - len) >= addr) | |
360 | && (!vma || (addr+len) <= vma->vm_start) | |
361 | && !is_hugepage_only_range(mm, addr,len)) | |
362 | return addr; | |
363 | } | |
1363c3cd WW |
364 | if (len > mm->cached_hole_size) { |
365 | start_addr = addr = mm->free_area_cache; | |
366 | } else { | |
367 | start_addr = addr = TASK_UNMAPPED_BASE; | |
368 | mm->cached_hole_size = 0; | |
369 | } | |
1da177e4 LT |
370 | |
371 | full_search: | |
372 | vma = find_vma(mm, addr); | |
373 | while (TASK_SIZE - len >= addr) { | |
374 | BUG_ON(vma && (addr >= vma->vm_end)); | |
375 | ||
376 | if (touches_hugepage_low_range(mm, addr, len)) { | |
377 | addr = ALIGN(addr+1, 1<<SID_SHIFT); | |
378 | vma = find_vma(mm, addr); | |
379 | continue; | |
380 | } | |
c594adad DG |
381 | if (touches_hugepage_high_range(mm, addr, len)) { |
382 | addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT); | |
1da177e4 LT |
383 | vma = find_vma(mm, addr); |
384 | continue; | |
385 | } | |
386 | if (!vma || addr + len <= vma->vm_start) { | |
387 | /* | |
388 | * Remember the place where we stopped the search: | |
389 | */ | |
390 | mm->free_area_cache = addr + len; | |
391 | return addr; | |
392 | } | |
1363c3cd WW |
393 | if (addr + mm->cached_hole_size < vma->vm_start) |
394 | mm->cached_hole_size = vma->vm_start - addr; | |
1da177e4 LT |
395 | addr = vma->vm_end; |
396 | vma = vma->vm_next; | |
397 | } | |
398 | ||
399 | /* Make sure we didn't miss any holes */ | |
400 | if (start_addr != TASK_UNMAPPED_BASE) { | |
401 | start_addr = addr = TASK_UNMAPPED_BASE; | |
1363c3cd | 402 | mm->cached_hole_size = 0; |
1da177e4 LT |
403 | goto full_search; |
404 | } | |
405 | return -ENOMEM; | |
406 | } | |
407 | ||
408 | /* | |
409 | * This mmap-allocator allocates new areas top-down from below the | |
410 | * stack's low limit (the base): | |
411 | * | |
412 | * Because we have an exclusive hugepage region which lies within the | |
413 | * normal user address space, we have to take special measures to make | |
414 | * non-huge mmap()s evade the hugepage reserved regions. | |
415 | */ | |
416 | unsigned long | |
417 | arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |
418 | const unsigned long len, const unsigned long pgoff, | |
419 | const unsigned long flags) | |
420 | { | |
421 | struct vm_area_struct *vma, *prev_vma; | |
422 | struct mm_struct *mm = current->mm; | |
423 | unsigned long base = mm->mmap_base, addr = addr0; | |
1363c3cd | 424 | unsigned long largest_hole = mm->cached_hole_size; |
1da177e4 LT |
425 | int first_time = 1; |
426 | ||
427 | /* requested length too big for entire address space */ | |
428 | if (len > TASK_SIZE) | |
429 | return -ENOMEM; | |
430 | ||
431 | /* dont allow allocations above current base */ | |
432 | if (mm->free_area_cache > base) | |
433 | mm->free_area_cache = base; | |
434 | ||
435 | /* requesting a specific address */ | |
436 | if (addr) { | |
437 | addr = PAGE_ALIGN(addr); | |
438 | vma = find_vma(mm, addr); | |
439 | if (TASK_SIZE - len >= addr && | |
440 | (!vma || addr + len <= vma->vm_start) | |
441 | && !is_hugepage_only_range(mm, addr,len)) | |
442 | return addr; | |
443 | } | |
444 | ||
1363c3cd WW |
445 | if (len <= largest_hole) { |
446 | largest_hole = 0; | |
447 | mm->free_area_cache = base; | |
448 | } | |
1da177e4 LT |
449 | try_again: |
450 | /* make sure it can fit in the remaining address space */ | |
451 | if (mm->free_area_cache < len) | |
452 | goto fail; | |
453 | ||
454 | /* either no address requested or cant fit in requested address hole */ | |
455 | addr = (mm->free_area_cache - len) & PAGE_MASK; | |
456 | do { | |
457 | hugepage_recheck: | |
458 | if (touches_hugepage_low_range(mm, addr, len)) { | |
459 | addr = (addr & ((~0) << SID_SHIFT)) - len; | |
460 | goto hugepage_recheck; | |
c594adad DG |
461 | } else if (touches_hugepage_high_range(mm, addr, len)) { |
462 | addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len; | |
463 | goto hugepage_recheck; | |
1da177e4 LT |
464 | } |
465 | ||
466 | /* | |
467 | * Lookup failure means no vma is above this address, | |
468 | * i.e. return with success: | |
469 | */ | |
470 | if (!(vma = find_vma_prev(mm, addr, &prev_vma))) | |
471 | return addr; | |
472 | ||
473 | /* | |
474 | * new region fits between prev_vma->vm_end and | |
475 | * vma->vm_start, use it: | |
476 | */ | |
477 | if (addr+len <= vma->vm_start && | |
1363c3cd | 478 | (!prev_vma || (addr >= prev_vma->vm_end))) { |
1da177e4 | 479 | /* remember the address as a hint for next time */ |
1363c3cd WW |
480 | mm->cached_hole_size = largest_hole; |
481 | return (mm->free_area_cache = addr); | |
482 | } else { | |
1da177e4 | 483 | /* pull free_area_cache down to the first hole */ |
1363c3cd | 484 | if (mm->free_area_cache == vma->vm_end) { |
1da177e4 | 485 | mm->free_area_cache = vma->vm_start; |
1363c3cd WW |
486 | mm->cached_hole_size = largest_hole; |
487 | } | |
488 | } | |
489 | ||
490 | /* remember the largest hole we saw so far */ | |
491 | if (addr + largest_hole < vma->vm_start) | |
492 | largest_hole = vma->vm_start - addr; | |
1da177e4 LT |
493 | |
494 | /* try just below the current vma->vm_start */ | |
495 | addr = vma->vm_start-len; | |
496 | } while (len <= vma->vm_start); | |
497 | ||
498 | fail: | |
499 | /* | |
500 | * if hint left us with no space for the requested | |
501 | * mapping then try again: | |
502 | */ | |
503 | if (first_time) { | |
504 | mm->free_area_cache = base; | |
1363c3cd | 505 | largest_hole = 0; |
1da177e4 LT |
506 | first_time = 0; |
507 | goto try_again; | |
508 | } | |
509 | /* | |
510 | * A failed mmap() very likely causes application failure, | |
511 | * so fall back to the bottom-up function here. This scenario | |
512 | * can happen with large stack limits and large mmap() | |
513 | * allocations. | |
514 | */ | |
515 | mm->free_area_cache = TASK_UNMAPPED_BASE; | |
1363c3cd | 516 | mm->cached_hole_size = ~0UL; |
1da177e4 LT |
517 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); |
518 | /* | |
519 | * Restore the topdown base: | |
520 | */ | |
521 | mm->free_area_cache = base; | |
1363c3cd | 522 | mm->cached_hole_size = ~0UL; |
1da177e4 LT |
523 | |
524 | return addr; | |
525 | } | |
526 | ||
527 | static unsigned long htlb_get_low_area(unsigned long len, u16 segmask) | |
528 | { | |
529 | unsigned long addr = 0; | |
530 | struct vm_area_struct *vma; | |
531 | ||
532 | vma = find_vma(current->mm, addr); | |
533 | while (addr + len <= 0x100000000UL) { | |
534 | BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ | |
535 | ||
536 | if (! __within_hugepage_low_range(addr, len, segmask)) { | |
537 | addr = ALIGN(addr+1, 1<<SID_SHIFT); | |
538 | vma = find_vma(current->mm, addr); | |
539 | continue; | |
540 | } | |
541 | ||
542 | if (!vma || (addr + len) <= vma->vm_start) | |
543 | return addr; | |
544 | addr = ALIGN(vma->vm_end, HPAGE_SIZE); | |
545 | /* Depending on segmask this might not be a confirmed | |
546 | * hugepage region, so the ALIGN could have skipped | |
547 | * some VMAs */ | |
548 | vma = find_vma(current->mm, addr); | |
549 | } | |
550 | ||
551 | return -ENOMEM; | |
552 | } | |
553 | ||
c594adad | 554 | static unsigned long htlb_get_high_area(unsigned long len, u16 areamask) |
1da177e4 | 555 | { |
c594adad | 556 | unsigned long addr = 0x100000000UL; |
1da177e4 LT |
557 | struct vm_area_struct *vma; |
558 | ||
559 | vma = find_vma(current->mm, addr); | |
c594adad | 560 | while (addr + len <= TASK_SIZE_USER64) { |
1da177e4 | 561 | BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ |
c594adad DG |
562 | |
563 | if (! __within_hugepage_high_range(addr, len, areamask)) { | |
564 | addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT); | |
565 | vma = find_vma(current->mm, addr); | |
566 | continue; | |
567 | } | |
1da177e4 LT |
568 | |
569 | if (!vma || (addr + len) <= vma->vm_start) | |
570 | return addr; | |
571 | addr = ALIGN(vma->vm_end, HPAGE_SIZE); | |
c594adad DG |
572 | /* Depending on segmask this might not be a confirmed |
573 | * hugepage region, so the ALIGN could have skipped | |
574 | * some VMAs */ | |
575 | vma = find_vma(current->mm, addr); | |
1da177e4 LT |
576 | } |
577 | ||
578 | return -ENOMEM; | |
579 | } | |
580 | ||
581 | unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | |
582 | unsigned long len, unsigned long pgoff, | |
583 | unsigned long flags) | |
584 | { | |
c594adad DG |
585 | int lastshift; |
586 | u16 areamask, curareas; | |
587 | ||
3c726f8d BH |
588 | if (HPAGE_SHIFT == 0) |
589 | return -EINVAL; | |
1da177e4 LT |
590 | if (len & ~HPAGE_MASK) |
591 | return -EINVAL; | |
592 | ||
593 | if (!cpu_has_feature(CPU_FTR_16M_PAGE)) | |
594 | return -EINVAL; | |
595 | ||
596 | if (test_thread_flag(TIF_32BIT)) { | |
c594adad | 597 | curareas = current->mm->context.low_htlb_areas; |
1da177e4 LT |
598 | |
599 | /* First see if we can do the mapping in the existing | |
c594adad DG |
600 | * low areas */ |
601 | addr = htlb_get_low_area(len, curareas); | |
1da177e4 LT |
602 | if (addr != -ENOMEM) |
603 | return addr; | |
604 | ||
c594adad DG |
605 | lastshift = 0; |
606 | for (areamask = LOW_ESID_MASK(0x100000000UL-len, len); | |
607 | ! lastshift; areamask >>=1) { | |
608 | if (areamask & 1) | |
1da177e4 LT |
609 | lastshift = 1; |
610 | ||
c594adad | 611 | addr = htlb_get_low_area(len, curareas | areamask); |
1da177e4 | 612 | if ((addr != -ENOMEM) |
c594adad | 613 | && open_low_hpage_areas(current->mm, areamask) == 0) |
1da177e4 LT |
614 | return addr; |
615 | } | |
1da177e4 | 616 | } else { |
c594adad DG |
617 | curareas = current->mm->context.high_htlb_areas; |
618 | ||
619 | /* First see if we can do the mapping in the existing | |
620 | * high areas */ | |
621 | addr = htlb_get_high_area(len, curareas); | |
622 | if (addr != -ENOMEM) | |
623 | return addr; | |
624 | ||
625 | lastshift = 0; | |
626 | for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len); | |
627 | ! lastshift; areamask >>=1) { | |
628 | if (areamask & 1) | |
629 | lastshift = 1; | |
630 | ||
631 | addr = htlb_get_high_area(len, curareas | areamask); | |
632 | if ((addr != -ENOMEM) | |
633 | && open_high_hpage_areas(current->mm, areamask) == 0) | |
634 | return addr; | |
635 | } | |
1da177e4 | 636 | } |
c594adad DG |
637 | printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open" |
638 | " enough areas\n"); | |
639 | return -ENOMEM; | |
1da177e4 LT |
640 | } |
641 | ||
1da177e4 LT |
642 | int hash_huge_page(struct mm_struct *mm, unsigned long access, |
643 | unsigned long ea, unsigned long vsid, int local) | |
644 | { | |
645 | pte_t *ptep; | |
3c726f8d BH |
646 | unsigned long old_pte, new_pte; |
647 | unsigned long va, rflags, pa; | |
1da177e4 LT |
648 | long slot; |
649 | int err = 1; | |
650 | ||
1da177e4 LT |
651 | ptep = huge_pte_offset(mm, ea); |
652 | ||
653 | /* Search the Linux page table for a match with va */ | |
654 | va = (vsid << 28) | (ea & 0x0fffffff); | |
1da177e4 LT |
655 | |
656 | /* | |
657 | * If no pte found or not present, send the problem up to | |
658 | * do_page_fault | |
659 | */ | |
660 | if (unlikely(!ptep || pte_none(*ptep))) | |
661 | goto out; | |
662 | ||
1da177e4 LT |
663 | /* |
664 | * Check the user's access rights to the page. If access should be | |
665 | * prevented then send the problem up to do_page_fault. | |
666 | */ | |
667 | if (unlikely(access & ~pte_val(*ptep))) | |
668 | goto out; | |
669 | /* | |
670 | * At this point, we have a pte (old_pte) which can be used to build | |
671 | * or update an HPTE. There are 2 cases: | |
672 | * | |
673 | * 1. There is a valid (present) pte with no associated HPTE (this is | |
674 | * the most common case) | |
675 | * 2. There is a valid (present) pte with an associated HPTE. The | |
676 | * current values of the pp bits in the HPTE prevent access | |
677 | * because we are doing software DIRTY bit management and the | |
678 | * page is currently not DIRTY. | |
679 | */ | |
680 | ||
681 | ||
3c726f8d BH |
682 | do { |
683 | old_pte = pte_val(*ptep); | |
684 | if (old_pte & _PAGE_BUSY) | |
685 | goto out; | |
686 | new_pte = old_pte | _PAGE_BUSY | | |
687 | _PAGE_ACCESSED | _PAGE_HASHPTE; | |
688 | } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, | |
689 | old_pte, new_pte)); | |
690 | ||
691 | rflags = 0x2 | (!(new_pte & _PAGE_RW)); | |
1da177e4 | 692 | /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ |
3c726f8d | 693 | rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); |
1da177e4 LT |
694 | |
695 | /* Check if pte already has an hpte (case 2) */ | |
3c726f8d | 696 | if (unlikely(old_pte & _PAGE_HASHPTE)) { |
1da177e4 LT |
697 | /* There MIGHT be an HPTE for this pte */ |
698 | unsigned long hash, slot; | |
699 | ||
3c726f8d BH |
700 | hash = hpt_hash(va, HPAGE_SHIFT); |
701 | if (old_pte & _PAGE_F_SECOND) | |
1da177e4 LT |
702 | hash = ~hash; |
703 | slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; | |
3c726f8d | 704 | slot += (old_pte & _PAGE_F_GIX) >> 12; |
1da177e4 | 705 | |
96e28449 | 706 | if (ppc_md.hpte_updatepp(slot, rflags, va, 1, local) == -1) |
3c726f8d | 707 | old_pte &= ~_PAGE_HPTEFLAGS; |
1da177e4 LT |
708 | } |
709 | ||
3c726f8d BH |
710 | if (likely(!(old_pte & _PAGE_HASHPTE))) { |
711 | unsigned long hash = hpt_hash(va, HPAGE_SHIFT); | |
1da177e4 LT |
712 | unsigned long hpte_group; |
713 | ||
3c726f8d | 714 | pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; |
1da177e4 LT |
715 | |
716 | repeat: | |
717 | hpte_group = ((hash & htab_hash_mask) * | |
718 | HPTES_PER_GROUP) & ~0x7UL; | |
719 | ||
3c726f8d BH |
720 | /* clear HPTE slot informations in new PTE */ |
721 | new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; | |
1da177e4 LT |
722 | |
723 | /* Add in WIMG bits */ | |
724 | /* XXX We should store these in the pte */ | |
3c726f8d | 725 | /* --BenH: I think they are ... */ |
96e28449 | 726 | rflags |= _PAGE_COHERENT; |
1da177e4 | 727 | |
3c726f8d BH |
728 | /* Insert into the hash table, primary slot */ |
729 | slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0, | |
730 | mmu_huge_psize); | |
1da177e4 LT |
731 | |
732 | /* Primary is full, try the secondary */ | |
733 | if (unlikely(slot == -1)) { | |
3c726f8d | 734 | new_pte |= _PAGE_F_SECOND; |
1da177e4 LT |
735 | hpte_group = ((~hash & htab_hash_mask) * |
736 | HPTES_PER_GROUP) & ~0x7UL; | |
3c726f8d | 737 | slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, |
67b10813 | 738 | HPTE_V_SECONDARY, |
3c726f8d | 739 | mmu_huge_psize); |
1da177e4 LT |
740 | if (slot == -1) { |
741 | if (mftb() & 0x1) | |
67b10813 BH |
742 | hpte_group = ((hash & htab_hash_mask) * |
743 | HPTES_PER_GROUP)&~0x7UL; | |
1da177e4 LT |
744 | |
745 | ppc_md.hpte_remove(hpte_group); | |
746 | goto repeat; | |
747 | } | |
748 | } | |
749 | ||
750 | if (unlikely(slot == -2)) | |
751 | panic("hash_huge_page: pte_insert failed\n"); | |
752 | ||
3c726f8d | 753 | new_pte |= (slot << 12) & _PAGE_F_GIX; |
1da177e4 LT |
754 | } |
755 | ||
3c726f8d | 756 | /* |
01edcd89 | 757 | * No need to use ldarx/stdcx here |
3c726f8d BH |
758 | */ |
759 | *ptep = __pte(new_pte & ~_PAGE_BUSY); | |
760 | ||
1da177e4 LT |
761 | err = 0; |
762 | ||
763 | out: | |
1da177e4 LT |
764 | return err; |
765 | } |