hugetlb: Try to grow hugetlb pool for MAP_PRIVATE mappings
[linux-2.6-block.git] / mm / hugetlb.c
CommitLineData
1da177e4
LT
1/*
2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004
4 */
5#include <linux/gfp.h>
6#include <linux/list.h>
7#include <linux/init.h>
8#include <linux/module.h>
9#include <linux/mm.h>
1da177e4
LT
10#include <linux/sysctl.h>
11#include <linux/highmem.h>
12#include <linux/nodemask.h>
63551ae0 13#include <linux/pagemap.h>
5da7ca86 14#include <linux/mempolicy.h>
aea47ff3 15#include <linux/cpuset.h>
3935baa9 16#include <linux/mutex.h>
5da7ca86 17
63551ae0
DG
18#include <asm/page.h>
19#include <asm/pgtable.h>
20
21#include <linux/hugetlb.h>
7835e98b 22#include "internal.h"
1da177e4
LT
23
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
a43a8c39 25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
7893d1d5 26static unsigned long surplus_huge_pages;
1da177e4
LT
27unsigned long max_huge_pages;
28static struct list_head hugepage_freelists[MAX_NUMNODES];
29static unsigned int nr_huge_pages_node[MAX_NUMNODES];
30static unsigned int free_huge_pages_node[MAX_NUMNODES];
7893d1d5 31static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
396faf03
MG
32static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
33unsigned long hugepages_treat_as_movable;
34
3935baa9
DG
35/*
36 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
37 */
38static DEFINE_SPINLOCK(hugetlb_lock);
0bd0f9fb 39
79ac6ba4
DG
40static void clear_huge_page(struct page *page, unsigned long addr)
41{
42 int i;
43
44 might_sleep();
45 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
46 cond_resched();
281e0e3b 47 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
79ac6ba4
DG
48 }
49}
50
51static void copy_huge_page(struct page *dst, struct page *src,
9de455b2 52 unsigned long addr, struct vm_area_struct *vma)
79ac6ba4
DG
53{
54 int i;
55
56 might_sleep();
57 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
58 cond_resched();
9de455b2 59 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
79ac6ba4
DG
60 }
61}
62
1da177e4
LT
63static void enqueue_huge_page(struct page *page)
64{
65 int nid = page_to_nid(page);
66 list_add(&page->lru, &hugepage_freelists[nid]);
67 free_huge_pages++;
68 free_huge_pages_node[nid]++;
69}
70
5da7ca86
CL
71static struct page *dequeue_huge_page(struct vm_area_struct *vma,
72 unsigned long address)
1da177e4 73{
31a5c6e4 74 int nid;
1da177e4 75 struct page *page = NULL;
480eccf9 76 struct mempolicy *mpol;
396faf03 77 struct zonelist *zonelist = huge_zonelist(vma, address,
480eccf9 78 htlb_alloc_mask, &mpol);
96df9333 79 struct zone **z;
1da177e4 80
96df9333 81 for (z = zonelist->zones; *z; z++) {
89fa3024 82 nid = zone_to_nid(*z);
396faf03 83 if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) &&
3abf7afd
AM
84 !list_empty(&hugepage_freelists[nid])) {
85 page = list_entry(hugepage_freelists[nid].next,
86 struct page, lru);
87 list_del(&page->lru);
88 free_huge_pages--;
89 free_huge_pages_node[nid]--;
5ab3ee7b 90 break;
3abf7afd 91 }
1da177e4 92 }
480eccf9 93 mpol_free(mpol); /* unref if mpol !NULL */
1da177e4
LT
94 return page;
95}
96
6af2acb6
AL
97static void update_and_free_page(struct page *page)
98{
99 int i;
100 nr_huge_pages--;
101 nr_huge_pages_node[page_to_nid(page)]--;
102 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
103 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
104 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
105 1 << PG_private | 1<< PG_writeback);
106 }
107 set_compound_page_dtor(page, NULL);
108 set_page_refcounted(page);
109 __free_pages(page, HUGETLB_PAGE_ORDER);
110}
111
27a85ef1
DG
112static void free_huge_page(struct page *page)
113{
7893d1d5 114 int nid = page_to_nid(page);
27a85ef1 115
7893d1d5 116 BUG_ON(page_count(page));
27a85ef1
DG
117 INIT_LIST_HEAD(&page->lru);
118
119 spin_lock(&hugetlb_lock);
7893d1d5
AL
120 if (surplus_huge_pages_node[nid]) {
121 update_and_free_page(page);
122 surplus_huge_pages--;
123 surplus_huge_pages_node[nid]--;
124 } else {
125 enqueue_huge_page(page);
126 }
27a85ef1
DG
127 spin_unlock(&hugetlb_lock);
128}
129
7893d1d5
AL
130/*
131 * Increment or decrement surplus_huge_pages. Keep node-specific counters
132 * balanced by operating on them in a round-robin fashion.
133 * Returns 1 if an adjustment was made.
134 */
135static int adjust_pool_surplus(int delta)
136{
137 static int prev_nid;
138 int nid = prev_nid;
139 int ret = 0;
140
141 VM_BUG_ON(delta != -1 && delta != 1);
142 do {
143 nid = next_node(nid, node_online_map);
144 if (nid == MAX_NUMNODES)
145 nid = first_node(node_online_map);
146
147 /* To shrink on this node, there must be a surplus page */
148 if (delta < 0 && !surplus_huge_pages_node[nid])
149 continue;
150 /* Surplus cannot exceed the total number of pages */
151 if (delta > 0 && surplus_huge_pages_node[nid] >=
152 nr_huge_pages_node[nid])
153 continue;
154
155 surplus_huge_pages += delta;
156 surplus_huge_pages_node[nid] += delta;
157 ret = 1;
158 break;
159 } while (nid != prev_nid);
160
161 prev_nid = nid;
162 return ret;
163}
164
a482289d 165static int alloc_fresh_huge_page(void)
1da177e4 166{
f96efd58 167 static int prev_nid;
1da177e4 168 struct page *page;
f96efd58
JJ
169 int nid;
170
7ed5cb2b
HD
171 /*
172 * Copy static prev_nid to local nid, work on that, then copy it
173 * back to prev_nid afterwards: otherwise there's a window in which
174 * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node.
175 * But we don't need to use a spin_lock here: it really doesn't
176 * matter if occasionally a racer chooses the same nid as we do.
177 */
f96efd58 178 nid = next_node(prev_nid, node_online_map);
fdb7cc59
PJ
179 if (nid == MAX_NUMNODES)
180 nid = first_node(node_online_map);
f96efd58 181 prev_nid = nid;
f96efd58 182
396faf03 183 page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
f96efd58 184 HUGETLB_PAGE_ORDER);
1da177e4 185 if (page) {
33f2ef89 186 set_compound_page_dtor(page, free_huge_page);
0bd0f9fb 187 spin_lock(&hugetlb_lock);
1da177e4
LT
188 nr_huge_pages++;
189 nr_huge_pages_node[page_to_nid(page)]++;
0bd0f9fb 190 spin_unlock(&hugetlb_lock);
a482289d
NP
191 put_page(page); /* free it into the hugepage allocator */
192 return 1;
1da177e4 193 }
a482289d 194 return 0;
1da177e4
LT
195}
196
7893d1d5
AL
197static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
198 unsigned long address)
199{
200 struct page *page;
201
202 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
203 HUGETLB_PAGE_ORDER);
204 if (page) {
205 set_compound_page_dtor(page, free_huge_page);
206 spin_lock(&hugetlb_lock);
207 nr_huge_pages++;
208 nr_huge_pages_node[page_to_nid(page)]++;
209 surplus_huge_pages++;
210 surplus_huge_pages_node[page_to_nid(page)]++;
211 spin_unlock(&hugetlb_lock);
212 }
213
214 return page;
215}
216
27a85ef1
DG
217static struct page *alloc_huge_page(struct vm_area_struct *vma,
218 unsigned long addr)
1da177e4 219{
7893d1d5 220 struct page *page = NULL;
1da177e4
LT
221
222 spin_lock(&hugetlb_lock);
a43a8c39
CK
223 if (vma->vm_flags & VM_MAYSHARE)
224 resv_huge_pages--;
225 else if (free_huge_pages <= resv_huge_pages)
226 goto fail;
b45b5bd6
DG
227
228 page = dequeue_huge_page(vma, addr);
229 if (!page)
230 goto fail;
231
1da177e4 232 spin_unlock(&hugetlb_lock);
7835e98b 233 set_page_refcounted(page);
1da177e4 234 return page;
b45b5bd6 235
a43a8c39 236fail:
ace4bd29
KC
237 if (vma->vm_flags & VM_MAYSHARE)
238 resv_huge_pages++;
b45b5bd6 239 spin_unlock(&hugetlb_lock);
7893d1d5
AL
240
241 /*
242 * Private mappings do not use reserved huge pages so the allocation
243 * may have failed due to an undersized hugetlb pool. Try to grab a
244 * surplus huge page from the buddy allocator.
245 */
246 if (!(vma->vm_flags & VM_MAYSHARE))
247 page = alloc_buddy_huge_page(vma, addr);
248
249 return page;
b45b5bd6
DG
250}
251
1da177e4
LT
252static int __init hugetlb_init(void)
253{
254 unsigned long i;
1da177e4 255
3c726f8d
BH
256 if (HPAGE_SHIFT == 0)
257 return 0;
258
1da177e4
LT
259 for (i = 0; i < MAX_NUMNODES; ++i)
260 INIT_LIST_HEAD(&hugepage_freelists[i]);
261
262 for (i = 0; i < max_huge_pages; ++i) {
a482289d 263 if (!alloc_fresh_huge_page())
1da177e4 264 break;
1da177e4
LT
265 }
266 max_huge_pages = free_huge_pages = nr_huge_pages = i;
267 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
268 return 0;
269}
270module_init(hugetlb_init);
271
272static int __init hugetlb_setup(char *s)
273{
274 if (sscanf(s, "%lu", &max_huge_pages) <= 0)
275 max_huge_pages = 0;
276 return 1;
277}
278__setup("hugepages=", hugetlb_setup);
279
8a630112
KC
280static unsigned int cpuset_mems_nr(unsigned int *array)
281{
282 int node;
283 unsigned int nr = 0;
284
285 for_each_node_mask(node, cpuset_current_mems_allowed)
286 nr += array[node];
287
288 return nr;
289}
290
1da177e4 291#ifdef CONFIG_SYSCTL
1da177e4
LT
292#ifdef CONFIG_HIGHMEM
293static void try_to_free_low(unsigned long count)
294{
4415cc8d
CL
295 int i;
296
1da177e4
LT
297 for (i = 0; i < MAX_NUMNODES; ++i) {
298 struct page *page, *next;
299 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
300 if (PageHighMem(page))
301 continue;
302 list_del(&page->lru);
303 update_and_free_page(page);
1da177e4 304 free_huge_pages--;
4415cc8d 305 free_huge_pages_node[page_to_nid(page)]--;
1da177e4
LT
306 if (count >= nr_huge_pages)
307 return;
308 }
309 }
310}
311#else
312static inline void try_to_free_low(unsigned long count)
313{
314}
315#endif
316
7893d1d5 317#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
1da177e4
LT
318static unsigned long set_max_huge_pages(unsigned long count)
319{
7893d1d5 320 unsigned long min_count, ret;
1da177e4 321
7893d1d5
AL
322 /*
323 * Increase the pool size
324 * First take pages out of surplus state. Then make up the
325 * remaining difference by allocating fresh huge pages.
326 */
1da177e4 327 spin_lock(&hugetlb_lock);
7893d1d5
AL
328 while (surplus_huge_pages && count > persistent_huge_pages) {
329 if (!adjust_pool_surplus(-1))
330 break;
331 }
332
333 while (count > persistent_huge_pages) {
334 int ret;
335 /*
336 * If this allocation races such that we no longer need the
337 * page, free_huge_page will handle it by freeing the page
338 * and reducing the surplus.
339 */
340 spin_unlock(&hugetlb_lock);
341 ret = alloc_fresh_huge_page();
342 spin_lock(&hugetlb_lock);
343 if (!ret)
344 goto out;
345
346 }
347 if (count >= persistent_huge_pages)
348 goto out;
349
350 /*
351 * Decrease the pool size
352 * First return free pages to the buddy allocator (being careful
353 * to keep enough around to satisfy reservations). Then place
354 * pages into surplus state as needed so the pool will shrink
355 * to the desired size as pages become free.
356 */
357 min_count = max(count, resv_huge_pages);
358 try_to_free_low(min_count);
359 while (min_count < persistent_huge_pages) {
5da7ca86 360 struct page *page = dequeue_huge_page(NULL, 0);
1da177e4
LT
361 if (!page)
362 break;
363 update_and_free_page(page);
364 }
7893d1d5
AL
365 while (count < persistent_huge_pages) {
366 if (!adjust_pool_surplus(1))
367 break;
368 }
369out:
370 ret = persistent_huge_pages;
1da177e4 371 spin_unlock(&hugetlb_lock);
7893d1d5 372 return ret;
1da177e4
LT
373}
374
375int hugetlb_sysctl_handler(struct ctl_table *table, int write,
376 struct file *file, void __user *buffer,
377 size_t *length, loff_t *ppos)
378{
379 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
380 max_huge_pages = set_max_huge_pages(max_huge_pages);
381 return 0;
382}
396faf03
MG
383
384int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
385 struct file *file, void __user *buffer,
386 size_t *length, loff_t *ppos)
387{
388 proc_dointvec(table, write, file, buffer, length, ppos);
389 if (hugepages_treat_as_movable)
390 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
391 else
392 htlb_alloc_mask = GFP_HIGHUSER;
393 return 0;
394}
395
1da177e4
LT
396#endif /* CONFIG_SYSCTL */
397
398int hugetlb_report_meminfo(char *buf)
399{
400 return sprintf(buf,
401 "HugePages_Total: %5lu\n"
402 "HugePages_Free: %5lu\n"
a43a8c39 403 "HugePages_Rsvd: %5lu\n"
7893d1d5 404 "HugePages_Surp: %5lu\n"
1da177e4
LT
405 "Hugepagesize: %5lu kB\n",
406 nr_huge_pages,
407 free_huge_pages,
a43a8c39 408 resv_huge_pages,
7893d1d5 409 surplus_huge_pages,
1da177e4
LT
410 HPAGE_SIZE/1024);
411}
412
413int hugetlb_report_node_meminfo(int nid, char *buf)
414{
415 return sprintf(buf,
416 "Node %d HugePages_Total: %5u\n"
417 "Node %d HugePages_Free: %5u\n",
418 nid, nr_huge_pages_node[nid],
419 nid, free_huge_pages_node[nid]);
420}
421
1da177e4
LT
422/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
423unsigned long hugetlb_total_pages(void)
424{
425 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
426}
1da177e4
LT
427
428/*
429 * We cannot handle pagefaults against hugetlb pages at all. They cause
430 * handle_mm_fault() to try to instantiate regular-sized pages in the
431 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
432 * this far.
433 */
d0217ac0 434static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1da177e4
LT
435{
436 BUG();
d0217ac0 437 return 0;
1da177e4
LT
438}
439
440struct vm_operations_struct hugetlb_vm_ops = {
d0217ac0 441 .fault = hugetlb_vm_op_fault,
1da177e4
LT
442};
443
1e8f889b
DG
444static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
445 int writable)
63551ae0
DG
446{
447 pte_t entry;
448
1e8f889b 449 if (writable) {
63551ae0
DG
450 entry =
451 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
452 } else {
453 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
454 }
455 entry = pte_mkyoung(entry);
456 entry = pte_mkhuge(entry);
457
458 return entry;
459}
460
1e8f889b
DG
461static void set_huge_ptep_writable(struct vm_area_struct *vma,
462 unsigned long address, pte_t *ptep)
463{
464 pte_t entry;
465
466 entry = pte_mkwrite(pte_mkdirty(*ptep));
8dab5241
BH
467 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
468 update_mmu_cache(vma, address, entry);
8dab5241 469 }
1e8f889b
DG
470}
471
472
63551ae0
DG
473int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
474 struct vm_area_struct *vma)
475{
476 pte_t *src_pte, *dst_pte, entry;
477 struct page *ptepage;
1c59827d 478 unsigned long addr;
1e8f889b
DG
479 int cow;
480
481 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
63551ae0 482
1c59827d 483 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
c74df32c
HD
484 src_pte = huge_pte_offset(src, addr);
485 if (!src_pte)
486 continue;
63551ae0
DG
487 dst_pte = huge_pte_alloc(dst, addr);
488 if (!dst_pte)
489 goto nomem;
c74df32c 490 spin_lock(&dst->page_table_lock);
1c59827d 491 spin_lock(&src->page_table_lock);
c74df32c 492 if (!pte_none(*src_pte)) {
1e8f889b
DG
493 if (cow)
494 ptep_set_wrprotect(src, addr, src_pte);
1c59827d
HD
495 entry = *src_pte;
496 ptepage = pte_page(entry);
497 get_page(ptepage);
1c59827d
HD
498 set_huge_pte_at(dst, addr, dst_pte, entry);
499 }
500 spin_unlock(&src->page_table_lock);
c74df32c 501 spin_unlock(&dst->page_table_lock);
63551ae0
DG
502 }
503 return 0;
504
505nomem:
506 return -ENOMEM;
507}
508
502717f4
CK
509void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
510 unsigned long end)
63551ae0
DG
511{
512 struct mm_struct *mm = vma->vm_mm;
513 unsigned long address;
c7546f8f 514 pte_t *ptep;
63551ae0
DG
515 pte_t pte;
516 struct page *page;
fe1668ae 517 struct page *tmp;
c0a499c2
CK
518 /*
519 * A page gathering list, protected by per file i_mmap_lock. The
520 * lock is used to avoid list corruption from multiple unmapping
521 * of the same page since we are using page->lru.
522 */
fe1668ae 523 LIST_HEAD(page_list);
63551ae0
DG
524
525 WARN_ON(!is_vm_hugetlb_page(vma));
526 BUG_ON(start & ~HPAGE_MASK);
527 BUG_ON(end & ~HPAGE_MASK);
528
508034a3 529 spin_lock(&mm->page_table_lock);
63551ae0 530 for (address = start; address < end; address += HPAGE_SIZE) {
c7546f8f 531 ptep = huge_pte_offset(mm, address);
4c887265 532 if (!ptep)
c7546f8f
DG
533 continue;
534
39dde65c
CK
535 if (huge_pmd_unshare(mm, &address, ptep))
536 continue;
537
c7546f8f 538 pte = huge_ptep_get_and_clear(mm, address, ptep);
63551ae0
DG
539 if (pte_none(pte))
540 continue;
c7546f8f 541
63551ae0 542 page = pte_page(pte);
6649a386
KC
543 if (pte_dirty(pte))
544 set_page_dirty(page);
fe1668ae 545 list_add(&page->lru, &page_list);
63551ae0 546 }
1da177e4 547 spin_unlock(&mm->page_table_lock);
508034a3 548 flush_tlb_range(vma, start, end);
fe1668ae
CK
549 list_for_each_entry_safe(page, tmp, &page_list, lru) {
550 list_del(&page->lru);
551 put_page(page);
552 }
1da177e4 553}
63551ae0 554
502717f4
CK
555void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
556 unsigned long end)
557{
558 /*
559 * It is undesirable to test vma->vm_file as it should be non-null
560 * for valid hugetlb area. However, vm_file will be NULL in the error
561 * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
562 * do_mmap_pgoff() nullifies vma->vm_file before calling this function
563 * to clean up. Since no pte has actually been setup, it is safe to
564 * do nothing in this case.
565 */
566 if (vma->vm_file) {
567 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
568 __unmap_hugepage_range(vma, start, end);
569 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
570 }
571}
572
1e8f889b
DG
573static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
574 unsigned long address, pte_t *ptep, pte_t pte)
575{
576 struct page *old_page, *new_page;
79ac6ba4 577 int avoidcopy;
1e8f889b
DG
578
579 old_page = pte_page(pte);
580
581 /* If no-one else is actually using this page, avoid the copy
582 * and just make the page writable */
583 avoidcopy = (page_count(old_page) == 1);
584 if (avoidcopy) {
585 set_huge_ptep_writable(vma, address, ptep);
83c54070 586 return 0;
1e8f889b
DG
587 }
588
589 page_cache_get(old_page);
5da7ca86 590 new_page = alloc_huge_page(vma, address);
1e8f889b
DG
591
592 if (!new_page) {
593 page_cache_release(old_page);
0df420d8 594 return VM_FAULT_OOM;
1e8f889b
DG
595 }
596
597 spin_unlock(&mm->page_table_lock);
9de455b2 598 copy_huge_page(new_page, old_page, address, vma);
1e8f889b
DG
599 spin_lock(&mm->page_table_lock);
600
601 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
602 if (likely(pte_same(*ptep, pte))) {
603 /* Break COW */
604 set_huge_pte_at(mm, address, ptep,
605 make_huge_pte(vma, new_page, 1));
606 /* Make the old page be freed below */
607 new_page = old_page;
608 }
609 page_cache_release(new_page);
610 page_cache_release(old_page);
83c54070 611 return 0;
1e8f889b
DG
612}
613
a1ed3dda 614static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1e8f889b 615 unsigned long address, pte_t *ptep, int write_access)
ac9b9c66
HD
616{
617 int ret = VM_FAULT_SIGBUS;
4c887265
AL
618 unsigned long idx;
619 unsigned long size;
4c887265
AL
620 struct page *page;
621 struct address_space *mapping;
1e8f889b 622 pte_t new_pte;
4c887265 623
4c887265
AL
624 mapping = vma->vm_file->f_mapping;
625 idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
626 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
627
628 /*
629 * Use page lock to guard against racing truncation
630 * before we get page_table_lock.
631 */
6bda666a
CL
632retry:
633 page = find_lock_page(mapping, idx);
634 if (!page) {
ebed4bfc
HD
635 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
636 if (idx >= size)
637 goto out;
6bda666a
CL
638 if (hugetlb_get_quota(mapping))
639 goto out;
640 page = alloc_huge_page(vma, address);
641 if (!page) {
642 hugetlb_put_quota(mapping);
0df420d8 643 ret = VM_FAULT_OOM;
6bda666a
CL
644 goto out;
645 }
79ac6ba4 646 clear_huge_page(page, address);
ac9b9c66 647
6bda666a
CL
648 if (vma->vm_flags & VM_SHARED) {
649 int err;
650
651 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
652 if (err) {
653 put_page(page);
654 hugetlb_put_quota(mapping);
655 if (err == -EEXIST)
656 goto retry;
657 goto out;
658 }
659 } else
660 lock_page(page);
661 }
1e8f889b 662
ac9b9c66 663 spin_lock(&mm->page_table_lock);
4c887265
AL
664 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
665 if (idx >= size)
666 goto backout;
667
83c54070 668 ret = 0;
86e5216f 669 if (!pte_none(*ptep))
4c887265
AL
670 goto backout;
671
1e8f889b
DG
672 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
673 && (vma->vm_flags & VM_SHARED)));
674 set_huge_pte_at(mm, address, ptep, new_pte);
675
676 if (write_access && !(vma->vm_flags & VM_SHARED)) {
677 /* Optimization, do the COW without a second fault */
678 ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
679 }
680
ac9b9c66 681 spin_unlock(&mm->page_table_lock);
4c887265
AL
682 unlock_page(page);
683out:
ac9b9c66 684 return ret;
4c887265
AL
685
686backout:
687 spin_unlock(&mm->page_table_lock);
688 hugetlb_put_quota(mapping);
689 unlock_page(page);
690 put_page(page);
691 goto out;
ac9b9c66
HD
692}
693
86e5216f
AL
694int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
695 unsigned long address, int write_access)
696{
697 pte_t *ptep;
698 pte_t entry;
1e8f889b 699 int ret;
3935baa9 700 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
86e5216f
AL
701
702 ptep = huge_pte_alloc(mm, address);
703 if (!ptep)
704 return VM_FAULT_OOM;
705
3935baa9
DG
706 /*
707 * Serialize hugepage allocation and instantiation, so that we don't
708 * get spurious allocation failures if two CPUs race to instantiate
709 * the same page in the page cache.
710 */
711 mutex_lock(&hugetlb_instantiation_mutex);
86e5216f 712 entry = *ptep;
3935baa9
DG
713 if (pte_none(entry)) {
714 ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
715 mutex_unlock(&hugetlb_instantiation_mutex);
716 return ret;
717 }
86e5216f 718
83c54070 719 ret = 0;
1e8f889b
DG
720
721 spin_lock(&mm->page_table_lock);
722 /* Check for a racing update before calling hugetlb_cow */
723 if (likely(pte_same(entry, *ptep)))
724 if (write_access && !pte_write(entry))
725 ret = hugetlb_cow(mm, vma, address, ptep, entry);
726 spin_unlock(&mm->page_table_lock);
3935baa9 727 mutex_unlock(&hugetlb_instantiation_mutex);
1e8f889b
DG
728
729 return ret;
86e5216f
AL
730}
731
63551ae0
DG
732int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
733 struct page **pages, struct vm_area_struct **vmas,
734 unsigned long *position, int *length, int i)
735{
d5d4b0aa
CK
736 unsigned long pfn_offset;
737 unsigned long vaddr = *position;
63551ae0
DG
738 int remainder = *length;
739
1c59827d 740 spin_lock(&mm->page_table_lock);
63551ae0 741 while (vaddr < vma->vm_end && remainder) {
4c887265
AL
742 pte_t *pte;
743 struct page *page;
63551ae0 744
4c887265
AL
745 /*
746 * Some archs (sparc64, sh*) have multiple pte_ts to
747 * each hugepage. We have to make * sure we get the
748 * first, for the page indexing below to work.
749 */
750 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
63551ae0 751
4c887265
AL
752 if (!pte || pte_none(*pte)) {
753 int ret;
63551ae0 754
4c887265
AL
755 spin_unlock(&mm->page_table_lock);
756 ret = hugetlb_fault(mm, vma, vaddr, 0);
757 spin_lock(&mm->page_table_lock);
a89182c7 758 if (!(ret & VM_FAULT_ERROR))
4c887265 759 continue;
63551ae0 760
4c887265
AL
761 remainder = 0;
762 if (!i)
763 i = -EFAULT;
764 break;
765 }
766
d5d4b0aa
CK
767 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
768 page = pte_page(*pte);
769same_page:
d6692183
CK
770 if (pages) {
771 get_page(page);
d5d4b0aa 772 pages[i] = page + pfn_offset;
d6692183 773 }
63551ae0
DG
774
775 if (vmas)
776 vmas[i] = vma;
777
778 vaddr += PAGE_SIZE;
d5d4b0aa 779 ++pfn_offset;
63551ae0
DG
780 --remainder;
781 ++i;
d5d4b0aa
CK
782 if (vaddr < vma->vm_end && remainder &&
783 pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
784 /*
785 * We use pfn_offset to avoid touching the pageframes
786 * of this compound page.
787 */
788 goto same_page;
789 }
63551ae0 790 }
1c59827d 791 spin_unlock(&mm->page_table_lock);
63551ae0
DG
792 *length = remainder;
793 *position = vaddr;
794
795 return i;
796}
8f860591
ZY
797
798void hugetlb_change_protection(struct vm_area_struct *vma,
799 unsigned long address, unsigned long end, pgprot_t newprot)
800{
801 struct mm_struct *mm = vma->vm_mm;
802 unsigned long start = address;
803 pte_t *ptep;
804 pte_t pte;
805
806 BUG_ON(address >= end);
807 flush_cache_range(vma, address, end);
808
39dde65c 809 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
8f860591
ZY
810 spin_lock(&mm->page_table_lock);
811 for (; address < end; address += HPAGE_SIZE) {
812 ptep = huge_pte_offset(mm, address);
813 if (!ptep)
814 continue;
39dde65c
CK
815 if (huge_pmd_unshare(mm, &address, ptep))
816 continue;
8f860591
ZY
817 if (!pte_none(*ptep)) {
818 pte = huge_ptep_get_and_clear(mm, address, ptep);
819 pte = pte_mkhuge(pte_modify(pte, newprot));
820 set_huge_pte_at(mm, address, ptep, pte);
8f860591
ZY
821 }
822 }
823 spin_unlock(&mm->page_table_lock);
39dde65c 824 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
8f860591
ZY
825
826 flush_tlb_range(vma, start, end);
827}
828
a43a8c39
CK
829struct file_region {
830 struct list_head link;
831 long from;
832 long to;
833};
834
835static long region_add(struct list_head *head, long f, long t)
836{
837 struct file_region *rg, *nrg, *trg;
838
839 /* Locate the region we are either in or before. */
840 list_for_each_entry(rg, head, link)
841 if (f <= rg->to)
842 break;
843
844 /* Round our left edge to the current segment if it encloses us. */
845 if (f > rg->from)
846 f = rg->from;
847
848 /* Check for and consume any regions we now overlap with. */
849 nrg = rg;
850 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
851 if (&rg->link == head)
852 break;
853 if (rg->from > t)
854 break;
855
856 /* If this area reaches higher then extend our area to
857 * include it completely. If this is not the first area
858 * which we intend to reuse, free it. */
859 if (rg->to > t)
860 t = rg->to;
861 if (rg != nrg) {
862 list_del(&rg->link);
863 kfree(rg);
864 }
865 }
866 nrg->from = f;
867 nrg->to = t;
868 return 0;
869}
870
871static long region_chg(struct list_head *head, long f, long t)
872{
873 struct file_region *rg, *nrg;
874 long chg = 0;
875
876 /* Locate the region we are before or in. */
877 list_for_each_entry(rg, head, link)
878 if (f <= rg->to)
879 break;
880
881 /* If we are below the current region then a new region is required.
882 * Subtle, allocate a new region at the position but make it zero
883 * size such that we can guarentee to record the reservation. */
884 if (&rg->link == head || t < rg->from) {
885 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
886 if (nrg == 0)
887 return -ENOMEM;
888 nrg->from = f;
889 nrg->to = f;
890 INIT_LIST_HEAD(&nrg->link);
891 list_add(&nrg->link, rg->link.prev);
892
893 return t - f;
894 }
895
896 /* Round our left edge to the current segment if it encloses us. */
897 if (f > rg->from)
898 f = rg->from;
899 chg = t - f;
900
901 /* Check for and consume any regions we now overlap with. */
902 list_for_each_entry(rg, rg->link.prev, link) {
903 if (&rg->link == head)
904 break;
905 if (rg->from > t)
906 return chg;
907
908 /* We overlap with this area, if it extends futher than
909 * us then we must extend ourselves. Account for its
910 * existing reservation. */
911 if (rg->to > t) {
912 chg += rg->to - t;
913 t = rg->to;
914 }
915 chg -= rg->to - rg->from;
916 }
917 return chg;
918}
919
920static long region_truncate(struct list_head *head, long end)
921{
922 struct file_region *rg, *trg;
923 long chg = 0;
924
925 /* Locate the region we are either in or before. */
926 list_for_each_entry(rg, head, link)
927 if (end <= rg->to)
928 break;
929 if (&rg->link == head)
930 return 0;
931
932 /* If we are in the middle of a region then adjust it. */
933 if (end > rg->from) {
934 chg = rg->to - end;
935 rg->to = end;
936 rg = list_entry(rg->link.next, typeof(*rg), link);
937 }
938
939 /* Drop any remaining regions. */
940 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
941 if (&rg->link == head)
942 break;
943 chg += rg->to - rg->from;
944 list_del(&rg->link);
945 kfree(rg);
946 }
947 return chg;
948}
949
950static int hugetlb_acct_memory(long delta)
951{
952 int ret = -ENOMEM;
953
954 spin_lock(&hugetlb_lock);
955 if ((delta + resv_huge_pages) <= free_huge_pages) {
956 resv_huge_pages += delta;
957 ret = 0;
958 }
959 spin_unlock(&hugetlb_lock);
960 return ret;
961}
962
963int hugetlb_reserve_pages(struct inode *inode, long from, long to)
964{
965 long ret, chg;
966
967 chg = region_chg(&inode->i_mapping->private_list, from, to);
968 if (chg < 0)
969 return chg;
8a630112
KC
970 /*
971 * When cpuset is configured, it breaks the strict hugetlb page
972 * reservation as the accounting is done on a global variable. Such
973 * reservation is completely rubbish in the presence of cpuset because
974 * the reservation is not checked against page availability for the
975 * current cpuset. Application can still potentially OOM'ed by kernel
976 * with lack of free htlb page in cpuset that the task is in.
977 * Attempt to enforce strict accounting with cpuset is almost
978 * impossible (or too ugly) because cpuset is too fluid that
979 * task or memory node can be dynamically moved between cpusets.
980 *
981 * The change of semantics for shared hugetlb mapping with cpuset is
982 * undesirable. However, in order to preserve some of the semantics,
983 * we fall back to check against current free page availability as
984 * a best attempt and hopefully to minimize the impact of changing
985 * semantics that cpuset has.
986 */
987 if (chg > cpuset_mems_nr(free_huge_pages_node))
988 return -ENOMEM;
989
a43a8c39
CK
990 ret = hugetlb_acct_memory(chg);
991 if (ret < 0)
992 return ret;
993 region_add(&inode->i_mapping->private_list, from, to);
994 return 0;
995}
996
997void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
998{
999 long chg = region_truncate(&inode->i_mapping->private_list, offset);
1000 hugetlb_acct_memory(freed - chg);
1001}