[PATCH] mm: dequeue a huge page near to this node
[linux-2.6-block.git] / mm / hugetlb.c
CommitLineData
1da177e4
LT
1/*
2 * Generic hugetlb support.
3 * (C) William Irwin, April 2004
4 */
5#include <linux/gfp.h>
6#include <linux/list.h>
7#include <linux/init.h>
8#include <linux/module.h>
9#include <linux/mm.h>
1da177e4
LT
10#include <linux/sysctl.h>
11#include <linux/highmem.h>
12#include <linux/nodemask.h>
63551ae0
DG
13#include <linux/pagemap.h>
14#include <asm/page.h>
15#include <asm/pgtable.h>
16
17#include <linux/hugetlb.h>
1da177e4
LT
18
19const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
20static unsigned long nr_huge_pages, free_huge_pages;
21unsigned long max_huge_pages;
22static struct list_head hugepage_freelists[MAX_NUMNODES];
23static unsigned int nr_huge_pages_node[MAX_NUMNODES];
24static unsigned int free_huge_pages_node[MAX_NUMNODES];
0bd0f9fb
EP
25
26/*
27 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
28 */
1da177e4
LT
29static DEFINE_SPINLOCK(hugetlb_lock);
30
31static void enqueue_huge_page(struct page *page)
32{
33 int nid = page_to_nid(page);
34 list_add(&page->lru, &hugepage_freelists[nid]);
35 free_huge_pages++;
36 free_huge_pages_node[nid]++;
37}
38
39static struct page *dequeue_huge_page(void)
40{
41 int nid = numa_node_id();
42 struct page *page = NULL;
96df9333
CL
43 struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists;
44 struct zone **z;
1da177e4 45
96df9333
CL
46 for (z = zonelist->zones; *z; z++) {
47 nid = (*z)->zone_pgdat->node_id;
48 if (!list_empty(&hugepage_freelists[nid]))
49 break;
1da177e4 50 }
96df9333
CL
51
52 if (*z) {
1da177e4
LT
53 page = list_entry(hugepage_freelists[nid].next,
54 struct page, lru);
55 list_del(&page->lru);
56 free_huge_pages--;
57 free_huge_pages_node[nid]--;
58 }
59 return page;
60}
61
62static struct page *alloc_fresh_huge_page(void)
63{
64 static int nid = 0;
65 struct page *page;
66 page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
67 HUGETLB_PAGE_ORDER);
68 nid = (nid + 1) % num_online_nodes();
69 if (page) {
0bd0f9fb 70 spin_lock(&hugetlb_lock);
1da177e4
LT
71 nr_huge_pages++;
72 nr_huge_pages_node[page_to_nid(page)]++;
0bd0f9fb 73 spin_unlock(&hugetlb_lock);
1da177e4
LT
74 }
75 return page;
76}
77
78void free_huge_page(struct page *page)
79{
80 BUG_ON(page_count(page));
81
82 INIT_LIST_HEAD(&page->lru);
83 page[1].mapping = NULL;
84
85 spin_lock(&hugetlb_lock);
86 enqueue_huge_page(page);
87 spin_unlock(&hugetlb_lock);
88}
89
90struct page *alloc_huge_page(void)
91{
92 struct page *page;
93 int i;
94
95 spin_lock(&hugetlb_lock);
96 page = dequeue_huge_page();
97 if (!page) {
98 spin_unlock(&hugetlb_lock);
99 return NULL;
100 }
101 spin_unlock(&hugetlb_lock);
102 set_page_count(page, 1);
103 page[1].mapping = (void *)free_huge_page;
104 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
105 clear_highpage(&page[i]);
106 return page;
107}
108
109static int __init hugetlb_init(void)
110{
111 unsigned long i;
112 struct page *page;
113
3c726f8d
BH
114 if (HPAGE_SHIFT == 0)
115 return 0;
116
1da177e4
LT
117 for (i = 0; i < MAX_NUMNODES; ++i)
118 INIT_LIST_HEAD(&hugepage_freelists[i]);
119
120 for (i = 0; i < max_huge_pages; ++i) {
121 page = alloc_fresh_huge_page();
122 if (!page)
123 break;
124 spin_lock(&hugetlb_lock);
125 enqueue_huge_page(page);
126 spin_unlock(&hugetlb_lock);
127 }
128 max_huge_pages = free_huge_pages = nr_huge_pages = i;
129 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
130 return 0;
131}
132module_init(hugetlb_init);
133
134static int __init hugetlb_setup(char *s)
135{
136 if (sscanf(s, "%lu", &max_huge_pages) <= 0)
137 max_huge_pages = 0;
138 return 1;
139}
140__setup("hugepages=", hugetlb_setup);
141
142#ifdef CONFIG_SYSCTL
143static void update_and_free_page(struct page *page)
144{
145 int i;
146 nr_huge_pages--;
147 nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
148 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
149 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
150 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
151 1 << PG_private | 1<< PG_writeback);
152 set_page_count(&page[i], 0);
153 }
154 set_page_count(page, 1);
155 __free_pages(page, HUGETLB_PAGE_ORDER);
156}
157
158#ifdef CONFIG_HIGHMEM
159static void try_to_free_low(unsigned long count)
160{
161 int i, nid;
162 for (i = 0; i < MAX_NUMNODES; ++i) {
163 struct page *page, *next;
164 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
165 if (PageHighMem(page))
166 continue;
167 list_del(&page->lru);
168 update_and_free_page(page);
169 nid = page_zone(page)->zone_pgdat->node_id;
170 free_huge_pages--;
171 free_huge_pages_node[nid]--;
172 if (count >= nr_huge_pages)
173 return;
174 }
175 }
176}
177#else
178static inline void try_to_free_low(unsigned long count)
179{
180}
181#endif
182
183static unsigned long set_max_huge_pages(unsigned long count)
184{
185 while (count > nr_huge_pages) {
186 struct page *page = alloc_fresh_huge_page();
187 if (!page)
188 return nr_huge_pages;
189 spin_lock(&hugetlb_lock);
190 enqueue_huge_page(page);
191 spin_unlock(&hugetlb_lock);
192 }
193 if (count >= nr_huge_pages)
194 return nr_huge_pages;
195
196 spin_lock(&hugetlb_lock);
197 try_to_free_low(count);
198 while (count < nr_huge_pages) {
199 struct page *page = dequeue_huge_page();
200 if (!page)
201 break;
202 update_and_free_page(page);
203 }
204 spin_unlock(&hugetlb_lock);
205 return nr_huge_pages;
206}
207
208int hugetlb_sysctl_handler(struct ctl_table *table, int write,
209 struct file *file, void __user *buffer,
210 size_t *length, loff_t *ppos)
211{
212 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
213 max_huge_pages = set_max_huge_pages(max_huge_pages);
214 return 0;
215}
216#endif /* CONFIG_SYSCTL */
217
218int hugetlb_report_meminfo(char *buf)
219{
220 return sprintf(buf,
221 "HugePages_Total: %5lu\n"
222 "HugePages_Free: %5lu\n"
223 "Hugepagesize: %5lu kB\n",
224 nr_huge_pages,
225 free_huge_pages,
226 HPAGE_SIZE/1024);
227}
228
229int hugetlb_report_node_meminfo(int nid, char *buf)
230{
231 return sprintf(buf,
232 "Node %d HugePages_Total: %5u\n"
233 "Node %d HugePages_Free: %5u\n",
234 nid, nr_huge_pages_node[nid],
235 nid, free_huge_pages_node[nid]);
236}
237
238int is_hugepage_mem_enough(size_t size)
239{
240 return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
241}
242
243/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
244unsigned long hugetlb_total_pages(void)
245{
246 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
247}
1da177e4
LT
248
249/*
250 * We cannot handle pagefaults against hugetlb pages at all. They cause
251 * handle_mm_fault() to try to instantiate regular-sized pages in the
252 * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
253 * this far.
254 */
255static struct page *hugetlb_nopage(struct vm_area_struct *vma,
256 unsigned long address, int *unused)
257{
258 BUG();
259 return NULL;
260}
261
262struct vm_operations_struct hugetlb_vm_ops = {
263 .nopage = hugetlb_nopage,
264};
265
1e8f889b
DG
266static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
267 int writable)
63551ae0
DG
268{
269 pte_t entry;
270
1e8f889b 271 if (writable) {
63551ae0
DG
272 entry =
273 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
274 } else {
275 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
276 }
277 entry = pte_mkyoung(entry);
278 entry = pte_mkhuge(entry);
279
280 return entry;
281}
282
1e8f889b
DG
283static void set_huge_ptep_writable(struct vm_area_struct *vma,
284 unsigned long address, pte_t *ptep)
285{
286 pte_t entry;
287
288 entry = pte_mkwrite(pte_mkdirty(*ptep));
289 ptep_set_access_flags(vma, address, ptep, entry, 1);
290 update_mmu_cache(vma, address, entry);
291 lazy_mmu_prot_update(entry);
292}
293
294
63551ae0
DG
295int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
296 struct vm_area_struct *vma)
297{
298 pte_t *src_pte, *dst_pte, entry;
299 struct page *ptepage;
1c59827d 300 unsigned long addr;
1e8f889b
DG
301 int cow;
302
303 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
63551ae0 304
1c59827d 305 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
c74df32c
HD
306 src_pte = huge_pte_offset(src, addr);
307 if (!src_pte)
308 continue;
63551ae0
DG
309 dst_pte = huge_pte_alloc(dst, addr);
310 if (!dst_pte)
311 goto nomem;
c74df32c 312 spin_lock(&dst->page_table_lock);
1c59827d 313 spin_lock(&src->page_table_lock);
c74df32c 314 if (!pte_none(*src_pte)) {
1e8f889b
DG
315 if (cow)
316 ptep_set_wrprotect(src, addr, src_pte);
1c59827d
HD
317 entry = *src_pte;
318 ptepage = pte_page(entry);
319 get_page(ptepage);
4294621f 320 add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
1c59827d
HD
321 set_huge_pte_at(dst, addr, dst_pte, entry);
322 }
323 spin_unlock(&src->page_table_lock);
c74df32c 324 spin_unlock(&dst->page_table_lock);
63551ae0
DG
325 }
326 return 0;
327
328nomem:
329 return -ENOMEM;
330}
331
332void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
333 unsigned long end)
334{
335 struct mm_struct *mm = vma->vm_mm;
336 unsigned long address;
c7546f8f 337 pte_t *ptep;
63551ae0
DG
338 pte_t pte;
339 struct page *page;
340
341 WARN_ON(!is_vm_hugetlb_page(vma));
342 BUG_ON(start & ~HPAGE_MASK);
343 BUG_ON(end & ~HPAGE_MASK);
344
508034a3
HD
345 spin_lock(&mm->page_table_lock);
346
365e9c87
HD
347 /* Update high watermark before we lower rss */
348 update_hiwater_rss(mm);
349
63551ae0 350 for (address = start; address < end; address += HPAGE_SIZE) {
c7546f8f 351 ptep = huge_pte_offset(mm, address);
4c887265 352 if (!ptep)
c7546f8f
DG
353 continue;
354
355 pte = huge_ptep_get_and_clear(mm, address, ptep);
63551ae0
DG
356 if (pte_none(pte))
357 continue;
c7546f8f 358
63551ae0
DG
359 page = pte_page(pte);
360 put_page(page);
4294621f 361 add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
63551ae0 362 }
63551ae0 363
1da177e4 364 spin_unlock(&mm->page_table_lock);
508034a3 365 flush_tlb_range(vma, start, end);
1da177e4 366}
63551ae0 367
85ef47f7 368static struct page *find_or_alloc_huge_page(struct address_space *mapping,
1e8f889b 369 unsigned long idx, int shared)
63551ae0 370{
4c887265
AL
371 struct page *page;
372 int err;
4c887265
AL
373
374retry:
375 page = find_lock_page(mapping, idx);
376 if (page)
377 goto out;
378
4c887265
AL
379 if (hugetlb_get_quota(mapping))
380 goto out;
381 page = alloc_huge_page();
382 if (!page) {
383 hugetlb_put_quota(mapping);
384 goto out;
385 }
63551ae0 386
1e8f889b
DG
387 if (shared) {
388 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
389 if (err) {
390 put_page(page);
391 hugetlb_put_quota(mapping);
392 if (err == -EEXIST)
393 goto retry;
394 page = NULL;
395 }
396 } else {
397 /* Caller expects a locked page */
398 lock_page(page);
63551ae0
DG
399 }
400out:
4c887265 401 return page;
63551ae0
DG
402}
403
1e8f889b
DG
404static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
405 unsigned long address, pte_t *ptep, pte_t pte)
406{
407 struct page *old_page, *new_page;
408 int i, avoidcopy;
409
410 old_page = pte_page(pte);
411
412 /* If no-one else is actually using this page, avoid the copy
413 * and just make the page writable */
414 avoidcopy = (page_count(old_page) == 1);
415 if (avoidcopy) {
416 set_huge_ptep_writable(vma, address, ptep);
417 return VM_FAULT_MINOR;
418 }
419
420 page_cache_get(old_page);
421 new_page = alloc_huge_page();
422
423 if (!new_page) {
424 page_cache_release(old_page);
425
426 /* Logically this is OOM, not a SIGBUS, but an OOM
427 * could cause the kernel to go killing other
428 * processes which won't help the hugepage situation
429 * at all (?) */
430 return VM_FAULT_SIGBUS;
431 }
432
433 spin_unlock(&mm->page_table_lock);
434 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
435 copy_user_highpage(new_page + i, old_page + i,
436 address + i*PAGE_SIZE);
437 spin_lock(&mm->page_table_lock);
438
439 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
440 if (likely(pte_same(*ptep, pte))) {
441 /* Break COW */
442 set_huge_pte_at(mm, address, ptep,
443 make_huge_pte(vma, new_page, 1));
444 /* Make the old page be freed below */
445 new_page = old_page;
446 }
447 page_cache_release(new_page);
448 page_cache_release(old_page);
449 return VM_FAULT_MINOR;
450}
451
86e5216f 452int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1e8f889b 453 unsigned long address, pte_t *ptep, int write_access)
ac9b9c66
HD
454{
455 int ret = VM_FAULT_SIGBUS;
4c887265
AL
456 unsigned long idx;
457 unsigned long size;
4c887265
AL
458 struct page *page;
459 struct address_space *mapping;
1e8f889b 460 pte_t new_pte;
4c887265 461
4c887265
AL
462 mapping = vma->vm_file->f_mapping;
463 idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
464 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
465
466 /*
467 * Use page lock to guard against racing truncation
468 * before we get page_table_lock.
469 */
1e8f889b
DG
470 page = find_or_alloc_huge_page(mapping, idx,
471 vma->vm_flags & VM_SHARED);
4c887265
AL
472 if (!page)
473 goto out;
ac9b9c66 474
1e8f889b
DG
475 BUG_ON(!PageLocked(page));
476
ac9b9c66 477 spin_lock(&mm->page_table_lock);
4c887265
AL
478 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
479 if (idx >= size)
480 goto backout;
481
482 ret = VM_FAULT_MINOR;
86e5216f 483 if (!pte_none(*ptep))
4c887265
AL
484 goto backout;
485
486 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
1e8f889b
DG
487 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
488 && (vma->vm_flags & VM_SHARED)));
489 set_huge_pte_at(mm, address, ptep, new_pte);
490
491 if (write_access && !(vma->vm_flags & VM_SHARED)) {
492 /* Optimization, do the COW without a second fault */
493 ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
494 }
495
ac9b9c66 496 spin_unlock(&mm->page_table_lock);
4c887265
AL
497 unlock_page(page);
498out:
ac9b9c66 499 return ret;
4c887265
AL
500
501backout:
502 spin_unlock(&mm->page_table_lock);
503 hugetlb_put_quota(mapping);
504 unlock_page(page);
505 put_page(page);
506 goto out;
ac9b9c66
HD
507}
508
86e5216f
AL
509int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
510 unsigned long address, int write_access)
511{
512 pte_t *ptep;
513 pte_t entry;
1e8f889b 514 int ret;
86e5216f
AL
515
516 ptep = huge_pte_alloc(mm, address);
517 if (!ptep)
518 return VM_FAULT_OOM;
519
520 entry = *ptep;
521 if (pte_none(entry))
1e8f889b 522 return hugetlb_no_page(mm, vma, address, ptep, write_access);
86e5216f 523
1e8f889b
DG
524 ret = VM_FAULT_MINOR;
525
526 spin_lock(&mm->page_table_lock);
527 /* Check for a racing update before calling hugetlb_cow */
528 if (likely(pte_same(entry, *ptep)))
529 if (write_access && !pte_write(entry))
530 ret = hugetlb_cow(mm, vma, address, ptep, entry);
531 spin_unlock(&mm->page_table_lock);
532
533 return ret;
86e5216f
AL
534}
535
63551ae0
DG
536int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
537 struct page **pages, struct vm_area_struct **vmas,
538 unsigned long *position, int *length, int i)
539{
540 unsigned long vpfn, vaddr = *position;
541 int remainder = *length;
542
63551ae0 543 vpfn = vaddr/PAGE_SIZE;
1c59827d 544 spin_lock(&mm->page_table_lock);
63551ae0 545 while (vaddr < vma->vm_end && remainder) {
4c887265
AL
546 pte_t *pte;
547 struct page *page;
63551ae0 548
4c887265
AL
549 /*
550 * Some archs (sparc64, sh*) have multiple pte_ts to
551 * each hugepage. We have to make * sure we get the
552 * first, for the page indexing below to work.
553 */
554 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
63551ae0 555
4c887265
AL
556 if (!pte || pte_none(*pte)) {
557 int ret;
63551ae0 558
4c887265
AL
559 spin_unlock(&mm->page_table_lock);
560 ret = hugetlb_fault(mm, vma, vaddr, 0);
561 spin_lock(&mm->page_table_lock);
562 if (ret == VM_FAULT_MINOR)
563 continue;
63551ae0 564
4c887265
AL
565 remainder = 0;
566 if (!i)
567 i = -EFAULT;
568 break;
569 }
570
571 if (pages) {
572 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
63551ae0
DG
573 get_page(page);
574 pages[i] = page;
575 }
576
577 if (vmas)
578 vmas[i] = vma;
579
580 vaddr += PAGE_SIZE;
581 ++vpfn;
582 --remainder;
583 ++i;
584 }
1c59827d 585 spin_unlock(&mm->page_table_lock);
63551ae0
DG
586 *length = remainder;
587 *position = vaddr;
588
589 return i;
590}