hugetlb: allow huge page mappings to be created without reservations
[linux-2.6-block.git] / mm / hugetlb.c
index ab171274ef217817291667bee226b45bebe259e3..72acbb29d2cc68ca50161af90d3cbe042c9852aa 100644 (file)
@@ -40,6 +40,253 @@ static int hugetlb_next_nid;
  */
 static DEFINE_SPINLOCK(hugetlb_lock);
 
+/*
+ * Region tracking -- allows tracking of reservations and instantiated pages
+ *                    across the pages in a mapping.
+ */
+struct file_region {
+       struct list_head link;
+       long from;
+       long to;
+};
+
+static long region_add(struct list_head *head, long f, long t)
+{
+       struct file_region *rg, *nrg, *trg;
+
+       /* Locate the region we are either in or before. */
+       list_for_each_entry(rg, head, link)
+               if (f <= rg->to)
+                       break;
+
+       /* Round our left edge to the current segment if it encloses us. */
+       if (f > rg->from)
+               f = rg->from;
+
+       /* Check for and consume any regions we now overlap with. */
+       nrg = rg;
+       list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+               if (&rg->link == head)
+                       break;
+               if (rg->from > t)
+                       break;
+
+               /* If this area reaches higher then extend our area to
+                * include it completely.  If this is not the first area
+                * which we intend to reuse, free it. */
+               if (rg->to > t)
+                       t = rg->to;
+               if (rg != nrg) {
+                       list_del(&rg->link);
+                       kfree(rg);
+               }
+       }
+       nrg->from = f;
+       nrg->to = t;
+       return 0;
+}
+
+static long region_chg(struct list_head *head, long f, long t)
+{
+       struct file_region *rg, *nrg;
+       long chg = 0;
+
+       /* Locate the region we are before or in. */
+       list_for_each_entry(rg, head, link)
+               if (f <= rg->to)
+                       break;
+
+       /* If we are below the current region then a new region is required.
+        * Subtle, allocate a new region at the position but make it zero
+        * size such that we can guarantee to record the reservation. */
+       if (&rg->link == head || t < rg->from) {
+               nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+               if (!nrg)
+                       return -ENOMEM;
+               nrg->from = f;
+               nrg->to   = f;
+               INIT_LIST_HEAD(&nrg->link);
+               list_add(&nrg->link, rg->link.prev);
+
+               return t - f;
+       }
+
+       /* Round our left edge to the current segment if it encloses us. */
+       if (f > rg->from)
+               f = rg->from;
+       chg = t - f;
+
+       /* Check for and consume any regions we now overlap with. */
+       list_for_each_entry(rg, rg->link.prev, link) {
+               if (&rg->link == head)
+                       break;
+               if (rg->from > t)
+                       return chg;
+
+               /* We overlap with this area, if it extends futher than
+                * us then we must extend ourselves.  Account for its
+                * existing reservation. */
+               if (rg->to > t) {
+                       chg += rg->to - t;
+                       t = rg->to;
+               }
+               chg -= rg->to - rg->from;
+       }
+       return chg;
+}
+
+static long region_truncate(struct list_head *head, long end)
+{
+       struct file_region *rg, *trg;
+       long chg = 0;
+
+       /* Locate the region we are either in or before. */
+       list_for_each_entry(rg, head, link)
+               if (end <= rg->to)
+                       break;
+       if (&rg->link == head)
+               return 0;
+
+       /* If we are in the middle of a region then adjust it. */
+       if (end > rg->from) {
+               chg = rg->to - end;
+               rg->to = end;
+               rg = list_entry(rg->link.next, typeof(*rg), link);
+       }
+
+       /* Drop any remaining regions. */
+       list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+               if (&rg->link == head)
+                       break;
+               chg += rg->to - rg->from;
+               list_del(&rg->link);
+               kfree(rg);
+       }
+       return chg;
+}
+
+/*
+ * Convert the address within this vma to the page offset within
+ * the mapping, in base page units.
+ */
+static pgoff_t vma_page_offset(struct vm_area_struct *vma,
+                               unsigned long address)
+{
+       return ((address - vma->vm_start) >> PAGE_SHIFT) +
+                                       (vma->vm_pgoff >> PAGE_SHIFT);
+}
+
+/*
+ * Convert the address within this vma to the page offset within
+ * the mapping, in pagecache page units; huge pages here.
+ */
+static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma,
+                                       unsigned long address)
+{
+       return ((address - vma->vm_start) >> HPAGE_SHIFT) +
+                       (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+}
+
+#define HPAGE_RESV_OWNER    (1UL << (BITS_PER_LONG - 1))
+#define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2))
+#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
+/*
+ * These helpers are used to track how many pages are reserved for
+ * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
+ * is guaranteed to have their future faults succeed.
+ *
+ * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * the reserve counters are updated with the hugetlb_lock held. It is safe
+ * to reset the VMA at fork() time as it is not in use yet and there is no
+ * chance of the global counters getting corrupted as a result of the values.
+ */
+static unsigned long get_vma_private_data(struct vm_area_struct *vma)
+{
+       return (unsigned long)vma->vm_private_data;
+}
+
+static void set_vma_private_data(struct vm_area_struct *vma,
+                                                       unsigned long value)
+{
+       vma->vm_private_data = (void *)value;
+}
+
+static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+       VM_BUG_ON(!is_vm_hugetlb_page(vma));
+       if (!(vma->vm_flags & VM_SHARED))
+               return get_vma_private_data(vma) & ~HPAGE_RESV_MASK;
+       return 0;
+}
+
+static void set_vma_resv_huge_pages(struct vm_area_struct *vma,
+                                                       unsigned long reserve)
+{
+       VM_BUG_ON(!is_vm_hugetlb_page(vma));
+       VM_BUG_ON(vma->vm_flags & VM_SHARED);
+
+       set_vma_private_data(vma,
+               (get_vma_private_data(vma) & HPAGE_RESV_MASK) | reserve);
+}
+
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+{
+       VM_BUG_ON(!is_vm_hugetlb_page(vma));
+       VM_BUG_ON(vma->vm_flags & VM_SHARED);
+
+       set_vma_private_data(vma, get_vma_private_data(vma) | flags);
+}
+
+static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
+{
+       VM_BUG_ON(!is_vm_hugetlb_page(vma));
+
+       return (get_vma_private_data(vma) & flag) != 0;
+}
+
+/* Decrement the reserved pages in the hugepage pool by one */
+static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
+{
+       if (vma->vm_flags & VM_NORESERVE)
+               return;
+
+       if (vma->vm_flags & VM_SHARED) {
+               /* Shared mappings always use reserves */
+               resv_huge_pages--;
+       } else {
+               /*
+                * Only the process that called mmap() has reserves for
+                * private mappings.
+                */
+               if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+                       unsigned long flags, reserve;
+                       resv_huge_pages--;
+                       flags = (unsigned long)vma->vm_private_data &
+                                                       HPAGE_RESV_MASK;
+                       reserve = (unsigned long)vma->vm_private_data - 1;
+                       vma->vm_private_data = (void *)(reserve | flags);
+               }
+       }
+}
+
+/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
+void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+       VM_BUG_ON(!is_vm_hugetlb_page(vma));
+       if (!(vma->vm_flags & VM_SHARED))
+               vma->vm_private_data = (void *)0;
+}
+
+/* Returns true if the VMA has associated reserve pages */
+static int vma_has_private_reserves(struct vm_area_struct *vma)
+{
+       if (vma->vm_flags & VM_SHARED)
+               return 0;
+       if (!vma_resv_huge_pages(vma))
+               return 0;
+       return 1;
+}
+
 static void clear_huge_page(struct page *page, unsigned long addr)
 {
        int i;
@@ -90,7 +337,7 @@ static struct page *dequeue_huge_page(void)
 }
 
 static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
-                               unsigned long address)
+                               unsigned long address, int avoid_reserve)
 {
        int nid;
        struct page *page = NULL;
@@ -101,6 +348,19 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
        struct zone *zone;
        struct zoneref *z;
 
+       /*
+        * A child process with MAP_PRIVATE mappings created by their parent
+        * have no page reserves. This check ensures that reservations are
+        * not "stolen". The child may still get SIGKILLed
+        */
+       if (!vma_has_private_reserves(vma) &&
+                       free_huge_pages - resv_huge_pages == 0)
+               return NULL;
+
+       /* If reserves cannot be used, ensure enough pages are in the pool */
+       if (avoid_reserve && free_huge_pages - resv_huge_pages == 0)
+               return NULL;
+
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                MAX_NR_ZONES - 1, nodemask) {
                nid = zone_to_nid(zone);
@@ -111,8 +371,10 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
                        list_del(&page->lru);
                        free_huge_pages--;
                        free_huge_pages_node[nid]--;
-                       if (vma && vma->vm_flags & VM_MAYSHARE)
-                               resv_huge_pages--;
+
+                       if (!avoid_reserve)
+                               decrement_hugepage_resv_vma(vma);
+
                        break;
                }
        }
@@ -461,55 +723,83 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
        }
 }
 
+/*
+ * Determine if the huge page at addr within the vma has an associated
+ * reservation.  Where it does not we will need to logically increase
+ * reservation and actually increase quota before an allocation can occur.
+ * Where any new reservation would be required the reservation change is
+ * prepared, but not committed.  Once the page has been quota'd allocated
+ * an instantiated the change should be committed via vma_commit_reservation.
+ * No action is required on failure.
+ */
+static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr)
+{
+       struct address_space *mapping = vma->vm_file->f_mapping;
+       struct inode *inode = mapping->host;
+
+       if (vma->vm_flags & VM_SHARED) {
+               pgoff_t idx = vma_pagecache_offset(vma, addr);
+               return region_chg(&inode->i_mapping->private_list,
+                                                       idx, idx + 1);
 
-static struct page *alloc_huge_page_shared(struct vm_area_struct *vma,
-                                               unsigned long addr)
+       } else {
+               if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+                       return 1;
+       }
+
+       return 0;
+}
+static void vma_commit_reservation(struct vm_area_struct *vma,
+                                                       unsigned long addr)
 {
-       struct page *page;
+       struct address_space *mapping = vma->vm_file->f_mapping;
+       struct inode *inode = mapping->host;
 
-       spin_lock(&hugetlb_lock);
-       page = dequeue_huge_page_vma(vma, addr);
-       spin_unlock(&hugetlb_lock);
-       return page ? page : ERR_PTR(-VM_FAULT_OOM);
+       if (vma->vm_flags & VM_SHARED) {
+               pgoff_t idx = vma_pagecache_offset(vma, addr);
+               region_add(&inode->i_mapping->private_list, idx, idx + 1);
+       }
 }
 
-static struct page *alloc_huge_page_private(struct vm_area_struct *vma,
-                                               unsigned long addr)
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+                                   unsigned long addr, int avoid_reserve)
 {
-       struct page *page = NULL;
+       struct page *page;
+       struct address_space *mapping = vma->vm_file->f_mapping;
+       struct inode *inode = mapping->host;
+       unsigned int chg;
 
-       if (hugetlb_get_quota(vma->vm_file->f_mapping, 1))
-               return ERR_PTR(-VM_FAULT_SIGBUS);
+       /*
+        * Processes that did not create the mapping will have no reserves and
+        * will not have accounted against quota. Check that the quota can be
+        * made before satisfying the allocation
+        * MAP_NORESERVE mappings may also need pages and quota allocated
+        * if no reserve mapping overlaps.
+        */
+       chg = vma_needs_reservation(vma, addr);
+       if (chg < 0)
+               return ERR_PTR(chg);
+       if (chg)
+               if (hugetlb_get_quota(inode->i_mapping, chg))
+                       return ERR_PTR(-ENOSPC);
 
        spin_lock(&hugetlb_lock);
-       if (free_huge_pages > resv_huge_pages)
-               page = dequeue_huge_page_vma(vma, addr);
+       page = dequeue_huge_page_vma(vma, addr, avoid_reserve);
        spin_unlock(&hugetlb_lock);
+
        if (!page) {
                page = alloc_buddy_huge_page(vma, addr);
                if (!page) {
-                       hugetlb_put_quota(vma->vm_file->f_mapping, 1);
+                       hugetlb_put_quota(inode->i_mapping, chg);
                        return ERR_PTR(-VM_FAULT_OOM);
                }
        }
-       return page;
-}
 
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
-                                   unsigned long addr)
-{
-       struct page *page;
-       struct address_space *mapping = vma->vm_file->f_mapping;
+       set_page_refcounted(page);
+       set_page_private(page, (unsigned long) mapping);
 
-       if (vma->vm_flags & VM_MAYSHARE)
-               page = alloc_huge_page_shared(vma, addr);
-       else
-               page = alloc_huge_page_private(vma, addr);
+       vma_commit_reservation(vma, addr);
 
-       if (!IS_ERR(page)) {
-               set_page_refcounted(page);
-               set_page_private(page, (unsigned long) mapping);
-       }
        return page;
 }
 
@@ -603,7 +893,6 @@ static unsigned long set_max_huge_pages(unsigned long count)
        }
 
        while (count > persistent_huge_pages) {
-               int ret;
                /*
                 * If this allocation races such that we no longer need the
                 * page, free_huge_page will handle it by freeing the page
@@ -717,6 +1006,54 @@ unsigned long hugetlb_total_pages(void)
        return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
 }
 
+static int hugetlb_acct_memory(long delta)
+{
+       int ret = -ENOMEM;
+
+       spin_lock(&hugetlb_lock);
+       /*
+        * When cpuset is configured, it breaks the strict hugetlb page
+        * reservation as the accounting is done on a global variable. Such
+        * reservation is completely rubbish in the presence of cpuset because
+        * the reservation is not checked against page availability for the
+        * current cpuset. Application can still potentially OOM'ed by kernel
+        * with lack of free htlb page in cpuset that the task is in.
+        * Attempt to enforce strict accounting with cpuset is almost
+        * impossible (or too ugly) because cpuset is too fluid that
+        * task or memory node can be dynamically moved between cpusets.
+        *
+        * The change of semantics for shared hugetlb mapping with cpuset is
+        * undesirable. However, in order to preserve some of the semantics,
+        * we fall back to check against current free page availability as
+        * a best attempt and hopefully to minimize the impact of changing
+        * semantics that cpuset has.
+        */
+       if (delta > 0) {
+               if (gather_surplus_pages(delta) < 0)
+                       goto out;
+
+               if (delta > cpuset_mems_nr(free_huge_pages_node)) {
+                       return_unused_surplus_pages(delta);
+                       goto out;
+               }
+       }
+
+       ret = 0;
+       if (delta < 0)
+               return_unused_surplus_pages((unsigned long) -delta);
+
+out:
+       spin_unlock(&hugetlb_lock);
+       return ret;
+}
+
+static void hugetlb_vm_op_close(struct vm_area_struct *vma)
+{
+       unsigned long reserve = vma_resv_huge_pages(vma);
+       if (reserve)
+               hugetlb_acct_memory(-reserve);
+}
+
 /*
  * We cannot handle pagefaults against hugetlb pages at all.  They cause
  * handle_mm_fault() to try to instantiate regular-sized pages in the
@@ -731,6 +1068,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 struct vm_operations_struct hugetlb_vm_ops = {
        .fault = hugetlb_vm_op_fault,
+       .close = hugetlb_vm_op_close,
 };
 
 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -804,7 +1142,7 @@ nomem:
 }
 
 void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-                           unsigned long end)
+                           unsigned long end, struct page *ref_page)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
@@ -832,6 +1170,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                if (huge_pmd_unshare(mm, &address, ptep))
                        continue;
 
+               /*
+                * If a reference page is supplied, it is because a specific
+                * page is being unmapped, not a range. Ensure the page we
+                * are about to unmap is the actual page of interest.
+                */
+               if (ref_page) {
+                       pte = huge_ptep_get(ptep);
+                       if (huge_pte_none(pte))
+                               continue;
+                       page = pte_page(pte);
+                       if (page != ref_page)
+                               continue;
+
+                       /*
+                        * Mark the VMA as having unmapped its page so that
+                        * future faults in this VMA will fail rather than
+                        * looking like data was lost
+                        */
+                       set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
+               }
+
                pte = huge_ptep_get_and_clear(mm, address, ptep);
                if (huge_pte_none(pte))
                        continue;
@@ -850,7 +1209,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 }
 
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-                         unsigned long end)
+                         unsigned long end, struct page *ref_page)
 {
        /*
         * It is undesirable to test vma->vm_file as it should be non-null
@@ -862,19 +1221,68 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
         */
        if (vma->vm_file) {
                spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
-               __unmap_hugepage_range(vma, start, end);
+               __unmap_hugepage_range(vma, start, end, ref_page);
                spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
        }
 }
 
+/*
+ * This is called when the original mapper is failing to COW a MAP_PRIVATE
+ * mappping it owns the reserve page for. The intention is to unmap the page
+ * from other VMAs and let the children be SIGKILLed if they are faulting the
+ * same region.
+ */
+int unmap_ref_private(struct mm_struct *mm,
+                                       struct vm_area_struct *vma,
+                                       struct page *page,
+                                       unsigned long address)
+{
+       struct vm_area_struct *iter_vma;
+       struct address_space *mapping;
+       struct prio_tree_iter iter;
+       pgoff_t pgoff;
+
+       /*
+        * vm_pgoff is in PAGE_SIZE units, hence the different calculation
+        * from page cache lookup which is in HPAGE_SIZE units.
+        */
+       address = address & huge_page_mask(hstate_vma(vma));
+       pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
+               + (vma->vm_pgoff >> PAGE_SHIFT);
+       mapping = (struct address_space *)page_private(page);
+
+       vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+               /* Do not unmap the current VMA */
+               if (iter_vma == vma)
+                       continue;
+
+               /*
+                * Unmap the page from other VMAs without their own reserves.
+                * They get marked to be SIGKILLed if they fault in these
+                * areas. This is because a future no-page fault on this VMA
+                * could insert a zeroed page instead of the data existing
+                * from the time of fork. This would look like data corruption
+                */
+               if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
+                       unmap_hugepage_range(iter_vma,
+                               address, address + HPAGE_SIZE,
+                               page);
+       }
+
+       return 1;
+}
+
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-                       unsigned long address, pte_t *ptep, pte_t pte)
+                       unsigned long address, pte_t *ptep, pte_t pte,
+                       struct page *pagecache_page)
 {
        struct page *old_page, *new_page;
        int avoidcopy;
+       int outside_reserve = 0;
 
        old_page = pte_page(pte);
 
+retry_avoidcopy:
        /* If no-one else is actually using this page, avoid the copy
         * and just make the page writable */
        avoidcopy = (page_count(old_page) == 1);
@@ -883,11 +1291,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
                return 0;
        }
 
+       /*
+        * If the process that created a MAP_PRIVATE mapping is about to
+        * perform a COW due to a shared page count, attempt to satisfy
+        * the allocation without using the existing reserves. The pagecache
+        * page is used to determine if the reserve at this address was
+        * consumed or not. If reserves were used, a partial faulted mapping
+        * at the time of fork() could consume its reserves on COW instead
+        * of the full address range.
+        */
+       if (!(vma->vm_flags & VM_SHARED) &&
+                       is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
+                       old_page != pagecache_page)
+               outside_reserve = 1;
+
        page_cache_get(old_page);
-       new_page = alloc_huge_page(vma, address);
+       new_page = alloc_huge_page(vma, address, outside_reserve);
 
        if (IS_ERR(new_page)) {
                page_cache_release(old_page);
+
+               /*
+                * If a process owning a MAP_PRIVATE mapping fails to COW,
+                * it is due to references held by a child and an insufficient
+                * huge page pool. To guarantee the original mappers
+                * reliability, unmap the page from child processes. The child
+                * may get SIGKILLed if it later faults.
+                */
+               if (outside_reserve) {
+                       BUG_ON(huge_pte_none(pte));
+                       if (unmap_ref_private(mm, vma, old_page, address)) {
+                               BUG_ON(page_count(old_page) != 1);
+                               BUG_ON(huge_pte_none(pte));
+                               goto retry_avoidcopy;
+                       }
+                       WARN_ON_ONCE(1);
+               }
+
                return -PTR_ERR(new_page);
        }
 
@@ -910,19 +1350,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        return 0;
 }
 
+/* Return the pagecache page at a given address within a VMA */
+static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma,
+                       unsigned long address)
+{
+       struct address_space *mapping;
+       pgoff_t idx;
+
+       mapping = vma->vm_file->f_mapping;
+       idx = vma_pagecache_offset(vma, address);
+
+       return find_lock_page(mapping, idx);
+}
+
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pte_t *ptep, int write_access)
 {
        int ret = VM_FAULT_SIGBUS;
-       unsigned long idx;
+       pgoff_t idx;
        unsigned long size;
        struct page *page;
        struct address_space *mapping;
        pte_t new_pte;
 
+       /*
+        * Currently, we are forced to kill the process in the event the
+        * original mapper has unmapped pages from the child due to a failed
+        * COW. Warn that such a situation has occured as it may not be obvious
+        */
+       if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
+               printk(KERN_WARNING
+                       "PID %d killed due to inadequate hugepage pool\n",
+                       current->pid);
+               return ret;
+       }
+
        mapping = vma->vm_file->f_mapping;
-       idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
-               + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+       idx = vma_pagecache_offset(vma, address);
 
        /*
         * Use page lock to guard against racing truncation
@@ -934,7 +1398,7 @@ retry:
                size = i_size_read(mapping->host) >> HPAGE_SHIFT;
                if (idx >= size)
                        goto out;
-               page = alloc_huge_page(vma, address);
+               page = alloc_huge_page(vma, address, 0);
                if (IS_ERR(page)) {
                        ret = -PTR_ERR(page);
                        goto out;
@@ -976,7 +1440,7 @@ retry:
 
        if (write_access && !(vma->vm_flags & VM_SHARED)) {
                /* Optimization, do the COW without a second fault */
-               ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+               ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
        }
 
        spin_unlock(&mm->page_table_lock);
@@ -1021,8 +1485,15 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        spin_lock(&mm->page_table_lock);
        /* Check for a racing update before calling hugetlb_cow */
        if (likely(pte_same(entry, huge_ptep_get(ptep))))
-               if (write_access && !pte_write(entry))
-                       ret = hugetlb_cow(mm, vma, address, ptep, entry);
+               if (write_access && !pte_write(entry)) {
+                       struct page *page;
+                       page = hugetlbfs_pagecache_page(vma, address);
+                       ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
+                       if (page) {
+                               unlock_page(page);
+                               put_page(page);
+                       }
+               }
        spin_unlock(&mm->page_table_lock);
        mutex_unlock(&hugetlb_instantiation_mutex);
 
@@ -1128,173 +1599,29 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
        flush_tlb_range(vma, start, end);
 }
 
-struct file_region {
-       struct list_head link;
-       long from;
-       long to;
-};
-
-static long region_add(struct list_head *head, long f, long t)
-{
-       struct file_region *rg, *nrg, *trg;
-
-       /* Locate the region we are either in or before. */
-       list_for_each_entry(rg, head, link)
-               if (f <= rg->to)
-                       break;
-
-       /* Round our left edge to the current segment if it encloses us. */
-       if (f > rg->from)
-               f = rg->from;
-
-       /* Check for and consume any regions we now overlap with. */
-       nrg = rg;
-       list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
-               if (&rg->link == head)
-                       break;
-               if (rg->from > t)
-                       break;
-
-               /* If this area reaches higher then extend our area to
-                * include it completely.  If this is not the first area
-                * which we intend to reuse, free it. */
-               if (rg->to > t)
-                       t = rg->to;
-               if (rg != nrg) {
-                       list_del(&rg->link);
-                       kfree(rg);
-               }
-       }
-       nrg->from = f;
-       nrg->to = t;
-       return 0;
-}
-
-static long region_chg(struct list_head *head, long f, long t)
+int hugetlb_reserve_pages(struct inode *inode,
+                                       long from, long to,
+                                       struct vm_area_struct *vma)
 {
-       struct file_region *rg, *nrg;
-       long chg = 0;
-
-       /* Locate the region we are before or in. */
-       list_for_each_entry(rg, head, link)
-               if (f <= rg->to)
-                       break;
-
-       /* If we are below the current region then a new region is required.
-        * Subtle, allocate a new region at the position but make it zero
-        * size such that we can guarantee to record the reservation. */
-       if (&rg->link == head || t < rg->from) {
-               nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
-               if (!nrg)
-                       return -ENOMEM;
-               nrg->from = f;
-               nrg->to   = f;
-               INIT_LIST_HEAD(&nrg->link);
-               list_add(&nrg->link, rg->link.prev);
-
-               return t - f;
-       }
-
-       /* Round our left edge to the current segment if it encloses us. */
-       if (f > rg->from)
-               f = rg->from;
-       chg = t - f;
-
-       /* Check for and consume any regions we now overlap with. */
-       list_for_each_entry(rg, rg->link.prev, link) {
-               if (&rg->link == head)
-                       break;
-               if (rg->from > t)
-                       return chg;
-
-               /* We overlap with this area, if it extends futher than
-                * us then we must extend ourselves.  Account for its
-                * existing reservation. */
-               if (rg->to > t) {
-                       chg += rg->to - t;
-                       t = rg->to;
-               }
-               chg -= rg->to - rg->from;
-       }
-       return chg;
-}
-
-static long region_truncate(struct list_head *head, long end)
-{
-       struct file_region *rg, *trg;
-       long chg = 0;
+       long ret, chg;
 
-       /* Locate the region we are either in or before. */
-       list_for_each_entry(rg, head, link)
-               if (end <= rg->to)
-                       break;
-       if (&rg->link == head)
+       if (vma && vma->vm_flags & VM_NORESERVE)
                return 0;
 
-       /* If we are in the middle of a region then adjust it. */
-       if (end > rg->from) {
-               chg = rg->to - end;
-               rg->to = end;
-               rg = list_entry(rg->link.next, typeof(*rg), link);
-       }
-
-       /* Drop any remaining regions. */
-       list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
-               if (&rg->link == head)
-                       break;
-               chg += rg->to - rg->from;
-               list_del(&rg->link);
-               kfree(rg);
-       }
-       return chg;
-}
-
-static int hugetlb_acct_memory(long delta)
-{
-       int ret = -ENOMEM;
-
-       spin_lock(&hugetlb_lock);
        /*
-        * When cpuset is configured, it breaks the strict hugetlb page
-        * reservation as the accounting is done on a global variable. Such
-        * reservation is completely rubbish in the presence of cpuset because
-        * the reservation is not checked against page availability for the
-        * current cpuset. Application can still potentially OOM'ed by kernel
-        * with lack of free htlb page in cpuset that the task is in.
-        * Attempt to enforce strict accounting with cpuset is almost
-        * impossible (or too ugly) because cpuset is too fluid that
-        * task or memory node can be dynamically moved between cpusets.
-        *
-        * The change of semantics for shared hugetlb mapping with cpuset is
-        * undesirable. However, in order to preserve some of the semantics,
-        * we fall back to check against current free page availability as
-        * a best attempt and hopefully to minimize the impact of changing
-        * semantics that cpuset has.
+        * Shared mappings base their reservation on the number of pages that
+        * are already allocated on behalf of the file. Private mappings need
+        * to reserve the full area even if read-only as mprotect() may be
+        * called to make the mapping read-write. Assume !vma is a shm mapping
         */
-       if (delta > 0) {
-               if (gather_surplus_pages(delta) < 0)
-                       goto out;
-
-               if (delta > cpuset_mems_nr(free_huge_pages_node)) {
-                       return_unused_surplus_pages(delta);
-                       goto out;
-               }
+       if (!vma || vma->vm_flags & VM_SHARED)
+               chg = region_chg(&inode->i_mapping->private_list, from, to);
+       else {
+               chg = to - from;
+               set_vma_resv_huge_pages(vma, chg);
+               set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
        }
 
-       ret = 0;
-       if (delta < 0)
-               return_unused_surplus_pages((unsigned long) -delta);
-
-out:
-       spin_unlock(&hugetlb_lock);
-       return ret;
-}
-
-int hugetlb_reserve_pages(struct inode *inode, long from, long to)
-{
-       long ret, chg;
-
-       chg = region_chg(&inode->i_mapping->private_list, from, to);
        if (chg < 0)
                return chg;
 
@@ -1305,7 +1632,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
                hugetlb_put_quota(inode->i_mapping, chg);
                return ret;
        }
-       region_add(&inode->i_mapping->private_list, from, to);
+       if (!vma || vma->vm_flags & VM_SHARED)
+               region_add(&inode->i_mapping->private_list, from, to);
        return 0;
 }