net/mlx4_core: Fix when to save some qp context flags for dynamic VST to VGT transitions
[linux-2.6-block.git] / mm / hugetlb.c
index 418bf01a50ed1f9dde0ca6c083aafa86c1869f23..c7025c132670a4d8e3279d1ebb7730718fb6aa8a 100644 (file)
@@ -1773,23 +1773,32 @@ free:
 }
 
 /*
- * When releasing a hugetlb pool reservation, any surplus pages that were
- * allocated to satisfy the reservation must be explicitly freed if they were
- * never used.
- * Called with hugetlb_lock held.
+ * This routine has two main purposes:
+ * 1) Decrement the reservation count (resv_huge_pages) by the value passed
+ *    in unused_resv_pages.  This corresponds to the prior adjustments made
+ *    to the associated reservation map.
+ * 2) Free any unused surplus pages that may have been allocated to satisfy
+ *    the reservation.  As many as unused_resv_pages may be freed.
+ *
+ * Called with hugetlb_lock held.  However, the lock could be dropped (and
+ * reacquired) during calls to cond_resched_lock.  Whenever dropping the lock,
+ * we must make sure nobody else can claim pages we are in the process of
+ * freeing.  Do this by ensuring resv_huge_page always is greater than the
+ * number of huge pages we plan to free when dropping the lock.
  */
 static void return_unused_surplus_pages(struct hstate *h,
                                        unsigned long unused_resv_pages)
 {
        unsigned long nr_pages;
 
-       /* Uncommit the reservation */
-       h->resv_huge_pages -= unused_resv_pages;
-
        /* Cannot return gigantic pages currently */
        if (hstate_is_gigantic(h))
-               return;
+               goto out;
 
+       /*
+        * Part (or even all) of the reservation could have been backed
+        * by pre-allocated pages. Only free surplus pages.
+        */
        nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
 
        /*
@@ -1799,12 +1808,22 @@ static void return_unused_surplus_pages(struct hstate *h,
         * when the nodes with surplus pages have no free pages.
         * free_pool_huge_page() will balance the the freed pages across the
         * on-line nodes with memory and will handle the hstate accounting.
+        *
+        * Note that we decrement resv_huge_pages as we free the pages.  If
+        * we drop the lock, resv_huge_pages will still be sufficiently large
+        * to cover subsequent pages we may free.
         */
        while (nr_pages--) {
+               h->resv_huge_pages--;
+               unused_resv_pages--;
                if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
-                       break;
+                       goto out;
                cond_resched_lock(&hugetlb_lock);
        }
+
+out:
+       /* Fully uncommit the reservation */
+       h->resv_huge_pages -= unused_resv_pages;
 }
 
 
@@ -3286,6 +3305,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
        BUG_ON(start & ~huge_page_mask(h));
        BUG_ON(end & ~huge_page_mask(h));
 
+       /*
+        * This is a hugetlb vma, all the pte entries should point
+        * to huge page.
+        */
+       tlb_remove_check_page_size_change(tlb, sz);
        tlb_start_vma(tlb, vma);
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        address = start;
@@ -3336,7 +3360,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                }
 
                pte = huge_ptep_get_and_clear(mm, address, ptep);
-               tlb_remove_tlb_entry(tlb, ptep, address);
+               tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
                if (huge_pte_dirty(pte))
                        set_page_dirty(page);
 
@@ -3450,15 +3474,17 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
  * Keep the pte_same checks anyway to make transition from the mutex easier.
  */
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-                       unsigned long address, pte_t *ptep, pte_t pte,
-                       struct page *pagecache_page, spinlock_t *ptl)
+                      unsigned long address, pte_t *ptep,
+                      struct page *pagecache_page, spinlock_t *ptl)
 {
+       pte_t pte;
        struct hstate *h = hstate_vma(vma);
        struct page *old_page, *new_page;
        int ret = 0, outside_reserve = 0;
        unsigned long mmun_start;       /* For mmu_notifiers */
        unsigned long mmun_end;         /* For mmu_notifiers */
 
+       pte = huge_ptep_get(ptep);
        old_page = pte_page(pte);
 
 retry_avoidcopy:
@@ -3711,8 +3737,7 @@ retry:
                vma_end_reservation(h, vma, address);
        }
 
-       ptl = huge_pte_lockptr(h, mm, ptep);
-       spin_lock(ptl);
+       ptl = huge_pte_lock(h, mm, ptep);
        size = i_size_read(mapping->host) >> huge_page_shift(h);
        if (idx >= size)
                goto backout;
@@ -3733,7 +3758,7 @@ retry:
        hugetlb_count_add(pages_per_huge_page(h), mm);
        if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                /* Optimization, do the COW without a second fault */
-               ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
+               ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
        }
 
        spin_unlock(ptl);
@@ -3888,8 +3913,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
        if (flags & FAULT_FLAG_WRITE) {
                if (!huge_pte_write(entry)) {
-                       ret = hugetlb_cow(mm, vma, address, ptep, entry,
-                                       pagecache_page, ptl);
+                       ret = hugetlb_cow(mm, vma, address, ptep,
+                                         pagecache_page, ptl);
                        goto out_put_page;
                }
                entry = huge_pte_mkdirty(entry);
@@ -4330,8 +4355,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        if (!spte)
                goto out;
 
-       ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
-       spin_lock(ptl);
+       ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
        if (pud_none(*pud)) {
                pud_populate(mm, pud,
                                (pmd_t *)((unsigned long)spte & PAGE_MASK));