fs/hfsplus: use bool instead of int for is_known_namespace() return value
[linux-2.6-block.git] / mm / hugetlb.c
index c41b2a0ee2736e4f7df74c440ceb90bd5fcceecb..271e4432734c376baf0bf4b8953a38e391ac011c 100644 (file)
@@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock);
 static int num_fault_mutexes;
 static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
 
+/* Forward declaration */
+static int hugetlb_acct_memory(struct hstate *h, long delta);
+
 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
 {
        bool free = (spool->count == 0) && (spool->used_hpages == 0);
@@ -68,23 +71,36 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
        spin_unlock(&spool->lock);
 
        /* If no pages are used, and no other handles to the subpool
-        * remain, free the subpool the subpool remain */
-       if (free)
+        * remain, give up any reservations mased on minimum size and
+        * free the subpool */
+       if (free) {
+               if (spool->min_hpages != -1)
+                       hugetlb_acct_memory(spool->hstate,
+                                               -spool->min_hpages);
                kfree(spool);
+       }
 }
 
-struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
+                                               long min_hpages)
 {
        struct hugepage_subpool *spool;
 
-       spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+       spool = kzalloc(sizeof(*spool), GFP_KERNEL);
        if (!spool)
                return NULL;
 
        spin_lock_init(&spool->lock);
        spool->count = 1;
-       spool->max_hpages = nr_blocks;
-       spool->used_hpages = 0;
+       spool->max_hpages = max_hpages;
+       spool->hstate = h;
+       spool->min_hpages = min_hpages;
+
+       if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
+               kfree(spool);
+               return NULL;
+       }
+       spool->rsv_hpages = min_hpages;
 
        return spool;
 }
@@ -97,36 +113,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
        unlock_or_release_subpool(spool);
 }
 
-static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+/*
+ * Subpool accounting for allocating and reserving pages.
+ * Return -ENOMEM if there are not enough resources to satisfy the
+ * the request.  Otherwise, return the number of pages by which the
+ * global pools must be adjusted (upward).  The returned value may
+ * only be different than the passed value (delta) in the case where
+ * a subpool minimum size must be manitained.
+ */
+static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
                                      long delta)
 {
-       int ret = 0;
+       long ret = delta;
 
        if (!spool)
-               return 0;
+               return ret;
 
        spin_lock(&spool->lock);
-       if ((spool->used_hpages + delta) <= spool->max_hpages) {
-               spool->used_hpages += delta;
-       } else {
-               ret = -ENOMEM;
+
+       if (spool->max_hpages != -1) {          /* maximum size accounting */
+               if ((spool->used_hpages + delta) <= spool->max_hpages)
+                       spool->used_hpages += delta;
+               else {
+                       ret = -ENOMEM;
+                       goto unlock_ret;
+               }
        }
-       spin_unlock(&spool->lock);
 
+       if (spool->min_hpages != -1) {          /* minimum size accounting */
+               if (delta > spool->rsv_hpages) {
+                       /*
+                        * Asking for more reserves than those already taken on
+                        * behalf of subpool.  Return difference.
+                        */
+                       ret = delta - spool->rsv_hpages;
+                       spool->rsv_hpages = 0;
+               } else {
+                       ret = 0;        /* reserves already accounted for */
+                       spool->rsv_hpages -= delta;
+               }
+       }
+
+unlock_ret:
+       spin_unlock(&spool->lock);
        return ret;
 }
 
-static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+/*
+ * Subpool accounting for freeing and unreserving pages.
+ * Return the number of global page reservations that must be dropped.
+ * The return value may only be different than the passed value (delta)
+ * in the case where a subpool minimum size must be maintained.
+ */
+static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
                                       long delta)
 {
+       long ret = delta;
+
        if (!spool)
-               return;
+               return delta;
 
        spin_lock(&spool->lock);
-       spool->used_hpages -= delta;
-       /* If hugetlbfs_put_super couldn't free spool due to
-       * an outstanding quota reference, free it now. */
+
+       if (spool->max_hpages != -1)            /* maximum size accounting */
+               spool->used_hpages -= delta;
+
+       if (spool->min_hpages != -1) {          /* minimum size accounting */
+               if (spool->rsv_hpages + delta <= spool->min_hpages)
+                       ret = 0;
+               else
+                       ret = spool->rsv_hpages + delta - spool->min_hpages;
+
+               spool->rsv_hpages += delta;
+               if (spool->rsv_hpages > spool->min_hpages)
+                       spool->rsv_hpages = spool->min_hpages;
+       }
+
+       /*
+        * If hugetlbfs_put_super couldn't free spool due to an outstanding
+        * quota reference, free it now.
+        */
        unlock_or_release_subpool(spool);
+
+       return ret;
 }
 
 static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
@@ -855,6 +924,31 @@ struct hstate *size_to_hstate(unsigned long size)
        return NULL;
 }
 
+/*
+ * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
+ * to hstate->hugepage_activelist.)
+ *
+ * This function can be called for tail pages, but never returns true for them.
+ */
+bool page_huge_active(struct page *page)
+{
+       VM_BUG_ON_PAGE(!PageHuge(page), page);
+       return PageHead(page) && PagePrivate(&page[1]);
+}
+
+/* never called for tail page */
+static void set_page_huge_active(struct page *page)
+{
+       VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
+       SetPagePrivate(&page[1]);
+}
+
+static void clear_page_huge_active(struct page *page)
+{
+       VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
+       ClearPagePrivate(&page[1]);
+}
+
 void free_huge_page(struct page *page)
 {
        /*
@@ -874,7 +968,16 @@ void free_huge_page(struct page *page)
        restore_reserve = PagePrivate(page);
        ClearPagePrivate(page);
 
+       /*
+        * A return code of zero implies that the subpool will be under its
+        * minimum size if the reservation is not restored after page is free.
+        * Therefore, force restore_reserve operation.
+        */
+       if (hugepage_subpool_put_pages(spool, 1) == 0)
+               restore_reserve = true;
+
        spin_lock(&hugetlb_lock);
+       clear_page_huge_active(page);
        hugetlb_cgroup_uncharge_page(hstate_index(h),
                                     pages_per_huge_page(h), page);
        if (restore_reserve)
@@ -891,7 +994,6 @@ void free_huge_page(struct page *page)
                enqueue_huge_page(h, page);
        }
        spin_unlock(&hugetlb_lock);
-       hugepage_subpool_put_pages(spool, 1);
 }
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -1386,7 +1488,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        if (chg < 0)
                return ERR_PTR(-ENOMEM);
        if (chg || avoid_reserve)
-               if (hugepage_subpool_get_pages(spool, 1))
+               if (hugepage_subpool_get_pages(spool, 1) < 0)
                        return ERR_PTR(-ENOSPC);
 
        ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
@@ -2454,6 +2556,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
        struct resv_map *resv = vma_resv_map(vma);
        struct hugepage_subpool *spool = subpool_vma(vma);
        unsigned long reserve, start, end;
+       long gbl_reserve;
 
        if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                return;
@@ -2466,8 +2569,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
        kref_put(&resv->refs, resv_map_release);
 
        if (reserve) {
-               hugetlb_acct_memory(h, -reserve);
-               hugepage_subpool_put_pages(spool, reserve);
+               /*
+                * Decrement reserve counts.  The global reserve count may be
+                * adjusted if the subpool has a minimum size.
+                */
+               gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
+               hugetlb_acct_memory(h, -gbl_reserve);
        }
 }
 
@@ -2891,6 +2998,7 @@ retry_avoidcopy:
        copy_user_huge_page(new_page, old_page, address, vma,
                            pages_per_huge_page(h));
        __SetPageUptodate(new_page);
+       set_page_huge_active(new_page);
 
        mmun_start = address & huge_page_mask(h);
        mmun_end = mmun_start + huge_page_size(h);
@@ -3003,6 +3111,7 @@ retry:
                }
                clear_huge_page(page, address, pages_per_huge_page(h));
                __SetPageUptodate(page);
+               set_page_huge_active(page);
 
                if (vma->vm_flags & VM_MAYSHARE) {
                        int err;
@@ -3277,6 +3386,15 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                int absent;
                struct page *page;
 
+               /*
+                * If we have a pending SIGKILL, don't keep faulting pages and
+                * potentially allocating memory.
+                */
+               if (unlikely(fatal_signal_pending(current))) {
+                       remainder = 0;
+                       break;
+               }
+
                /*
                 * Some archs (sparc64, sh*) have multiple pte_ts to
                 * each hugepage.  We have to make sure we get the
@@ -3438,6 +3556,7 @@ int hugetlb_reserve_pages(struct inode *inode,
        struct hstate *h = hstate_inode(inode);
        struct hugepage_subpool *spool = subpool_inode(inode);
        struct resv_map *resv_map;
+       long gbl_reserve;
 
        /*
         * Only apply hugepage reservation if asked. At fault time, an
@@ -3474,8 +3593,13 @@ int hugetlb_reserve_pages(struct inode *inode,
                goto out_err;
        }
 
-       /* There must be enough pages in the subpool for the mapping */
-       if (hugepage_subpool_get_pages(spool, chg)) {
+       /*
+        * There must be enough pages in the subpool for the mapping. If
+        * the subpool has a minimum size, there may be some global
+        * reservations already in place (gbl_reserve).
+        */
+       gbl_reserve = hugepage_subpool_get_pages(spool, chg);
+       if (gbl_reserve < 0) {
                ret = -ENOSPC;
                goto out_err;
        }
@@ -3484,9 +3608,10 @@ int hugetlb_reserve_pages(struct inode *inode,
         * Check enough hugepages are available for the reservation.
         * Hand the pages back to the subpool if there are not
         */
-       ret = hugetlb_acct_memory(h, chg);
+       ret = hugetlb_acct_memory(h, gbl_reserve);
        if (ret < 0) {
-               hugepage_subpool_put_pages(spool, chg);
+               /* put back original number of pages, chg */
+               (void)hugepage_subpool_put_pages(spool, chg);
                goto out_err;
        }
 
@@ -3516,6 +3641,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        struct resv_map *resv_map = inode_resv_map(inode);
        long chg = 0;
        struct hugepage_subpool *spool = subpool_inode(inode);
+       long gbl_reserve;
 
        if (resv_map)
                chg = region_truncate(resv_map, offset);
@@ -3523,8 +3649,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        inode->i_blocks -= (blocks_per_huge_page(h) * freed);
        spin_unlock(&inode->i_lock);
 
-       hugepage_subpool_put_pages(spool, (chg - freed));
-       hugetlb_acct_memory(h, -(chg - freed));
+       /*
+        * If the subpool has a minimum size, the number of global
+        * reservations to be released may be adjusted.
+        */
+       gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
+       hugetlb_acct_memory(h, -gbl_reserve);
 }
 
 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -3735,8 +3865,7 @@ retry:
        if (!pmd_huge(*pmd))
                goto out;
        if (pmd_present(*pmd)) {
-               page = pte_page(*(pte_t *)pmd) +
-                       ((address & ~PMD_MASK) >> PAGE_SHIFT);
+               page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
                if (flags & FOLL_GET)
                        get_page(page);
        } else {
@@ -3767,20 +3896,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 
 #ifdef CONFIG_MEMORY_FAILURE
 
-/* Should be called in hugetlb_lock */
-static int is_hugepage_on_freelist(struct page *hpage)
-{
-       struct page *page;
-       struct page *tmp;
-       struct hstate *h = page_hstate(hpage);
-       int nid = page_to_nid(hpage);
-
-       list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
-               if (page == hpage)
-                       return 1;
-       return 0;
-}
-
 /*
  * This function is called from memory failure code.
  * Assume the caller holds page lock of the head page.
@@ -3792,7 +3907,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
        int ret = -EBUSY;
 
        spin_lock(&hugetlb_lock);
-       if (is_hugepage_on_freelist(hpage)) {
+       /*
+        * Just checking !page_huge_active is not enough, because that could be
+        * an isolated/hwpoisoned hugepage (which have >0 refcount).
+        */
+       if (!page_huge_active(hpage) && !page_count(hpage)) {
                /*
                 * Hwpoisoned hugepage isn't linked to activelist or freelist,
                 * but dangling hpage->lru can trigger list-debug warnings
@@ -3812,42 +3931,27 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
 
 bool isolate_huge_page(struct page *page, struct list_head *list)
 {
+       bool ret = true;
+
        VM_BUG_ON_PAGE(!PageHead(page), page);
-       if (!get_page_unless_zero(page))
-               return false;
        spin_lock(&hugetlb_lock);
+       if (!page_huge_active(page) || !get_page_unless_zero(page)) {
+               ret = false;
+               goto unlock;
+       }
+       clear_page_huge_active(page);
        list_move_tail(&page->lru, list);
+unlock:
        spin_unlock(&hugetlb_lock);
-       return true;
+       return ret;
 }
 
 void putback_active_hugepage(struct page *page)
 {
        VM_BUG_ON_PAGE(!PageHead(page), page);
        spin_lock(&hugetlb_lock);
+       set_page_huge_active(page);
        list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
        spin_unlock(&hugetlb_lock);
        put_page(page);
 }
-
-bool is_hugepage_active(struct page *page)
-{
-       VM_BUG_ON_PAGE(!PageHuge(page), page);
-       /*
-        * This function can be called for a tail page because the caller,
-        * scan_movable_pages, scans through a given pfn-range which typically
-        * covers one memory block. In systems using gigantic hugepage (1GB
-        * for x86_64,) a hugepage is larger than a memory block, and we don't
-        * support migrating such large hugepages for now, so return false
-        * when called for tail pages.
-        */
-       if (PageTail(page))
-               return false;
-       /*
-        * Refcount of a hwpoisoned hugepages is 1, but they are not active,
-        * so we should return false for them.
-        */
-       if (unlikely(PageHWPoison(page)))
-               return false;
-       return page_count(page) > 0;
-}