Merge branch 'mm-hotfixes-stable' into mm-stable
[linux-2.6-block.git] / mm / hugetlb.c
index 0bdfc7e1c933f59c8084f9b4288201450c3d90cb..2ca4e8c3163ef414416b3e1bc3057ed14a438e59 100644 (file)
@@ -456,14 +456,12 @@ static int allocate_file_region_entries(struct resv_map *resv,
                                        int regions_needed)
        __must_hold(&resv->lock)
 {
-       struct list_head allocated_regions;
+       LIST_HEAD(allocated_regions);
        int to_allocate = 0, i = 0;
        struct file_region *trg = NULL, *rg = NULL;
 
        VM_BUG_ON(regions_needed < 0);
 
-       INIT_LIST_HEAD(&allocated_regions);
-
        /*
         * Check for sufficient descriptors in the cache to accommodate
         * the number of in progress add operations plus regions_needed.
@@ -1506,6 +1504,10 @@ static void add_hugetlb_page(struct hstate *h, struct page *page,
 
        set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
        set_page_private(page, 0);
+       /*
+        * We have to set HPageVmemmapOptimized again as above
+        * set_page_private(page, 0) cleared it.
+        */
        SetHPageVmemmapOptimized(page);
 
        /*
@@ -2336,7 +2338,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
 static int gather_surplus_pages(struct hstate *h, long delta)
        __must_hold(&hugetlb_lock)
 {
-       struct list_head surplus_list;
+       LIST_HEAD(surplus_list);
        struct page *page, *tmp;
        int ret;
        long i;
@@ -2351,7 +2353,6 @@ static int gather_surplus_pages(struct hstate *h, long delta)
        }
 
        allocated = 0;
-       INIT_LIST_HEAD(&surplus_list);
 
        ret = -ENOMEM;
 retry:
@@ -3474,7 +3475,8 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
         * based on pool changes for the demoted page.
         */
        h->max_huge_pages--;
-       target_hstate->max_huge_pages += pages_per_huge_page(h);
+       target_hstate->max_huge_pages +=
+               pages_per_huge_page(h) / pages_per_huge_page(target_hstate);
 
        return rc;
 }
@@ -3767,8 +3769,7 @@ HSTATE_ATTR_WO(demote);
 static ssize_t demote_size_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
 {
-       int nid;
-       struct hstate *h = kobj_to_hstate(kobj, &nid);
+       struct hstate *h = kobj_to_hstate(kobj, NULL);
        unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
 
        return sysfs_emit(buf, "%lukB\n", demote_size);
@@ -3781,7 +3782,6 @@ static ssize_t demote_size_store(struct kobject *kobj,
        struct hstate *h, *demote_hstate;
        unsigned long demote_size;
        unsigned int demote_order;
-       int nid;
 
        demote_size = (unsigned long)memparse(buf, NULL);
 
@@ -3793,7 +3793,7 @@ static ssize_t demote_size_store(struct kobject *kobj,
                return -EINVAL;
 
        /* demote order must be smaller than hstate order */
-       h = kobj_to_hstate(kobj, &nid);
+       h = kobj_to_hstate(kobj, NULL);
        if (demote_order >= h->order)
                return -EINVAL;
 
@@ -3847,15 +3847,22 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
        if (retval) {
                kobject_put(hstate_kobjs[hi]);
                hstate_kobjs[hi] = NULL;
+               return retval;
        }
 
        if (h->demote_order) {
-               if (sysfs_create_group(hstate_kobjs[hi],
-                                       &hstate_demote_attr_group))
+               retval = sysfs_create_group(hstate_kobjs[hi],
+                                           &hstate_demote_attr_group);
+               if (retval) {
                        pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
+                       sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group);
+                       kobject_put(hstate_kobjs[hi]);
+                       hstate_kobjs[hi] = NULL;
+                       return retval;
+               }
        }
 
-       return retval;
+       return 0;
 }
 
 static void __init hugetlb_sysfs_init(void)
@@ -3941,10 +3948,15 @@ static void hugetlb_unregister_node(struct node *node)
 
        for_each_hstate(h) {
                int idx = hstate_index(h);
-               if (nhs->hstate_kobjs[idx]) {
-                       kobject_put(nhs->hstate_kobjs[idx]);
-                       nhs->hstate_kobjs[idx] = NULL;
-               }
+               struct kobject *hstate_kobj = nhs->hstate_kobjs[idx];
+
+               if (!hstate_kobj)
+                       continue;
+               if (h->demote_order)
+                       sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group);
+               sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group);
+               kobject_put(hstate_kobj);
+               nhs->hstate_kobjs[idx] = NULL;
        }
 
        kobject_put(nhs->hugepages_kobj);
@@ -4019,6 +4031,14 @@ static void hugetlb_register_all_nodes(void) { }
 
 #endif
 
+#ifdef CONFIG_CMA
+static void __init hugetlb_cma_check(void);
+#else
+static inline __init void hugetlb_cma_check(void)
+{
+}
+#endif
+
 static int __init hugetlb_init(void)
 {
        int i;
@@ -4118,7 +4138,7 @@ void __init hugetlb_add_hstate(unsigned int order)
        h->next_nid_to_alloc = first_memory_node;
        h->next_nid_to_free = first_memory_node;
        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
-                                       huge_page_size(h)/1024);
+                                       huge_page_size(h)/SZ_1K);
 
        parsed_hstate = h;
 }
@@ -4133,11 +4153,11 @@ static void __init hugepages_clear_pages_in_node(void)
        if (!hugetlb_max_hstate) {
                default_hstate_max_huge_pages = 0;
                memset(default_hugepages_in_node, 0,
-                       MAX_NUMNODES * sizeof(unsigned int));
+                       sizeof(default_hugepages_in_node));
        } else {
                parsed_hstate->max_huge_pages = 0;
                memset(parsed_hstate->max_huge_pages_node, 0,
-                       MAX_NUMNODES * sizeof(unsigned int));
+                       sizeof(parsed_hstate->max_huge_pages_node));
        }
 }
 
@@ -4332,18 +4352,34 @@ static int __init default_hugepagesz_setup(char *s)
 }
 __setup("default_hugepagesz=", default_hugepagesz_setup);
 
+static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
+{
+#ifdef CONFIG_NUMA
+       struct mempolicy *mpol = get_task_policy(current);
+
+       /*
+        * Only enforce MPOL_BIND policy which overlaps with cpuset policy
+        * (from policy_nodemask) specifically for hugetlb case
+        */
+       if (mpol->mode == MPOL_BIND &&
+               (apply_policy_zone(mpol, gfp_zone(gfp)) &&
+                cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
+               return &mpol->nodes;
+#endif
+       return NULL;
+}
+
 static unsigned int allowed_mems_nr(struct hstate *h)
 {
        int node;
        unsigned int nr = 0;
-       nodemask_t *mpol_allowed;
+       nodemask_t *mbind_nodemask;
        unsigned int *array = h->free_huge_pages_node;
        gfp_t gfp_mask = htlb_alloc_mask(h);
 
-       mpol_allowed = policy_nodemask_current(gfp_mask);
-
+       mbind_nodemask = policy_mbind_nodemask(gfp_mask);
        for_each_node_mask(node, cpuset_current_mems_allowed) {
-               if (!mpol_allowed || node_isset(node, *mpol_allowed))
+               if (!mbind_nodemask || node_isset(node, *mbind_nodemask))
                        nr += array[node];
        }
 
@@ -4723,7 +4759,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                            struct vm_area_struct *dst_vma,
                            struct vm_area_struct *src_vma)
 {
-       pte_t *src_pte, *dst_pte, entry, dst_entry;
+       pte_t *src_pte, *dst_pte, entry;
        struct page *ptepage;
        unsigned long addr;
        bool cow = is_cow_mapping(src_vma->vm_flags);
@@ -4768,15 +4804,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 
                /*
                 * If the pagetables are shared don't copy or take references.
-                * dst_pte == src_pte is the common case of src/dest sharing.
                 *
+                * dst_pte == src_pte is the common case of src/dest sharing.
                 * However, src could have 'unshared' and dst shares with
-                * another vma.  If dst_pte !none, this implies sharing.
-                * Check here before taking page table lock, and once again
-                * after taking the lock below.
+                * another vma. So page_count of ptep page is checked instead
+                * to reliably determine whether pte is shared.
                 */
-               dst_entry = huge_ptep_get(dst_pte);
-               if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) {
+               if (page_count(virt_to_page(dst_pte)) > 1) {
                        addr |= last_addr_mask;
                        continue;
                }
@@ -4785,13 +4819,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                src_ptl = huge_pte_lockptr(h, src, src_pte);
                spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
                entry = huge_ptep_get(src_pte);
-               dst_entry = huge_ptep_get(dst_pte);
 again:
-               if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
+               if (huge_pte_none(entry)) {
                        /*
-                        * Skip if src entry none.  Also, skip in the
-                        * unlikely case dst entry !none as this implies
-                        * sharing with another vma.
+                        * Skip if src entry none.
                         */
                        ;
                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
@@ -4870,7 +4901,7 @@ again:
                                        restore_reserve_on_error(h, dst_vma, addr,
                                                                new);
                                        put_page(new);
-                                       /* dst_entry won't change as in child */
+                                       /* huge_ptep of dst_pte won't change as in child */
                                        goto again;
                                }
                                hugetlb_install_page(dst_vma, dst_pte, addr, new);
@@ -5316,7 +5347,6 @@ retry_avoidcopy:
                        u32 hash;
 
                        put_page(old_page);
-                       BUG_ON(huge_pte_none(pte));
                        /*
                         * Drop hugetlb_fault_mutex and i_mmap_rwsem before
                         * unmapping.  unmapping needs to hold i_mmap_rwsem
@@ -5408,19 +5438,6 @@ out_release_old:
        return ret;
 }
 
-/* Return the pagecache page at a given address within a VMA */
-static struct page *hugetlbfs_pagecache_page(struct hstate *h,
-                       struct vm_area_struct *vma, unsigned long address)
-{
-       struct address_space *mapping;
-       pgoff_t idx;
-
-       mapping = vma->vm_file->f_mapping;
-       idx = vma_hugecache_offset(h, vma, address);
-
-       return find_lock_page(mapping, idx);
-}
-
 /*
  * Return whether there is a pagecache page to back given address within VMA.
  * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
@@ -5547,7 +5564,6 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
        if (idx >= size)
                goto out;
 
-retry:
        new_page = false;
        page = find_lock_page(mapping, idx);
        if (!page) {
@@ -5587,9 +5603,15 @@ retry:
                if (vma->vm_flags & VM_MAYSHARE) {
                        int err = huge_add_to_page_cache(page, mapping, idx);
                        if (err) {
+                               /*
+                                * err can't be -EEXIST which implies someone
+                                * else consumed the reservation since hugetlb
+                                * fault mutex is held when add a hugetlb page
+                                * to the page cache. So it's safe to call
+                                * restore_reserve_on_error() here.
+                                */
+                               restore_reserve_on_error(h, vma, haddr, page);
                                put_page(page);
-                               if (err == -EEXIST)
-                                       goto retry;
                                goto out;
                        }
                        new_pagecache_page = true;
@@ -5810,7 +5832,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                /* Just decrements count, does not deallocate */
                vma_end_reservation(h, vma, haddr);
 
-               pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr);
+               pagecache_page = find_lock_page(mapping, idx);
        }
 
        ptl = huge_pte_lock(h, mm, ptep);
@@ -6017,8 +6039,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                page_in_pagecache = true;
        }
 
-       ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
-       spin_lock(ptl);
+       ptl = huge_pte_lock(h, dst_mm, dst_pte);
 
        /*
         * Recheck the i_size after holding PT lock to make sure not
@@ -7334,7 +7355,7 @@ void __init hugetlb_cma_reserve(int order)
                hugetlb_cma_size = 0;
 }
 
-void __init hugetlb_cma_check(void)
+static void __init hugetlb_cma_check(void)
 {
        if (!hugetlb_cma_size || cma_reserve_called)
                return;