hugetlb: add support for gigantic page allocation at runtime
[linux-2.6-block.git] / mm / hugetlb.c
index 5964d0de1777419a9202c94b426cc85d356e7e07..98f0bc105dfe196b148b392d7cf5add3995ee0c2 100644 (file)
@@ -680,11 +680,150 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
                ((node = hstate_next_node_to_free(hs, mask)) || 1);     \
                nr_nodes--)
 
+#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
+static void destroy_compound_gigantic_page(struct page *page,
+                                       unsigned long order)
+{
+       int i;
+       int nr_pages = 1 << order;
+       struct page *p = page + 1;
+
+       for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+               __ClearPageTail(p);
+               set_page_refcounted(p);
+               p->first_page = NULL;
+       }
+
+       set_compound_order(page, 0);
+       __ClearPageHead(page);
+}
+
+static void free_gigantic_page(struct page *page, unsigned order)
+{
+       free_contig_range(page_to_pfn(page), 1 << order);
+}
+
+static int __alloc_gigantic_page(unsigned long start_pfn,
+                               unsigned long nr_pages)
+{
+       unsigned long end_pfn = start_pfn + nr_pages;
+       return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+}
+
+static bool pfn_range_valid_gigantic(unsigned long start_pfn,
+                               unsigned long nr_pages)
+{
+       unsigned long i, end_pfn = start_pfn + nr_pages;
+       struct page *page;
+
+       for (i = start_pfn; i < end_pfn; i++) {
+               if (!pfn_valid(i))
+                       return false;
+
+               page = pfn_to_page(i);
+
+               if (PageReserved(page))
+                       return false;
+
+               if (page_count(page) > 0)
+                       return false;
+
+               if (PageHuge(page))
+                       return false;
+       }
+
+       return true;
+}
+
+static bool zone_spans_last_pfn(const struct zone *zone,
+                       unsigned long start_pfn, unsigned long nr_pages)
+{
+       unsigned long last_pfn = start_pfn + nr_pages - 1;
+       return zone_spans_pfn(zone, last_pfn);
+}
+
+static struct page *alloc_gigantic_page(int nid, unsigned order)
+{
+       unsigned long nr_pages = 1 << order;
+       unsigned long ret, pfn, flags;
+       struct zone *z;
+
+       z = NODE_DATA(nid)->node_zones;
+       for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) {
+               spin_lock_irqsave(&z->lock, flags);
+
+               pfn = ALIGN(z->zone_start_pfn, nr_pages);
+               while (zone_spans_last_pfn(z, pfn, nr_pages)) {
+                       if (pfn_range_valid_gigantic(pfn, nr_pages)) {
+                               /*
+                                * We release the zone lock here because
+                                * alloc_contig_range() will also lock the zone
+                                * at some point. If there's an allocation
+                                * spinning on this lock, it may win the race
+                                * and cause alloc_contig_range() to fail...
+                                */
+                               spin_unlock_irqrestore(&z->lock, flags);
+                               ret = __alloc_gigantic_page(pfn, nr_pages);
+                               if (!ret)
+                                       return pfn_to_page(pfn);
+                               spin_lock_irqsave(&z->lock, flags);
+                       }
+                       pfn += nr_pages;
+               }
+
+               spin_unlock_irqrestore(&z->lock, flags);
+       }
+
+       return NULL;
+}
+
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
+static void prep_compound_gigantic_page(struct page *page, unsigned long order);
+
+static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
+{
+       struct page *page;
+
+       page = alloc_gigantic_page(nid, huge_page_order(h));
+       if (page) {
+               prep_compound_gigantic_page(page, huge_page_order(h));
+               prep_new_huge_page(h, page, nid);
+       }
+
+       return page;
+}
+
+static int alloc_fresh_gigantic_page(struct hstate *h,
+                               nodemask_t *nodes_allowed)
+{
+       struct page *page = NULL;
+       int nr_nodes, node;
+
+       for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+               page = alloc_fresh_gigantic_page_node(h, node);
+               if (page)
+                       return 1;
+       }
+
+       return 0;
+}
+
+static inline bool gigantic_page_supported(void) { return true; }
+#else
+static inline bool gigantic_page_supported(void) { return false; }
+static inline void free_gigantic_page(struct page *page, unsigned order) { }
+static inline void destroy_compound_gigantic_page(struct page *page,
+                                               unsigned long order) { }
+static inline int alloc_fresh_gigantic_page(struct hstate *h,
+                                       nodemask_t *nodes_allowed) { return 0; }
+#endif
+
 static void update_and_free_page(struct hstate *h, struct page *page)
 {
        int i;
 
-       VM_BUG_ON(hstate_is_gigantic(h));
+       if (hstate_is_gigantic(h) && !gigantic_page_supported())
+               return;
 
        h->nr_huge_pages--;
        h->nr_huge_pages_node[page_to_nid(page)]--;
@@ -697,8 +836,13 @@ static void update_and_free_page(struct hstate *h, struct page *page)
        VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
        set_compound_page_dtor(page, NULL);
        set_page_refcounted(page);
-       arch_release_hugepage(page);
-       __free_pages(page, huge_page_order(h));
+       if (hstate_is_gigantic(h)) {
+               destroy_compound_gigantic_page(page, huge_page_order(h));
+               free_gigantic_page(page, huge_page_order(h));
+       } else {
+               arch_release_hugepage(page);
+               __free_pages(page, huge_page_order(h));
+       }
 }
 
 struct hstate *size_to_hstate(unsigned long size)
@@ -737,7 +881,7 @@ static void free_huge_page(struct page *page)
        if (restore_reserve)
                h->resv_huge_pages++;
 
-       if (h->surplus_huge_pages_node[nid] && !hstate_is_gigantic(h)) {
+       if (h->surplus_huge_pages_node[nid]) {
                /* remove the page from active list */
                list_del(&page->lru);
                update_and_free_page(h, page);
@@ -841,9 +985,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 {
        struct page *page;
 
-       if (hstate_is_gigantic(h))
-               return NULL;
-
        page = alloc_pages_exact_node(nid,
                htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
                                                __GFP_REPEAT|__GFP_NOWARN,
@@ -1478,7 +1619,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
 {
        unsigned long min_count, ret;
 
-       if (hstate_is_gigantic(h))
+       if (hstate_is_gigantic(h) && !gigantic_page_supported())
                return h->max_huge_pages;
 
        /*
@@ -1505,7 +1646,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
                 * and reducing the surplus.
                 */
                spin_unlock(&hugetlb_lock);
-               ret = alloc_fresh_huge_page(h, nodes_allowed);
+               if (hstate_is_gigantic(h))
+                       ret = alloc_fresh_gigantic_page(h, nodes_allowed);
+               else
+                       ret = alloc_fresh_huge_page(h, nodes_allowed);
                spin_lock(&hugetlb_lock);
                if (!ret)
                        goto out;
@@ -1605,7 +1749,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
                goto out;
 
        h = kobj_to_hstate(kobj, &nid);
-       if (hstate_is_gigantic(h)) {
+       if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
                err = -EINVAL;
                goto out;
        }
@@ -2112,7 +2256,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
 
        tmp = h->max_huge_pages;
 
-       if (write && hstate_is_gigantic(h))
+       if (write && hstate_is_gigantic(h) && !gigantic_page_supported())
                return -EINVAL;
 
        table->data = &tmp;