mm: speed up mremap by 20x on large regions
[linux-2.6-block.git] / mm / memory_hotplug.c
index 8537429d33a6dfd1bc57184646d8d0d75b8745e2..b9a667d36c554afc46c2d337a2712e6ca8664fb5 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/hugetlb.h>
 #include <linux/memblock.h>
 #include <linux/compaction.h>
+#include <linux/rmap.h>
 
 #include <asm/tlbflush.h>
 
@@ -253,7 +254,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
        if (pfn_valid(phys_start_pfn))
                return -EEXIST;
 
-       ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn, altmap);
+       ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
        if (ret < 0)
                return ret;
 
@@ -743,14 +744,13 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
        int nid = pgdat->node_id;
        unsigned long flags;
 
-       if (zone_is_empty(zone))
-               init_currently_empty_zone(zone, start_pfn, nr_pages);
-
        clear_zone_contiguous(zone);
 
        /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
        pgdat_resize_lock(pgdat, &flags);
        zone_span_writelock(zone);
+       if (zone_is_empty(zone))
+               init_currently_empty_zone(zone, start_pfn, nr_pages);
        resize_zone_range(zone, start_pfn, nr_pages);
        zone_span_writeunlock(zone);
        resize_pgdat_range(pgdat, start_pfn, nr_pages);
@@ -1078,7 +1078,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
  *
  * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
  */
-int __ref add_memory_resource(int nid, struct resource *res, bool online)
+int __ref add_memory_resource(int nid, struct resource *res)
 {
        u64 start, size;
        bool new_node = false;
@@ -1133,7 +1133,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
        mem_hotplug_done();
 
        /* online pages if requested */
-       if (online)
+       if (memhp_auto_online)
                walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
                                  NULL, online_memory_block);
 
@@ -1157,7 +1157,7 @@ int __ref __add_memory(int nid, u64 start, u64 size)
        if (IS_ERR(res))
                return PTR_ERR(res);
 
-       ret = add_memory_resource(nid, res, memhp_auto_online);
+       ret = add_memory_resource(nid, res);
        if (ret < 0)
                release_memory_resource(res);
        return ret;
@@ -1339,18 +1339,16 @@ static struct page *new_node_page(struct page *page, unsigned long private)
        return new_page_nodemask(page, nid, &nmask);
 }
 
-#define NR_OFFLINE_AT_ONCE_PAGES       (256)
 static int
 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 {
        unsigned long pfn;
        struct page *page;
-       int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
        int not_managed = 0;
        int ret = 0;
        LIST_HEAD(source);
 
-       for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
+       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                if (!pfn_valid(pfn))
                        continue;
                page = pfn_to_page(pfn);
@@ -1362,13 +1360,27 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                                ret = -EBUSY;
                                break;
                        }
-                       if (isolate_huge_page(page, &source))
-                               move_pages -= 1 << compound_order(head);
+                       isolate_huge_page(page, &source);
                        continue;
                } else if (PageTransHuge(page))
                        pfn = page_to_pfn(compound_head(page))
                                + hpage_nr_pages(page) - 1;
 
+               /*
+                * HWPoison pages have elevated reference counts so the migration would
+                * fail on them. It also doesn't make any sense to migrate them in the
+                * first place. Still try to unmap such a page in case it is still mapped
+                * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
+                * the unmap as the catch all safety net).
+                */
+               if (PageHWPoison(page)) {
+                       if (WARN_ON(PageLRU(page)))
+                               isolate_lru_page(page);
+                       if (page_mapped(page))
+                               try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
+                       continue;
+               }
+
                if (!get_page_unless_zero(page))
                        continue;
                /*
@@ -1382,7 +1394,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                if (!ret) { /* Success */
                        put_page(page);
                        list_add_tail(&page->lru, &source);
-                       move_pages--;
                        if (!__PageMovable(page))
                                inc_node_page_state(page, NR_ISOLATED_ANON +
                                                    page_is_file_cache(page));
@@ -1596,38 +1607,42 @@ static int __ref __offline_pages(unsigned long start_pfn,
                goto failed_removal_isolated;
        }
 
-       pfn = start_pfn;
-repeat:
-       /* start memory hot removal */
-       ret = -EINTR;
-       if (signal_pending(current)) {
-               reason = "signal backoff";
-               goto failed_removal_isolated;
-       }
+       do {
+               for (pfn = start_pfn; pfn;) {
+                       if (signal_pending(current)) {
+                               ret = -EINTR;
+                               reason = "signal backoff";
+                               goto failed_removal_isolated;
+                       }
 
-       cond_resched();
-       lru_add_drain_all();
-       drain_all_pages(zone);
+                       cond_resched();
+                       lru_add_drain_all();
+                       drain_all_pages(zone);
+
+                       pfn = scan_movable_pages(pfn, end_pfn);
+                       if (pfn) {
+                               /*
+                                * TODO: fatal migration failures should bail
+                                * out
+                                */
+                               do_migrate_range(pfn, end_pfn);
+                       }
+               }
 
-       pfn = scan_movable_pages(start_pfn, end_pfn);
-       if (pfn) { /* We have movable pages */
-               ret = do_migrate_range(pfn, end_pfn);
-               goto repeat;
-       }
+               /*
+                * Dissolve free hugepages in the memory block before doing
+                * offlining actually in order to make hugetlbfs's object
+                * counting consistent.
+                */
+               ret = dissolve_free_huge_pages(start_pfn, end_pfn);
+               if (ret) {
+                       reason = "failure to dissolve huge pages";
+                       goto failed_removal_isolated;
+               }
+               /* check again */
+               offlined_pages = check_pages_isolated(start_pfn, end_pfn);
+       } while (offlined_pages < 0);
 
-       /*
-        * dissolve free hugepages in the memory block before doing offlining
-        * actually in order to make hugetlbfs's object counting consistent.
-        */
-       ret = dissolve_free_huge_pages(start_pfn, end_pfn);
-       if (ret) {
-               reason = "failure to dissolve huge pages";
-               goto failed_removal_isolated;
-       }
-       /* check again */
-       offlined_pages = check_pages_isolated(start_pfn, end_pfn);
-       if (offlined_pages < 0)
-               goto repeat;
        pr_info("Offlined Pages %ld\n", offlined_pages);
        /* Ok, all of our target is isolated.
           We cannot do rollback at this point. */
@@ -1764,34 +1779,6 @@ static int check_cpu_on_node(pg_data_t *pgdat)
        return 0;
 }
 
-static void unmap_cpu_on_node(pg_data_t *pgdat)
-{
-#ifdef CONFIG_ACPI_NUMA
-       int cpu;
-
-       for_each_possible_cpu(cpu)
-               if (cpu_to_node(cpu) == pgdat->node_id)
-                       numa_clear_node(cpu);
-#endif
-}
-
-static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
-{
-       int ret;
-
-       ret = check_cpu_on_node(pgdat);
-       if (ret)
-               return ret;
-
-       /*
-        * the node will be offlined when we come here, so we can clear
-        * the cpu_to_node() now.
-        */
-
-       unmap_cpu_on_node(pgdat);
-       return 0;
-}
-
 /**
  * try_offline_node
  * @nid: the node ID
@@ -1824,7 +1811,7 @@ void try_offline_node(int nid)
                return;
        }
 
-       if (check_and_unmap_cpu_on_node(pgdat))
+       if (check_cpu_on_node(pgdat))
                return;
 
        /*
@@ -1869,7 +1856,7 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
        memblock_free(start, size);
        memblock_remove(start, size);
 
-       arch_remove_memory(start, size, NULL);
+       arch_remove_memory(nid, start, size, NULL);
 
        try_offline_node(nid);