mm: speed up mremap by 20x on large regions

[linux-2.6-block.git] / mm / memory_hotplug.c
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c

index 8537429d33a6dfd1bc57184646d8d0d75b8745e2..b9a667d36c554afc46c2d337a2712e6ca8664fb5 100644 (file)
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,7 @@
  #include <linux/hugetlb.h>
  #include <linux/memblock.h>
  #include <linux/compaction.h>
+#include <linux/rmap.h>
  
  #include <asm/tlbflush.h>
  
@@ -253,7 +254,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
         if (pfn_valid(phys_start_pfn))
                 return -EEXIST;
  
-       ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn, altmap);
+       ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
         if (ret < 0)
                 return ret;
  
@@ -743,14 +744,13 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
         int nid = pgdat->node_id;
         unsigned long flags;
  
-       if (zone_is_empty(zone))
-               init_currently_empty_zone(zone, start_pfn, nr_pages);
-
         clear_zone_contiguous(zone);
  
         /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
         pgdat_resize_lock(pgdat, &flags);
         zone_span_writelock(zone);
+       if (zone_is_empty(zone))
+               init_currently_empty_zone(zone, start_pfn, nr_pages);
         resize_zone_range(zone, start_pfn, nr_pages);
         zone_span_writeunlock(zone);
         resize_pgdat_range(pgdat, start_pfn, nr_pages);
@@ -1078,7 +1078,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
   *
   * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
   */
-int __ref add_memory_resource(int nid, struct resource *res, bool online)
+int __ref add_memory_resource(int nid, struct resource *res)
  {
         u64 start, size;
         bool new_node = false;
@@ -1133,7 +1133,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
         mem_hotplug_done();
  
         /* online pages if requested */
-       if (online)
+       if (memhp_auto_online)
                 walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
                                   NULL, online_memory_block);
  
@@ -1157,7 +1157,7 @@ int __ref __add_memory(int nid, u64 start, u64 size)
         if (IS_ERR(res))
                 return PTR_ERR(res);
  
-       ret = add_memory_resource(nid, res, memhp_auto_online);
+       ret = add_memory_resource(nid, res);
         if (ret < 0)
                 release_memory_resource(res);
         return ret;
@@ -1339,18 +1339,16 @@ static struct page *new_node_page(struct page *page, unsigned long private)
         return new_page_nodemask(page, nid, &nmask);
  }
  
-#define NR_OFFLINE_AT_ONCE_PAGES       (256)
  static int
  do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
  {
         unsigned long pfn;
         struct page *page;
-       int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
         int not_managed = 0;
         int ret = 0;
         LIST_HEAD(source);
  
-       for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
+       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                 if (!pfn_valid(pfn))
                         continue;
                 page = pfn_to_page(pfn);
@@ -1362,13 +1360,27 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                                 ret = -EBUSY;
                                 break;
                         }
-                       if (isolate_huge_page(page, &source))
-                               move_pages -= 1 << compound_order(head);
+                       isolate_huge_page(page, &source);
                         continue;
                 } else if (PageTransHuge(page))
                         pfn = page_to_pfn(compound_head(page))
                                 + hpage_nr_pages(page) - 1;
  
+               /*
+                * HWPoison pages have elevated reference counts so the migration would
+                * fail on them. It also doesn't make any sense to migrate them in the
+                * first place. Still try to unmap such a page in case it is still mapped
+                * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
+                * the unmap as the catch all safety net).
+                */
+               if (PageHWPoison(page)) {
+                       if (WARN_ON(PageLRU(page)))
+                               isolate_lru_page(page);
+                       if (page_mapped(page))
+                               try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
+                       continue;
+               }
+
                 if (!get_page_unless_zero(page))
                         continue;
                 /*
@@ -1382,7 +1394,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                 if (!ret) { /* Success */
                         put_page(page);
                         list_add_tail(&page->lru, &source);
-                       move_pages--;
                         if (!__PageMovable(page))
                                 inc_node_page_state(page, NR_ISOLATED_ANON +
                                                     page_is_file_cache(page));
@@ -1596,38 +1607,42 @@ static int __ref __offline_pages(unsigned long start_pfn,
                 goto failed_removal_isolated;
         }
  
-       pfn = start_pfn;
-repeat:
-       /* start memory hot removal */
-       ret = -EINTR;
-       if (signal_pending(current)) {
-               reason = "signal backoff";
-               goto failed_removal_isolated;
-       }
+       do {
+               for (pfn = start_pfn; pfn;) {
+                       if (signal_pending(current)) {
+                               ret = -EINTR;
+                               reason = "signal backoff";
+                               goto failed_removal_isolated;
+                       }
  
-       cond_resched();
-       lru_add_drain_all();
-       drain_all_pages(zone);
+                       cond_resched();
+                       lru_add_drain_all();
+                       drain_all_pages(zone);
+
+                       pfn = scan_movable_pages(pfn, end_pfn);
+                       if (pfn) {
+                               /*
+                                * TODO: fatal migration failures should bail
+                                * out
+                                */
+                               do_migrate_range(pfn, end_pfn);
+                       }
+               }
  
-       pfn = scan_movable_pages(start_pfn, end_pfn);
-       if (pfn) { /* We have movable pages */
-               ret = do_migrate_range(pfn, end_pfn);
-               goto repeat;
-       }
+               /*
+                * Dissolve free hugepages in the memory block before doing
+                * offlining actually in order to make hugetlbfs's object
+                * counting consistent.
+                */
+               ret = dissolve_free_huge_pages(start_pfn, end_pfn);
+               if (ret) {
+                       reason = "failure to dissolve huge pages";
+                       goto failed_removal_isolated;
+               }
+               /* check again */
+               offlined_pages = check_pages_isolated(start_pfn, end_pfn);
+       } while (offlined_pages < 0);
  
-       /*
-        * dissolve free hugepages in the memory block before doing offlining
-        * actually in order to make hugetlbfs's object counting consistent.
-        */
-       ret = dissolve_free_huge_pages(start_pfn, end_pfn);
-       if (ret) {
-               reason = "failure to dissolve huge pages";
-               goto failed_removal_isolated;
-       }
-       /* check again */
-       offlined_pages = check_pages_isolated(start_pfn, end_pfn);
-       if (offlined_pages < 0)
-               goto repeat;
         pr_info("Offlined Pages %ld\n", offlined_pages);
         /* Ok, all of our target is isolated.
            We cannot do rollback at this point. */
@@ -1764,34 +1779,6 @@ static int check_cpu_on_node(pg_data_t *pgdat)
         return 0;
  }
  
-static void unmap_cpu_on_node(pg_data_t *pgdat)
-{
-#ifdef CONFIG_ACPI_NUMA
-       int cpu;
-
-       for_each_possible_cpu(cpu)
-               if (cpu_to_node(cpu) == pgdat->node_id)
-                       numa_clear_node(cpu);
-#endif
-}
-
-static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
-{
-       int ret;
-
-       ret = check_cpu_on_node(pgdat);
-       if (ret)
-               return ret;
-
-       /*
-        * the node will be offlined when we come here, so we can clear
-        * the cpu_to_node() now.
-        */
-
-       unmap_cpu_on_node(pgdat);
-       return 0;
-}
-
  /**
   * try_offline_node
   * @nid: the node ID
@@ -1824,7 +1811,7 @@ void try_offline_node(int nid)
                 return;
         }
  
-       if (check_and_unmap_cpu_on_node(pgdat))
+       if (check_cpu_on_node(pgdat))
                 return;
  
         /*
@@ -1869,7 +1856,7 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
         memblock_free(start, size);
         memblock_remove(start, size);
  
-       arch_remove_memory(start, size, NULL);
+       arch_remove_memory(nid, start, size, NULL);
  
         try_offline_node(nid);