mm, memory_hotplug: deobfuscate migration part of offlining
[linux-2.6-block.git] / mm / memory_hotplug.c
index 2b2b3ccbbfb5768a3d6b530799ebf5c4c3129688..b9a667d36c554afc46c2d337a2712e6ca8664fb5 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/hugetlb.h>
 #include <linux/memblock.h>
 #include <linux/compaction.h>
+#include <linux/rmap.h>
 
 #include <asm/tlbflush.h>
 
@@ -253,7 +254,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
        if (pfn_valid(phys_start_pfn))
                return -EEXIST;
 
-       ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn, altmap);
+       ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
        if (ret < 0)
                return ret;
 
@@ -743,14 +744,13 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
        int nid = pgdat->node_id;
        unsigned long flags;
 
-       if (zone_is_empty(zone))
-               init_currently_empty_zone(zone, start_pfn, nr_pages);
-
        clear_zone_contiguous(zone);
 
        /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
        pgdat_resize_lock(pgdat, &flags);
        zone_span_writelock(zone);
+       if (zone_is_empty(zone))
+               init_currently_empty_zone(zone, start_pfn, nr_pages);
        resize_zone_range(zone, start_pfn, nr_pages);
        zone_span_writeunlock(zone);
        resize_pgdat_range(pgdat, start_pfn, nr_pages);
@@ -1078,7 +1078,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
  *
  * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
  */
-int __ref add_memory_resource(int nid, struct resource *res, bool online)
+int __ref add_memory_resource(int nid, struct resource *res)
 {
        u64 start, size;
        bool new_node = false;
@@ -1133,7 +1133,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
        mem_hotplug_done();
 
        /* online pages if requested */
-       if (online)
+       if (memhp_auto_online)
                walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
                                  NULL, online_memory_block);
 
@@ -1157,7 +1157,7 @@ int __ref __add_memory(int nid, u64 start, u64 size)
        if (IS_ERR(res))
                return PTR_ERR(res);
 
-       ret = add_memory_resource(nid, res, memhp_auto_online);
+       ret = add_memory_resource(nid, res);
        if (ret < 0)
                release_memory_resource(res);
        return ret;
@@ -1226,7 +1226,7 @@ static bool is_pageblock_removable_nolock(struct page *page)
        if (!zone_spans_pfn(zone, pfn))
                return false;
 
-       return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
+       return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, SKIP_HWPOISON);
 }
 
 /* Checks if this range of memory is likely to be hot-removable. */
@@ -1339,18 +1339,16 @@ static struct page *new_node_page(struct page *page, unsigned long private)
        return new_page_nodemask(page, nid, &nmask);
 }
 
-#define NR_OFFLINE_AT_ONCE_PAGES       (256)
 static int
 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 {
        unsigned long pfn;
        struct page *page;
-       int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
        int not_managed = 0;
        int ret = 0;
        LIST_HEAD(source);
 
-       for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
+       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                if (!pfn_valid(pfn))
                        continue;
                page = pfn_to_page(pfn);
@@ -1362,13 +1360,27 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                                ret = -EBUSY;
                                break;
                        }
-                       if (isolate_huge_page(page, &source))
-                               move_pages -= 1 << compound_order(head);
+                       isolate_huge_page(page, &source);
                        continue;
                } else if (PageTransHuge(page))
                        pfn = page_to_pfn(compound_head(page))
                                + hpage_nr_pages(page) - 1;
 
+               /*
+                * HWPoison pages have elevated reference counts so the migration would
+                * fail on them. It also doesn't make any sense to migrate them in the
+                * first place. Still try to unmap such a page in case it is still mapped
+                * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
+                * the unmap as the catch all safety net).
+                */
+               if (PageHWPoison(page)) {
+                       if (WARN_ON(PageLRU(page)))
+                               isolate_lru_page(page);
+                       if (page_mapped(page))
+                               try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
+                       continue;
+               }
+
                if (!get_page_unless_zero(page))
                        continue;
                /*
@@ -1382,16 +1394,13 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                if (!ret) { /* Success */
                        put_page(page);
                        list_add_tail(&page->lru, &source);
-                       move_pages--;
                        if (!__PageMovable(page))
                                inc_node_page_state(page, NR_ISOLATED_ANON +
                                                    page_is_file_cache(page));
 
                } else {
-#ifdef CONFIG_DEBUG_VM
-                       pr_alert("failed to isolate pfn %lx\n", pfn);
+                       pr_warn("failed to isolate pfn %lx\n", pfn);
                        dump_page(page, "isolation failed");
-#endif
                        put_page(page);
                        /* Because we don't have big zone->lock. we should
                           check this again here. */
@@ -1411,8 +1420,14 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                /* Allocate a new page from the nearest neighbor node */
                ret = migrate_pages(&source, new_node_page, NULL, 0,
                                        MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
-               if (ret)
+               if (ret) {
+                       list_for_each_entry(page, &source, lru) {
+                               pr_warn("migrating pfn %lx failed ret:%d ",
+                                      page_to_pfn(page), ret);
+                               dump_page(page, "migration failure");
+                       }
                        putback_movable_pages(&source);
+               }
        }
 out:
        return ret;
@@ -1553,12 +1568,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
        unsigned long valid_start, valid_end;
        struct zone *zone;
        struct memory_notify arg;
-
-       /* at least, alignment against pageblock is necessary */
-       if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
-               return -EINVAL;
-       if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
-               return -EINVAL;
+       char *reason;
 
        mem_hotplug_begin();
 
@@ -1567,7 +1577,9 @@ static int __ref __offline_pages(unsigned long start_pfn,
        if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
                                  &valid_end)) {
                mem_hotplug_done();
-               return -EINVAL;
+               ret = -EINVAL;
+               reason = "multizone range";
+               goto failed_removal;
        }
 
        zone = page_zone(pfn_to_page(valid_start));
@@ -1576,10 +1588,12 @@ static int __ref __offline_pages(unsigned long start_pfn,
 
        /* set above range as isolated */
        ret = start_isolate_page_range(start_pfn, end_pfn,
-                                      MIGRATE_MOVABLE, true);
+                                      MIGRATE_MOVABLE,
+                                      SKIP_HWPOISON | REPORT_FAILURE);
        if (ret) {
                mem_hotplug_done();
-               return ret;
+               reason = "failure to isolate range";
+               goto failed_removal;
        }
 
        arg.start_pfn = start_pfn;
@@ -1588,37 +1602,47 @@ static int __ref __offline_pages(unsigned long start_pfn,
 
        ret = memory_notify(MEM_GOING_OFFLINE, &arg);
        ret = notifier_to_errno(ret);
-       if (ret)
-               goto failed_removal;
+       if (ret) {
+               reason = "notifier failure";
+               goto failed_removal_isolated;
+       }
 
-       pfn = start_pfn;
-repeat:
-       /* start memory hot removal */
-       ret = -EINTR;
-       if (signal_pending(current))
-               goto failed_removal;
+       do {
+               for (pfn = start_pfn; pfn;) {
+                       if (signal_pending(current)) {
+                               ret = -EINTR;
+                               reason = "signal backoff";
+                               goto failed_removal_isolated;
+                       }
 
-       cond_resched();
-       lru_add_drain_all();
-       drain_all_pages(zone);
+                       cond_resched();
+                       lru_add_drain_all();
+                       drain_all_pages(zone);
+
+                       pfn = scan_movable_pages(pfn, end_pfn);
+                       if (pfn) {
+                               /*
+                                * TODO: fatal migration failures should bail
+                                * out
+                                */
+                               do_migrate_range(pfn, end_pfn);
+                       }
+               }
 
-       pfn = scan_movable_pages(start_pfn, end_pfn);
-       if (pfn) { /* We have movable pages */
-               ret = do_migrate_range(pfn, end_pfn);
-               goto repeat;
-       }
+               /*
+                * Dissolve free hugepages in the memory block before doing
+                * offlining actually in order to make hugetlbfs's object
+                * counting consistent.
+                */
+               ret = dissolve_free_huge_pages(start_pfn, end_pfn);
+               if (ret) {
+                       reason = "failure to dissolve huge pages";
+                       goto failed_removal_isolated;
+               }
+               /* check again */
+               offlined_pages = check_pages_isolated(start_pfn, end_pfn);
+       } while (offlined_pages < 0);
 
-       /*
-        * dissolve free hugepages in the memory block before doing offlining
-        * actually in order to make hugetlbfs's object counting consistent.
-        */
-       ret = dissolve_free_huge_pages(start_pfn, end_pfn);
-       if (ret)
-               goto failed_removal;
-       /* check again */
-       offlined_pages = check_pages_isolated(start_pfn, end_pfn);
-       if (offlined_pages < 0)
-               goto repeat;
        pr_info("Offlined Pages %ld\n", offlined_pages);
        /* Ok, all of our target is isolated.
           We cannot do rollback at this point. */
@@ -1654,13 +1678,15 @@ repeat:
        mem_hotplug_done();
        return 0;
 
+failed_removal_isolated:
+       undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
 failed_removal:
-       pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n",
+       pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
                 (unsigned long long) start_pfn << PAGE_SHIFT,
-                ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
+                ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
+                reason);
        memory_notify(MEM_CANCEL_OFFLINE, &arg);
        /* pushback to free area */
-       undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
        mem_hotplug_done();
        return ret;
 }
@@ -1753,34 +1779,6 @@ static int check_cpu_on_node(pg_data_t *pgdat)
        return 0;
 }
 
-static void unmap_cpu_on_node(pg_data_t *pgdat)
-{
-#ifdef CONFIG_ACPI_NUMA
-       int cpu;
-
-       for_each_possible_cpu(cpu)
-               if (cpu_to_node(cpu) == pgdat->node_id)
-                       numa_clear_node(cpu);
-#endif
-}
-
-static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
-{
-       int ret;
-
-       ret = check_cpu_on_node(pgdat);
-       if (ret)
-               return ret;
-
-       /*
-        * the node will be offlined when we come here, so we can clear
-        * the cpu_to_node() now.
-        */
-
-       unmap_cpu_on_node(pgdat);
-       return 0;
-}
-
 /**
  * try_offline_node
  * @nid: the node ID
@@ -1813,7 +1811,7 @@ void try_offline_node(int nid)
                return;
        }
 
-       if (check_and_unmap_cpu_on_node(pgdat))
+       if (check_cpu_on_node(pgdat))
                return;
 
        /*
@@ -1858,7 +1856,7 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
        memblock_free(start, size);
        memblock_remove(start, size);
 
-       arch_remove_memory(start, size, NULL);
+       arch_remove_memory(nid, start, size, NULL);
 
        try_offline_node(nid);