Merge tag 'pm+acpi-4.6-rc1-3' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael...
[linux-2.6-block.git] / mm / page_alloc.c
index 838ca8bb64f7376062fc6670c759a27d7f59c7e3..a762be57e46e14efa571b967eb75696c35ebd034 100644 (file)
@@ -223,6 +223,19 @@ static char * const zone_names[MAX_NR_ZONES] = {
 #endif
 };
 
+char * const migratetype_names[MIGRATE_TYPES] = {
+       "Unmovable",
+       "Movable",
+       "Reclaimable",
+       "HighAtomic",
+#ifdef CONFIG_CMA
+       "CMA",
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+       "Isolate",
+#endif
+};
+
 compound_page_dtor * const compound_page_dtors[] = {
        NULL,
        free_compound_page,
@@ -236,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = {
 
 int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
+int watermark_scale_factor = 10;
 
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
@@ -247,6 +261,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static bool mirrored_kernelcore;
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
@@ -293,13 +308,20 @@ static inline bool update_defer_init(pg_data_t *pgdat,
                                unsigned long pfn, unsigned long zone_end,
                                unsigned long *nr_initialised)
 {
+       unsigned long max_initialise;
+
        /* Always populate low zones for address-contrained allocations */
        if (zone_end < pgdat_end_pfn(pgdat))
                return true;
+       /*
+        * Initialise at least 2G of a node but also take into account that
+        * two large system hashes that can take up 1GB for 0.25TB/node.
+        */
+       max_initialise = max(2UL << (30 - PAGE_SHIFT),
+               (pgdat->node_spanned_pages >> 8));
 
-       /* Initialise at least 2G of the highest zone */
        (*nr_initialised)++;
-       if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) &&
+       if ((*nr_initialised > max_initialise) &&
            (pfn & (PAGES_PER_SECTION - 1)) == 0) {
                pgdat->first_deferred_pfn = pfn;
                return false;
@@ -416,7 +438,7 @@ static void bad_page(struct page *page, const char *reason,
                        goto out;
                }
                if (nr_unshown) {
-                       printk(KERN_ALERT
+                       pr_alert(
                              "BUG: Bad page state: %lu messages suppressed\n",
                                nr_unshown);
                        nr_unshown = 0;
@@ -426,9 +448,14 @@ static void bad_page(struct page *page, const char *reason,
        if (nr_shown++ == 0)
                resume = jiffies + 60 * HZ;
 
-       printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
+       pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
                current->comm, page_to_pfn(page));
-       dump_page_badflags(page, reason, bad_flags);
+       __dump_page(page, reason);
+       bad_flags &= page->flags;
+       if (bad_flags)
+               pr_alert("bad because of flags: %#lx(%pGp)\n",
+                                               bad_flags, &bad_flags);
+       dump_page_owner(page);
 
        print_modules();
        dump_stack();
@@ -477,7 +504,9 @@ void prep_compound_page(struct page *page, unsigned int order)
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
-bool _debug_pagealloc_enabled __read_mostly;
+bool _debug_pagealloc_enabled __read_mostly
+                       = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+EXPORT_SYMBOL(_debug_pagealloc_enabled);
 bool _debug_guardpage_enabled __read_mostly;
 
 static int __init early_debug_pagealloc(char *buf)
@@ -488,6 +517,9 @@ static int __init early_debug_pagealloc(char *buf)
        if (strcmp(buf, "on") == 0)
                _debug_pagealloc_enabled = true;
 
+       if (strcmp(buf, "off") == 0)
+               _debug_pagealloc_enabled = false;
+
        return 0;
 }
 early_param("debug_pagealloc", early_debug_pagealloc);
@@ -519,11 +551,11 @@ static int __init debug_guardpage_minorder_setup(char *buf)
        unsigned long res;
 
        if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
-               printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+               pr_err("Bad debug_guardpage_minorder value\n");
                return 0;
        }
        _debug_guardpage_minorder = res;
-       printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+       pr_info("Setting debug_guardpage_minorder to %lu\n", res);
        return 0;
 }
 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
@@ -741,7 +773,7 @@ static inline int free_pages_check(struct page *page)
                bad_reason = "nonzero mapcount";
        if (unlikely(page->mapping != NULL))
                bad_reason = "non-NULL mapping";
-       if (unlikely(atomic_read(&page->_count) != 0))
+       if (unlikely(page_ref_count(page) != 0))
                bad_reason = "nonzero _count";
        if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
                bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
@@ -1002,6 +1034,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
                                           PAGE_SIZE << order);
        }
        arch_free_page(page, order);
+       kernel_poison_pages(page, 1 << order, 0);
        kernel_map_pages(page, 1 << order, 0);
 
        return true;
@@ -1104,6 +1137,75 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
        return __free_pages_boot_core(page, pfn, order);
 }
 
+/*
+ * Check that the whole (or subset of) a pageblock given by the interval of
+ * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
+ * with the migration of free compaction scanner. The scanners then need to
+ * use only pfn_valid_within() check for arches that allow holes within
+ * pageblocks.
+ *
+ * Return struct page pointer of start_pfn, or NULL if checks were not passed.
+ *
+ * It's possible on some configurations to have a setup like node0 node1 node0
+ * i.e. it's possible that all pages within a zones range of pages do not
+ * belong to a single zone. We assume that a border between node0 and node1
+ * can occur within a single pageblock, but not a node0 node1 node0
+ * interleaving within a single pageblock. It is therefore sufficient to check
+ * the first and last page of a pageblock and avoid checking each individual
+ * page in a pageblock.
+ */
+struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+                                    unsigned long end_pfn, struct zone *zone)
+{
+       struct page *start_page;
+       struct page *end_page;
+
+       /* end_pfn is one past the range we are checking */
+       end_pfn--;
+
+       if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
+               return NULL;
+
+       start_page = pfn_to_page(start_pfn);
+
+       if (page_zone(start_page) != zone)
+               return NULL;
+
+       end_page = pfn_to_page(end_pfn);
+
+       /* This gives a shorter code than deriving page_zone(end_page) */
+       if (page_zone_id(start_page) != page_zone_id(end_page))
+               return NULL;
+
+       return start_page;
+}
+
+void set_zone_contiguous(struct zone *zone)
+{
+       unsigned long block_start_pfn = zone->zone_start_pfn;
+       unsigned long block_end_pfn;
+
+       block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
+       for (; block_start_pfn < zone_end_pfn(zone);
+                       block_start_pfn = block_end_pfn,
+                        block_end_pfn += pageblock_nr_pages) {
+
+               block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
+
+               if (!__pageblock_pfn_to_page(block_start_pfn,
+                                            block_end_pfn, zone))
+                       return;
+       }
+
+       /* We confirm that there is no hole */
+       zone->contiguous = true;
+}
+
+void clear_zone_contiguous(struct zone *zone)
+{
+       zone->contiguous = false;
+}
+
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 static void __init deferred_free_range(struct page *page,
                                        unsigned long pfn, int nr_pages)
@@ -1254,9 +1356,13 @@ free_range:
        pgdat_init_report_one_done();
        return 0;
 }
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
 void __init page_alloc_init_late(void)
 {
+       struct zone *zone;
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
        int nid;
 
        /* There will be num_node_state(N_MEMORY) threads */
@@ -1270,8 +1376,11 @@ void __init page_alloc_init_late(void)
 
        /* Reinit limits that are based on free pages after the kernel is up */
        files_maxfiles_init();
+#endif
+
+       for_each_populated_zone(zone)
+               set_zone_contiguous(zone);
 }
-#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
 #ifdef CONFIG_CMA
 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
@@ -1360,7 +1469,7 @@ static inline int check_new_page(struct page *page)
                bad_reason = "nonzero mapcount";
        if (unlikely(page->mapping != NULL))
                bad_reason = "non-NULL mapping";
-       if (unlikely(atomic_read(&page->_count) != 0))
+       if (unlikely(page_ref_count(page) != 0))
                bad_reason = "nonzero _count";
        if (unlikely(page->flags & __PG_HWPOISON)) {
                bad_reason = "HWPoisoned (hardware-corrupted)";
@@ -1381,15 +1490,24 @@ static inline int check_new_page(struct page *page)
        return 0;
 }
 
+static inline bool free_pages_prezeroed(bool poisoned)
+{
+       return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
+               page_poisoning_enabled() && poisoned;
+}
+
 static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
                                                                int alloc_flags)
 {
        int i;
+       bool poisoned = true;
 
        for (i = 0; i < (1 << order); i++) {
                struct page *p = page + i;
                if (unlikely(check_new_page(p)))
                        return 1;
+               if (poisoned)
+                       poisoned &= page_is_poisoned(p);
        }
 
        set_page_private(page, 0);
@@ -1397,9 +1515,10 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
 
        arch_alloc_page(page, order);
        kernel_map_pages(page, 1 << order, 1);
+       kernel_poison_pages(page, 1 << order, 1);
        kasan_alloc_pages(page, order);
 
-       if (gfp_flags & __GFP_ZERO)
+       if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
                for (i = 0; i < (1 << order); i++)
                        clear_highpage(page + i);
 
@@ -2238,19 +2357,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
                list_del(&page->lru);
                pcp->count--;
        } else {
-               if (unlikely(gfp_flags & __GFP_NOFAIL)) {
-                       /*
-                        * __GFP_NOFAIL is not to be used in new code.
-                        *
-                        * All __GFP_NOFAIL callers should be fixed so that they
-                        * properly detect and handle allocation failures.
-                        *
-                        * We most definitely don't want callers attempting to
-                        * allocate greater than order-1 page units with
-                        * __GFP_NOFAIL.
-                        */
-                       WARN_ON_ONCE(order > 1);
-               }
+               /*
+                * We most definitely don't want callers attempting to
+                * allocate greater than order-1 page units with __GFP_NOFAIL.
+                */
+               WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
                spin_lock_irqsave(&zone->lock, flags);
 
                page = NULL;
@@ -2690,9 +2801,8 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
                va_end(args);
        }
 
-       pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n",
-               current->comm, order, gfp_mask);
-
+       pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
+               current->comm, order, gfp_mask, &gfp_mask);
        dump_stack();
        if (!should_suppress_show_mem())
                show_mem(filter);
@@ -2748,8 +2858,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                         * XXX: Page reclaim didn't yield anything,
                         * and the OOM killer can't be invoked, but
                         * keep looping as per tradition.
+                        *
+                        * But do not keep looping if oom_killer_disable()
+                        * was already called, for the system is trying to
+                        * enter a quiescent state during suspend.
                         */
-                       *did_some_progress = 1;
+                       *did_some_progress = !oom_killer_disabled;
                        goto out;
                }
                if (pm_suspended_storage())
@@ -3008,14 +3122,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                                (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
                gfp_mask &= ~__GFP_ATOMIC;
 
-       /*
-        * If this allocation cannot block and it is for a specific node, then
-        * fail early.  There's no need to wakeup kswapd or retry for a
-        * speculative node-specific allocation.
-        */
-       if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim)
-               goto nopage;
-
 retry:
        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
                wake_all_kswapds(order, ac);
@@ -3372,7 +3478,7 @@ refill:
                /* Even if we own the page, we do not use atomic_set().
                 * This would break get_page_unless_zero() users.
                 */
-               atomic_add(size - 1, &page->_count);
+               page_ref_add(page, size - 1);
 
                /* reset page count bias and offset to start of new frag */
                nc->pfmemalloc = page_is_pfmemalloc(page);
@@ -3384,7 +3490,7 @@ refill:
        if (unlikely(offset < 0)) {
                page = virt_to_page(nc->va);
 
-               if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
+               if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
                        goto refill;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
@@ -3392,7 +3498,7 @@ refill:
                size = nc->size;
 #endif
                /* OK, page count is 0, we can safely set it */
-               atomic_set(&page->_count, size);
+               set_page_count(page, size);
 
                /* reset page count bias and offset to start of new frag */
                nc->pagecnt_bias = size;
@@ -3603,6 +3709,49 @@ static inline void show_node(struct zone *zone)
                printk("Node %d ", zone_to_nid(zone));
 }
 
+long si_mem_available(void)
+{
+       long available;
+       unsigned long pagecache;
+       unsigned long wmark_low = 0;
+       unsigned long pages[NR_LRU_LISTS];
+       struct zone *zone;
+       int lru;
+
+       for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+               pages[lru] = global_page_state(NR_LRU_BASE + lru);
+
+       for_each_zone(zone)
+               wmark_low += zone->watermark[WMARK_LOW];
+
+       /*
+        * Estimate the amount of memory available for userspace allocations,
+        * without causing swapping.
+        */
+       available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
+
+       /*
+        * Not all the page cache can be freed, otherwise the system will
+        * start swapping. Assume at least half of the page cache, or the
+        * low watermark worth of cache, needs to stay.
+        */
+       pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+       pagecache -= min(pagecache / 2, wmark_low);
+       available += pagecache;
+
+       /*
+        * Part of the reclaimable slab consists of items that are in use,
+        * and cannot be freed. Cap this estimate at the low watermark.
+        */
+       available += global_page_state(NR_SLAB_RECLAIMABLE) -
+                    min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+
+       if (available < 0)
+               available = 0;
+       return available;
+}
+EXPORT_SYMBOL_GPL(si_mem_available);
+
 void si_meminfo(struct sysinfo *val)
 {
        val->totalram = totalram_pages;
@@ -3935,9 +4084,7 @@ static int __parse_numa_zonelist_order(char *s)
        } else if (*s == 'z' || *s == 'Z') {
                user_zonelist_order = ZONELIST_ORDER_ZONE;
        } else {
-               printk(KERN_WARNING
-                       "Ignoring invalid numa_zonelist_order value:  "
-                       "%s\n", s);
+               pr_warn("Ignoring invalid numa_zonelist_order value:  %s\n", s);
                return -EINVAL;
        }
        return 0;
@@ -4401,12 +4548,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
        else
                page_group_by_mobility_disabled = 0;
 
-       pr_info("Built %i zonelists in %s order, mobility grouping %s.  "
-               "Total pages: %ld\n",
-                       nr_online_nodes,
-                       zonelist_order_name[current_zonelist_order],
-                       page_group_by_mobility_disabled ? "off" : "on",
-                       vm_total_pages);
+       pr_info("Built %i zonelists in %s order, mobility grouping %s.  Total pages: %ld\n",
+               nr_online_nodes,
+               zonelist_order_name[current_zonelist_order],
+               page_group_by_mobility_disabled ? "off" : "on",
+               vm_total_pages);
 #ifdef CONFIG_NUMA
        pr_info("Policy zone: %s\n", zone_names[policy_zone]);
 #endif
@@ -4491,6 +4637,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
        pg_data_t *pgdat = NODE_DATA(nid);
        unsigned long pfn;
        unsigned long nr_initialised = 0;
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+       struct memblock_region *r = NULL, *tmp;
+#endif
 
        if (highest_memmap_pfn < end_pfn - 1)
                highest_memmap_pfn = end_pfn - 1;
@@ -4504,20 +4653,51 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 
        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                /*
-                * There can be holes in boot-time mem_map[]s
-                * handed to this function.  They do not
-                * exist on hotplugged memory.
+                * There can be holes in boot-time mem_map[]s handed to this
+                * function.  They do not exist on hotplugged memory.
+                */
+               if (context != MEMMAP_EARLY)
+                       goto not_early;
+
+               if (!early_pfn_valid(pfn))
+                       continue;
+               if (!early_pfn_in_nid(pfn, nid))
+                       continue;
+               if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
+                       break;
+
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+               /*
+                * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
+                * from zone_movable_pfn[nid] to end of each node should be
+                * ZONE_MOVABLE not ZONE_NORMAL. skip it.
                 */
-               if (context == MEMMAP_EARLY) {
-                       if (!early_pfn_valid(pfn))
+               if (!mirrored_kernelcore && zone_movable_pfn[nid])
+                       if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
                                continue;
-                       if (!early_pfn_in_nid(pfn, nid))
+
+               /*
+                * Check given memblock attribute by firmware which can affect
+                * kernel memory layout.  If zone==ZONE_MOVABLE but memory is
+                * mirrored, it's an overlapped memmap init. skip it.
+                */
+               if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+                       if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
+                               for_each_memblock(memory, tmp)
+                                       if (pfn < memblock_region_memory_end_pfn(tmp))
+                                               break;
+                               r = tmp;
+                       }
+                       if (pfn >= memblock_region_memory_base_pfn(r) &&
+                           memblock_is_mirror(r)) {
+                               /* already initialized as NORMAL */
+                               pfn = memblock_region_memory_end_pfn(r);
                                continue;
-                       if (!update_defer_init(pgdat, pfn, end_pfn,
-                                               &nr_initialised))
-                               break;
+                       }
                }
+#endif
 
+not_early:
                /*
                 * Mark the block movable so that blocks are reserved for
                 * movable at startup. This will force kernel allocations
@@ -4934,11 +5114,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
                        *zone_end_pfn = min(node_end_pfn,
                                arch_zone_highest_possible_pfn[movable_zone]);
 
-               /* Adjust for ZONE_MOVABLE starting within this range */
-               } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
-                               *zone_end_pfn > zone_movable_pfn[nid]) {
-                       *zone_end_pfn = zone_movable_pfn[nid];
-
                /* Check if this whole range is within ZONE_MOVABLE */
                } else if (*zone_start_pfn >= zone_movable_pfn[nid])
                        *zone_start_pfn = *zone_end_pfn;
@@ -4953,31 +5128,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long node_start_pfn,
                                        unsigned long node_end_pfn,
+                                       unsigned long *zone_start_pfn,
+                                       unsigned long *zone_end_pfn,
                                        unsigned long *ignored)
 {
-       unsigned long zone_start_pfn, zone_end_pfn;
-
        /* When hotadd a new node from cpu_up(), the node should be empty */
        if (!node_start_pfn && !node_end_pfn)
                return 0;
 
        /* Get the start and end of the zone */
-       zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
-       zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+       *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+       *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
        adjust_zone_range_for_zone_movable(nid, zone_type,
                                node_start_pfn, node_end_pfn,
-                               &zone_start_pfn, &zone_end_pfn);
+                               zone_start_pfn, zone_end_pfn);
 
        /* Check that this node has pages within the zone's required range */
-       if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+       if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
                return 0;
 
        /* Move the zone boundaries inside the node if necessary */
-       zone_end_pfn = min(zone_end_pfn, node_end_pfn);
-       zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+       *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
+       *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
 
        /* Return the spanned pages */
-       return zone_end_pfn - zone_start_pfn;
+       return *zone_end_pfn - *zone_start_pfn;
 }
 
 /*
@@ -5023,6 +5198,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
        unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
        unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
        unsigned long zone_start_pfn, zone_end_pfn;
+       unsigned long nr_absent;
 
        /* When hotadd a new node from cpu_up(), the node should be empty */
        if (!node_start_pfn && !node_end_pfn)
@@ -5034,7 +5210,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
        adjust_zone_range_for_zone_movable(nid, zone_type,
                        node_start_pfn, node_end_pfn,
                        &zone_start_pfn, &zone_end_pfn);
-       return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+       nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+
+       /*
+        * ZONE_MOVABLE handling.
+        * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
+        * and vice versa.
+        */
+       if (zone_movable_pfn[nid]) {
+               if (mirrored_kernelcore) {
+                       unsigned long start_pfn, end_pfn;
+                       struct memblock_region *r;
+
+                       for_each_memblock(memory, r) {
+                               start_pfn = clamp(memblock_region_memory_base_pfn(r),
+                                                 zone_start_pfn, zone_end_pfn);
+                               end_pfn = clamp(memblock_region_memory_end_pfn(r),
+                                               zone_start_pfn, zone_end_pfn);
+
+                               if (zone_type == ZONE_MOVABLE &&
+                                   memblock_is_mirror(r))
+                                       nr_absent += end_pfn - start_pfn;
+
+                               if (zone_type == ZONE_NORMAL &&
+                                   !memblock_is_mirror(r))
+                                       nr_absent += end_pfn - start_pfn;
+                       }
+               } else {
+                       if (zone_type == ZONE_NORMAL)
+                               nr_absent += node_end_pfn - zone_movable_pfn[nid];
+               }
+       }
+
+       return nr_absent;
 }
 
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
@@ -5042,8 +5250,18 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long node_start_pfn,
                                        unsigned long node_end_pfn,
+                                       unsigned long *zone_start_pfn,
+                                       unsigned long *zone_end_pfn,
                                        unsigned long *zones_size)
 {
+       unsigned int zone;
+
+       *zone_start_pfn = node_start_pfn;
+       for (zone = 0; zone < zone_type; zone++)
+               *zone_start_pfn += zones_size[zone];
+
+       *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
+
        return zones_size[zone_type];
 }
 
@@ -5072,15 +5290,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
 
        for (i = 0; i < MAX_NR_ZONES; i++) {
                struct zone *zone = pgdat->node_zones + i;
+               unsigned long zone_start_pfn, zone_end_pfn;
                unsigned long size, real_size;
 
                size = zone_spanned_pages_in_node(pgdat->node_id, i,
                                                  node_start_pfn,
                                                  node_end_pfn,
+                                                 &zone_start_pfn,
+                                                 &zone_end_pfn,
                                                  zones_size);
                real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
                                                  node_start_pfn, node_end_pfn,
                                                  zholes_size);
+               if (size)
+                       zone->zone_start_pfn = zone_start_pfn;
+               else
+                       zone->zone_start_pfn = 0;
                zone->spanned_pages = size;
                zone->present_pages = real_size;
 
@@ -5201,7 +5426,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 {
        enum zone_type j;
        int nid = pgdat->node_id;
-       unsigned long zone_start_pfn = pgdat->node_start_pfn;
        int ret;
 
        pgdat_resize_init(pgdat);
@@ -5217,11 +5441,15 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 #endif
        init_waitqueue_head(&pgdat->kswapd_wait);
        init_waitqueue_head(&pgdat->pfmemalloc_wait);
+#ifdef CONFIG_COMPACTION
+       init_waitqueue_head(&pgdat->kcompactd_wait);
+#endif
        pgdat_page_ext_init(pgdat);
 
        for (j = 0; j < MAX_NR_ZONES; j++) {
                struct zone *zone = pgdat->node_zones + j;
                unsigned long size, realsize, freesize, memmap_pages;
+               unsigned long zone_start_pfn = zone->zone_start_pfn;
 
                size = zone->spanned_pages;
                realsize = freesize = zone->present_pages;
@@ -5240,8 +5468,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
                                               "  %s zone: %lu pages used for memmap\n",
                                               zone_names[j], memmap_pages);
                        } else
-                               printk(KERN_WARNING
-                                       "  %s zone: %lu pages exceeds freesize %lu\n",
+                               pr_warn("  %s zone: %lu pages exceeds freesize %lu\n",
                                        zone_names[j], memmap_pages, freesize);
                }
 
@@ -5290,7 +5517,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
                ret = init_currently_empty_zone(zone, zone_start_pfn, size);
                BUG_ON(ret);
                memmap_init(size, nid, j, zone_start_pfn);
-               zone_start_pfn += size;
        }
 }
 
@@ -5358,6 +5584,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
                (u64)start_pfn << PAGE_SHIFT,
                end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+#else
+       start_pfn = node_start_pfn;
 #endif
        calculate_node_totalpages(pgdat, start_pfn, end_pfn,
                                  zones_size, zholes_size);
@@ -5448,8 +5676,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
                min_pfn = min(min_pfn, start_pfn);
 
        if (min_pfn == ULONG_MAX) {
-               printk(KERN_WARNING
-                       "Could not find start_pfn for node %d\n", nid);
+               pr_warn("Could not find start_pfn for node %d\n", nid);
                return 0;
        }
 
@@ -5528,6 +5755,36 @@ static void __init find_zone_movable_pfns_for_nodes(void)
                goto out2;
        }
 
+       /*
+        * If kernelcore=mirror is specified, ignore movablecore option
+        */
+       if (mirrored_kernelcore) {
+               bool mem_below_4gb_not_mirrored = false;
+
+               for_each_memblock(memory, r) {
+                       if (memblock_is_mirror(r))
+                               continue;
+
+                       nid = r->nid;
+
+                       usable_startpfn = memblock_region_memory_base_pfn(r);
+
+                       if (usable_startpfn < 0x100000) {
+                               mem_below_4gb_not_mirrored = true;
+                               continue;
+                       }
+
+                       zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+                               min(usable_startpfn, zone_movable_pfn[nid]) :
+                               usable_startpfn;
+               }
+
+               if (mem_below_4gb_not_mirrored)
+                       pr_warn("This configuration results in unmirrored kernel memory.");
+
+               goto out2;
+       }
+
        /*
         * If movablecore=nn[KMG] was specified, calculate what size of
         * kernelcore that corresponds so that memory usable for
@@ -5788,6 +6045,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core)
  */
 static int __init cmdline_parse_kernelcore(char *p)
 {
+       /* parse kernelcore=mirror */
+       if (parse_option_str(p, "mirror")) {
+               mirrored_kernelcore = true;
+               return 0;
+       }
+
        return cmdline_parse_core(p, &required_kernelcore);
 }
 
@@ -5885,22 +6148,21 @@ void __init mem_init_print_info(const char *str)
 
 #undef adj_init_size
 
-       pr_info("Memory: %luK/%luK available "
-              "(%luK kernel code, %luK rwdata, %luK rodata, "
-              "%luK init, %luK bss, %luK reserved, %luK cma-reserved"
+       pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
 #ifdef CONFIG_HIGHMEM
-              ", %luK highmem"
+               ", %luK highmem"
 #endif
-              "%s%s)\n",
-              nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
-              codesize >> 10, datasize >> 10, rosize >> 10,
-              (init_data_size + init_code_size) >> 10, bss_size >> 10,
-              (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10),
-              totalcma_pages << (PAGE_SHIFT-10),
+               "%s%s)\n",
+               nr_free_pages() << (PAGE_SHIFT - 10),
+               physpages << (PAGE_SHIFT - 10),
+               codesize >> 10, datasize >> 10, rosize >> 10,
+               (init_data_size + init_code_size) >> 10, bss_size >> 10,
+               (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
+               totalcma_pages << (PAGE_SHIFT - 10),
 #ifdef CONFIG_HIGHMEM
-              totalhigh_pages << (PAGE_SHIFT-10),
+               totalhigh_pages << (PAGE_SHIFT - 10),
 #endif
-              str ? ", " : "", str ? str : "");
+               str ? ", " : "", str ? str : "");
 }
 
 /**
@@ -6075,8 +6337,17 @@ static void __setup_per_zone_wmarks(void)
                        zone->watermark[WMARK_MIN] = tmp;
                }
 
-               zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
-               zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+               /*
+                * Set the kswapd watermarks distance according to the
+                * scale factor in proportion to available memory, but
+                * ensure a minimum size on small systems.
+                */
+               tmp = max_t(u64, tmp >> 2,
+                           mult_frac(zone->managed_pages,
+                                     watermark_scale_factor, 10000));
+
+               zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
+               zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
 
                __mod_zone_page_state(zone, NR_ALLOC_BATCH,
                        high_wmark_pages(zone) - low_wmark_pages(zone) -
@@ -6217,6 +6488,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
        return 0;
 }
 
+int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+       void __user *buffer, size_t *length, loff_t *ppos)
+{
+       int rc;
+
+       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+       if (rc)
+               return rc;
+
+       if (write)
+               setup_per_zone_wmarks();
+
+       return 0;
+}
+
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
@@ -6408,11 +6694,8 @@ void *__init alloc_large_system_hash(const char *tablename,
        if (!table)
                panic("Failed to allocate %s hash table\n", tablename);
 
-       printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
-              tablename,
-              (1UL << log2qty),
-              ilog2(size) - PAGE_SHIFT,
-              size);
+       pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
+               tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
 
        if (_hash_shift)
                *_hash_shift = log2qty;
@@ -6563,7 +6846,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
                 * This check already skips compound tails of THP
                 * because their page->_count is zero at all time.
                 */
-               if (!atomic_read(&page->_count)) {
+               if (!page_ref_count(page)) {
                        if (PageBuddy(page))
                                iter += (1 << page_order(page)) - 1;
                        continue;
@@ -6913,8 +7196,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
                BUG_ON(!PageBuddy(page));
                order = page_order(page);
 #ifdef CONFIG_DEBUG_VM
-               printk(KERN_INFO "remove from free list %lx %d %lx\n",
-                      pfn, 1 << order, end_pfn);
+               pr_info("remove from free list %lx %d %lx\n",
+                       pfn, 1 << order, end_pfn);
 #endif
                list_del(&page->lru);
                rmv_page_order(page);
@@ -6927,7 +7210,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 }
 #endif
 
-#ifdef CONFIG_MEMORY_FAILURE
 bool is_free_buddy_page(struct page *page)
 {
        struct zone *zone = page_zone(page);
@@ -6946,4 +7228,3 @@ bool is_free_buddy_page(struct page *page)
 
        return order < MAX_ORDER;
 }
-#endif