Merge tag 'tag-chrome-platform-for-v4.20' of git://git.kernel.org/pub/scm/linux/kerne...
[linux-2.6-block.git] / mm / page_alloc.c
index e2ef1c17942fa6934af3fe5dfe99c630002b8e80..a919ba5cb3c845e03e4a070eff354acf19ec7c4a 100644 (file)
@@ -20,7 +20,6 @@
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
-#include <linux/bootmem.h>
 #include <linux/memblock.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
@@ -66,6 +65,7 @@
 #include <linux/ftrace.h>
 #include <linux/lockdep.h>
 #include <linux/nmi.h>
+#include <linux/psi.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -306,24 +306,33 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn)
 }
 
 /*
- * Returns false when the remaining initialisation should be deferred until
+ * Returns true when the remaining initialisation should be deferred until
  * later in the boot cycle when it can be parallelised.
  */
-static inline bool update_defer_init(pg_data_t *pgdat,
-                               unsigned long pfn, unsigned long zone_end,
-                               unsigned long *nr_initialised)
+static bool __meminit
+defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
 {
+       static unsigned long prev_end_pfn, nr_initialised;
+
+       /*
+        * prev_end_pfn static that contains the end of previous zone
+        * No need to protect because called very early in boot before smp_init.
+        */
+       if (prev_end_pfn != end_pfn) {
+               prev_end_pfn = end_pfn;
+               nr_initialised = 0;
+       }
+
        /* Always populate low zones for address-constrained allocations */
-       if (zone_end < pgdat_end_pfn(pgdat))
-               return true;
-       (*nr_initialised)++;
-       if ((*nr_initialised > pgdat->static_init_pgcnt) &&
-           (pfn & (PAGES_PER_SECTION - 1)) == 0) {
-               pgdat->first_deferred_pfn = pfn;
+       if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
                return false;
+       nr_initialised++;
+       if ((nr_initialised > NODE_DATA(nid)->static_init_pgcnt) &&
+           (pfn & (PAGES_PER_SECTION - 1)) == 0) {
+               NODE_DATA(nid)->first_deferred_pfn = pfn;
+               return true;
        }
-
-       return true;
+       return false;
 }
 #else
 static inline bool early_page_uninitialised(unsigned long pfn)
@@ -331,11 +340,9 @@ static inline bool early_page_uninitialised(unsigned long pfn)
        return false;
 }
 
-static inline bool update_defer_init(pg_data_t *pgdat,
-                               unsigned long pfn, unsigned long zone_end,
-                               unsigned long *nr_initialised)
+static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
 {
-       return true;
+       return false;
 }
 #endif
 
@@ -1231,7 +1238,12 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
                        /* Avoid false-positive PageTail() */
                        INIT_LIST_HEAD(&page->lru);
 
-                       SetPageReserved(page);
+                       /*
+                        * no need for atomic set_bit because the struct
+                        * page is not visible yet so nobody should
+                        * access it yet.
+                        */
+                       __SetPageReserved(page);
                }
        }
 }
@@ -1326,7 +1338,7 @@ meminit_pfn_in_nid(unsigned long pfn, int node,
 #endif
 
 
-void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
+void __init memblock_free_pages(struct page *page, unsigned long pfn,
                                                        unsigned int order)
 {
        if (early_page_uninitialised(pfn))
@@ -2015,10 +2027,6 @@ static int move_freepages(struct zone *zone,
                  pfn_valid(page_to_pfn(end_page)) &&
                  page_zone(start_page) != page_zone(end_page));
 #endif
-
-       if (num_movable)
-               *num_movable = 0;
-
        for (page = start_page; page <= end_page;) {
                if (!pfn_valid_within(page_to_pfn(page))) {
                        page++;
@@ -2058,6 +2066,9 @@ int move_freepages_block(struct zone *zone, struct page *page,
        unsigned long start_pfn, end_pfn;
        struct page *start_page, *end_page;
 
+       if (num_movable)
+               *num_movable = 0;
+
        start_pfn = page_to_pfn(page);
        start_pfn = start_pfn & ~(pageblock_nr_pages-1);
        start_page = pfn_to_page(start_pfn);
@@ -3366,26 +3377,12 @@ try_this_zone:
        return NULL;
 }
 
-/*
- * Large machines with many possible nodes should not always dump per-node
- * meminfo in irq context.
- */
-static inline bool should_suppress_show_mem(void)
-{
-       bool ret = false;
-
-#if NODES_SHIFT > 8
-       ret = in_interrupt();
-#endif
-       return ret;
-}
-
 static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
 {
        unsigned int filter = SHOW_MEM_FILTER_NODES;
        static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
 
-       if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
+       if (!__ratelimit(&show_mem_rs))
                return;
 
        /*
@@ -3549,15 +3546,20 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                enum compact_priority prio, enum compact_result *compact_result)
 {
        struct page *page;
+       unsigned long pflags;
        unsigned int noreclaim_flag;
 
        if (!order)
                return NULL;
 
+       psi_memstall_enter(&pflags);
        noreclaim_flag = memalloc_noreclaim_save();
+
        *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
                                                                        prio);
+
        memalloc_noreclaim_restore(noreclaim_flag);
+       psi_memstall_leave(&pflags);
 
        if (*compact_result <= COMPACT_INACTIVE)
                return NULL;
@@ -3756,11 +3758,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
        struct reclaim_state reclaim_state;
        int progress;
        unsigned int noreclaim_flag;
+       unsigned long pflags;
 
        cond_resched();
 
        /* We now go into synchronous reclaim */
        cpuset_memory_pressure_bump();
+       psi_memstall_enter(&pflags);
        fs_reclaim_acquire(gfp_mask);
        noreclaim_flag = memalloc_noreclaim_save();
        reclaim_state.reclaimed_slab = 0;
@@ -3772,6 +3776,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
        current->reclaim_state = NULL;
        memalloc_noreclaim_restore(noreclaim_flag);
        fs_reclaim_release(gfp_mask);
+       psi_memstall_leave(&pflags);
 
        cond_resched();
 
@@ -3922,6 +3927,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 {
        struct zone *zone;
        struct zoneref *z;
+       bool ret = false;
 
        /*
         * Costly allocations might have made a progress but this doesn't mean
@@ -3985,25 +3991,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
                                }
                        }
 
-                       /*
-                        * Memory allocation/reclaim might be called from a WQ
-                        * context and the current implementation of the WQ
-                        * concurrency control doesn't recognize that
-                        * a particular WQ is congested if the worker thread is
-                        * looping without ever sleeping. Therefore we have to
-                        * do a short sleep here rather than calling
-                        * cond_resched().
-                        */
-                       if (current->flags & PF_WQ_WORKER)
-                               schedule_timeout_uninterruptible(1);
-                       else
-                               cond_resched();
-
-                       return true;
+                       ret = true;
+                       goto out;
                }
        }
 
-       return false;
+out:
+       /*
+        * Memory allocation/reclaim might be called from a WQ context and the
+        * current implementation of the WQ concurrency control doesn't
+        * recognize that a particular WQ is congested if the worker thread is
+        * looping without ever sleeping. Therefore we have to do a short sleep
+        * here rather than calling cond_resched().
+        */
+       if (current->flags & PF_WQ_WORKER)
+               schedule_timeout_uninterruptible(1);
+       else
+               cond_resched();
+       return ret;
 }
 
 static inline bool
@@ -4701,6 +4706,7 @@ long si_mem_available(void)
        unsigned long pagecache;
        unsigned long wmark_low = 0;
        unsigned long pages[NR_LRU_LISTS];
+       unsigned long reclaimable;
        struct zone *zone;
        int lru;
 
@@ -4726,19 +4732,13 @@ long si_mem_available(void)
        available += pagecache;
 
        /*
-        * Part of the reclaimable slab consists of items that are in use,
-        * and cannot be freed. Cap this estimate at the low watermark.
+        * Part of the reclaimable slab and other kernel memory consists of
+        * items that are in use, and cannot be freed. Cap this estimate at the
+        * low watermark.
         */
-       available += global_node_page_state(NR_SLAB_RECLAIMABLE) -
-                    min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
-                        wmark_low);
-
-       /*
-        * Part of the kernel memory, which can be released under memory
-        * pressure.
-        */
-       available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >>
-               PAGE_SHIFT;
+       reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
+                       global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
+       available += reclaimable - min(reclaimable / 2, wmark_low);
 
        if (available < 0)
                available = 0;
@@ -5449,76 +5449,151 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
 #endif
 }
 
+/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
+static bool __meminit
+overlap_memmap_init(unsigned long zone, unsigned long *pfn)
+{
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+       static struct memblock_region *r;
+
+       if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+               if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
+                       for_each_memblock(memory, r) {
+                               if (*pfn < memblock_region_memory_end_pfn(r))
+                                       break;
+                       }
+               }
+               if (*pfn >= memblock_region_memory_base_pfn(r) &&
+                   memblock_is_mirror(r)) {
+                       *pfn = memblock_region_memory_end_pfn(r);
+                       return true;
+               }
+       }
+#endif
+       return false;
+}
+
 /*
  * Initially all pages are reserved - free ones are freed
- * up by free_all_bootmem() once the early boot process is
+ * up by memblock_free_all() once the early boot process is
  * done. Non-atomic initialization, single-pass.
  */
 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                unsigned long start_pfn, enum memmap_context context,
                struct vmem_altmap *altmap)
 {
-       unsigned long end_pfn = start_pfn + size;
-       pg_data_t *pgdat = NODE_DATA(nid);
-       unsigned long pfn;
-       unsigned long nr_initialised = 0;
+       unsigned long pfn, end_pfn = start_pfn + size;
        struct page *page;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-       struct memblock_region *r = NULL, *tmp;
-#endif
 
        if (highest_memmap_pfn < end_pfn - 1)
                highest_memmap_pfn = end_pfn - 1;
 
+#ifdef CONFIG_ZONE_DEVICE
        /*
         * Honor reservation requested by the driver for this ZONE_DEVICE
-        * memory
+        * memory. We limit the total number of pages to initialize to just
+        * those that might contain the memory mapping. We will defer the
+        * ZONE_DEVICE page initialization until after we have released
+        * the hotplug lock.
         */
-       if (altmap && start_pfn == altmap->base_pfn)
-               start_pfn += altmap->reserve;
+       if (zone == ZONE_DEVICE) {
+               if (!altmap)
+                       return;
+
+               if (start_pfn == altmap->base_pfn)
+                       start_pfn += altmap->reserve;
+               end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+       }
+#endif
 
        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                /*
                 * There can be holes in boot-time mem_map[]s handed to this
                 * function.  They do not exist on hotplugged memory.
                 */
-               if (context != MEMMAP_EARLY)
-                       goto not_early;
-
-               if (!early_pfn_valid(pfn))
-                       continue;
-               if (!early_pfn_in_nid(pfn, nid))
-                       continue;
-               if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
-                       break;
-
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-               /*
-                * Check given memblock attribute by firmware which can affect
-                * kernel memory layout.  If zone==ZONE_MOVABLE but memory is
-                * mirrored, it's an overlapped memmap init. skip it.
-                */
-               if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
-                       if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
-                               for_each_memblock(memory, tmp)
-                                       if (pfn < memblock_region_memory_end_pfn(tmp))
-                                               break;
-                               r = tmp;
-                       }
-                       if (pfn >= memblock_region_memory_base_pfn(r) &&
-                           memblock_is_mirror(r)) {
-                               /* already initialized as NORMAL */
-                               pfn = memblock_region_memory_end_pfn(r);
+               if (context == MEMMAP_EARLY) {
+                       if (!early_pfn_valid(pfn))
                                continue;
-                       }
+                       if (!early_pfn_in_nid(pfn, nid))
+                               continue;
+                       if (overlap_memmap_init(zone, &pfn))
+                               continue;
+                       if (defer_init(nid, pfn, end_pfn))
+                               break;
                }
-#endif
 
-not_early:
                page = pfn_to_page(pfn);
                __init_single_page(page, pfn, zone, nid);
                if (context == MEMMAP_HOTPLUG)
-                       SetPageReserved(page);
+                       __SetPageReserved(page);
+
+               /*
+                * Mark the block movable so that blocks are reserved for
+                * movable at startup. This will force kernel allocations
+                * to reserve their blocks rather than leaking throughout
+                * the address space during boot when many long-lived
+                * kernel allocations are made.
+                *
+                * bitmap is created for zone's valid pfn range. but memmap
+                * can be created for invalid pages (for alignment)
+                * check here not to call set_pageblock_migratetype() against
+                * pfn out of zone.
+                */
+               if (!(pfn & (pageblock_nr_pages - 1))) {
+                       set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+                       cond_resched();
+               }
+       }
+}
+
+#ifdef CONFIG_ZONE_DEVICE
+void __ref memmap_init_zone_device(struct zone *zone,
+                                  unsigned long start_pfn,
+                                  unsigned long size,
+                                  struct dev_pagemap *pgmap)
+{
+       unsigned long pfn, end_pfn = start_pfn + size;
+       struct pglist_data *pgdat = zone->zone_pgdat;
+       unsigned long zone_idx = zone_idx(zone);
+       unsigned long start = jiffies;
+       int nid = pgdat->node_id;
+
+       if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
+               return;
+
+       /*
+        * The call to memmap_init_zone should have already taken care
+        * of the pages reserved for the memmap, so we can just jump to
+        * the end of that region and start processing the device pages.
+        */
+       if (pgmap->altmap_valid) {
+               struct vmem_altmap *altmap = &pgmap->altmap;
+
+               start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+               size = end_pfn - start_pfn;
+       }
+
+       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+               struct page *page = pfn_to_page(pfn);
+
+               __init_single_page(page, pfn, zone_idx, nid);
+
+               /*
+                * Mark page reserved as it will need to wait for onlining
+                * phase for it to be fully associated with a zone.
+                *
+                * We can use the non-atomic __set_bit operation for setting
+                * the flag as we are still initializing the pages.
+                */
+               __SetPageReserved(page);
+
+               /*
+                * ZONE_DEVICE pages union ->lru with a ->pgmap back
+                * pointer and hmm_data.  It is a bug if a ZONE_DEVICE
+                * page is ever freed or placed on a driver-private list.
+                */
+               page->pgmap = pgmap;
+               page->hmm_data = 0;
 
                /*
                 * Mark the block movable so that blocks are reserved for
@@ -5540,8 +5615,12 @@ not_early:
                        cond_resched();
                }
        }
+
+       pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev),
+               size, jiffies_to_msecs(jiffies - start));
 }
 
+#endif
 static void __meminit zone_init_free_lists(struct zone *zone)
 {
        unsigned int order, t;
@@ -5551,10 +5630,11 @@ static void __meminit zone_init_free_lists(struct zone *zone)
        }
 }
 
-#ifndef __HAVE_ARCH_MEMMAP_INIT
-#define memmap_init(size, nid, zone, start_pfn) \
-       memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL)
-#endif
+void __meminit __weak memmap_init(unsigned long size, int nid,
+                                 unsigned long zone, unsigned long start_pfn)
+{
+       memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL);
+}
 
 static int zone_batchsize(struct zone *zone)
 {
@@ -6128,7 +6208,7 @@ static void __ref setup_usemap(struct pglist_data *pgdat,
        zone->pageblock_flags = NULL;
        if (usemapsize)
                zone->pageblock_flags =
-                       memblock_virt_alloc_node_nopanic(usemapsize,
+                       memblock_alloc_node_nopanic(usemapsize,
                                                         pgdat->node_id);
 }
 #else
@@ -6358,7 +6438,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
                end = pgdat_end_pfn(pgdat);
                end = ALIGN(end, MAX_ORDER_NR_PAGES);
                size =  (end - start) * sizeof(struct page);
-               map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
+               map = memblock_alloc_node_nopanic(size, pgdat->node_id);
                pgdat->node_mem_map = map + offset;
        }
        pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
@@ -6427,48 +6507,67 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,
        free_area_init_core(pgdat);
 }
 
-#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
+#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
+/*
+ * Zero all valid struct pages in range [spfn, epfn), return number of struct
+ * pages zeroed
+ */
+static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
+{
+       unsigned long pfn;
+       u64 pgcnt = 0;
+
+       for (pfn = spfn; pfn < epfn; pfn++) {
+               if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
+                       pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
+                               + pageblock_nr_pages - 1;
+                       continue;
+               }
+               mm_zero_struct_page(pfn_to_page(pfn));
+               pgcnt++;
+       }
+
+       return pgcnt;
+}
+
 /*
  * Only struct pages that are backed by physical memory are zeroed and
  * initialized by going through __init_single_page(). But, there are some
  * struct pages which are reserved in memblock allocator and their fields
  * may be accessed (for example page_to_pfn() on some configuration accesses
  * flags). We must explicitly zero those struct pages.
+ *
+ * This function also addresses a similar issue where struct pages are left
+ * uninitialized because the physical address range is not covered by
+ * memblock.memory or memblock.reserved. That could happen when memblock
+ * layout is manually configured via memmap=.
  */
 void __init zero_resv_unavail(void)
 {
        phys_addr_t start, end;
-       unsigned long pfn;
        u64 i, pgcnt;
+       phys_addr_t next = 0;
 
        /*
-        * Loop through ranges that are reserved, but do not have reported
-        * physical memory backing.
+        * Loop through unavailable ranges not covered by memblock.memory.
         */
        pgcnt = 0;
-       for_each_resv_unavail_range(i, &start, &end) {
-               for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
-                       if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
-                               pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
-                                       + pageblock_nr_pages - 1;
-                               continue;
-                       }
-                       mm_zero_struct_page(pfn_to_page(pfn));
-                       pgcnt++;
-               }
+       for_each_mem_range(i, &memblock.memory, NULL,
+                       NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
+               if (next < start)
+                       pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
+               next = end;
        }
+       pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn);
 
        /*
         * Struct pages that do not have backing memory. This could be because
         * firmware is using some of this memory, or for some other reasons.
-        * Once memblock is changed so such behaviour is not allowed: i.e.
-        * list of "reserved" memory must be a subset of list of "memory", then
-        * this code can be removed.
         */
        if (pgcnt)
-               pr_info("Reserved but unavailable: %lld pages", pgcnt);
+               pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
 }
-#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */
+#endif /* !CONFIG_FLAT_NODE_MEM_MAP */
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 
@@ -6803,15 +6902,12 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
 {
        enum zone_type zone_type;
 
-       if (N_MEMORY == N_NORMAL_MEMORY)
-               return;
-
        for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
                struct zone *zone = &pgdat->node_zones[zone_type];
                if (populated_zone(zone)) {
-                       node_set_state(nid, N_HIGH_MEMORY);
-                       if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
-                           zone_type <= ZONE_NORMAL)
+                       if (IS_ENABLED(CONFIG_HIGHMEM))
+                               node_set_state(nid, N_HIGH_MEMORY);
+                       if (zone_type <= ZONE_NORMAL)
                                node_set_state(nid, N_NORMAL_MEMORY);
                        break;
                }
@@ -7614,9 +7710,11 @@ void *__init alloc_large_system_hash(const char *tablename,
                size = bucketsize << log2qty;
                if (flags & HASH_EARLY) {
                        if (flags & HASH_ZERO)
-                               table = memblock_virt_alloc_nopanic(size, 0);
+                               table = memblock_alloc_nopanic(size,
+                                                              SMP_CACHE_BYTES);
                        else
-                               table = memblock_virt_alloc_raw(size, 0);
+                               table = memblock_alloc_raw(size,
+                                                          SMP_CACHE_BYTES);
                } else if (hashdist) {
                        table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
                } else {