Merge branch 'stable-4.6' of git://git.infradead.org/users/pcmoore/audit

[linux-2.6-block.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index c46b75d14b6f8732b513065f1bd79d124d703579..a762be57e46e14efa571b967eb75696c35ebd034 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -249,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = {
  
  int min_free_kbytes = 1024;
  int user_min_free_kbytes = -1;
+int watermark_scale_factor = 10;
  
  static unsigned long __meminitdata nr_kernel_pages;
  static unsigned long __meminitdata nr_all_pages;
@@ -307,13 +308,20 @@ static inline bool update_defer_init(pg_data_t *pgdat,
                                 unsigned long pfn, unsigned long zone_end,
                                 unsigned long *nr_initialised)
  {
+       unsigned long max_initialise;
+
         /* Always populate low zones for address-contrained allocations */
         if (zone_end < pgdat_end_pfn(pgdat))
                 return true;
+       /*
+        * Initialise at least 2G of a node but also take into account that
+        * two large system hashes that can take up 1GB for 0.25TB/node.
+        */
+       max_initialise = max(2UL << (30 - PAGE_SHIFT),
+               (pgdat->node_spanned_pages >> 8));
  
-       /* Initialise at least 2G of the highest zone */
         (*nr_initialised)++;
-       if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) &&
+       if ((*nr_initialised > max_initialise) &&
             (pfn & (PAGES_PER_SECTION - 1)) == 0) {
                 pgdat->first_deferred_pfn = pfn;
                 return false;
@@ -498,6 +506,7 @@ void prep_compound_page(struct page *page, unsigned int order)
  unsigned int _debug_guardpage_minorder;
  bool _debug_pagealloc_enabled __read_mostly
                         = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+EXPORT_SYMBOL(_debug_pagealloc_enabled);
  bool _debug_guardpage_enabled __read_mostly;
  
  static int __init early_debug_pagealloc(char *buf)
@@ -542,11 +551,11 @@ static int __init debug_guardpage_minorder_setup(char *buf)
         unsigned long res;
  
         if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
-               printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+               pr_err("Bad debug_guardpage_minorder value\n");
                 return 0;
         }
         _debug_guardpage_minorder = res;
-       printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+       pr_info("Setting debug_guardpage_minorder to %lu\n", res);
         return 0;
  }
  __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
@@ -764,7 +773,7 @@ static inline int free_pages_check(struct page *page)
                 bad_reason = "nonzero mapcount";
         if (unlikely(page->mapping != NULL))
                 bad_reason = "non-NULL mapping";
-       if (unlikely(atomic_read(&page->_count) != 0))
+       if (unlikely(page_ref_count(page) != 0))
                 bad_reason = "nonzero _count";
         if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
                 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
@@ -1460,7 +1469,7 @@ static inline int check_new_page(struct page *page)
                 bad_reason = "nonzero mapcount";
         if (unlikely(page->mapping != NULL))
                 bad_reason = "non-NULL mapping";
-       if (unlikely(atomic_read(&page->_count) != 0))
+       if (unlikely(page_ref_count(page) != 0))
                 bad_reason = "nonzero _count";
         if (unlikely(page->flags & __PG_HWPOISON)) {
                 bad_reason = "HWPoisoned (hardware-corrupted)";
@@ -2348,19 +2357,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
                 list_del(&page->lru);
                 pcp->count--;
         } else {
-               if (unlikely(gfp_flags & __GFP_NOFAIL)) {
-                       /*
-                        * __GFP_NOFAIL is not to be used in new code.
-                        *
-                        * All __GFP_NOFAIL callers should be fixed so that they
-                        * properly detect and handle allocation failures.
-                        *
-                        * We most definitely don't want callers attempting to
-                        * allocate greater than order-1 page units with
-                        * __GFP_NOFAIL.
-                        */
-                       WARN_ON_ONCE(order > 1);
-               }
+               /*
+                * We most definitely don't want callers attempting to
+                * allocate greater than order-1 page units with __GFP_NOFAIL.
+                */
+               WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
                 spin_lock_irqsave(&zone->lock, flags);
  
                 page = NULL;
@@ -2857,8 +2858,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                          * XXX: Page reclaim didn't yield anything,
                          * and the OOM killer can't be invoked, but
                          * keep looping as per tradition.
+                        *
+                        * But do not keep looping if oom_killer_disable()
+                        * was already called, for the system is trying to
+                        * enter a quiescent state during suspend.
                          */
-                       *did_some_progress = 1;
+                       *did_some_progress = !oom_killer_disabled;
                         goto out;
                 }
                 if (pm_suspended_storage())
@@ -3117,14 +3122,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                                 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
                 gfp_mask &= ~__GFP_ATOMIC;
  
-       /*
-        * If this allocation cannot block and it is for a specific node, then
-        * fail early.  There's no need to wakeup kswapd or retry for a
-        * speculative node-specific allocation.
-        */
-       if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim)
-               goto nopage;
-
  retry:
         if (gfp_mask & __GFP_KSWAPD_RECLAIM)
                 wake_all_kswapds(order, ac);
@@ -3481,7 +3478,7 @@ refill:
                 /* Even if we own the page, we do not use atomic_set().
                  * This would break get_page_unless_zero() users.
                  */
-               atomic_add(size - 1, &page->_count);
+               page_ref_add(page, size - 1);
  
                 /* reset page count bias and offset to start of new frag */
                 nc->pfmemalloc = page_is_pfmemalloc(page);
@@ -3493,7 +3490,7 @@ refill:
         if (unlikely(offset < 0)) {
                 page = virt_to_page(nc->va);
  
-               if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
+               if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
                         goto refill;
  
  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
@@ -3501,7 +3498,7 @@ refill:
                 size = nc->size;
  #endif
                 /* OK, page count is 0, we can safely set it */
-               atomic_set(&page->_count, size);
+               set_page_count(page, size);
  
                 /* reset page count bias and offset to start of new frag */
                 nc->pagecnt_bias = size;
@@ -3712,6 +3709,49 @@ static inline void show_node(struct zone *zone)
                 printk("Node %d ", zone_to_nid(zone));
  }
  
+long si_mem_available(void)
+{
+       long available;
+       unsigned long pagecache;
+       unsigned long wmark_low = 0;
+       unsigned long pages[NR_LRU_LISTS];
+       struct zone *zone;
+       int lru;
+
+       for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+               pages[lru] = global_page_state(NR_LRU_BASE + lru);
+
+       for_each_zone(zone)
+               wmark_low += zone->watermark[WMARK_LOW];
+
+       /*
+        * Estimate the amount of memory available for userspace allocations,
+        * without causing swapping.
+        */
+       available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
+
+       /*
+        * Not all the page cache can be freed, otherwise the system will
+        * start swapping. Assume at least half of the page cache, or the
+        * low watermark worth of cache, needs to stay.
+        */
+       pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+       pagecache -= min(pagecache / 2, wmark_low);
+       available += pagecache;
+
+       /*
+        * Part of the reclaimable slab consists of items that are in use,
+        * and cannot be freed. Cap this estimate at the low watermark.
+        */
+       available += global_page_state(NR_SLAB_RECLAIMABLE) -
+                    min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+
+       if (available < 0)
+               available = 0;
+       return available;
+}
+EXPORT_SYMBOL_GPL(si_mem_available);
+
  void si_meminfo(struct sysinfo *val)
  {
         val->totalram = totalram_pages;
@@ -4044,9 +4084,7 @@ static int __parse_numa_zonelist_order(char *s)
         } else if (*s == 'z' || *s == 'Z') {
                 user_zonelist_order = ZONELIST_ORDER_ZONE;
         } else {
-               printk(KERN_WARNING
-                       "Ignoring invalid numa_zonelist_order value:  "
-                       "%s\n", s);
+               pr_warn("Ignoring invalid numa_zonelist_order value:  %s\n", s);
                 return -EINVAL;
         }
         return 0;
@@ -4510,12 +4548,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
         else
                 page_group_by_mobility_disabled = 0;
  
-       pr_info("Built %i zonelists in %s order, mobility grouping %s.  "
-               "Total pages: %ld\n",
-                       nr_online_nodes,
-                       zonelist_order_name[current_zonelist_order],
-                       page_group_by_mobility_disabled ? "off" : "on",
-                       vm_total_pages);
+       pr_info("Built %i zonelists in %s order, mobility grouping %s.  Total pages: %ld\n",
+               nr_online_nodes,
+               zonelist_order_name[current_zonelist_order],
+               page_group_by_mobility_disabled ? "off" : "on",
+               vm_total_pages);
  #ifdef CONFIG_NUMA
         pr_info("Policy zone: %s\n", zone_names[policy_zone]);
  #endif
@@ -5404,6 +5441,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
  #endif
         init_waitqueue_head(&pgdat->kswapd_wait);
         init_waitqueue_head(&pgdat->pfmemalloc_wait);
+#ifdef CONFIG_COMPACTION
+       init_waitqueue_head(&pgdat->kcompactd_wait);
+#endif
         pgdat_page_ext_init(pgdat);
  
         for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -5428,8 +5468,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
                                                "  %s zone: %lu pages used for memmap\n",
                                                zone_names[j], memmap_pages);
                         } else
-                               printk(KERN_WARNING
-                                       "  %s zone: %lu pages exceeds freesize %lu\n",
+                               pr_warn("  %s zone: %lu pages exceeds freesize %lu\n",
                                         zone_names[j], memmap_pages, freesize);
                 }
  
@@ -5637,8 +5676,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
                 min_pfn = min(min_pfn, start_pfn);
  
         if (min_pfn == ULONG_MAX) {
-               printk(KERN_WARNING
-                       "Could not find start_pfn for node %d\n", nid);
+               pr_warn("Could not find start_pfn for node %d\n", nid);
                 return 0;
         }
  
@@ -6110,22 +6148,21 @@ void __init mem_init_print_info(const char *str)
  
  #undef adj_init_size
  
-       pr_info("Memory: %luK/%luK available "
-              "(%luK kernel code, %luK rwdata, %luK rodata, "
-              "%luK init, %luK bss, %luK reserved, %luK cma-reserved"
+       pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
  #ifdef CONFIG_HIGHMEM
-              ", %luK highmem"
+               ", %luK highmem"
  #endif
-              "%s%s)\n",
-              nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
-              codesize >> 10, datasize >> 10, rosize >> 10,
-              (init_data_size + init_code_size) >> 10, bss_size >> 10,
-              (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10),
-              totalcma_pages << (PAGE_SHIFT-10),
+               "%s%s)\n",
+               nr_free_pages() << (PAGE_SHIFT - 10),
+               physpages << (PAGE_SHIFT - 10),
+               codesize >> 10, datasize >> 10, rosize >> 10,
+               (init_data_size + init_code_size) >> 10, bss_size >> 10,
+               (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
+               totalcma_pages << (PAGE_SHIFT - 10),
  #ifdef CONFIG_HIGHMEM
-              totalhigh_pages << (PAGE_SHIFT-10),
+               totalhigh_pages << (PAGE_SHIFT - 10),
  #endif
-              str ? ", " : "", str ? str : "");
+               str ? ", " : "", str ? str : "");
  }
  
  /**
@@ -6300,8 +6337,17 @@ static void __setup_per_zone_wmarks(void)
                         zone->watermark[WMARK_MIN] = tmp;
                 }
  
-               zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
-               zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+               /*
+                * Set the kswapd watermarks distance according to the
+                * scale factor in proportion to available memory, but
+                * ensure a minimum size on small systems.
+                */
+               tmp = max_t(u64, tmp >> 2,
+                           mult_frac(zone->managed_pages,
+                                     watermark_scale_factor, 10000));
+
+               zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
+               zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
  
                 __mod_zone_page_state(zone, NR_ALLOC_BATCH,
                         high_wmark_pages(zone) - low_wmark_pages(zone) -
@@ -6442,6 +6488,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
         return 0;
  }
  
+int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+       void __user *buffer, size_t *length, loff_t *ppos)
+{
+       int rc;
+
+       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+       if (rc)
+               return rc;
+
+       if (write)
+               setup_per_zone_wmarks();
+
+       return 0;
+}
+
  #ifdef CONFIG_NUMA
  int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
         void __user *buffer, size_t *length, loff_t *ppos)
@@ -6633,11 +6694,8 @@ void *__init alloc_large_system_hash(const char *tablename,
         if (!table)
                 panic("Failed to allocate %s hash table\n", tablename);
  
-       printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
-              tablename,
-              (1UL << log2qty),
-              ilog2(size) - PAGE_SHIFT,
-              size);
+       pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
+               tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
  
         if (_hash_shift)
                 *_hash_shift = log2qty;
@@ -6788,7 +6846,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
                  * This check already skips compound tails of THP
                  * because their page->_count is zero at all time.
                  */
-               if (!atomic_read(&page->_count)) {
+               if (!page_ref_count(page)) {
                         if (PageBuddy(page))
                                 iter += (1 << page_order(page)) - 1;
                         continue;
@@ -7138,8 +7196,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
                 BUG_ON(!PageBuddy(page));
                 order = page_order(page);
  #ifdef CONFIG_DEBUG_VM
-               printk(KERN_INFO "remove from free list %lx %d %lx\n",
-                      pfn, 1 << order, end_pfn);
+               pr_info("remove from free list %lx %d %lx\n",
+                       pfn, 1 << order, end_pfn);
  #endif
                 list_del(&page->lru);
                 rmv_page_order(page);
@@ -7152,7 +7210,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
  }
  #endif
  
-#ifdef CONFIG_MEMORY_FAILURE
  bool is_free_buddy_page(struct page *page)
  {
         struct zone *zone = page_zone(page);
@@ -7171,4 +7228,3 @@ bool is_free_buddy_page(struct page *page)
  
         return order < MAX_ORDER;
  }
-#endif