Merge tag 'nfsd-4.8' of git://linux-nfs.org/~bfields/linux

[linux-2.6-block.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 759cfa8cbbeb4f1899799b21b705b08c68a38153..39a372a2a1d628a58eb5f02d3a27b3e0989b37f9 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3085,7 +3085,6 @@ out:
         return page;
  }
  
-
  /*
   * Maximum number of compaction retries wit a progress before OOM
   * killer is consider as the only way to move forward.
@@ -3097,17 +3096,16 @@ out:
  static struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                 unsigned int alloc_flags, const struct alloc_context *ac,
-               enum migrate_mode mode, enum compact_result *compact_result)
+               enum compact_priority prio, enum compact_result *compact_result)
  {
         struct page *page;
-       int contended_compaction;
  
         if (!order)
                 return NULL;
  
         current->flags |= PF_MEMALLOC;
         *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
-                                               mode, &contended_compaction);
+                                                                       prio);
         current->flags &= ~PF_MEMALLOC;
  
         if (*compact_result <= COMPACT_INACTIVE)
@@ -3119,8 +3117,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
          */
         count_vm_event(COMPACTSTALL);
  
-       page = get_page_from_freelist(gfp_mask, order,
-                                       alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
+       page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
  
         if (page) {
                 struct zone *zone = page_zone(page);
@@ -3137,24 +3134,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
          */
         count_vm_event(COMPACTFAIL);
  
-       /*
-        * In all zones where compaction was attempted (and not
-        * deferred or skipped), lock contention has been detected.
-        * For THP allocation we do not want to disrupt the others
-        * so we fallback to base pages instead.
-        */
-       if (contended_compaction == COMPACT_CONTENDED_LOCK)
-               *compact_result = COMPACT_CONTENDED;
-
-       /*
-        * If compaction was aborted due to need_resched(), we do not
-        * want to further increase allocation latency, unless it is
-        * khugepaged trying to collapse.
-        */
-       if (contended_compaction == COMPACT_CONTENDED_SCHED
-               && !(current->flags & PF_KTHREAD))
-               *compact_result = COMPACT_CONTENDED;
-
         cond_resched();
  
         return NULL;
@@ -3162,7 +3141,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
  
  static inline bool
  should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
-                    enum compact_result compact_result, enum migrate_mode *migrate_mode,
+                    enum compact_result compact_result,
+                    enum compact_priority *compact_priority,
                      int compaction_retries)
  {
         int max_retries = MAX_COMPACT_RETRIES;
@@ -3173,11 +3153,11 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
         /*
          * compaction considers all the zone as desperately out of memory
          * so it doesn't really make much sense to retry except when the
-        * failure could be caused by weak migration mode.
+        * failure could be caused by insufficient priority
          */
         if (compaction_failed(compact_result)) {
-               if (*migrate_mode == MIGRATE_ASYNC) {
-                       *migrate_mode = MIGRATE_SYNC_LIGHT;
+               if (*compact_priority > MIN_COMPACT_PRIORITY) {
+                       (*compact_priority)--;
                         return true;
                 }
                 return false;
@@ -3211,7 +3191,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
  static inline struct page *
  __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                 unsigned int alloc_flags, const struct alloc_context *ac,
-               enum migrate_mode mode, enum compact_result *compact_result)
+               enum compact_priority prio, enum compact_result *compact_result)
  {
         *compact_result = COMPACT_SKIPPED;
         return NULL;
@@ -3220,7 +3200,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
  static inline bool
  should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
                      enum compact_result compact_result,
-                    enum migrate_mode *migrate_mode,
+                    enum compact_priority *compact_priority,
                      int compaction_retries)
  {
         struct zone *zone;
@@ -3288,8 +3268,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
                 return NULL;
  
  retry:
-       page = get_page_from_freelist(gfp_mask, order,
-                                       alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
+       page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
  
         /*
          * If an allocation failed after direct reclaim, it could be because
@@ -3351,16 +3330,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
         } else if (unlikely(rt_task(current)) && !in_interrupt())
                 alloc_flags |= ALLOC_HARDER;
  
-       if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
-               if (gfp_mask & __GFP_MEMALLOC)
-                       alloc_flags |= ALLOC_NO_WATERMARKS;
-               else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
-                       alloc_flags |= ALLOC_NO_WATERMARKS;
-               else if (!in_interrupt() &&
-                               ((current->flags & PF_MEMALLOC) ||
-                                unlikely(test_thread_flag(TIF_MEMDIE))))
-                       alloc_flags |= ALLOC_NO_WATERMARKS;
-       }
  #ifdef CONFIG_CMA
         if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                 alloc_flags |= ALLOC_CMA;
@@ -3370,12 +3339,19 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
  
  bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
  {
-       return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
-}
+       if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+               return false;
  
-static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
-{
-       return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
+       if (gfp_mask & __GFP_MEMALLOC)
+               return true;
+       if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
+               return true;
+       if (!in_interrupt() &&
+                       ((current->flags & PF_MEMALLOC) ||
+                        unlikely(test_thread_flag(TIF_MEMDIE))))
+               return true;
+
+       return false;
  }
  
  /*
@@ -3402,7 +3378,6 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
  {
         struct zone *zone;
         struct zoneref *z;
-       pg_data_t *current_pgdat = NULL;
  
         /*
          * Make sure we converge to OOM if we cannot make any progress
@@ -3411,15 +3386,6 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
         if (no_progress_loops > MAX_RECLAIM_RETRIES)
                 return false;
  
-       /*
-        * Blindly retry lowmem allocation requests that are often ignored by
-        * the OOM killer up to MAX_RECLAIM_RETRIES as we not have a reliable
-        * and fast means of calculating reclaimable, dirty and writeback pages
-        * in eligible zones.
-        */
-       if (ac->high_zoneidx < ZONE_NORMAL)
-               goto out;
-
         /*
          * Keep reclaiming pages while there is a chance this will lead
          * somewhere.  If none of the target zones can satisfy our allocation
@@ -3430,38 +3396,18 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
                                         ac->nodemask) {
                 unsigned long available;
                 unsigned long reclaimable;
-               int zid;
-
-               if (current_pgdat == zone->zone_pgdat)
-                       continue;
  
-               current_pgdat = zone->zone_pgdat;
-               available = reclaimable = pgdat_reclaimable_pages(current_pgdat);
+               available = reclaimable = zone_reclaimable_pages(zone);
                 available -= DIV_ROUND_UP(no_progress_loops * available,
                                           MAX_RECLAIM_RETRIES);
-
-               /* Account for all free pages on eligible zones */
-               for (zid = 0; zid <= zone_idx(zone); zid++) {
-                       struct zone *acct_zone = &current_pgdat->node_zones[zid];
-
-                       available += zone_page_state_snapshot(acct_zone, NR_FREE_PAGES);
-               }
+               available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
  
                 /*
                  * Would the allocation succeed if we reclaimed the whole
-                * available? This is approximate because there is no
-                * accurate count of reclaimable pages per zone.
+                * available?
                  */
-               for (zid = 0; zid <= zone_idx(zone); zid++) {
-                       struct zone *check_zone = &current_pgdat->node_zones[zid];
-                       unsigned long estimate;
-
-                       estimate = min(check_zone->managed_pages, available);
-                       if (!__zone_watermark_ok(check_zone, order,
-                                       min_wmark_pages(check_zone), ac_classzone_idx(ac),
-                                       alloc_flags, estimate))
-                               continue;
-
+               if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
+                               ac_classzone_idx(ac), alloc_flags, available)) {
                         /*
                          * If we didn't make any progress and have a lot of
                          * dirty + writeback pages then we should wait for
@@ -3471,16 +3417,15 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
                         if (!did_some_progress) {
                                 unsigned long write_pending;
  
-                               write_pending =
-                                       node_page_state(current_pgdat, NR_WRITEBACK) +
-                                       node_page_state(current_pgdat, NR_FILE_DIRTY);
+                               write_pending = zone_page_state_snapshot(zone,
+                                                       NR_ZONE_WRITE_PENDING);
  
                                 if (2 * write_pending > reclaimable) {
                                         congestion_wait(BLK_RW_ASYNC, HZ/10);
                                         return true;
                                 }
                         }
-out:
+
                         /*
                          * Memory allocation/reclaim might be called from a WQ
                          * context and the current implementation of the WQ
@@ -3510,7 +3455,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         struct page *page = NULL;
         unsigned int alloc_flags;
         unsigned long did_some_progress;
-       enum migrate_mode migration_mode = MIGRATE_ASYNC;
+       enum compact_priority compact_priority = DEF_COMPACT_PRIORITY;
         enum compact_result compact_result;
         int compaction_retries = 0;
         int no_progress_loops = 0;
@@ -3534,42 +3479,88 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                                 (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
                 gfp_mask &= ~__GFP_ATOMIC;
  
-retry:
+       /*
+        * The fast path uses conservative alloc_flags to succeed only until
+        * kswapd needs to be woken up, and to avoid the cost of setting up
+        * alloc_flags precisely. So we do that now.
+        */
+       alloc_flags = gfp_to_alloc_flags(gfp_mask);
+
         if (gfp_mask & __GFP_KSWAPD_RECLAIM)
                 wake_all_kswapds(order, ac);
  
         /*
-        * OK, we're below the kswapd watermark and have kicked background
-        * reclaim. Now things get more complex, so set up alloc_flags according
-        * to how we want to proceed.
+        * The adjusted alloc_flags might result in immediate success, so try
+        * that first
          */
-       alloc_flags = gfp_to_alloc_flags(gfp_mask);
+       page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+       if (page)
+               goto got_pg;
+
+       /*
+        * For costly allocations, try direct compaction first, as it's likely
+        * that we have enough base pages and don't need to reclaim. Don't try
+        * that for allocations that are allowed to ignore watermarks, as the
+        * ALLOC_NO_WATERMARKS attempt didn't yet happen.
+        */
+       if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
+               !gfp_pfmemalloc_allowed(gfp_mask)) {
+               page = __alloc_pages_direct_compact(gfp_mask, order,
+                                               alloc_flags, ac,
+                                               INIT_COMPACT_PRIORITY,
+                                               &compact_result);
+               if (page)
+                       goto got_pg;
+
+               /*
+                * Checks for costly allocations with __GFP_NORETRY, which
+                * includes THP page fault allocations
+                */
+               if (gfp_mask & __GFP_NORETRY) {
+                       /*
+                        * If compaction is deferred for high-order allocations,
+                        * it is because sync compaction recently failed. If
+                        * this is the case and the caller requested a THP
+                        * allocation, we do not want to heavily disrupt the
+                        * system, so we fail the allocation instead of entering
+                        * direct reclaim.
+                        */
+                       if (compact_result == COMPACT_DEFERRED)
+                               goto nopage;
+
+                       /*
+                        * Looks like reclaim/compaction is worth trying, but
+                        * sync compaction could be very expensive, so keep
+                        * using async compaction.
+                        */
+                       compact_priority = INIT_COMPACT_PRIORITY;
+               }
+       }
+
+retry:
+       /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
+       if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+               wake_all_kswapds(order, ac);
+
+       if (gfp_pfmemalloc_allowed(gfp_mask))
+               alloc_flags = ALLOC_NO_WATERMARKS;
  
         /*
          * Reset the zonelist iterators if memory policies can be ignored.
          * These allocations are high priority and system rather than user
          * orientated.
          */
-       if ((alloc_flags & ALLOC_NO_WATERMARKS) || !(alloc_flags & ALLOC_CPUSET)) {
+       if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
                 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
                 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
                                         ac->high_zoneidx, ac->nodemask);
         }
  
-       /* This is the last chance, in general, before the goto nopage. */
-       page = get_page_from_freelist(gfp_mask, order,
-                               alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
+       /* Attempt with potentially adjusted zonelist and alloc_flags */
+       page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
         if (page)
                 goto got_pg;
  
-       /* Allocate without watermarks if the context allows */
-       if (alloc_flags & ALLOC_NO_WATERMARKS) {
-               page = get_page_from_freelist(gfp_mask, order,
-                                               ALLOC_NO_WATERMARKS, ac);
-               if (page)
-                       goto got_pg;
-       }
-
         /* Caller is not willing to reclaim, we can't balance anything */
         if (!can_direct_reclaim) {
                 /*
@@ -3599,38 +3590,6 @@ retry:
         if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
                 goto nopage;
  
-       /*
-        * Try direct compaction. The first pass is asynchronous. Subsequent
-        * attempts after direct reclaim are synchronous
-        */
-       page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
-                                       migration_mode,
-                                       &compact_result);
-       if (page)
-               goto got_pg;
-
-       /* Checks for THP-specific high-order allocations */
-       if (is_thp_gfp_mask(gfp_mask)) {
-               /*
-                * If compaction is deferred for high-order allocations, it is
-                * because sync compaction recently failed. If this is the case
-                * and the caller requested a THP allocation, we do not want
-                * to heavily disrupt the system, so we fail the allocation
-                * instead of entering direct reclaim.
-                */
-               if (compact_result == COMPACT_DEFERRED)
-                       goto nopage;
-
-               /*
-                * Compaction is contended so rather back off than cause
-                * excessive stalls.
-                */
-               if(compact_result == COMPACT_CONTENDED)
-                       goto nopage;
-       }
-
-       if (order && compaction_made_progress(compact_result))
-               compaction_retries++;
  
         /* Try direct reclaim and then allocating */
         page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
@@ -3638,16 +3597,25 @@ retry:
         if (page)
                 goto got_pg;
  
+       /* Try direct compaction and then allocating */
+       page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
+                                       compact_priority, &compact_result);
+       if (page)
+               goto got_pg;
+
+       if (order && compaction_made_progress(compact_result))
+               compaction_retries++;
+
         /* Do not loop if specifically requested */
         if (gfp_mask & __GFP_NORETRY)
-               goto noretry;
+               goto nopage;
  
         /*
          * Do not retry costly high order allocations unless they are
          * __GFP_REPEAT
          */
         if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
-               goto noretry;
+               goto nopage;
  
         /*
          * Costly allocations might have made a progress but this doesn't mean
@@ -3671,7 +3639,7 @@ retry:
          */
         if (did_some_progress > 0 &&
                         should_compact_retry(ac, order, alloc_flags,
-                               compact_result, &migration_mode,
+                               compact_result, &compact_priority,
                                 compaction_retries))
                 goto retry;
  
@@ -3686,25 +3654,6 @@ retry:
                 goto retry;
         }
  
-noretry:
-       /*
-        * High-order allocations do not necessarily loop after direct reclaim
-        * and reclaim/compaction depends on compaction being called after
-        * reclaim so call directly if necessary.
-        * It can become very expensive to allocate transparent hugepages at
-        * fault, so use asynchronous memory compaction for THP unless it is
-        * khugepaged trying to collapse. All other requests should tolerate
-        * at least light sync migration.
-        */
-       if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD))
-               migration_mode = MIGRATE_ASYNC;
-       else
-               migration_mode = MIGRATE_SYNC_LIGHT;
-       page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
-                                           ac, migration_mode,
-                                           &compact_result);
-       if (page)
-               goto got_pg;
  nopage:
         warn_alloc_failed(gfp_mask, order, NULL);
  got_pg:
@@ -4361,6 +4310,7 @@ void show_free_areas(unsigned int filter)
                         " active_file:%lukB"
                         " inactive_file:%lukB"
                         " unevictable:%lukB"
+                       " writepending:%lukB"
                         " present:%lukB"
                         " managed:%lukB"
                         " mlocked:%lukB"
@@ -4383,13 +4333,13 @@ void show_free_areas(unsigned int filter)
                         K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
                         K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
                         K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
+                       K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
                         K(zone->present_pages),
                         K(zone->managed_pages),
                         K(zone_page_state(zone, NR_MLOCK)),
                         K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
                         K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
-                       zone_page_state(zone, NR_KERNEL_STACK) *
-                               THREAD_SIZE / 1024,
+                       zone_page_state(zone, NR_KERNEL_STACK_KB),
                         K(zone_page_state(zone, NR_PAGETABLE)),
                         K(zone_page_state(zone, NR_BOUNCE)),
                         K(free_pcp),
@@ -5326,7 +5276,7 @@ void __init setup_per_cpu_pageset(void)
                 setup_zone_pageset(zone);
  }
  
-static noinline __init_refok
+static noinline __ref
  int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
  {
         int i;
@@ -5953,7 +5903,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
         }
  }
  
-static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
+static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
  {
         unsigned long __maybe_unused start = 0;
         unsigned long __maybe_unused offset = 0;