Merge tag 'nfsd-4.8' of git://linux-nfs.org/~bfields/linux

[linux-2.6-block.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 9f6e673efba7a95d8313037d7fcba9e6a6afe349..374d95d0417856b096d902d40ff7cc29d4021b2e 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -203,9 +203,11 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
  {
         unsigned long nr;
  
-       nr = zone_page_state_snapshot(zone, NR_ZONE_LRU_FILE);
+       nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
+               zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
         if (get_nr_swap_pages() > 0)
-               nr += zone_page_state_snapshot(zone, NR_ZONE_LRU_ANON);
+               nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
+                       zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
  
         return nr;
  }
@@ -612,7 +614,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                         ClearPageReclaim(page);
                 }
                 trace_mm_vmscan_writepage(page);
-               inc_zone_page_state(page, NR_VMSCAN_WRITE);
+               inc_node_page_state(page, NR_VMSCAN_WRITE);
                 return PAGE_SUCCESS;
         }
  
@@ -1117,7 +1119,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                  * except we already have the page isolated
                                  * and know it's dirty
                                  */
-                               inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
+                               inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
                                 SetPageReclaim(page);
  
                                 goto keep_locked;
@@ -1366,6 +1368,29 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
         return ret;
  }
  
+
+/*
+ * Update LRU sizes after isolating pages. The LRU size updates must
+ * be complete before mem_cgroup_update_lru_size due to a santity check.
+ */
+static __always_inline void update_lru_sizes(struct lruvec *lruvec,
+                       enum lru_list lru, unsigned long *nr_zone_taken,
+                       unsigned long nr_taken)
+{
+       int zid;
+
+       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+               if (!nr_zone_taken[zid])
+                       continue;
+
+               __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
+       }
+
+#ifdef CONFIG_MEMCG
+       mem_cgroup_update_lru_size(lruvec, lru, -nr_taken);
+#endif
+}
+
  /*
   * zone_lru_lock is heavily contended.  Some of the functions that
   * shrink the lists perform better by taking out a batch of pages
@@ -1394,11 +1419,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
         struct list_head *src = &lruvec->lists[lru];
         unsigned long nr_taken = 0;
         unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
+       unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
         unsigned long scan, nr_pages;
         LIST_HEAD(pages_skipped);
  
         for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
-                                       !list_empty(src); scan++) {
+                                       !list_empty(src);) {
                 struct page *page;
  
                 page = lru_to_page(src);
@@ -1408,9 +1434,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
  
                 if (page_zonenum(page) > sc->reclaim_idx) {
                         list_move(&page->lru, &pages_skipped);
+                       nr_skipped[page_zonenum(page)]++;
                         continue;
                 }
  
+               /*
+                * Account for scanned and skipped separetly to avoid the pgdat
+                * being prematurely marked unreclaimable by pgdat_reclaimable.
+                */
+               scan++;
+
                 switch (__isolate_lru_page(page, mode)) {
                 case 0:
                         nr_pages = hpage_nr_pages(page);
@@ -1436,18 +1469,31 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
          * scanning would soon rescan the same pages to skip and put the
          * system at risk of premature OOM.
          */
-       if (!list_empty(&pages_skipped))
+       if (!list_empty(&pages_skipped)) {
+               int zid;
+               unsigned long total_skipped = 0;
+
+               for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                       if (!nr_skipped[zid])
+                               continue;
+
+                       __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
+                       total_skipped += nr_skipped[zid];
+               }
+
+               /*
+                * Account skipped pages as a partial scan as the pgdat may be
+                * close to unreclaimable. If the LRU list is empty, account
+                * skipped pages as a full scan.
+                */
+               scan += list_empty(src) ? total_skipped : total_skipped >> 2;
+
                 list_splice(&pages_skipped, src);
+       }
         *nr_scanned = scan;
-       trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
+       trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
                                     nr_taken, mode, is_file_lru(lru));
-       for (scan = 0; scan < MAX_NR_ZONES; scan++) {
-               nr_pages = nr_zone_taken[scan];
-               if (!nr_pages)
-                       continue;
-
-               update_lru_size(lruvec, lru, scan, -nr_pages);
-       }
+       update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken);
         return nr_taken;
  }
  
@@ -1606,6 +1652,30 @@ static int current_may_throttle(void)
                 bdi_write_congested(current->backing_dev_info);
  }
  
+static bool inactive_reclaimable_pages(struct lruvec *lruvec,
+                               struct scan_control *sc, enum lru_list lru)
+{
+       int zid;
+       struct zone *zone;
+       int file = is_file_lru(lru);
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+       if (!global_reclaim(sc))
+               return true;
+
+       for (zid = sc->reclaim_idx; zid >= 0; zid--) {
+               zone = &pgdat->node_zones[zid];
+               if (!populated_zone(zone))
+                       continue;
+
+               if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE +
+                               LRU_FILE * file) >= SWAP_CLUSTER_MAX)
+                       return true;
+       }
+
+       return false;
+}
+
  /*
   * shrink_inactive_list() is a helper for shrink_node().  It returns the number
   * of reclaimed pages
@@ -1628,6 +1698,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         struct pglist_data *pgdat = lruvec_pgdat(lruvec);
         struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
  
+       if (!inactive_reclaimable_pages(lruvec, sc, lru))
+               return 0;
+
         while (unlikely(too_many_isolated(pgdat, file, sc))) {
                 congestion_wait(BLK_RW_ASYNC, HZ/10);
  
@@ -1934,12 +2007,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
   *    1TB     101        10GB
   *   10TB     320        32GB
   */
-static bool inactive_list_is_low(struct lruvec *lruvec, bool file)
+static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
+                                               struct scan_control *sc)
  {
         unsigned long inactive_ratio;
         unsigned long inactive;
         unsigned long active;
         unsigned long gb;
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+       int zid;
  
         /*
          * If we don't have swap space, anonymous page deactivation
@@ -1951,6 +2027,27 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file)
         inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
         active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
  
+       /*
+        * For zone-constrained allocations, it is necessary to check if
+        * deactivations are required for lowmem to be reclaimed. This
+        * calculates the inactive/active pages available in eligible zones.
+        */
+       for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) {
+               struct zone *zone = &pgdat->node_zones[zid];
+               unsigned long inactive_zone, active_zone;
+
+               if (!populated_zone(zone))
+                       continue;
+
+               inactive_zone = zone_page_state(zone,
+                               NR_ZONE_LRU_BASE + (file * LRU_FILE));
+               active_zone = zone_page_state(zone,
+                               NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE);
+
+               inactive -= min(inactive, inactive_zone);
+               active -= min(active, active_zone);
+       }
+
         gb = (inactive + active) >> (30 - PAGE_SHIFT);
         if (gb)
                 inactive_ratio = int_sqrt(10 * gb);
@@ -1964,7 +2061,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
                                  struct lruvec *lruvec, struct scan_control *sc)
  {
         if (is_active_lru(lru)) {
-               if (inactive_list_is_low(lruvec, is_file_lru(lru)))
+               if (inactive_list_is_low(lruvec, is_file_lru(lru), sc))
                         shrink_active_list(nr_to_scan, lruvec, sc, lru);
                 return 0;
         }
@@ -2095,7 +2192,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
          * lruvec even if it has plenty of old anonymous pages unless the
          * system is under heavy pressure.
          */
-       if (!inactive_list_is_low(lruvec, true) &&
+       if (!inactive_list_is_low(lruvec, true, sc) &&
             lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
                 scan_balance = SCAN_FILE;
                 goto out;
@@ -2337,7 +2434,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
          * Even if we did not try to evict anon pages at all, we want to
          * rebalance the anon lru active/inactive ratio.
          */
-       if (inactive_list_is_low(lruvec, false))
+       if (inactive_list_is_low(lruvec, false, sc))
                 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                    sc, LRU_ACTIVE_ANON);
  
@@ -2428,8 +2525,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
         return true;
  }
  
-static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
-                       enum zone_type classzone_idx)
+static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
  {
         struct reclaim_state *reclaim_state = current->reclaim_state;
         unsigned long nr_reclaimed, nr_scanned;
@@ -2465,7 +2561,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
                         shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
                         node_lru_pages += lru_pages;
  
-                       if (!global_reclaim(sc))
+                       if (memcg)
                                 shrink_slab(sc->gfp_mask, pgdat->node_id,
                                             memcg, sc->nr_scanned - scanned,
                                             lru_pages);
@@ -2524,7 +2620,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
   * Returns true if compaction should go ahead for a high-order request, or
   * the high-order allocation would succeed without compaction.
   */
-static inline bool compaction_ready(struct zone *zone, int order, int classzone_idx)
+static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
  {
         unsigned long watermark;
         bool watermark_ok;
@@ -2535,21 +2631,21 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_
          * there is a buffer of free pages available to give compaction
          * a reasonable chance of completing and allocating the page
          */
-       watermark = high_wmark_pages(zone) + (2UL << order);
-       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, classzone_idx);
+       watermark = high_wmark_pages(zone) + (2UL << sc->order);
+       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
  
         /*
          * If compaction is deferred, reclaim up to a point where
          * compaction will have a chance of success when re-enabled
          */
-       if (compaction_deferred(zone, order))
+       if (compaction_deferred(zone, sc->order))
                 return watermark_ok;
  
         /*
          * If compaction is not ready to start and allocation is not likely
          * to succeed without it, then keep reclaiming.
          */
-       if (compaction_suitable(zone, order, 0, classzone_idx) == COMPACT_SKIPPED)
+       if (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx) == COMPACT_SKIPPED)
                 return false;
  
         return watermark_ok;
@@ -2570,7 +2666,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
         unsigned long nr_soft_reclaimed;
         unsigned long nr_soft_scanned;
         gfp_t orig_mask;
-       enum zone_type classzone_idx;
         pg_data_t *last_pgdat = NULL;
  
         /*
@@ -2581,25 +2676,11 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
         orig_mask = sc->gfp_mask;
         if (buffer_heads_over_limit) {
                 sc->gfp_mask |= __GFP_HIGHMEM;
-               sc->reclaim_idx = classzone_idx = gfp_zone(sc->gfp_mask);
+               sc->reclaim_idx = gfp_zone(sc->gfp_mask);
         }
  
         for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                         sc->reclaim_idx, sc->nodemask) {
-               if (!populated_zone(zone))
-                       continue;
-
-               /*
-                * Note that reclaim_idx does not change as it is the highest
-                * zone reclaimed from which for empty zones is a no-op but
-                * classzone_idx is used by shrink_node to test if the slabs
-                * should be shrunk on a given node.
-                */
-               classzone_idx = sc->reclaim_idx;
-               while (!populated_zone(zone->zone_pgdat->node_zones +
-                                                       classzone_idx))
-                       classzone_idx--;
-
                 /*
                  * Take care memory controller reclaiming has small influence
                  * to global LRU.
@@ -2624,8 +2705,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                          */
                         if (IS_ENABLED(CONFIG_COMPACTION) &&
                             sc->order > PAGE_ALLOC_COSTLY_ORDER &&
-                           zonelist_zone_idx(z) <= classzone_idx &&
-                           compaction_ready(zone, sc->order, classzone_idx)) {
+                           compaction_ready(zone, sc)) {
                                 sc->compaction_ready = true;
                                 continue;
                         }
@@ -2658,7 +2738,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                 if (zone->zone_pgdat == last_pgdat)
                         continue;
                 last_pgdat = zone->zone_pgdat;
-               shrink_node(zone->zone_pgdat, sc, classzone_idx);
+               shrink_node(zone->zone_pgdat, sc);
         }
  
         /*
@@ -2694,7 +2774,7 @@ retry:
         delayacct_freepages_start();
  
         if (global_reclaim(sc))
-               count_vm_event(ALLOCSTALL);
+               __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
  
         do {
                 vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
@@ -2903,7 +2983,8 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
  
         trace_mm_vmscan_direct_reclaim_begin(order,
                                 sc.may_writepage,
-                               gfp_mask);
+                               gfp_mask,
+                               sc.reclaim_idx);
  
         nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
  
@@ -2934,7 +3015,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
  
         trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
                                                       sc.may_writepage,
-                                                     sc.gfp_mask);
+                                                     sc.gfp_mask,
+                                                     sc.reclaim_idx);
  
         /*
          * NOTE: Although we can get the priority field, using it
@@ -2982,7 +3064,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
  
         trace_mm_vmscan_memcg_reclaim_begin(0,
                                             sc.may_writepage,
-                                           sc.gfp_mask);
+                                           sc.gfp_mask,
+                                           sc.reclaim_idx);
  
         nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
  
@@ -3004,7 +3087,7 @@ static void age_active_anon(struct pglist_data *pgdat,
         do {
                 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
  
-               if (inactive_list_is_low(lruvec, false))
+               if (inactive_list_is_low(lruvec, false, sc))
                         shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                            sc, LRU_ACTIVE_ANON);
  
@@ -3035,15 +3118,10 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
   *
   * Returns true if kswapd is ready to sleep
   */
-static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
-                                       int classzone_idx)
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
  {
         int i;
  
-       /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
-       if (remaining)
-               return false;
-
         /*
          * The throttled processes are normally woken up in balance_pgdat() as
          * soon as pfmemalloc_watermark_ok() is true. But there is a potential
@@ -3082,7 +3160,6 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
   * This is used to determine if the scanning priority needs to be raised.
   */
  static bool kswapd_shrink_node(pg_data_t *pgdat,
-                              int classzone_idx,
                                struct scan_control *sc)
  {
         struct zone *zone;
@@ -3090,7 +3167,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
  
         /* Reclaim a number of pages proportional to the number of zones */
         sc->nr_to_reclaim = 0;
-       for (z = 0; z <= classzone_idx; z++) {
+       for (z = 0; z <= sc->reclaim_idx; z++) {
                 zone = pgdat->node_zones + z;
                 if (!populated_zone(zone))
                         continue;
@@ -3102,7 +3179,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
          * Historically care was taken to put equal pressure on all zones but
          * now pressure is applied based on node LRU order.
          */
-       shrink_node(pgdat, sc, classzone_idx);
+       shrink_node(pgdat, sc);
  
         /*
          * Fragmentation may mean that the system cannot be rebalanced for
@@ -3143,7 +3220,6 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 .may_writepage = !laptop_mode,
                 .may_unmap = 1,
                 .may_swap = 1,
-               .reclaim_idx = classzone_idx,
         };
         count_vm_event(PAGEOUTRUN);
  
@@ -3151,12 +3227,17 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 bool raise_priority = true;
  
                 sc.nr_reclaimed = 0;
+               sc.reclaim_idx = classzone_idx;
  
                 /*
-                * If the number of buffer_heads in the machine exceeds the
-                * maximum allowed level then reclaim from all zones. This is
-                * not specific to highmem as highmem may not exist but it is
-                * it is expected that buffer_heads are stripped in writeback.
+                * If the number of buffer_heads exceeds the maximum allowed
+                * then consider reclaiming from all zones. This has a dual
+                * purpose -- on 64-bit systems it is expected that
+                * buffer_heads are stripped during active rotation. On 32-bit
+                * systems, highmem pages can pin lowmem memory and shrinking
+                * buffers can relieve lowmem pressure. Reclaim may still not
+                * go ahead if all eligible zones for the original allocation
+                * request are balanced to avoid excessive reclaim from kswapd.
                  */
                 if (buffer_heads_over_limit) {
                         for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
@@ -3164,7 +3245,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                                 if (!populated_zone(zone))
                                         continue;
  
-                               classzone_idx = i;
+                               sc.reclaim_idx = i;
                                 break;
                         }
                 }
@@ -3175,7 +3256,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  * Scanning from low to high zone would allow congestion to be
                  * cleared during a very small window when a small low
                  * zone was balanced even under extreme pressure when the
-                * overall node may be congested.
+                * overall node may be congested. Note that sc.reclaim_idx
+                * is not used as buffer_heads_over_limit may have adjusted
+                * it.
                  */
                 for (i = classzone_idx; i >= 0; i--) {
                         zone = pgdat->node_zones + i;
@@ -3213,7 +3296,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                  * enough pages are already being scanned that that high
                  * watermark would be met at 100% efficiency.
                  */
-               if (kswapd_shrink_node(pgdat, classzone_idx, &sc))
+               if (kswapd_shrink_node(pgdat, &sc))
                         raise_priority = false;
  
                 /*
@@ -3259,7 +3342,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
         prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
  
         /* Try to sleep for a short interval */
-       if (prepare_kswapd_sleep(pgdat, reclaim_order, remaining, classzone_idx)) {
+       if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
                 /*
                  * Compaction records what page blocks it recently failed to
                  * isolate pages from and skips them in the future scanning.
@@ -3294,7 +3377,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
          * After a short sleep, check if it was a premature sleep. If not, then
          * go fully to sleep until explicitly woken up.
          */
-       if (prepare_kswapd_sleep(pgdat, reclaim_order, remaining, classzone_idx)) {
+       if (!remaining &&
+           prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
                 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
  
                 /*
@@ -3399,7 +3483,8 @@ kswapd_try_sleep:
                  * but kcompactd is woken to compact for the original
                  * request (alloc_order).
                  */
-               trace_mm_vmscan_kswapd_wake(pgdat->node_id, alloc_order);
+               trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
+                                               alloc_order);
                 reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
                 if (reclaim_order < alloc_order)
                         goto kswapd_try_sleep;
@@ -3421,6 +3506,7 @@ kswapd_try_sleep:
  void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
  {
         pg_data_t *pgdat;
+       int z;
  
         if (!populated_zone(zone))
                 return;
@@ -3432,8 +3518,16 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
         pgdat->kswapd_order = max(pgdat->kswapd_order, order);
         if (!waitqueue_active(&pgdat->kswapd_wait))
                 return;
-       if (zone_balanced(zone, order, 0))
-               return;
+
+       /* Only wake kswapd if all zones are unbalanced */
+       for (z = 0; z <= classzone_idx; z++) {
+               zone = pgdat->node_zones + z;
+               if (!populated_zone(zone))
+                       continue;
+
+               if (zone_balanced(zone, order, classzone_idx))
+                       return;
+       }
  
         trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
         wake_up_interruptible(&pgdat->kswapd_wait);
@@ -3556,12 +3650,12 @@ module_init(kswapd_init)
  
  #ifdef CONFIG_NUMA
  /*
- * Zone reclaim mode
+ * Node reclaim mode
   *
- * If non-zero call zone_reclaim when the number of free pages falls below
+ * If non-zero call node_reclaim when the number of free pages falls below
   * the watermarks.
   */
-int zone_reclaim_mode __read_mostly;
+int node_reclaim_mode __read_mostly;
  
  #define RECLAIM_OFF 0
  #define RECLAIM_ZONE (1<<0)    /* Run shrink_inactive_list on the zone */
@@ -3569,14 +3663,14 @@ int zone_reclaim_mode __read_mostly;
  #define RECLAIM_UNMAP (1<<2)   /* Unmap pages during reclaim */
  
  /*
- * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * Priority for NODE_RECLAIM. This determines the fraction of pages
   * of a node considered for each zone_reclaim. 4 scans 1/16th of
   * a zone.
   */
-#define ZONE_RECLAIM_PRIORITY 4
+#define NODE_RECLAIM_PRIORITY 4
  
  /*
- * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * Percentage of pages in a zone that must be unmapped for node_reclaim to
   * occur.
   */
  int sysctl_min_unmapped_ratio = 1;
@@ -3587,11 +3681,11 @@ int sysctl_min_unmapped_ratio = 1;
   */
  int sysctl_min_slab_ratio = 5;
  
-static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
+static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
  {
-       unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
-       unsigned long file_lru = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) +
-               node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE);
+       unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
+       unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
+               node_page_state(pgdat, NR_ACTIVE_FILE);
  
         /*
          * It's possible for there to be more file mapped pages than
@@ -3602,7 +3696,7 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
  }
  
  /* Work out how many page cache pages we can reclaim in this reclaim_mode */
-static unsigned long zone_pagecache_reclaimable(struct zone *zone)
+static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
  {
         unsigned long nr_pagecache_reclaimable;
         unsigned long delta = 0;
@@ -3610,17 +3704,17 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
         /*
          * If RECLAIM_UNMAP is set, then all file pages are considered
          * potentially reclaimable. Otherwise, we have to worry about
-        * pages like swapcache and zone_unmapped_file_pages() provides
+        * pages like swapcache and node_unmapped_file_pages() provides
          * a better estimate
          */
-       if (zone_reclaim_mode & RECLAIM_UNMAP)
-               nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
+       if (node_reclaim_mode & RECLAIM_UNMAP)
+               nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
         else
-               nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
+               nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
  
         /* If we can't clean pages, remove dirty pages from consideration */
-       if (!(zone_reclaim_mode & RECLAIM_WRITE))
-               delta += zone_page_state(zone, NR_FILE_DIRTY);
+       if (!(node_reclaim_mode & RECLAIM_WRITE))
+               delta += node_page_state(pgdat, NR_FILE_DIRTY);
  
         /* Watch for any possible underflows due to delta */
         if (unlikely(delta > nr_pagecache_reclaimable))
@@ -3630,23 +3724,24 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone)
  }
  
  /*
- * Try to free up some pages from this zone through reclaim.
+ * Try to free up some pages from this node through reclaim.
   */
-static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
  {
         /* Minimum pages needed in order to stay on node */
         const unsigned long nr_pages = 1 << order;
         struct task_struct *p = current;
         struct reclaim_state reclaim_state;
+       int classzone_idx = gfp_zone(gfp_mask);
         struct scan_control sc = {
                 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
                 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
                 .order = order,
-               .priority = ZONE_RECLAIM_PRIORITY,
-               .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
-               .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
+               .priority = NODE_RECLAIM_PRIORITY,
+               .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
+               .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
                 .may_swap = 1,
-               .reclaim_idx = zone_idx(zone),
+               .reclaim_idx = classzone_idx,
         };
  
         cond_resched();
@@ -3660,13 +3755,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
         reclaim_state.reclaimed_slab = 0;
         p->reclaim_state = &reclaim_state;
  
-       if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
+       if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
                 /*
                  * Free memory by calling shrink zone with increasing
                  * priorities until we have enough memory freed.
                  */
                 do {
-                       shrink_node(zone->zone_pgdat, &sc, zone_idx(zone));
+                       shrink_node(pgdat, &sc);
                 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
         }
  
@@ -3676,49 +3771,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
         return sc.nr_reclaimed >= nr_pages;
  }
  
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
  {
-       int node_id;
         int ret;
  
         /*
-        * Zone reclaim reclaims unmapped file backed pages and
+        * Node reclaim reclaims unmapped file backed pages and
          * slab pages if we are over the defined limits.
          *
          * A small portion of unmapped file backed pages is needed for
          * file I/O otherwise pages read by file I/O will be immediately
-        * thrown out if the zone is overallocated. So we do not reclaim
-        * if less than a specified percentage of the zone is used by
+        * thrown out if the node is overallocated. So we do not reclaim
+        * if less than a specified percentage of the node is used by
          * unmapped file backed pages.
          */
-       if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
-           zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
-               return ZONE_RECLAIM_FULL;
+       if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
+           sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
+               return NODE_RECLAIM_FULL;
  
-       if (!pgdat_reclaimable(zone->zone_pgdat))
-               return ZONE_RECLAIM_FULL;
+       if (!pgdat_reclaimable(pgdat))
+               return NODE_RECLAIM_FULL;
  
         /*
          * Do not scan if the allocation should not be delayed.
          */
         if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
-               return ZONE_RECLAIM_NOSCAN;
+               return NODE_RECLAIM_NOSCAN;
  
         /*
-        * Only run zone reclaim on the local zone or on zones that do not
+        * Only run node reclaim on the local node or on nodes that do not
          * have associated processors. This will favor the local processor
          * over remote processors and spread off node memory allocations
          * as wide as possible.
          */
-       node_id = zone_to_nid(zone);
-       if (node_state(node_id, N_CPU) && node_id != numa_node_id())
-               return ZONE_RECLAIM_NOSCAN;
+       if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
+               return NODE_RECLAIM_NOSCAN;
  
-       if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))
-               return ZONE_RECLAIM_NOSCAN;
+       if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+               return NODE_RECLAIM_NOSCAN;
  
-       ret = __zone_reclaim(zone, gfp_mask, order);
-       clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
+       ret = __node_reclaim(pgdat, gfp_mask, order);
+       clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
  
         if (!ret)
                 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
@@ -3757,24 +3850,23 @@ int page_evictable(struct page *page)
  void check_move_unevictable_pages(struct page **pages, int nr_pages)
  {
         struct lruvec *lruvec;
-       struct zone *zone = NULL;
+       struct pglist_data *pgdat = NULL;
         int pgscanned = 0;
         int pgrescued = 0;
         int i;
  
         for (i = 0; i < nr_pages; i++) {
                 struct page *page = pages[i];
-               struct zone *pagezone;
+               struct pglist_data *pagepgdat = page_pgdat(page);
  
                 pgscanned++;
-               pagezone = page_zone(page);
-               if (pagezone != zone) {
-                       if (zone)
-                               spin_unlock_irq(zone_lru_lock(zone));
-                       zone = pagezone;
-                       spin_lock_irq(zone_lru_lock(zone));
+               if (pagepgdat != pgdat) {
+                       if (pgdat)
+                               spin_unlock_irq(&pgdat->lru_lock);
+                       pgdat = pagepgdat;
+                       spin_lock_irq(&pgdat->lru_lock);
                 }
-               lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+               lruvec = mem_cgroup_page_lruvec(page, pgdat);
  
                 if (!PageLRU(page) || !PageUnevictable(page))
                         continue;
@@ -3790,10 +3882,10 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
                 }
         }
  
-       if (zone) {
+       if (pgdat) {
                 __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
                 __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
-               spin_unlock_irq(zone_lru_lock(zone));
+               spin_unlock_irq(&pgdat->lru_lock);
         }
  }
  #endif /* CONFIG_SHMEM */