Merge tag 'nfsd-4.8' of git://linux-nfs.org/~bfields/linux
[linux-2.6-block.git] / mm / vmscan.c
index b3f5b359280d9a200e23fd26a1c9a2c0d396e3f1..374d95d0417856b096d902d40ff7cc29d4021b2e 100644 (file)
@@ -194,6 +194,24 @@ static bool sane_reclaim(struct scan_control *sc)
 }
 #endif
 
+/*
+ * This misses isolated pages which are not accounted for to save counters.
+ * As the data only determines if reclaim or compaction continues, it is
+ * not expected that isolated pages will be a dominating factor.
+ */
+unsigned long zone_reclaimable_pages(struct zone *zone)
+{
+       unsigned long nr;
+
+       nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
+               zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
+       if (get_nr_swap_pages() > 0)
+               nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
+                       zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
+
+       return nr;
+}
+
 unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
 {
        unsigned long nr;
@@ -1359,23 +1377,14 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
                        enum lru_list lru, unsigned long *nr_zone_taken,
                        unsigned long nr_taken)
 {
-#ifdef CONFIG_HIGHMEM
        int zid;
 
-       /*
-        * Highmem has separate accounting for highmem pages so each zone
-        * is updated separately.
-        */
        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
                if (!nr_zone_taken[zid])
                        continue;
 
                __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
        }
-#else
-       /* Zone ID does not matter on !HIGHMEM */
-       __update_lru_size(lruvec, lru, 0, -nr_taken);
-#endif
 
 #ifdef CONFIG_MEMCG
        mem_cgroup_update_lru_size(lruvec, lru, -nr_taken);
@@ -1415,7 +1424,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        LIST_HEAD(pages_skipped);
 
        for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
-                                       !list_empty(src); scan++) {
+                                       !list_empty(src);) {
                struct page *page;
 
                page = lru_to_page(src);
@@ -1429,6 +1438,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                        continue;
                }
 
+               /*
+                * Account for scanned and skipped separetly to avoid the pgdat
+                * being prematurely marked unreclaimable by pgdat_reclaimable.
+                */
+               scan++;
+
                switch (__isolate_lru_page(page, mode)) {
                case 0:
                        nr_pages = hpage_nr_pages(page);
@@ -1456,14 +1471,24 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
         */
        if (!list_empty(&pages_skipped)) {
                int zid;
+               unsigned long total_skipped = 0;
 
-               list_splice(&pages_skipped, src);
                for (zid = 0; zid < MAX_NR_ZONES; zid++) {
                        if (!nr_skipped[zid])
                                continue;
 
                        __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
+                       total_skipped += nr_skipped[zid];
                }
+
+               /*
+                * Account skipped pages as a partial scan as the pgdat may be
+                * close to unreclaimable. If the LRU list is empty, account
+                * skipped pages as a full scan.
+                */
+               scan += list_empty(src) ? total_skipped : total_skipped >> 2;
+
+               list_splice(&pages_skipped, src);
        }
        *nr_scanned = scan;
        trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan,
@@ -1627,6 +1652,30 @@ static int current_may_throttle(void)
                bdi_write_congested(current->backing_dev_info);
 }
 
+static bool inactive_reclaimable_pages(struct lruvec *lruvec,
+                               struct scan_control *sc, enum lru_list lru)
+{
+       int zid;
+       struct zone *zone;
+       int file = is_file_lru(lru);
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+       if (!global_reclaim(sc))
+               return true;
+
+       for (zid = sc->reclaim_idx; zid >= 0; zid--) {
+               zone = &pgdat->node_zones[zid];
+               if (!populated_zone(zone))
+                       continue;
+
+               if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE +
+                               LRU_FILE * file) >= SWAP_CLUSTER_MAX)
+                       return true;
+       }
+
+       return false;
+}
+
 /*
  * shrink_inactive_list() is a helper for shrink_node().  It returns the number
  * of reclaimed pages
@@ -1649,6 +1698,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);
        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 
+       if (!inactive_reclaimable_pages(lruvec, sc, lru))
+               return 0;
+
        while (unlikely(too_many_isolated(pgdat, file, sc))) {
                congestion_wait(BLK_RW_ASYNC, HZ/10);
 
@@ -1955,12 +2007,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
  *    1TB     101        10GB
  *   10TB     320        32GB
  */
-static bool inactive_list_is_low(struct lruvec *lruvec, bool file)
+static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
+                                               struct scan_control *sc)
 {
        unsigned long inactive_ratio;
        unsigned long inactive;
        unsigned long active;
        unsigned long gb;
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+       int zid;
 
        /*
         * If we don't have swap space, anonymous page deactivation
@@ -1972,6 +2027,27 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file)
        inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
        active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
 
+       /*
+        * For zone-constrained allocations, it is necessary to check if
+        * deactivations are required for lowmem to be reclaimed. This
+        * calculates the inactive/active pages available in eligible zones.
+        */
+       for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) {
+               struct zone *zone = &pgdat->node_zones[zid];
+               unsigned long inactive_zone, active_zone;
+
+               if (!populated_zone(zone))
+                       continue;
+
+               inactive_zone = zone_page_state(zone,
+                               NR_ZONE_LRU_BASE + (file * LRU_FILE));
+               active_zone = zone_page_state(zone,
+                               NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE);
+
+               inactive -= min(inactive, inactive_zone);
+               active -= min(active, active_zone);
+       }
+
        gb = (inactive + active) >> (30 - PAGE_SHIFT);
        if (gb)
                inactive_ratio = int_sqrt(10 * gb);
@@ -1985,7 +2061,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
                                 struct lruvec *lruvec, struct scan_control *sc)
 {
        if (is_active_lru(lru)) {
-               if (inactive_list_is_low(lruvec, is_file_lru(lru)))
+               if (inactive_list_is_low(lruvec, is_file_lru(lru), sc))
                        shrink_active_list(nr_to_scan, lruvec, sc, lru);
                return 0;
        }
@@ -2116,7 +2192,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
         * lruvec even if it has plenty of old anonymous pages unless the
         * system is under heavy pressure.
         */
-       if (!inactive_list_is_low(lruvec, true) &&
+       if (!inactive_list_is_low(lruvec, true, sc) &&
            lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
                scan_balance = SCAN_FILE;
                goto out;
@@ -2358,7 +2434,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-       if (inactive_list_is_low(lruvec, false))
+       if (inactive_list_is_low(lruvec, false, sc))
                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                   sc, LRU_ACTIVE_ANON);
 
@@ -2485,7 +2561,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
                        shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
                        node_lru_pages += lru_pages;
 
-                       if (!global_reclaim(sc))
+                       if (memcg)
                                shrink_slab(sc->gfp_mask, pgdat->node_id,
                                            memcg, sc->nr_scanned - scanned,
                                            lru_pages);
@@ -2605,9 +2681,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                        sc->reclaim_idx, sc->nodemask) {
-               if (!populated_zone(zone))
-                       continue;
-
                /*
                 * Take care memory controller reclaiming has small influence
                 * to global LRU.
@@ -3014,7 +3087,7 @@ static void age_active_anon(struct pglist_data *pgdat,
        do {
                struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
 
-               if (inactive_list_is_low(lruvec, false))
+               if (inactive_list_is_low(lruvec, false, sc))
                        shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                           sc, LRU_ACTIVE_ANON);
 
@@ -3777,24 +3850,23 @@ int page_evictable(struct page *page)
 void check_move_unevictable_pages(struct page **pages, int nr_pages)
 {
        struct lruvec *lruvec;
-       struct zone *zone = NULL;
+       struct pglist_data *pgdat = NULL;
        int pgscanned = 0;
        int pgrescued = 0;
        int i;
 
        for (i = 0; i < nr_pages; i++) {
                struct page *page = pages[i];
-               struct zone *pagezone;
+               struct pglist_data *pagepgdat = page_pgdat(page);
 
                pgscanned++;
-               pagezone = page_zone(page);
-               if (pagezone != zone) {
-                       if (zone)
-                               spin_unlock_irq(zone_lru_lock(zone));
-                       zone = pagezone;
-                       spin_lock_irq(zone_lru_lock(zone));
+               if (pagepgdat != pgdat) {
+                       if (pgdat)
+                               spin_unlock_irq(&pgdat->lru_lock);
+                       pgdat = pagepgdat;
+                       spin_lock_irq(&pgdat->lru_lock);
                }
-               lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+               lruvec = mem_cgroup_page_lruvec(page, pgdat);
 
                if (!PageLRU(page) || !PageUnevictable(page))
                        continue;
@@ -3810,10 +3882,10 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
                }
        }
 
-       if (zone) {
+       if (pgdat) {
                __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
                __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
-               spin_unlock_irq(zone_lru_lock(zone));
+               spin_unlock_irq(&pgdat->lru_lock);
        }
 }
 #endif /* CONFIG_SHMEM */