/* Scan (total_size >> priority) pages at once */
int priority;
+ /* The highest zone to isolate pages for reclaim from */
+ enum zone_type reclaim_idx;
+
unsigned int may_writepage:1;
/* Can mapped pages be reclaimed? */
}
#endif
+/*
+ * This misses isolated pages which are not accounted for to save counters.
+ * As the data only determines if reclaim or compaction continues, it is
+ * not expected that isolated pages will be a dominating factor.
+ */
unsigned long zone_reclaimable_pages(struct zone *zone)
{
unsigned long nr;
- nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) +
- zone_page_state_snapshot(zone, NR_INACTIVE_FILE) +
- zone_page_state_snapshot(zone, NR_ISOLATED_FILE);
+ nr = zone_page_state_snapshot(zone, NR_ZONE_LRU_FILE);
+ if (get_nr_swap_pages() > 0)
+ nr += zone_page_state_snapshot(zone, NR_ZONE_LRU_ANON);
+
+ return nr;
+}
+
+unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
+{
+ unsigned long nr;
+
+ nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) +
+ node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) +
+ node_page_state_snapshot(pgdat, NR_ISOLATED_FILE);
if (get_nr_swap_pages() > 0)
- nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) +
- zone_page_state_snapshot(zone, NR_INACTIVE_ANON) +
- zone_page_state_snapshot(zone, NR_ISOLATED_ANON);
+ nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) +
+ node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) +
+ node_page_state_snapshot(pgdat, NR_ISOLATED_ANON);
return nr;
}
-bool zone_reclaimable(struct zone *zone)
+bool pgdat_reclaimable(struct pglist_data *pgdat)
{
- return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) <
- zone_reclaimable_pages(zone) * 6;
+ return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) <
+ pgdat_reclaimable_pages(pgdat) * 6;
}
unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
if (!mem_cgroup_disabled())
return mem_cgroup_get_lru_size(lruvec, lru);
- return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
+ return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
}
/*
* shrink_page_list() returns the number of reclaimed pages
*/
static unsigned long shrink_page_list(struct list_head *page_list,
- struct zone *zone,
+ struct pglist_data *pgdat,
struct scan_control *sc,
enum ttu_flags ttu_flags,
unsigned long *ret_nr_dirty,
goto keep;
VM_BUG_ON_PAGE(PageActive(page), page);
- VM_BUG_ON_PAGE(page_zone(page) != zone, page);
sc->nr_scanned++;
/* Case 1 above */
if (current_is_kswapd() &&
PageReclaim(page) &&
- test_bit(ZONE_WRITEBACK, &zone->flags)) {
+ test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
nr_immediate++;
goto keep_locked;
/* Adding to swap updated mapping */
mapping = page_mapping(page);
+ } else if (unlikely(PageTransHuge(page))) {
+ /* Split file THP */
+ if (split_huge_page_to_list(page, page_list))
+ goto keep_locked;
}
+ VM_BUG_ON_PAGE(PageTransHuge(page), page);
+
/*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
if (page_is_file_cache(page) &&
(!current_is_kswapd() ||
- !test_bit(ZONE_DIRTY, &zone->flags))) {
+ !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
/*
* Immediately reclaim when written back.
* Similar in principal to deactivate_page()
list_for_each_entry_safe(page, next, page_list, lru) {
if (page_is_file_cache(page) && !PageDirty(page) &&
- !isolated_balloon_page(page)) {
+ !__PageMovable(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
}
}
- ret = shrink_page_list(&clean_pages, zone, &sc,
+ ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
TTU_UNMAP|TTU_IGNORE_ACCESS,
&dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
list_splice(&clean_pages, page_list);
- mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
return ret;
}
}
/*
- * zone->lru_lock is heavily contended. Some of the functions that
+ * zone_lru_lock is heavily contended. Some of the functions that
* shrink the lists perform better by taking out a batch of pages
* and working on them outside the LRU lock.
*
{
struct list_head *src = &lruvec->lists[lru];
unsigned long nr_taken = 0;
- unsigned long scan;
+ unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
+ unsigned long scan, nr_pages;
+ LIST_HEAD(pages_skipped);
for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
!list_empty(src); scan++) {
VM_BUG_ON_PAGE(!PageLRU(page), page);
+ if (page_zonenum(page) > sc->reclaim_idx) {
+ list_move(&page->lru, &pages_skipped);
+ continue;
+ }
+
switch (__isolate_lru_page(page, mode)) {
case 0:
- nr_taken += hpage_nr_pages(page);
+ nr_pages = hpage_nr_pages(page);
+ nr_taken += nr_pages;
+ nr_zone_taken[page_zonenum(page)] += nr_pages;
list_move(&page->lru, dst);
break;
}
}
+ /*
+ * Splice any skipped pages to the start of the LRU list. Note that
+ * this disrupts the LRU order when reclaiming for lower zones but
+ * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
+ * scanning would soon rescan the same pages to skip and put the
+ * system at risk of premature OOM.
+ */
+ if (!list_empty(&pages_skipped))
+ list_splice(&pages_skipped, src);
*nr_scanned = scan;
trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
nr_taken, mode, is_file_lru(lru));
+ for (scan = 0; scan < MAX_NR_ZONES; scan++) {
+ nr_pages = nr_zone_taken[scan];
+ if (!nr_pages)
+ continue;
+
+ update_lru_size(lruvec, lru, scan, -nr_pages);
+ }
return nr_taken;
}
struct zone *zone = page_zone(page);
struct lruvec *lruvec;
- spin_lock_irq(&zone->lru_lock);
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ spin_lock_irq(zone_lru_lock(zone));
+ lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
if (PageLRU(page)) {
int lru = page_lru(page);
get_page(page);
del_page_from_lru_list(page, lruvec, lru);
ret = 0;
}
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
}
return ret;
}
* the LRU list will go small and be scanned faster than necessary, leading to
* unnecessary swapping, thrashing and OOM.
*/
-static int too_many_isolated(struct zone *zone, int file,
+static int too_many_isolated(struct pglist_data *pgdat, int file,
struct scan_control *sc)
{
unsigned long inactive, isolated;
return 0;
if (file) {
- inactive = zone_page_state(zone, NR_INACTIVE_FILE);
- isolated = zone_page_state(zone, NR_ISOLATED_FILE);
+ inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
+ isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
} else {
- inactive = zone_page_state(zone, NR_INACTIVE_ANON);
- isolated = zone_page_state(zone, NR_ISOLATED_ANON);
+ inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
+ isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
}
/*
putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
{
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
- struct zone *zone = lruvec_zone(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
LIST_HEAD(pages_to_free);
/*
VM_BUG_ON_PAGE(PageLRU(page), page);
list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(&pgdat->lru_lock);
putback_lru_page(page);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(&pgdat->lru_lock);
continue;
}
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
SetPageLRU(page);
lru = page_lru(page);
del_page_from_lru_list(page, lruvec, lru);
if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(&pgdat->lru_lock);
mem_cgroup_uncharge(page);
(*get_compound_page_dtor(page))(page);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(&pgdat->lru_lock);
} else
list_add(&page->lru, &pages_to_free);
}
}
/*
- * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
+ * shrink_inactive_list() is a helper for shrink_node(). It returns the number
* of reclaimed pages
*/
static noinline_for_stack unsigned long
unsigned long nr_immediate = 0;
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
- struct zone *zone = lruvec_zone(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
- while (unlikely(too_many_isolated(zone, file, sc))) {
+ while (unlikely(too_many_isolated(pgdat, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
/* We are about to die and free our memory. Return now. */
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(&pgdat->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
&nr_scanned, sc, isolate_mode, lru);
- update_lru_size(lruvec, lru, -nr_taken);
- __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
if (global_reclaim(sc)) {
- __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
+ __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
if (current_is_kswapd())
- __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
+ __count_vm_events(PGSCAN_KSWAPD, nr_scanned);
else
- __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
+ __count_vm_events(PGSCAN_DIRECT, nr_scanned);
}
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(&pgdat->lru_lock);
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
+ nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
&nr_dirty, &nr_unqueued_dirty, &nr_congested,
&nr_writeback, &nr_immediate,
false);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(&pgdat->lru_lock);
if (global_reclaim(sc)) {
if (current_is_kswapd())
- __count_zone_vm_events(PGSTEAL_KSWAPD, zone,
- nr_reclaimed);
+ __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
else
- __count_zone_vm_events(PGSTEAL_DIRECT, zone,
- nr_reclaimed);
+ __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
}
putback_inactive_pages(lruvec, &page_list);
- __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(&pgdat->lru_lock);
mem_cgroup_uncharge_list(&page_list);
free_hot_cold_page_list(&page_list, true);
* are encountered in the nr_immediate check below.
*/
if (nr_writeback && nr_writeback == nr_taken)
- set_bit(ZONE_WRITEBACK, &zone->flags);
+ set_bit(PGDAT_WRITEBACK, &pgdat->flags);
/*
* Legacy memcg will stall in page writeback so avoid forcibly
* backed by a congested BDI and wait_iff_congested will stall.
*/
if (nr_dirty && nr_dirty == nr_congested)
- set_bit(ZONE_CONGESTED, &zone->flags);
+ set_bit(PGDAT_CONGESTED, &pgdat->flags);
/*
* If dirty pages are scanned that are not queued for IO, it
* implies that flushers are not keeping up. In this case, flag
- * the zone ZONE_DIRTY and kswapd will start writing pages from
+ * the pgdat PGDAT_DIRTY and kswapd will start writing pages from
* reclaim context.
*/
if (nr_unqueued_dirty == nr_taken)
- set_bit(ZONE_DIRTY, &zone->flags);
+ set_bit(PGDAT_DIRTY, &pgdat->flags);
/*
* If kswapd scans pages marked marked for immediate
*/
if (!sc->hibernation_mode && !current_is_kswapd() &&
current_may_throttle())
- wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+ wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10);
- trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed,
+ trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
+ nr_scanned, nr_reclaimed,
sc->priority, file);
return nr_reclaimed;
}
* processes, from rmap.
*
* If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold zone->lru_lock across the whole operation. But if
+ * appropriate to hold zone_lru_lock across the whole operation. But if
* the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop zone->lru_lock around each page. It's impossible to balance
+ * should drop zone_lru_lock around each page. It's impossible to balance
* this, so instead we remove the pages from the LRU while processing them.
* It is safe to rely on PG_active against the non-LRU pages in here because
* nobody will play with that bit on a non-LRU page.
struct list_head *pages_to_free,
enum lru_list lru)
{
- struct zone *zone = lruvec_zone(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
unsigned long pgmoved = 0;
struct page *page;
int nr_pages;
while (!list_empty(list)) {
page = lru_to_page(list);
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page);
nr_pages = hpage_nr_pages(page);
- update_lru_size(lruvec, lru, nr_pages);
+ update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
list_move(&page->lru, &lruvec->lists[lru]);
pgmoved += nr_pages;
del_page_from_lru_list(page, lruvec, lru);
if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(&pgdat->lru_lock);
mem_cgroup_uncharge(page);
(*get_compound_page_dtor(page))(page);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(&pgdat->lru_lock);
} else
list_add(&page->lru, pages_to_free);
}
unsigned long nr_rotated = 0;
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
- struct zone *zone = lruvec_zone(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
lru_add_drain();
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(&pgdat->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, isolate_mode, lru);
- update_lru_size(lruvec, lru, -nr_taken);
- __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
if (global_reclaim(sc))
- __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
- __count_zone_vm_events(PGREFILL, zone, nr_scanned);
+ __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
+ __count_vm_events(PGREFILL, nr_scanned);
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(&pgdat->lru_lock);
while (!list_empty(&l_hold)) {
cond_resched();
/*
* Move pages back to the lru list.
*/
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(&pgdat->lru_lock);
/*
* Count referenced pages from currently used mappings as rotated,
* even though only some of them are actually re-activated. This
move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
- __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&zone->lru_lock);
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
+ spin_unlock_irq(&pgdat->lru_lock);
mem_cgroup_uncharge_list(&l_hold);
free_hot_cold_page_list(&l_hold, true);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
u64 fraction[2];
u64 denominator = 0; /* gcc */
- struct zone *zone = lruvec_zone(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
unsigned long anon_prio, file_prio;
enum scan_balance scan_balance;
unsigned long anon, file;
* well.
*/
if (current_is_kswapd()) {
- if (!zone_reclaimable(zone))
+ if (!pgdat_reclaimable(pgdat))
force_scan = true;
if (!mem_cgroup_online(memcg))
force_scan = true;
* anon pages. Try to detect this based on file LRU size.
*/
if (global_reclaim(sc)) {
- unsigned long zonefile;
- unsigned long zonefree;
+ unsigned long pgdatfile;
+ unsigned long pgdatfree;
+ int z;
+ unsigned long total_high_wmark = 0;
+
+ pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+ pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
+ node_page_state(pgdat, NR_INACTIVE_FILE);
- zonefree = zone_page_state(zone, NR_FREE_PAGES);
- zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +
- zone_page_state(zone, NR_INACTIVE_FILE);
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = &pgdat->node_zones[z];
+ if (!populated_zone(zone))
+ continue;
- if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {
+ total_high_wmark += high_wmark_pages(zone);
+ }
+
+ if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
scan_balance = SCAN_ANON;
goto out;
}
file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(&pgdat->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
reclaim_stat->recent_scanned[0] /= 2;
reclaim_stat->recent_rotated[0] /= 2;
fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
fp /= reclaim_stat->recent_rotated[1] + 1;
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(&pgdat->lru_lock);
fraction[0] = ap;
fraction[1] = fp;
* inactive lists are large enough, continue reclaiming
*/
pages_for_compaction = (2UL << sc->order);
- inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
+ inactive_lru_pages = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE);
if (get_nr_swap_pages() > 0)
- inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
+ inactive_lru_pages += node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON);
if (sc->nr_reclaimed < pages_for_compaction &&
inactive_lru_pages > pages_for_compaction)
return true;
}
}
-static bool shrink_zone(struct zone *zone, struct scan_control *sc,
- bool is_classzone)
+static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc,
+ enum zone_type classzone_idx)
{
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_reclaimed, nr_scanned;
bool reclaimable = false;
+ struct zone *zone = &pgdat->node_zones[classzone_idx];
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
shrink_zone_memcg(zone, memcg, sc, &lru_pages);
zone_lru_pages += lru_pages;
- if (memcg && is_classzone)
+ if (!global_reclaim(sc))
shrink_slab(sc->gfp_mask, zone_to_nid(zone),
memcg, sc->nr_scanned - scanned,
lru_pages);
* Shrink the slab caches in the same proportion that
* the eligible LRU pages were scanned.
*/
- if (global_reclaim(sc) && is_classzone)
+ if (global_reclaim(sc))
shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
sc->nr_scanned - nr_scanned,
zone_lru_pages);
*/
static inline bool compaction_ready(struct zone *zone, int order, int classzone_idx)
{
- unsigned long balance_gap, watermark;
+ unsigned long watermark;
bool watermark_ok;
/*
* there is a buffer of free pages available to give compaction
* a reasonable chance of completing and allocating the page
*/
- balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
- zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
- watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
+ watermark = high_wmark_pages(zone) + (2UL << order);
watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, classzone_idx);
/*
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
gfp_t orig_mask;
- enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
+ enum zone_type classzone_idx;
/*
* If the number of buffer_heads in the machine exceeds the maximum
* highmem pages could be pinning lowmem pages storing buffer_heads
*/
orig_mask = sc->gfp_mask;
- if (buffer_heads_over_limit)
+ if (buffer_heads_over_limit) {
sc->gfp_mask |= __GFP_HIGHMEM;
+ sc->reclaim_idx = classzone_idx = gfp_zone(sc->gfp_mask);
+ }
for_each_zone_zonelist_nodemask(zone, z, zonelist,
- gfp_zone(sc->gfp_mask), sc->nodemask) {
- enum zone_type classzone_idx;
-
+ sc->reclaim_idx, sc->nodemask) {
if (!populated_zone(zone))
continue;
- classzone_idx = requested_highidx;
+ /*
+ * Note that reclaim_idx does not change as it is the highest
+ * zone reclaimed from which for empty zones is a no-op but
+ * classzone_idx is used by shrink_node to test if the slabs
+ * should be shrunk on a given node.
+ */
+ classzone_idx = sc->reclaim_idx;
while (!populated_zone(zone->zone_pgdat->node_zones +
classzone_idx))
classzone_idx--;
continue;
if (sc->priority != DEF_PRIORITY &&
- !zone_reclaimable(zone))
+ !pgdat_reclaimable(zone->zone_pgdat))
continue; /* Let kswapd poll it */
/*
*/
if (IS_ENABLED(CONFIG_COMPACTION) &&
sc->order > PAGE_ALLOC_COSTLY_ORDER &&
- zonelist_zone_idx(z) <= requested_highidx &&
- compaction_ready(zone, sc->order, requested_highidx)) {
+ zonelist_zone_idx(z) <= classzone_idx &&
+ compaction_ready(zone, sc->order, classzone_idx)) {
sc->compaction_ready = true;
continue;
}
/* need some check for avoid more shrink_zone() */
}
- shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
+ shrink_node(zone->zone_pgdat, sc, classzone_idx);
}
/*
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
if (!populated_zone(zone) ||
- zone_reclaimable_pages(zone) == 0)
+ pgdat_reclaimable_pages(pgdat) == 0)
continue;
pfmemalloc_reserve += min_wmark_pages(zone);
struct scan_control sc = {
.nr_to_reclaim = SWAP_CLUSTER_MAX,
.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+ .reclaim_idx = gfp_zone(gfp_mask),
.order = order,
.nodemask = nodemask,
.priority = DEF_PRIORITY,
.target_mem_cgroup = memcg,
.may_writepage = !laptop_mode,
.may_unmap = 1,
+ .reclaim_idx = MAX_NR_ZONES - 1,
.may_swap = !noswap,
};
unsigned long lru_pages;
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
+ .reclaim_idx = MAX_NR_ZONES - 1,
.target_mem_cgroup = memcg,
.priority = DEF_PRIORITY,
.may_writepage = !laptop_mode,
}
#endif
-static void age_active_anon(struct zone *zone, struct scan_control *sc)
+static void age_active_anon(struct pglist_data *pgdat,
+ struct zone *zone, struct scan_control *sc)
{
struct mem_cgroup *memcg;
} while (memcg);
}
-static bool zone_balanced(struct zone *zone, int order, bool highorder,
- unsigned long balance_gap, int classzone_idx)
+static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
{
- unsigned long mark = high_wmark_pages(zone) + balance_gap;
-
- /*
- * When checking from pgdat_balanced(), kswapd should stop and sleep
- * when it reaches the high order-0 watermark and let kcompactd take
- * over. Other callers such as wakeup_kswapd() want to determine the
- * true high-order watermark.
- */
- if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) {
- mark += (1UL << order);
- order = 0;
- }
+ unsigned long mark = high_wmark_pages(zone);
return zone_watermark_ok_safe(zone, order, mark, classzone_idx);
}
-/*
- * pgdat_balanced() is used when checking if a node is balanced.
- *
- * For order-0, all zones must be balanced!
- *
- * For high-order allocations only zones that meet watermarks and are in a
- * zone allowed by the callers classzone_idx are added to balanced_pages. The
- * total of balanced pages must be at least 25% of the zones allowed by
- * classzone_idx for the node to be considered balanced. Forcing all zones to
- * be balanced for high orders can cause excessive reclaim when there are
- * imbalanced zones.
- * The choice of 25% is due to
- * o a 16M DMA zone that is balanced will not balance a zone on any
- * reasonable sized machine
- * o On all other machines, the top zone must be at least a reasonable
- * percentage of the middle zones. For example, on 32-bit x86, highmem
- * would need to be at least 256M for it to be balance a whole node.
- * Similarly, on x86-64 the Normal zone would need to be at least 1G
- * to balance a node on its own. These seemed like reasonable ratios.
- */
-static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
-{
- unsigned long managed_pages = 0;
- unsigned long balanced_pages = 0;
- int i;
-
- /* Check the watermark levels */
- for (i = 0; i <= classzone_idx; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!populated_zone(zone))
- continue;
-
- managed_pages += zone->managed_pages;
-
- /*
- * A special case here:
- *
- * balance_pgdat() skips over all_unreclaimable after
- * DEF_PRIORITY. Effectively, it considers them balanced so
- * they must be considered balanced here as well!
- */
- if (!zone_reclaimable(zone)) {
- balanced_pages += zone->managed_pages;
- continue;
- }
-
- if (zone_balanced(zone, order, false, 0, i))
- balanced_pages += zone->managed_pages;
- else if (!order)
- return false;
- }
-
- if (order)
- return balanced_pages >= (managed_pages >> 2);
- else
- return true;
-}
-
/*
* Prepare kswapd for sleeping. This verifies that there are no processes
* waiting in throttle_direct_reclaim() and that watermarks have been met.
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
int classzone_idx)
{
+ int i;
+
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
if (remaining)
return false;
if (waitqueue_active(&pgdat->pfmemalloc_wait))
wake_up_all(&pgdat->pfmemalloc_wait);
- return pgdat_balanced(pgdat, order, classzone_idx);
+ for (i = 0; i <= classzone_idx; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_balanced(zone, order, classzone_idx))
+ return true;
+ }
+
+ return false;
}
/*
- * kswapd shrinks the zone by the number of pages required to reach
- * the high watermark.
+ * kswapd shrinks a node of pages that are at or below the highest usable
+ * zone that is currently unbalanced.
*
* Returns true if kswapd scanned at least the requested number of pages to
* reclaim or if the lack of progress was due to pages under writeback.
* This is used to determine if the scanning priority needs to be raised.
*/
-static bool kswapd_shrink_zone(struct zone *zone,
+static bool kswapd_shrink_node(pg_data_t *pgdat,
int classzone_idx,
struct scan_control *sc)
{
- unsigned long balance_gap;
- bool lowmem_pressure;
+ struct zone *zone;
+ int z;
- /* Reclaim above the high watermark. */
- sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
+ /* Reclaim a number of pages proportional to the number of zones */
+ sc->nr_to_reclaim = 0;
+ for (z = 0; z <= classzone_idx; z++) {
+ zone = pgdat->node_zones + z;
+ if (!populated_zone(zone))
+ continue;
- /*
- * We put equal pressure on every zone, unless one zone has way too
- * many pages free already. The "too many pages" is defined as the
- * high wmark plus a "gap" where the gap is either the low
- * watermark or 1% of the zone, whichever is smaller.
- */
- balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
- zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
+ sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
+ }
/*
- * If there is no low memory pressure or the zone is balanced then no
- * reclaim is necessary
+ * Historically care was taken to put equal pressure on all zones but
+ * now pressure is applied based on node LRU order.
*/
- lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
- if (!lowmem_pressure && zone_balanced(zone, sc->order, false,
- balance_gap, classzone_idx))
- return true;
-
- shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
-
- clear_bit(ZONE_WRITEBACK, &zone->flags);
+ shrink_node(pgdat, sc, classzone_idx);
/*
- * If a zone reaches its high watermark, consider it to be no longer
- * congested. It's possible there are dirty pages backed by congested
- * BDIs but as pressure is relieved, speculatively avoid congestion
- * waits.
+ * Fragmentation may mean that the system cannot be rebalanced for
+ * high-order allocations. If twice the allocation size has been
+ * reclaimed then recheck watermarks only at order-0 to prevent
+ * excessive reclaim. Assume that a process requested a high-order
+ * can direct reclaim/compact.
*/
- if (zone_reclaimable(zone) &&
- zone_balanced(zone, sc->order, false, 0, classzone_idx)) {
- clear_bit(ZONE_CONGESTED, &zone->flags);
- clear_bit(ZONE_DIRTY, &zone->flags);
- }
+ if (sc->order && sc->nr_reclaimed >= 2UL << sc->order)
+ sc->order = 0;
return sc->nr_scanned >= sc->nr_to_reclaim;
}
/*
- * For kswapd, balance_pgdat() will work across all this node's zones until
- * they are all at high_wmark_pages(zone).
+ * For kswapd, balance_pgdat() will reclaim pages across a node from zones
+ * that are eligible for use by the caller until at least one zone is
+ * balanced.
*
- * Returns the highest zone idx kswapd was reclaiming at
- *
- * There is special handling here for zones which are full of pinned pages.
- * This can happen if the pages are all mlocked, or if they are all used by
- * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
- * What we do is to detect the case where all pages in the zone have been
- * scanned twice and there has been zero successful reclaim. Mark the zone as
- * dead and from now on, only perform a short scan. Basically we're polling
- * the zone for when the problem goes away.
+ * Returns the order kswapd finished reclaiming at.
*
* kswapd scans the zones in the highmem->normal->dma direction. It skips
* zones which have free_pages > high_wmark_pages(zone), but once a zone is
- * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
- * lower zones regardless of the number of free pages in the lower zones. This
- * interoperates with the page allocator fallback scheme to ensure that aging
- * of pages is balanced across the zones.
+ * found to have free_pages <= high_wmark_pages(zone), any page is that zone
+ * or lower is eligible for reclaim until at least one usable zone is
+ * balanced.
*/
static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
{
int i;
- int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
+ struct zone *zone;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.order = order,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = 1,
+ .reclaim_idx = classzone_idx,
};
count_vm_event(PAGEOUTRUN);
sc.nr_reclaimed = 0;
- /*
- * Scan in the highmem->dma direction for the highest
- * zone which needs scanning
- */
- for (i = pgdat->nr_zones - 1; i >= 0; i--) {
- struct zone *zone = pgdat->node_zones + i;
-
+ /* Scan from the highest requested zone to dma */
+ for (i = classzone_idx; i >= 0; i--) {
+ zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
- if (sc.priority != DEF_PRIORITY &&
- !zone_reclaimable(zone))
- continue;
-
- /*
- * Do some background aging of the anon list, to give
- * pages a chance to be referenced before reclaiming.
- */
- age_active_anon(zone, &sc);
-
/*
* If the number of buffer_heads in the machine
* exceeds the maximum allowed level and this node
* it to relieve lowmem pressure.
*/
if (buffer_heads_over_limit && is_highmem_idx(i)) {
- end_zone = i;
+ classzone_idx = i;
break;
}
- if (!zone_balanced(zone, order, false, 0, 0)) {
- end_zone = i;
+ if (!zone_balanced(zone, order, 0)) {
+ classzone_idx = i;
break;
} else {
/*
- * If balanced, clear the dirty and congested
- * flags
+ * If any eligible zone is balanced then the
+ * node is not considered congested or dirty.
*/
- clear_bit(ZONE_CONGESTED, &zone->flags);
- clear_bit(ZONE_DIRTY, &zone->flags);
+ clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
+ clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
}
}
if (i < 0)
goto out;
+ /*
+ * Do some background aging of the anon list, to give
+ * pages a chance to be referenced before reclaiming. All
+ * pages are rotated regardless of classzone as this is
+ * about consistent aging.
+ */
+ age_active_anon(pgdat, &pgdat->node_zones[MAX_NR_ZONES - 1], &sc);
+
/*
* If we're getting trouble reclaiming, start doing writepage
* even in laptop mode.
*/
- if (sc.priority < DEF_PRIORITY - 2)
+ if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat))
sc.may_writepage = 1;
+ /* Call soft limit reclaim before calling shrink_node. */
+ sc.nr_scanned = 0;
+ nr_soft_scanned = 0;
+ nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, sc.order,
+ sc.gfp_mask, &nr_soft_scanned);
+ sc.nr_reclaimed += nr_soft_reclaimed;
+
/*
- * Now scan the zone in the dma->highmem direction, stopping
- * at the last zone which needs scanning.
- *
- * We do this because the page allocator works in the opposite
- * direction. This prevents the page allocator from allocating
- * pages behind kswapd's direction of progress, which would
- * cause too much scanning of the lower zones.
+ * There should be no need to raise the scanning priority if
+ * enough pages are already being scanned that that high
+ * watermark would be met at 100% efficiency.
*/
- for (i = 0; i <= end_zone; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!populated_zone(zone))
- continue;
-
- if (sc.priority != DEF_PRIORITY &&
- !zone_reclaimable(zone))
- continue;
-
- sc.nr_scanned = 0;
-
- nr_soft_scanned = 0;
- /*
- * Call soft limit reclaim before calling shrink_zone.
- */
- nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
- order, sc.gfp_mask,
- &nr_soft_scanned);
- sc.nr_reclaimed += nr_soft_reclaimed;
-
- /*
- * There should be no need to raise the scanning
- * priority if enough pages are already being scanned
- * that that high watermark would be met at 100%
- * efficiency.
- */
- if (kswapd_shrink_zone(zone, end_zone, &sc))
- raise_priority = false;
- }
+ if (kswapd_shrink_node(pgdat, classzone_idx, &sc))
+ raise_priority = false;
/*
* If the low watermark is met there is no need for processes
if (try_to_freeze() || kthread_should_stop())
break;
+ /*
+ * Stop reclaiming if any eligible zone is balanced and clear
+ * node writeback or congested.
+ */
+ for (i = 0; i <= classzone_idx; i++) {
+ zone = pgdat->node_zones + i;
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_balanced(zone, sc.order, classzone_idx)) {
+ clear_bit(PGDAT_CONGESTED, &pgdat->flags);
+ clear_bit(PGDAT_DIRTY, &pgdat->flags);
+ goto out;
+ }
+ }
+
/*
* Raise priority if scanning rate is too low or there was no
* progress in reclaiming pages
*/
if (raise_priority || !sc.nr_reclaimed)
sc.priority--;
- } while (sc.priority >= 1 &&
- !pgdat_balanced(pgdat, order, classzone_idx));
+ } while (sc.priority >= 1);
out:
/*
- * Return the highest zone idx we were reclaiming at so
- * prepare_kswapd_sleep() makes the same decisions as here.
+ * Return the order kswapd stopped reclaiming at as
+ * prepare_kswapd_sleep() takes it into account. If another caller
+ * entered the allocator slow path while kswapd was awake, order will
+ * remain at the higher level.
*/
- return end_zone;
+ return sc.order;
}
static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- balanced_classzone_idx = balance_pgdat(pgdat, order,
- classzone_idx);
+
+ /* return value ignored until next patch */
+ balance_pgdat(pgdat, order, classzone_idx);
}
}
}
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
- if (zone_balanced(zone, order, true, 0, 0))
+ if (zone_balanced(zone, order, 0))
return;
trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
struct scan_control sc = {
.nr_to_reclaim = nr_to_reclaim,
.gfp_mask = GFP_HIGHUSER_MOVABLE,
+ .reclaim_idx = MAX_NR_ZONES - 1,
.priority = DEF_PRIORITY,
.may_writepage = 1,
.may_unmap = 1,
static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
{
unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
- unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
- zone_page_state(zone, NR_ACTIVE_FILE);
+ unsigned long file_lru = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) +
+ node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE);
/*
* It's possible for there to be more file mapped pages than
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
.may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
.may_swap = 1,
+ .reclaim_idx = zone_idx(zone),
};
cond_resched();
* priorities until we have enough memory freed.
*/
do {
- shrink_zone(zone, &sc, true);
+ shrink_node(zone->zone_pgdat, &sc, zone_idx(zone));
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
return ZONE_RECLAIM_FULL;
- if (!zone_reclaimable(zone))
+ if (!pgdat_reclaimable(zone->zone_pgdat))
return ZONE_RECLAIM_FULL;
/*
pagezone = page_zone(page);
if (pagezone != zone) {
if (zone)
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
zone = pagezone;
- spin_lock_irq(&zone->lru_lock);
+ spin_lock_irq(zone_lru_lock(zone));
}
- lruvec = mem_cgroup_page_lruvec(page, zone);
+ lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
if (!PageLRU(page) || !PageUnevictable(page))
continue;
if (zone) {
__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
- spin_unlock_irq(&zone->lru_lock);
+ spin_unlock_irq(zone_lru_lock(zone));
}
}
#endif /* CONFIG_SHMEM */