else
page = list_first_entry(list, struct page, lru);
- __dec_zone_state(zone, NR_ALLOC_BATCH);
list_del(&page->lru);
pcp->count--;
spin_unlock(&zone->lock);
if (!page)
goto failed;
- __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
}
- if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
- !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
- set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
-
- __count_zone_vm_events(PGALLOC, zone, 1 << order);
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
}
#ifdef CONFIG_NUMA
-static bool zone_local(struct zone *local_zone, struct zone *zone)
-{
- return local_zone->node == zone->node;
-}
-
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
RECLAIM_DISTANCE;
}
#else /* CONFIG_NUMA */
-static bool zone_local(struct zone *local_zone, struct zone *zone)
-{
- return true;
-}
-
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return true;
}
#endif /* CONFIG_NUMA */
-static void reset_alloc_batches(struct zone *preferred_zone)
-{
- struct zone *zone = preferred_zone->zone_pgdat->node_zones;
-
- do {
- mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) - low_wmark_pages(zone) -
- atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
- clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
- } while (zone++ != preferred_zone);
-}
-
/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
{
struct zoneref *z = ac->preferred_zoneref;
struct zone *zone;
- bool fair_skipped = false;
- bool apply_fair = (alloc_flags & ALLOC_FAIR);
+ struct pglist_data *last_pgdat_dirty_limit = NULL;
-zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
(alloc_flags & ALLOC_CPUSET) &&
!__cpuset_zone_allowed(zone, gfp_mask))
continue;
- /*
- * Distribute pages in proportion to the individual
- * zone size to ensure fair page aging. The zone a
- * page was allocated in should have no effect on the
- * time the page has in memory before being reclaimed.
- */
- if (apply_fair) {
- if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
- fair_skipped = true;
- continue;
- }
- if (!zone_local(ac->preferred_zoneref->zone, zone)) {
- if (fair_skipped)
- goto reset_fair;
- apply_fair = false;
- }
- }
/*
* When allocating a page cache page for writing, we
* want to get it from a node that is within its dirty
* will require awareness of nodes in the
* dirty-throttling and the flusher threads.
*/
- if (ac->spread_dirty_pages && !node_dirty_ok(zone->zone_pgdat))
- continue;
+ if (ac->spread_dirty_pages) {
+ if (last_pgdat_dirty_limit == zone->zone_pgdat)
+ continue;
+
+ if (!node_dirty_ok(zone->zone_pgdat)) {
+ last_pgdat_dirty_limit = zone->zone_pgdat;
+ continue;
+ }
+ }
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (!zone_watermark_fast(zone, order, mark,
}
}
- /*
- * The first pass makes sure allocations are spread fairly within the
- * local node. However, the local node might have free pages left
- * after the fairness batches are exhausted, and remote zones haven't
- * even been considered yet. Try once more without fairness, and
- * include remote zones now, before entering the slowpath and waking
- * kswapd: prefer spilling to a remote zone over swapping locally.
- */
- if (fair_skipped) {
-reset_fair:
- apply_fair = false;
- fair_skipped = false;
- reset_alloc_batches(ac->preferred_zoneref->zone);
- z = ac->preferred_zoneref;
- goto zonelist_scan;
- }
-
return NULL;
}
return page;
}
-
/*
* Maximum number of compaction retries wit a progress before OOM
* killer is consider as the only way to move forward.
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, enum compact_result *compact_result)
+ enum compact_priority prio, enum compact_result *compact_result)
{
struct page *page;
- int contended_compaction;
if (!order)
return NULL;
current->flags |= PF_MEMALLOC;
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
- mode, &contended_compaction);
+ prio);
current->flags &= ~PF_MEMALLOC;
if (*compact_result <= COMPACT_INACTIVE)
*/
count_vm_event(COMPACTSTALL);
- page = get_page_from_freelist(gfp_mask, order,
- alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page) {
struct zone *zone = page_zone(page);
*/
count_vm_event(COMPACTFAIL);
- /*
- * In all zones where compaction was attempted (and not
- * deferred or skipped), lock contention has been detected.
- * For THP allocation we do not want to disrupt the others
- * so we fallback to base pages instead.
- */
- if (contended_compaction == COMPACT_CONTENDED_LOCK)
- *compact_result = COMPACT_CONTENDED;
-
- /*
- * If compaction was aborted due to need_resched(), we do not
- * want to further increase allocation latency, unless it is
- * khugepaged trying to collapse.
- */
- if (contended_compaction == COMPACT_CONTENDED_SCHED
- && !(current->flags & PF_KTHREAD))
- *compact_result = COMPACT_CONTENDED;
-
cond_resched();
return NULL;
static inline bool
should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
- enum compact_result compact_result, enum migrate_mode *migrate_mode,
+ enum compact_result compact_result,
+ enum compact_priority *compact_priority,
int compaction_retries)
{
int max_retries = MAX_COMPACT_RETRIES;
/*
* compaction considers all the zone as desperately out of memory
* so it doesn't really make much sense to retry except when the
- * failure could be caused by weak migration mode.
+ * failure could be caused by insufficient priority
*/
if (compaction_failed(compact_result)) {
- if (*migrate_mode == MIGRATE_ASYNC) {
- *migrate_mode = MIGRATE_SYNC_LIGHT;
+ if (*compact_priority > MIN_COMPACT_PRIORITY) {
+ (*compact_priority)--;
return true;
}
return false;
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, enum compact_result *compact_result)
+ enum compact_priority prio, enum compact_result *compact_result)
{
*compact_result = COMPACT_SKIPPED;
return NULL;
static inline bool
should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
enum compact_result compact_result,
- enum migrate_mode *migrate_mode,
+ enum compact_priority *compact_priority,
int compaction_retries)
{
struct zone *zone;
return NULL;
retry:
- page = get_page_from_freelist(gfp_mask, order,
- alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
/*
* If an allocation failed after direct reclaim, it could be because
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
- if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
- if (gfp_mask & __GFP_MEMALLOC)
- alloc_flags |= ALLOC_NO_WATERMARKS;
- else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
- alloc_flags |= ALLOC_NO_WATERMARKS;
- else if (!in_interrupt() &&
- ((current->flags & PF_MEMALLOC) ||
- unlikely(test_thread_flag(TIF_MEMDIE))))
- alloc_flags |= ALLOC_NO_WATERMARKS;
- }
#ifdef CONFIG_CMA
if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
{
- return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
-}
+ if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+ return false;
-static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
-{
- return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
+ if (gfp_mask & __GFP_MEMALLOC)
+ return true;
+ if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
+ return true;
+ if (!in_interrupt() &&
+ ((current->flags & PF_MEMALLOC) ||
+ unlikely(test_thread_flag(TIF_MEMDIE))))
+ return true;
+
+ return false;
}
/*
return false;
/*
- * Keep reclaiming pages while there is a chance this will lead somewhere.
- * If none of the target zones can satisfy our allocation request even
- * if all reclaimable pages are considered then we are screwed and have
- * to go OOM.
+ * Keep reclaiming pages while there is a chance this will lead
+ * somewhere. If none of the target zones can satisfy our allocation
+ * request even if all reclaimable pages are considered then we are
+ * screwed and have to go OOM.
*/
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
struct page *page = NULL;
unsigned int alloc_flags;
unsigned long did_some_progress;
- enum migrate_mode migration_mode = MIGRATE_ASYNC;
+ enum compact_priority compact_priority = DEF_COMPACT_PRIORITY;
enum compact_result compact_result;
int compaction_retries = 0;
int no_progress_loops = 0;
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
-retry:
+ /*
+ * The fast path uses conservative alloc_flags to succeed only until
+ * kswapd needs to be woken up, and to avoid the cost of setting up
+ * alloc_flags precisely. So we do that now.
+ */
+ alloc_flags = gfp_to_alloc_flags(gfp_mask);
+
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
/*
- * OK, we're below the kswapd watermark and have kicked background
- * reclaim. Now things get more complex, so set up alloc_flags according
- * to how we want to proceed.
+ * The adjusted alloc_flags might result in immediate success, so try
+ * that first
*/
- alloc_flags = gfp_to_alloc_flags(gfp_mask);
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+ if (page)
+ goto got_pg;
+
+ /*
+ * For costly allocations, try direct compaction first, as it's likely
+ * that we have enough base pages and don't need to reclaim. Don't try
+ * that for allocations that are allowed to ignore watermarks, as the
+ * ALLOC_NO_WATERMARKS attempt didn't yet happen.
+ */
+ if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
+ !gfp_pfmemalloc_allowed(gfp_mask)) {
+ page = __alloc_pages_direct_compact(gfp_mask, order,
+ alloc_flags, ac,
+ INIT_COMPACT_PRIORITY,
+ &compact_result);
+ if (page)
+ goto got_pg;
+
+ /*
+ * Checks for costly allocations with __GFP_NORETRY, which
+ * includes THP page fault allocations
+ */
+ if (gfp_mask & __GFP_NORETRY) {
+ /*
+ * If compaction is deferred for high-order allocations,
+ * it is because sync compaction recently failed. If
+ * this is the case and the caller requested a THP
+ * allocation, we do not want to heavily disrupt the
+ * system, so we fail the allocation instead of entering
+ * direct reclaim.
+ */
+ if (compact_result == COMPACT_DEFERRED)
+ goto nopage;
+
+ /*
+ * Looks like reclaim/compaction is worth trying, but
+ * sync compaction could be very expensive, so keep
+ * using async compaction.
+ */
+ compact_priority = INIT_COMPACT_PRIORITY;
+ }
+ }
+
+retry:
+ /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
+ if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ wake_all_kswapds(order, ac);
+
+ if (gfp_pfmemalloc_allowed(gfp_mask))
+ alloc_flags = ALLOC_NO_WATERMARKS;
/*
* Reset the zonelist iterators if memory policies can be ignored.
* These allocations are high priority and system rather than user
* orientated.
*/
- if ((alloc_flags & ALLOC_NO_WATERMARKS) || !(alloc_flags & ALLOC_CPUSET)) {
+ if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
- /* This is the last chance, in general, before the goto nopage. */
- page = get_page_from_freelist(gfp_mask, order,
- alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
+ /* Attempt with potentially adjusted zonelist and alloc_flags */
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;
- /* Allocate without watermarks if the context allows */
- if (alloc_flags & ALLOC_NO_WATERMARKS) {
- page = get_page_from_freelist(gfp_mask, order,
- ALLOC_NO_WATERMARKS, ac);
- if (page)
- goto got_pg;
- }
-
/* Caller is not willing to reclaim, we can't balance anything */
if (!can_direct_reclaim) {
/*
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
- /*
- * Try direct compaction. The first pass is asynchronous. Subsequent
- * attempts after direct reclaim are synchronous
- */
- page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
- migration_mode,
- &compact_result);
- if (page)
- goto got_pg;
-
- /* Checks for THP-specific high-order allocations */
- if (is_thp_gfp_mask(gfp_mask)) {
- /*
- * If compaction is deferred for high-order allocations, it is
- * because sync compaction recently failed. If this is the case
- * and the caller requested a THP allocation, we do not want
- * to heavily disrupt the system, so we fail the allocation
- * instead of entering direct reclaim.
- */
- if (compact_result == COMPACT_DEFERRED)
- goto nopage;
-
- /*
- * Compaction is contended so rather back off than cause
- * excessive stalls.
- */
- if(compact_result == COMPACT_CONTENDED)
- goto nopage;
- }
-
- if (order && compaction_made_progress(compact_result))
- compaction_retries++;
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
if (page)
goto got_pg;
+ /* Try direct compaction and then allocating */
+ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
+ compact_priority, &compact_result);
+ if (page)
+ goto got_pg;
+
+ if (order && compaction_made_progress(compact_result))
+ compaction_retries++;
+
/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)
- goto noretry;
+ goto nopage;
/*
* Do not retry costly high order allocations unless they are
* __GFP_REPEAT
*/
if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
- goto noretry;
+ goto nopage;
/*
* Costly allocations might have made a progress but this doesn't mean
*/
if (did_some_progress > 0 &&
should_compact_retry(ac, order, alloc_flags,
- compact_result, &migration_mode,
+ compact_result, &compact_priority,
compaction_retries))
goto retry;
goto retry;
}
-noretry:
- /*
- * High-order allocations do not necessarily loop after direct reclaim
- * and reclaim/compaction depends on compaction being called after
- * reclaim so call directly if necessary.
- * It can become very expensive to allocate transparent hugepages at
- * fault, so use asynchronous memory compaction for THP unless it is
- * khugepaged trying to collapse. All other requests should tolerate
- * at least light sync migration.
- */
- if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD))
- migration_mode = MIGRATE_ASYNC;
- else
- migration_mode = MIGRATE_SYNC_LIGHT;
- page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
- ac, migration_mode,
- &compact_result);
- if (page)
- goto got_pg;
nopage:
warn_alloc_failed(gfp_mask, order, NULL);
got_pg:
{
struct page *page;
unsigned int cpuset_mems_cookie;
- unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
+ unsigned int alloc_flags = ALLOC_WMARK_LOW;
gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = {
.high_zoneidx = gfp_zone(gfp_mask),
#endif
" writeback_tmp:%lukB"
" unstable:%lukB"
+ " pages_scanned:%lu"
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
K(node_page_state(pgdat, NR_SHMEM)),
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
+ node_page_state(pgdat, NR_PAGES_SCANNED),
!pgdat_reclaimable(pgdat) ? "yes" : "no");
}
" min:%lukB"
" low:%lukB"
" high:%lukB"
+ " active_anon:%lukB"
+ " inactive_anon:%lukB"
+ " active_file:%lukB"
+ " inactive_file:%lukB"
+ " unevictable:%lukB"
+ " writepending:%lukB"
" present:%lukB"
" managed:%lukB"
" mlocked:%lukB"
" free_pcp:%lukB"
" local_pcp:%ukB"
" free_cma:%lukB"
- " node_pages_scanned:%lu"
"\n",
zone->name,
K(zone_page_state(zone, NR_FREE_PAGES)),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
+ K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
+ K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
+ K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
+ K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
K(zone->present_pages),
K(zone->managed_pages),
K(zone_page_state(zone, NR_MLOCK)),
K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
- zone_page_state(zone, NR_KERNEL_STACK) *
- THREAD_SIZE / 1024,
+ zone_page_state(zone, NR_KERNEL_STACK_KB),
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_BOUNCE)),
K(free_pcp),
K(this_cpu_read(zone->pageset->pcp.count)),
- K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
- K(node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED)));
+ K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
printk(" %ld", zone->lowmem_reserve[i]);
setup_zone_pageset(zone);
}
-static noinline __init_refok
+static noinline __ref
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
int i;
zone_seqlock_init(zone);
zone_pcp_init(zone);
- /* For bootup, initialized properly in watermark setup */
- mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
-
if (!size)
continue;
}
}
-static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
+static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
{
unsigned long __maybe_unused start = 0;
unsigned long __maybe_unused offset = 0;
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
- __mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) - low_wmark_pages(zone) -
- atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-
spin_unlock_irqrestore(&zone->lock, flags);
}