mm: vmscan: restore high-cpu watermark safety in kswapd
authorJohannes Weiner <hannes@cmpxchg.org>
Wed, 16 Apr 2025 13:45:39 +0000 (09:45 -0400)
committerAndrew Morton <akpm@linux-foundation.org>
Fri, 18 Apr 2025 03:10:09 +0000 (20:10 -0700)
Vlastimil points out that commit a211c6550efc ("mm: page_alloc:
defrag_mode kswapd/kcompactd watermarks") switched kswapd from
zone_watermark_ok_safe() to the standard, percpu-cached version of reading
free pages, thus dropping the watermark safety precautions for systems
with high CPU counts (e.g.  >212 cpus on 64G).  Restore them.

Since zone_watermark_ok_safe() is no longer the right interface, and this
was the last caller of the function anyway, open-code the
zone_page_state_snapshot() conditional and delete the function.

Link: https://lkml.kernel.org/r/20250416135142.778933-2-hannes@cmpxchg.org
Fixes: a211c6550efc ("mm: page_alloc: defrag_mode kswapd/kcompactd watermarks")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Brendan Jackman <jackmanb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/mmzone.h
mm/page_alloc.c
mm/vmscan.c

index 4c95fcc9e9df0cafafa3970ff6231b3f20a82bb9..6ccec1bf2896ff74fc75d484e51c64072da0f3c6 100644 (file)
@@ -1502,8 +1502,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 bool zone_watermark_ok(struct zone *z, unsigned int order,
                unsigned long mark, int highest_zoneidx,
                unsigned int alloc_flags);
-bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
-               unsigned long mark, int highest_zoneidx);
 /*
  * Memory initialization context, use to differentiate memory added by
  * the platform statically or via memory hotplug interface.
index e506e365d6f18b865915e5c809eae100374580ad..5669baf2a6fea75c17b2be426443a6cf29051f52 100644 (file)
@@ -3470,18 +3470,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
        return false;
 }
 
-bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
-                       unsigned long mark, int highest_zoneidx)
-{
-       long free_pages = zone_page_state(z, NR_FREE_PAGES);
-
-       if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
-               free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
-
-       return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
-                                                               free_pages);
-}
-
 #ifdef CONFIG_NUMA
 int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
 
index b620d74b0f66e33ce821260366ec1afde2abcaac..cc422ad830d6357bcaffab2e349690c1367cb158 100644 (file)
@@ -6736,6 +6736,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
         * meet watermarks.
         */
        for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
+               enum zone_stat_item item;
                unsigned long free_pages;
 
                if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
@@ -6748,9 +6749,25 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
                 * blocks to avoid polluting allocator fallbacks.
                 */
                if (defrag_mode)
-                       free_pages = zone_page_state(zone, NR_FREE_PAGES_BLOCKS);
+                       item = NR_FREE_PAGES_BLOCKS;
                else
-                       free_pages = zone_page_state(zone, NR_FREE_PAGES);
+                       item = NR_FREE_PAGES;
+
+               /*
+                * When there is a high number of CPUs in the system,
+                * the cumulative error from the vmstat per-cpu cache
+                * can blur the line between the watermarks. In that
+                * case, be safe and get an accurate snapshot.
+                *
+                * TODO: NR_FREE_PAGES_BLOCKS moves in steps of
+                * pageblock_nr_pages, while the vmstat pcp threshold
+                * is limited to 125. On many configurations that
+                * counter won't actually be per-cpu cached. But keep
+                * things simple for now; revisit when somebody cares.
+                */
+               free_pages = zone_page_state(zone, item);
+               if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark)
+                       free_pages = zone_page_state_snapshot(zone, item);
 
                if (__zone_watermark_ok(zone, order, mark, highest_zoneidx,
                                        0, free_pages))