On Tue, 20 Jan 2026 10:43:47 +0800 Jiayuan Chen <[email protected]> wrote:

> == Problem ==
> 
> We observed an issue in production on a multi-NUMA system where kswapd
> runs endlessly, causing sustained heavy IO READ pressure across the
> entire system.
> 
> The root cause is that direct reclaim triggered by cgroup memory.high
> keeps resetting kswapd_failures to 0, even when the node cannot be
> balanced. This prevents kswapd from ever stopping after reaching
> MAX_RECLAIM_RETRIES.
> 

Updated, thanks.

> v3 -> v4: 
> https://lore.kernel.org/linux-mm/[email protected]/
>   - Add Acked-by tags
>   - Some modifications suggested by Johannes Weiner 

Here's how v4 altered mm.git:


 include/linux/mmzone.h        |   26 ++++++++-----
 include/trace/events/vmscan.h |   24 ++++++------
 mm/memory-tiers.c             |    2 -
 mm/page_alloc.c               |    4 +-
 mm/show_mem.c                 |    3 -
 mm/vmscan.c                   |   60 +++++++++++++++++---------------
 mm/vmstat.c                   |    2 -
 7 files changed, 64 insertions(+), 57 deletions(-)

--- a/include/linux/mmzone.h~b
+++ a/include/linux/mmzone.h
@@ -1531,26 +1531,30 @@ static inline unsigned long pgdat_end_pf
        return pgdat->node_start_pfn + pgdat->node_spanned_pages;
 }
 
-enum reset_kswapd_failures_reason {
-       RESET_KSWAPD_FAILURES_OTHER = 0,
-       RESET_KSWAPD_FAILURES_KSWAPD,
-       RESET_KSWAPD_FAILURES_DIRECT,
-       RESET_KSWAPD_FAILURES_PCP,
-};
-
-void pgdat_reset_kswapd_failures(pg_data_t *pgdat, enum 
reset_kswapd_failures_reason reason);
-
 #include <linux/memory_hotplug.h>
 
 void build_all_zonelists(pg_data_t *pgdat);
-void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
-                  enum zone_type highest_zoneidx);
 bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long 
mark,
                         int highest_zoneidx, unsigned int alloc_flags,
                         long free_pages);
 bool zone_watermark_ok(struct zone *z, unsigned int order,
                unsigned long mark, int highest_zoneidx,
                unsigned int alloc_flags);
+
+enum kswapd_clear_hopeless_reason {
+       KSWAPD_CLEAR_HOPELESS_OTHER = 0,
+       KSWAPD_CLEAR_HOPELESS_KSWAPD,
+       KSWAPD_CLEAR_HOPELESS_DIRECT,
+       KSWAPD_CLEAR_HOPELESS_PCP,
+};
+
+void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
+                  enum zone_type highest_zoneidx);
+void kswapd_try_clear_hopeless(struct pglist_data *pgdat,
+                              unsigned int order, int highest_zoneidx);
+void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason 
reason);
+bool kswapd_test_hopeless(pg_data_t *pgdat);
+
 /*
  * Memory initialization context, use to differentiate memory added by
  * the platform statically or via memory hotplug interface.
--- a/include/trace/events/vmscan.h~b
+++ a/include/trace/events/vmscan.h
@@ -40,16 +40,16 @@
                {_VMSCAN_THROTTLE_CONGESTED,    "VMSCAN_THROTTLE_CONGESTED"}    
\
                ) : "VMSCAN_THROTTLE_NONE"
 
-TRACE_DEFINE_ENUM(RESET_KSWAPD_FAILURES_OTHER);
-TRACE_DEFINE_ENUM(RESET_KSWAPD_FAILURES_KSWAPD);
-TRACE_DEFINE_ENUM(RESET_KSWAPD_FAILURES_DIRECT);
-TRACE_DEFINE_ENUM(RESET_KSWAPD_FAILURES_PCP);
-
-#define reset_kswapd_src                               \
-       {RESET_KSWAPD_FAILURES_KSWAPD,  "KSWAPD"},      \
-       {RESET_KSWAPD_FAILURES_DIRECT,  "DIRECT"},      \
-       {RESET_KSWAPD_FAILURES_PCP,     "PCP"},         \
-       {RESET_KSWAPD_FAILURES_OTHER,   "OTHER"}
+TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_OTHER);
+TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_KSWAPD);
+TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_DIRECT);
+TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_PCP);
+
+#define kswapd_clear_hopeless_reason_ops               \
+       {KSWAPD_CLEAR_HOPELESS_KSWAPD,  "KSWAPD"},      \
+       {KSWAPD_CLEAR_HOPELESS_DIRECT,  "DIRECT"},      \
+       {KSWAPD_CLEAR_HOPELESS_PCP,     "PCP"},         \
+       {KSWAPD_CLEAR_HOPELESS_OTHER,   "OTHER"}
 
 #define trace_reclaim_flags(file) ( \
        (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
@@ -566,7 +566,7 @@ TRACE_EVENT(mm_vmscan_kswapd_reclaim_fai
                __entry->nid, __entry->failures)
 );
 
-TRACE_EVENT(mm_vmscan_reset_kswapd_failures,
+TRACE_EVENT(mm_vmscan_kswapd_clear_hopeless,
 
        TP_PROTO(int nid, int reason),
 
@@ -584,7 +584,7 @@ TRACE_EVENT(mm_vmscan_reset_kswapd_failu
 
        TP_printk("nid=%d reason=%s",
                __entry->nid,
-               __print_symbolic(__entry->reason, reset_kswapd_src))
+               __print_symbolic(__entry->reason, 
kswapd_clear_hopeless_reason_ops))
 );
 #endif /* _TRACE_VMSCAN_H */
 
--- a/mm/memory-tiers.c~b
+++ a/mm/memory-tiers.c
@@ -955,7 +955,7 @@ static ssize_t demotion_enabled_store(st
                struct pglist_data *pgdat;
 
                for_each_online_pgdat(pgdat)
-                       pgdat_reset_kswapd_failures(pgdat, 
RESET_KSWAPD_FAILURES_OTHER);
+                       kswapd_clear_hopeless(pgdat, 
KSWAPD_CLEAR_HOPELESS_OTHER);
        }
 
        return count;
--- a/mm/page_alloc.c~b
+++ a/mm/page_alloc.c
@@ -2945,9 +2945,9 @@ static bool free_frozen_page_commit(stru
                 * 'hopeless node' to stay in that state for a while.  Let
                 * kswapd work again by resetting kswapd_failures.
                 */
-               if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES 
&&
+               if (kswapd_test_hopeless(pgdat) &&
                    next_memory_node(pgdat->node_id) < MAX_NUMNODES)
-                       pgdat_reset_kswapd_failures(pgdat, 
RESET_KSWAPD_FAILURES_PCP);
+                       kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_PCP);
        }
        return ret;
 }
--- a/mm/show_mem.c~b
+++ a/mm/show_mem.c
@@ -278,8 +278,7 @@ static void show_free_areas(unsigned int
 #endif
                        K(node_page_state(pgdat, NR_PAGETABLE)),
                        K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
-                       str_yes_no(atomic_read(&pgdat->kswapd_failures) >=
-                                  MAX_RECLAIM_RETRIES),
+                       str_yes_no(kswapd_test_hopeless(pgdat)),
                        K(node_page_state(pgdat, NR_BALLOON_PAGES)));
        }
 
--- a/mm/vmscan.c~b
+++ a/mm/vmscan.c
@@ -506,7 +506,7 @@ static bool skip_throttle_noprogress(pg_
         * If kswapd is disabled, reschedule if necessary but do not
         * throttle as the system is likely near OOM.
         */
-       if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
+       if (kswapd_test_hopeless(pgdat))
                return true;
 
        /*
@@ -2647,28 +2647,6 @@ static bool can_age_anon_pages(struct lr
                          lruvec_memcg(lruvec));
 }
 
-void pgdat_reset_kswapd_failures(pg_data_t *pgdat, enum 
reset_kswapd_failures_reason reason)
-{
-       /* Only trace actual resets, not redundant zero-to-zero */
-       if (atomic_xchg(&pgdat->kswapd_failures, 0))
-               trace_mm_vmscan_reset_kswapd_failures(pgdat->node_id, reason);
-}
-
-/*
- * Reset kswapd_failures only when the node is balanced. Without this
- * check, successful direct reclaim (e.g., from cgroup memory.high
- * throttling) can keep resetting kswapd_failures even when the node
- * cannot be balanced, causing kswapd to run endlessly.
- */
-static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx);
-static inline void pgdat_try_reset_kswapd_failures(struct pglist_data *pgdat,
-                                                  struct scan_control *sc)
-{
-       if (pgdat_balanced(pgdat, sc->order, sc->reclaim_idx))
-               pgdat_reset_kswapd_failures(pgdat, current_is_kswapd() ?
-                       RESET_KSWAPD_FAILURES_KSWAPD : 
RESET_KSWAPD_FAILURES_DIRECT);
-}
-
 #ifdef CONFIG_LRU_GEN
 
 #ifdef CONFIG_LRU_GEN_ENABLED
@@ -5086,7 +5064,7 @@ static void lru_gen_shrink_node(struct p
        blk_finish_plug(&plug);
 done:
        if (sc->nr_reclaimed > reclaimed)
-               pgdat_try_reset_kswapd_failures(pgdat, sc);
+               kswapd_try_clear_hopeless(pgdat, sc->order, sc->reclaim_idx);
 }
 
 /******************************************************************************
@@ -6153,7 +6131,7 @@ again:
         * successful direct reclaim run will revive a dormant kswapd.
         */
        if (reclaimable)
-               pgdat_try_reset_kswapd_failures(pgdat, sc);
+               kswapd_try_clear_hopeless(pgdat, sc->order, sc->reclaim_idx);
        else if (sc->cache_trim_mode)
                sc->cache_trim_mode_failed = 1;
 }
@@ -6458,7 +6436,7 @@ static bool allow_direct_reclaim(pg_data
        int i;
        bool wmark_ok;
 
-       if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
+       if (kswapd_test_hopeless(pgdat))
                return true;
 
        for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) {
@@ -6867,7 +6845,7 @@ static bool prepare_kswapd_sleep(pg_data
                wake_up_all(&pgdat->pfmemalloc_wait);
 
        /* Hopeless node, leave it to direct reclaim */
-       if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES)
+       if (kswapd_test_hopeless(pgdat))
                return true;
 
        if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
@@ -7395,7 +7373,7 @@ void wakeup_kswapd(struct zone *zone, gf
                return;
 
        /* Hopeless node, leave it to direct reclaim if possible */
-       if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES ||
+       if (kswapd_test_hopeless(pgdat) ||
            (pgdat_balanced(pgdat, order, highest_zoneidx) &&
             !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
                /*
@@ -7415,6 +7393,32 @@ void wakeup_kswapd(struct zone *zone, gf
        wake_up_interruptible(&pgdat->kswapd_wait);
 }
 
+void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason 
reason)
+{
+       /* Only trace actual resets, not redundant zero-to-zero */
+       if (atomic_xchg(&pgdat->kswapd_failures, 0))
+               trace_mm_vmscan_kswapd_clear_hopeless(pgdat->node_id, reason);
+}
+
+/*
+ * Reset kswapd_failures only when the node is balanced. Without this
+ * check, successful direct reclaim (e.g., from cgroup memory.high
+ * throttling) can keep resetting kswapd_failures even when the node
+ * cannot be balanced, causing kswapd to run endlessly.
+ */
+void kswapd_try_clear_hopeless(struct pglist_data *pgdat,
+                              unsigned int order, int highest_zoneidx)
+{
+       if (pgdat_balanced(pgdat, order, highest_zoneidx))
+               kswapd_clear_hopeless(pgdat, current_is_kswapd() ?
+                       KSWAPD_CLEAR_HOPELESS_KSWAPD : 
KSWAPD_CLEAR_HOPELESS_DIRECT);
+}
+
+bool kswapd_test_hopeless(pg_data_t *pgdat)
+{
+       return atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES;
+}
+
 #ifdef CONFIG_HIBERNATION
 /*
  * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
--- a/mm/vmstat.c~b
+++ a/mm/vmstat.c
@@ -1840,7 +1840,7 @@ static void zoneinfo_show_print(struct s
                   "\n  start_pfn:           %lu"
                   "\n  reserved_highatomic: %lu"
                   "\n  free_highatomic:     %lu",
-                  atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES,
+                  kswapd_test_hopeless(pgdat),
                   zone->zone_start_pfn,
                   zone->nr_reserved_highatomic,
                   zone->nr_free_highatomic);
_


Reply via email to