A reaccess refers to detecting an access on a page via refault
or access bit harvesting after the initial access. Similar to
the working set histogram, the reaccess histogram breaks down
reaccesses into user-defined bins.

It tracks reaccesses from MGLRU walks, where a move from older
generations to the young generation counts as a reaccess.
Swapped out pages are tracked with the generation number
encoded in mm/workingset.c, and additional tracking is added
for enabled memory cgroups to track an additional 4 swapped out
generations.

Memcg interfaces:
/sys/fs/cgroup/.../memory.workingset.reaccess
        The format is identical to memory.workingset.page_age, but the
        content breaks down reaccesses into pre-defined intervals.
        e.g.
        N0
        1000 anon=6330 file=0
        2000 anon=72 file=0
        4000 anon=0 file=0
        18446744073709551615 anon=0 file=0
        N1
        18446744073709551615 anon=0 file=0

/sys/fs/cgroup/.../memory.workingset.reaccess_intervals
        Defines the per-node intervals for memory.workingset.reaccess.
        e.g.
        echo N0=120000,240000,480000 > memory.workingset.reaccess_intervals

Signed-off-by: Yuanchu Xie <[email protected]>
---
 include/linux/workingset_report.h |  20 +++
 mm/internal.h                     |  28 ++++
 mm/memcontrol.c                   | 112 ++++++++++++++
 mm/vmscan.c                       |   8 +-
 mm/workingset.c                   |   9 +-
 mm/workingset_report.c            | 249 ++++++++++++++++++++++++++++++
 6 files changed, 419 insertions(+), 7 deletions(-)

diff --git a/include/linux/workingset_report.h 
b/include/linux/workingset_report.h
index 502542c812b3..e908c5678b1e 100644
--- a/include/linux/workingset_report.h
+++ b/include/linux/workingset_report.h
@@ -4,6 +4,7 @@
 
 #include <linux/types.h>
 #include <linux/mutex.h>
+#include <linux/rcutree.h>
 
 struct mem_cgroup;
 struct pglist_data;
@@ -19,6 +20,12 @@ struct cgroup_file;
 #define WORKINGSET_INTERVAL_MAX ((unsigned long)-1)
 #define ANON_AND_FILE 2
 
+/*
+ * MAX_NR_EVICTED_GENS is set to 4 so we can track the same number of
+ * generations as MGLRU has resident.
+ */
+#define MAX_NR_EVICTED_GENS 4
+
 struct wsr_report_bin {
        unsigned long idle_age;
        unsigned long nr_pages[ANON_AND_FILE];
@@ -35,6 +42,18 @@ struct wsr_page_age_histo {
        struct wsr_report_bins bins;
 };
 
+struct wsr_evicted_gen {
+       unsigned long timestamp;
+       int seq;
+};
+
+struct wsr_reaccess_histo {
+       struct rcu_head rcu;
+       /* evicted gens start from min_seq[LRU_GEN_ANON] - 1 */
+       struct wsr_evicted_gen gens[MAX_NR_EVICTED_GENS];
+       struct wsr_report_bins bins;
+};
+
 struct wsr_state {
        unsigned long report_threshold;
        unsigned long refresh_interval;
@@ -47,6 +66,7 @@ struct wsr_state {
        /* breakdown of workingset by page age */
        struct mutex page_age_lock;
        struct wsr_page_age_histo *page_age;
+       struct wsr_reaccess_histo __rcu *reaccess;
 };
 
 void wsr_init(struct lruvec *lruvec);
diff --git a/mm/internal.h b/mm/internal.h
index 3730c8399ad4..077340b526e8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -205,16 +205,44 @@ void putback_lru_page(struct page *page);
 void folio_putback_lru(struct folio *folio);
 extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state 
reason);
 
+/*
+ * in mm/workingset.c
+ */
+#define WORKINGSET_SHIFT 1
+#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) +  \
+                        WORKINGSET_SHIFT + NODES_SHIFT + \
+                        MEM_CGROUP_ID_SHIFT)
+#define EVICTION_MASK  (~0UL >> EVICTION_SHIFT)
+
 #ifdef CONFIG_WORKINGSET_REPORT
 /*
  * in mm/wsr.c
  */
+void report_lru_gen_eviction(struct lruvec *lruvec, int type, int min_seq);
+void lru_gen_report_reaccess(struct lruvec *lruvec,
+                            struct lru_gen_mm_walk *walk);
+void report_reaccess_refault(struct lruvec *lruvec, unsigned long token,
+                            int type, int nr_pages);
 void notify_workingset(struct mem_cgroup *memcg, struct pglist_data *pgdat);
 /* Requires wsr->page_age_lock held */
 void wsr_refresh_scan(struct lruvec *lruvec, unsigned long refresh_interval);
 int workingset_report_intervals_parse(char *src,
                                      struct wsr_report_bins *bins);
 #else
+struct lru_gen_mm_walk;
+static inline void report_lru_gen_eviction(struct lruvec *lruvec, int type,
+                                          int min_seq)
+{
+}
+static inline void lru_gen_report_reaccess(struct lruvec *lruvec,
+                                          struct lru_gen_mm_walk *walk)
+{
+}
+static inline void report_reaccess_refault(struct lruvec *lruvec,
+                                          unsigned long token, int type,
+                                          int nr_pages)
+{
+}
 static inline void notify_workingset(struct mem_cgroup *memcg,
                                     struct pglist_data *pgdat)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 75bda5f7994d..2a39a4445bb7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -7108,6 +7108,71 @@ static ssize_t memory_ws_page_age_intervals_write(struct 
kernfs_open_file *of,
        return err;
 }
 
+static int memory_ws_reaccess_intervals_show(struct seq_file *m, void *v)
+{
+       int nid;
+       struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+       for_each_node_state(nid, N_MEMORY) {
+               struct wsr_state *wsr;
+               struct wsr_reaccess_histo *reaccess;
+               int i, nr_bins;
+
+               wsr = &mem_cgroup_lruvec(memcg, NODE_DATA(nid))->wsr;
+               rcu_read_lock();
+               reaccess = rcu_dereference(wsr->reaccess);
+               if (!reaccess)
+                       goto unlock;
+               seq_printf(m, "N%d=", nid);
+               nr_bins = reaccess->bins.nr_bins;
+               for (i = 0; i < nr_bins; ++i) {
+                       struct wsr_report_bin *bin = &reaccess->bins.bins[i];
+
+                       seq_printf(m, "%u", jiffies_to_msecs(bin->idle_age));
+                       if (i + 1 < nr_bins)
+                               seq_putc(m, ',');
+               }
+               seq_putc(m, ' ');
+unlock:
+               rcu_read_unlock();
+       }
+       seq_putc(m, '\n');
+
+       return 0;
+}
+
+static ssize_t memory_ws_reaccess_intervals_write(struct kernfs_open_file *of,
+                                                 char *buf, size_t nbytes,
+                                                 loff_t off)
+{
+       unsigned int nid;
+       int err;
+       struct wsr_state *wsr;
+       struct wsr_reaccess_histo *reaccess = NULL, *old;
+       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+       reaccess = kzalloc(sizeof(struct wsr_reaccess_histo), GFP_KERNEL);
+       if (!reaccess)
+               return -ENOMEM;
+
+       err = memory_wsr_interval_parse(of, buf, nbytes, &nid, &reaccess->bins);
+       if (err < 0)
+               goto failed;
+
+       if (err == 0) {
+               kfree(reaccess);
+               reaccess = NULL;
+       }
+
+       wsr = &mem_cgroup_lruvec(memcg, NODE_DATA(nid))->wsr;
+       old = xchg(&wsr->reaccess, reaccess);
+       kfree_rcu(old, rcu);
+       return nbytes;
+failed:
+       kfree(reaccess);
+       return err;
+}
+
 static int memory_ws_refresh_interval_show(struct seq_file *m, void *v)
 {
        int nid;
@@ -7242,6 +7307,42 @@ static int memory_ws_page_age_show(struct seq_file *m, 
void *v)
 
        return 0;
 }
+
+static int memory_ws_reaccess_histogram_show(struct seq_file *m, void *v)
+{
+       int nid;
+       struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+       for_each_node_state(nid, N_MEMORY) {
+               struct wsr_state *wsr =
+                       &mem_cgroup_lruvec(memcg, NODE_DATA(nid))->wsr;
+               struct wsr_reaccess_histo *reaccess;
+               struct wsr_report_bin *bin;
+
+               rcu_read_lock();
+               reaccess = rcu_dereference(wsr->reaccess);
+
+               if (!reaccess)
+                       goto unlock;
+
+               wsr_refresh_report(wsr, memcg, NODE_DATA(nid));
+
+               seq_printf(m, "N%d\n", nid);
+               for (bin = reaccess->bins.bins;
+                    bin->idle_age != WORKINGSET_INTERVAL_MAX; bin++)
+                       seq_printf(m, "%u anon=%lu file=%lu\n",
+                                  jiffies_to_msecs(bin->idle_age),
+                                  bin->nr_pages[0], bin->nr_pages[1]);
+
+               seq_printf(m, "%lu anon=%lu file=%lu\n", 
WORKINGSET_INTERVAL_MAX,
+                          bin->nr_pages[0], bin->nr_pages[1]);
+
+unlock:
+               rcu_read_unlock();
+       }
+
+       return 0;
+}
 #endif
 
 static struct cftype memory_files[] = {
@@ -7337,6 +7438,17 @@ static struct cftype memory_files[] = {
                .file_offset = offsetof(struct mem_cgroup, 
workingset_page_age_file),
                .seq_show = memory_ws_page_age_show,
        },
+       {
+               .name = "workingset.reaccess_intervals",
+               .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+               .seq_show = memory_ws_reaccess_intervals_show,
+               .write = memory_ws_reaccess_intervals_write,
+       },
+       {
+               .name = "workingset.reaccess",
+               .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+               .seq_show = memory_ws_reaccess_histogram_show,
+       },
 #endif
        {} /* terminate */
 };
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c6acd5265b3f..4d9245e2c0d1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3637,6 +3637,7 @@ static void walk_mm(struct lruvec *lruvec, struct 
mm_struct *mm, struct lru_gen_
                mem_cgroup_unlock_pages();
 
                if (walk->batched) {
+                       lru_gen_report_reaccess(lruvec, walk);
                        spin_lock_irq(&lruvec->lru_lock);
                        reset_batch_size(lruvec, walk);
                        spin_unlock_irq(&lruvec->lru_lock);
@@ -3709,6 +3710,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, 
bool can_swap)
        }
 done:
        reset_ctrl_pos(lruvec, type, true);
+       report_lru_gen_eviction(lruvec, type, lrugen->min_seq[type] + 1);
        WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
 
        return true;
@@ -3750,6 +3752,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, 
bool can_swap)
                        continue;
 
                reset_ctrl_pos(lruvec, type, true);
+               report_lru_gen_eviction(lruvec, type, min_seq[type]);
                WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
                success = true;
        }
@@ -4565,11 +4568,14 @@ static int evict_folios(struct lruvec *lruvec, struct 
scan_control *sc, int swap
                sc->nr_scanned -= folio_nr_pages(folio);
        }
 
+       walk = current->reclaim_state->mm_walk;
+       if (walk && walk->batched)
+               lru_gen_report_reaccess(lruvec, walk);
+
        spin_lock_irq(&lruvec->lru_lock);
 
        move_folios_to_lru(lruvec, &list);
 
-       walk = current->reclaim_state->mm_walk;
        if (walk && walk->batched)
                reset_batch_size(lruvec, walk);
 
diff --git a/mm/workingset.c b/mm/workingset.c
index 226012974328..057fbedd91ea 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -17,6 +17,8 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 
+#include "internal.h"
+
 /*
  *             Double CLOCK lists
  *
@@ -179,12 +181,6 @@
  * refault distance will immediately activate the refaulting page.
  */
 
-#define WORKINGSET_SHIFT 1
-#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) +  \
-                        WORKINGSET_SHIFT + NODES_SHIFT + \
-                        MEM_CGROUP_ID_SHIFT)
-#define EVICTION_MASK  (~0UL >> EVICTION_SHIFT)
-
 /*
  * Eviction timestamps need to be able to cover the full range of
  * actionable refaults. However, bits are tight in the xarray
@@ -294,6 +290,7 @@ static void lru_gen_refault(struct folio *folio, void 
*shadow)
                goto unlock;
 
        mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
+       report_reaccess_refault(lruvec, token, type, delta);
 
        if (!recent)
                goto unlock;
diff --git a/mm/workingset_report.c b/mm/workingset_report.c
index b00ffbfebcab..504d840bbe6a 100644
--- a/mm/workingset_report.c
+++ b/mm/workingset_report.c
@@ -34,6 +34,7 @@ void wsr_destroy(struct lruvec *lruvec)
 
        mutex_destroy(&wsr->page_age_lock);
        kfree(wsr->page_age);
+       kfree_rcu(wsr->reaccess, rcu);
        memset(wsr, 0, sizeof(*wsr));
 }
 
@@ -259,6 +260,254 @@ bool wsr_refresh_report(struct wsr_state *wsr, struct 
mem_cgroup *root,
 }
 EXPORT_SYMBOL_GPL(wsr_refresh_report);
 
+static void lru_gen_collect_reaccess_refault(struct wsr_report_bins *bins,
+                                            unsigned long timestamp, int type,
+                                            int nr_pages)
+{
+       unsigned long curr_timestamp = jiffies;
+       struct wsr_report_bin *bin = &bins->bins[0];
+
+       while (bin->idle_age != WORKINGSET_INTERVAL_MAX &&
+              time_before(timestamp + bin->idle_age, curr_timestamp))
+               bin++;
+
+       bin->nr_pages[type] += nr_pages;
+}
+
+static void collect_reaccess_type(struct lru_gen_mm_walk *walk,
+                                 const struct lru_gen_folio *lrugen,
+                                 struct wsr_report_bin *bin,
+                                 unsigned long max_seq, unsigned long min_seq,
+                                 unsigned long curr_timestamp, int type)
+{
+       unsigned long seq;
+
+       /* Skip max_seq because a reaccess moves a page from another seq
+        * to max_seq. We use the negative change in page count from
+        * other seqs to track the number of reaccesses.
+        */
+       for (seq = max_seq - 1; seq + 1 > min_seq; seq--) {
+               int younger_gen, gen, zone;
+               unsigned long gen_end, gen_start;
+               long delta = 0;
+
+               gen = lru_gen_from_seq(seq);
+
+               for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+                       long nr_pages = walk->nr_pages[gen][type][zone];
+
+                       if (nr_pages < 0)
+                               delta += -nr_pages;
+               }
+
+               gen_end = READ_ONCE(lrugen->timestamps[gen]);
+               younger_gen = lru_gen_from_seq(seq + 1);
+               gen_start = READ_ONCE(lrugen->timestamps[younger_gen]);
+
+               /* ensure gen_start is within idle_age of bin */
+               while (bin->idle_age != WORKINGSET_INTERVAL_MAX &&
+                      time_before(gen_start + bin->idle_age, curr_timestamp))
+                       bin++;
+
+               while (bin->idle_age != WORKINGSET_INTERVAL_MAX &&
+                      time_before(gen_end + bin->idle_age, curr_timestamp)) {
+                       unsigned long proportion = (long)gen_start -
+                                                  (long)curr_timestamp +
+                                                  (long)bin->idle_age;
+                       unsigned long gen_len = (long)gen_start - (long)gen_end;
+
+                       if (!gen_len)
+                               break;
+                       if (proportion) {
+                               unsigned long split_bin =
+                                       delta / gen_len * proportion;
+                               bin->nr_pages[type] += split_bin;
+                               delta -= split_bin;
+                       }
+                       gen_start = curr_timestamp - bin->idle_age;
+                       bin++;
+               }
+               bin->nr_pages[type] += delta;
+       }
+}
+
+/*
+ * Reaccesses are propagated up the memcg hierarchy during scanning/refault.
+ * Collect the reaccess information from a multi-gen LRU walk.
+ */
+static void lru_gen_collect_reaccess(struct wsr_report_bins *bins,
+                                    struct lru_gen_folio *lrugen,
+                                    struct lru_gen_mm_walk *walk)
+{
+       int type;
+       unsigned long curr_timestamp = jiffies;
+       unsigned long max_seq = READ_ONCE(walk->max_seq);
+       unsigned long min_seq[ANON_AND_FILE] = {
+               READ_ONCE(lrugen->min_seq[LRU_GEN_ANON]),
+               READ_ONCE(lrugen->min_seq[LRU_GEN_FILE]),
+       };
+
+       for (type = 0; type < ANON_AND_FILE; type++) {
+               struct wsr_report_bin *bin = &bins->bins[0];
+
+               collect_reaccess_type(walk, lrugen, bin, max_seq,
+                                     min_seq[type], curr_timestamp, type);
+       }
+}
+
+void lru_gen_report_reaccess(struct lruvec *lruvec, struct lru_gen_mm_walk 
*walk)
+{
+       struct lru_gen_folio *lrugen = &lruvec->lrugen;
+       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+       for (memcg = lruvec_memcg(lruvec); memcg;
+            memcg = parent_mem_cgroup(memcg)) {
+               struct lruvec *memcg_lruvec =
+                       mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
+               struct wsr_state *wsr = &memcg_lruvec->wsr;
+               struct wsr_reaccess_histo *reaccess;
+
+               rcu_read_lock();
+               reaccess = rcu_dereference(wsr->reaccess);
+               if (!reaccess) {
+                       rcu_read_unlock();
+                       continue;
+               }
+               lru_gen_collect_reaccess(&reaccess->bins, lrugen, walk);
+               rcu_read_unlock();
+       }
+}
+
+static inline int evicted_gen_from_seq(unsigned long seq)
+{
+       return seq % MAX_NR_EVICTED_GENS;
+}
+
+void report_lru_gen_eviction(struct lruvec *lruvec, int type, int min_seq)
+{
+       int seq;
+       struct wsr_reaccess_histo *reaccess = NULL;
+       struct lru_gen_folio *lrugen = &lruvec->lrugen;
+       struct wsr_state *wsr = &lruvec->wsr;
+
+       /*
+        * Since file can go ahead of anon, min_seq[file] >= min_seq[anon]
+        * only record evictions when anon moves forward.
+        */
+       if (type != LRU_GEN_ANON)
+               return;
+
+       /*
+        * lru_lock is held during eviction, so reaccess accounting
+        * can be serialized.
+        */
+       lockdep_assert_held(&lruvec->lru_lock);
+
+       rcu_read_lock();
+       reaccess = rcu_dereference(wsr->reaccess);
+       if (!reaccess)
+               goto unlock;
+
+       for (seq = READ_ONCE(lrugen->min_seq[LRU_GEN_ANON]); seq < min_seq;
+            ++seq) {
+               int evicted_gen = evicted_gen_from_seq(seq);
+               int gen = lru_gen_from_seq(seq);
+
+               WRITE_ONCE(reaccess->gens[evicted_gen].seq, seq);
+               WRITE_ONCE(reaccess->gens[evicted_gen].timestamp,
+                          READ_ONCE(lrugen->timestamps[gen]));
+       }
+
+unlock:
+       rcu_read_unlock();
+}
+
+/*
+ * May yield an incorrect timestamp if the token collides with
+ * a recently evicted generation.
+ */
+static int timestamp_from_workingset_token(struct lruvec *lruvec,
+                                          unsigned long token,
+                                          unsigned long *timestamp)
+{
+       int type, err = -EEXIST;
+       unsigned long seq, evicted_min_seq;
+       struct wsr_reaccess_histo *reaccess = NULL;
+       struct lru_gen_folio *lrugen = &lruvec->lrugen;
+       struct wsr_state *wsr = &lruvec->wsr;
+       unsigned long min_seq[ANON_AND_FILE] = {
+               READ_ONCE(lrugen->min_seq[LRU_GEN_ANON]),
+               READ_ONCE(lrugen->min_seq[LRU_GEN_FILE])
+       };
+
+       token >>= LRU_REFS_WIDTH;
+
+       /* recent eviction */
+       for (type = 0; type < ANON_AND_FILE; ++type) {
+               if (token ==
+                   (min_seq[type] & (EVICTION_MASK >> LRU_REFS_WIDTH))) {
+                       int gen = lru_gen_from_seq(min_seq[type]);
+
+                       *timestamp = READ_ONCE(lrugen->timestamps[gen]);
+                       return 0;
+               }
+       }
+
+       rcu_read_lock();
+       reaccess = rcu_dereference(wsr->reaccess);
+       if (!reaccess)
+               goto unlock;
+
+       /* look up in evicted gen buffer */
+       evicted_min_seq = min_seq[LRU_GEN_ANON] - MAX_NR_EVICTED_GENS;
+       if (min_seq[LRU_GEN_ANON] < MAX_NR_EVICTED_GENS)
+               evicted_min_seq = 0;
+       for (seq = min_seq[LRU_GEN_ANON]; seq > evicted_min_seq; --seq) {
+               int gen = evicted_gen_from_seq(seq - 1);
+
+               if (token == (reaccess->gens[gen].seq &
+                             (EVICTION_MASK >> LRU_REFS_WIDTH))) {
+                       *timestamp = reaccess->gens[gen].timestamp;
+
+                       goto unlock;
+               }
+       }
+
+unlock:
+       rcu_read_unlock();
+       return err;
+}
+
+void report_reaccess_refault(struct lruvec *lruvec, unsigned long token,
+                            int type, int nr_pages)
+{
+       unsigned long timestamp;
+       int err;
+       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+       err = timestamp_from_workingset_token(lruvec, token, &timestamp);
+       if (err)
+               return;
+
+       for (memcg = lruvec_memcg(lruvec); memcg;
+            memcg = parent_mem_cgroup(memcg)) {
+               struct lruvec *memcg_lruvec =
+                       mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
+               struct wsr_state *wsr = &memcg_lruvec->wsr;
+               struct wsr_reaccess_histo *reaccess = NULL;
+
+               rcu_read_lock();
+               reaccess = rcu_dereference(wsr->reaccess);
+               if (!reaccess) {
+                       rcu_read_unlock();
+                       continue;
+               }
+               lru_gen_collect_reaccess_refault(&reaccess->bins, timestamp,
+                                                type, nr_pages);
+               rcu_read_unlock();
+       }
+}
+
 static struct pglist_data *kobj_to_pgdat(struct kobject *kobj)
 {
        int nid = IS_ENABLED(CONFIG_NUMA) ? kobj_to_dev(kobj)->id :
-- 
2.44.0.396.g6e790dbe36-goog


Reply via email to