Hi David,

thanks for pursuing this! A couple of comments below.

On Mon, Jul 22, 2024 at 07:55:53PM -0400, David Finkel wrote:
> @@ -1322,11 +1322,16 @@ PAGE_SIZE multiple when read back.
>       reclaim induced by memory.reclaim.
>  
>    memory.peak
> -     A read-only single value file which exists on non-root
> -     cgroups.
> +     A read-write single value file which exists on non-root cgroups.
> +
> +     The max memory usage recorded for the cgroup and its descendants since
> +     either the creation of the cgroup or the most recent reset for that FD.
>  
> -     The max memory usage recorded for the cgroup and its
> -     descendants since the creation of the cgroup.
> +     A write of the string "reset" to this file resets it to the
> +     current memory usage for subsequent reads through the same
> +     file descriptor.
> +     Attempts to write any other non-empty string will return EINVAL
> +     (modulo leading and trailing whitespace).

Why not allow any write to reset? This makes it harder to use, and I'm
not sure accidental writes are a likely mistake to make.

> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 2150ca60394b..7001ed74e339 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -12,6 +12,7 @@
>  #include <linux/sched.h>
>  #include <linux/cpumask.h>
>  #include <linux/nodemask.h>
> +#include <linux/list.h>
>  #include <linux/rculist.h>
>  #include <linux/cgroupstats.h>
>  #include <linux/fs.h>
> @@ -855,4 +856,11 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
>  
>  struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id);
>  
> +struct memcg_peak_mem_ctx {
> +     long                            local_watermark;
> +     struct list_head                peers;
> +};

Since this is generic cgroup code, and can be conceivably used by
other controllers, let's keep the naming generic as well. How about:

struct cgroup_of_peak {
        long                    value;
        struct list_head        list;
};

cgroup-defs.h would be a better place for it.

> +struct memcg_peak_mem_ctx *memcg_extract_peak_mem_ctx(struct 
> kernfs_open_file *of);

of_peak()

> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 030d34e9d117..cbc390234605 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -198,6 +198,11 @@ struct mem_cgroup {
>       struct page_counter kmem;               /* v1 only */
>       struct page_counter tcpmem;             /* v1 only */
>  
> +     /* lists of memcg peak watching contexts on swap and memory */
> +     struct list_head peak_memory_local_watermark_watchers;
> +     struct list_head peak_swap_local_watermark_watchers;
> +     spinlock_t swap_memory_peak_watchers_lock;

These names are too long. How about:

        /* Registered local usage peak watchers */
        struct list_head        memory_peaks;
        struct list_head        swap_peaks;
        spinlock_t              peaks_lock;

> diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
> index 8cd858d912c4..06bb84218960 100644
> --- a/include/linux/page_counter.h
> +++ b/include/linux/page_counter.h
> @@ -26,6 +26,7 @@ struct page_counter {
>       atomic_long_t children_low_usage;
>  
>       unsigned long watermark;
> +     unsigned long local_watermark; /* track min of fd-local resets */
>       unsigned long failcnt;
>  
>       /* Keep all the read most fields in a separete cacheline. */
> @@ -78,7 +79,15 @@ int page_counter_memparse(const char *buf, const char *max,
>  
>  static inline void page_counter_reset_watermark(struct page_counter *counter)
>  {
> -     counter->watermark = page_counter_read(counter);
> +     unsigned long cur = page_counter_read(counter);

cur -> usage

> @@ -6907,12 +6912,109 @@ static u64 memory_current_read(struct 
> cgroup_subsys_state *css,
>       return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
>  }
>  
> -static u64 memory_peak_read(struct cgroup_subsys_state *css,
> -                         struct cftype *cft)
> +inline int swap_memory_peak_show(
> +     struct seq_file *sf, void *v, bool swap_cg)
>  {

Leave inlining to the compiler. Just static int.

The name can be simply peak_show().

Customary coding style is to line wrap at the last parameter that
fits. Don't wrap if the line fits within 80 cols.

static int peak_show(struct seq_file *sf, void *v, ...,
                     ...)
{
        ...
}

> +     struct cgroup_subsys_state *css = seq_css(sf);
>       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> +     struct page_counter *pc;
> +     struct kernfs_open_file *of = sf->private;
> +     struct memcg_peak_mem_ctx *ctx = memcg_extract_peak_mem_ctx(of);
> +     s64 fd_peak = ctx->local_watermark;
>  
> -     return (u64)memcg->memory.watermark * PAGE_SIZE;
> +     if (swap_cg)
> +             pc = &memcg->swap;
> +     else
> +             pc = &memcg->memory;
> +
> +     if (fd_peak == -1) {
> +             seq_printf(sf, "%llu\n", (u64)pc->watermark * PAGE_SIZE);
> +             return 0;
> +     }
> +
> +     s64 pc_peak = pc->local_watermark;
> +     s64 wm = fd_peak > pc_peak ? fd_peak : pc_peak;
> +
> +     seq_printf(sf, "%lld\n", wm * PAGE_SIZE);
> +     return 0;
> +}

As per Roman's feedback, don't mix decls and code.

You can simplify it by extracting css and memcg in the callers, then
pass the right struct page counter *pc directly.

That should eliminate most local variables as well.

static int peak_show(struct seq_file *sf, void *v, struct page_counter *pc)
{
        struct cgroup_of_peak *ofp = of_peak(sf->private);
        u64 peak;

        /* User wants global or local peak? */
        if (ofp->value == -1)
                peak = pc->watermark;
        else
                peak = max(ofp->value, pc->local_watermark);

        seq_printf(sf, "%lld\n", peak * PAGE_SIZE);
}

> +static int memory_peak_show(struct seq_file *sf, void *v)
> +{
> +     return swap_memory_peak_show(sf, v, false);

And then do:

        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));

        return peak_show(sf, v, &memcg->memory);

Then do the same with ... &memcg->swap.

> +inline ssize_t swap_memory_peak_write(
> +     struct kernfs_open_file *of,
> +     char *buf, size_t nbytes, loff_t off, bool swap_cg)
> +{

Same feedback as above. Please don't inline explicitly (unless it
really is measurably a performance improvement in a critical path),
and stick to surrounding coding style.

Here too, pass page_counter directly and save the branches.

> +     unsigned long cur;
> +     struct memcg_peak_mem_ctx *peer_ctx;
> +     struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> +     struct memcg_peak_mem_ctx *ctx = memcg_extract_peak_mem_ctx(of);
> +     struct page_counter *pc;
> +     struct list_head *watchers, *pos;
> +
> +     buf = strstrip(buf);
> +     /* Only allow "reset" to keep the API clear */
> +     if (strcmp(buf, "reset"))
> +             return -EINVAL;
> +
> +     if (swap_cg) {
> +             pc = &memcg->swap;
> +             watchers = &memcg->peak_swap_local_watermark_watchers;
> +     } else {
> +             pc = &memcg->memory;
> +             watchers = &memcg->peak_memory_local_watermark_watchers;
> +     }
> +
> +     spin_lock(&memcg->swap_memory_peak_watchers_lock);
> +
> +     page_counter_reset_local_watermark(pc);
> +     cur = pc->local_watermark;
> +
> +     list_for_each(pos, watchers) {

        list_for_each_entry()

> +             peer_ctx = list_entry(pos, typeof(*ctx), peers);
> +             if (cur > peer_ctx->local_watermark)
> +                     peer_ctx->local_watermark = cur;
> +     }

I don't think this is quite right. local_peak could be higher than the
current usage when a new watcher shows up. The other watchers should
retain the higher local_peak, not the current usage.

> +
> +     if (ctx->local_watermark == -1)
> +             /* only append to the list if we're not already there */
> +             list_add_tail(&ctx->peers, watchers);
> +
> +     ctx->local_watermark = cur;

This makes me think that page_counter_reset_local_watermark() is not a
good helper. It obscures what's going on. Try without it.

AFAICS the list ordering doesn't matter, so keep it simple and use a
plain list_add().

        /*
         * A new local peak is being tracked in pc->local_watermark.
         * Save current local peak in all watchers.
         */
        list_for_each_entry(pos, ...)
                if (pc->local_watermark > pos->value)
                        pos->value = pc->local_watermark;

        pc->local_watermark = page_counter_read(pc);

        /* Initital write, register watcher */
        if (ofp->value == -1)
                list_add()

        ofp->value = pc->local_watermark;

> diff --git a/mm/page_counter.c b/mm/page_counter.c
> index db20d6452b71..724d31508664 100644
> --- a/mm/page_counter.c
> +++ b/mm/page_counter.c
> @@ -79,9 +79,22 @@ void page_counter_charge(struct page_counter *counter, 
> unsigned long nr_pages)
>               /*
>                * This is indeed racy, but we can live with some
>                * inaccuracy in the watermark.
> +              *
> +              * Notably, we have two watermarks to allow for both a globally
> +              * visible peak and one that can be reset at a smaller scope.
> +              *
> +              * Since we reset both watermarks when the global reset occurs,
> +              * we can guarantee that watermark >= local_watermark, so we
> +              * don't need to do both comparisons every time.
> +              *
> +              * On systems with branch predictors, the inner condition should
> +              * be almost free.
>                */
> -             if (new > READ_ONCE(c->watermark))
> -                     WRITE_ONCE(c->watermark, new);
> +             if (new > READ_ONCE(c->local_watermark)) {
> +                     WRITE_ONCE(c->local_watermark, new);
> +                     if (new > READ_ONCE(c->watermark))
> +                             WRITE_ONCE(c->watermark, new);
> +             }
>       }
>  }
>  
> @@ -131,10 +144,23 @@ bool page_counter_try_charge(struct page_counter 
> *counter,
>               propagate_protected_usage(c, new);
>               /*
>                * Just like with failcnt, we can live with some
> -              * inaccuracy in the watermark.
> +              * inaccuracy in the watermarks.
> +              *
> +              * Notably, we have two watermarks to allow for both a globally
> +              * visible peak and one that can be reset at a smaller scope.
> +              *
> +              * Since we reset both watermarks when the global reset occurs,
> +              * we can guarantee that watermark >= local_watermark, so we
> +              * don't need to do both comparisons every time.
> +              *
> +              * On systems with branch predictors, the inner condition should
> +              * be almost free.

                /* See comment in page_counter_charge() */

> diff --git a/tools/testing/selftests/cgroup/cgroup_util.c 
> b/tools/testing/selftests/cgroup/cgroup_util.c
> index 432db923bced..1e2d46636a0c 100644
> --- a/tools/testing/selftests/cgroup/cgroup_util.c
> +++ b/tools/testing/selftests/cgroup/cgroup_util.c
> @@ -141,6 +141,16 @@ long cg_read_long(const char *cgroup, const char 
> *control)
>       return atol(buf);
>  }

This should be in patch #2.

Reply via email to