Re: [External] Re: [RFC PATCH v2 06/18] mm: memcontrol: move the objcg infrastructure out of CONFIG_MEMCG_KMEM

2021-04-11 Thread Muchun Song
On Sat, Apr 10, 2021 at 12:56 AM Johannes Weiner  wrote:
>
> On Fri, Apr 09, 2021 at 08:29:47PM +0800, Muchun Song wrote:
> > Because memory allocations pinning memcgs for a long time - it exists
> > at a larger scale and is causing recurring problems in the real world:
> > page cache doesn't get reclaimed for a long time, or is used by the
> > second, third, fourth, ... instance of the same job that was restarted
> > into a new cgroup every time. Unreclaimable dying cgroups pile up,
> > waste memory, and make page reclaim very inefficient.
> >
> > We can convert LRU pages and most other raw memcg pins to the objcg
> > direction to fix this problem, and then the page->memcg will always
> > point to an object cgroup pointer.
> >
> > Therefore, the infrastructure of objcg no longer only serves
> > CONFIG_MEMCG_KMEM. In this patch, we move the infrastructure of the
> > objcg out of the scope of the CONFIG_MEMCG_KMEM so that the LRU pages
> > can reuse it to charge pages.
>
> Just an observation on this:
>
> We actually may want to remove CONFIG_MEMCG_KMEM altogether at this
> point. It used to be an optional feature, but nowadays it's not
> configurable anymore, and always on unless slob is configured.
>
> We've also added more than just slab accounting to it, like kernel
> stack pages, and it all gets disabled on slob configs just because it
> doesn't support slab object tracking.
>
> We could probably replace CONFIG_MEMCG_KMEM with CONFIG_MEMCG in most
> places, and add a couple of !CONFIG_SLOB checks in the slab callbacks.
>
> But that's beyond the scope of your patch series, so I'm also okay
> with this patch here.
>
> > We know that the LRU pages are not accounted at the root level. But the
> > page->memcg_data points to the root_mem_cgroup. So the page->memcg_data
> > of the LRU pages always points to a valid pointer. But the root_mem_cgroup
> > dose not have an object cgroup. If we use obj_cgroup APIs to charge the
> > LRU pages, we should set the page->memcg_data to a root object cgroup. So
> > we also allocate an object cgroup for the root_mem_cgroup and introduce
> > root_obj_cgroup to cache its value just like root_mem_cgroup.
> >
> > Signed-off-by: Muchun Song 
>
> Overall, the patch makes sense to me. A few comments below:
>
> > @@ -252,9 +253,14 @@ struct cgroup_subsys_state *vmpressure_to_css(struct 
> > vmpressure *vmpr)
> >   return _of(vmpr, struct mem_cgroup, vmpressure)->css;
> >  }
> >
> > -#ifdef CONFIG_MEMCG_KMEM
> >  extern spinlock_t css_set_lock;
> >
> > +static inline bool obj_cgroup_is_root(struct obj_cgroup *objcg)
> > +{
>   > +   return objcg == root_obj_cgroup;
> > +}
>
> This function, and by extension root_obj_cgroup, aren't used by this
> patch. Please move them to the patch that adds users for them.

OK. Will do.

>
> > @@ -298,6 +304,20 @@ static void obj_cgroup_release(struct percpu_ref *ref)
> >   percpu_ref_exit(ref);
> >   kfree_rcu(objcg, rcu);
> >  }
> > +#else
> > +static void obj_cgroup_release(struct percpu_ref *ref)
> > +{
> > + struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, 
> > refcnt);
> > + unsigned long flags;
> > +
> > + spin_lock_irqsave(_set_lock, flags);
> > + list_del(>list);
> > + spin_unlock_irqrestore(_set_lock, flags);
> > +
> > + percpu_ref_exit(ref);
> > + kfree_rcu(objcg, rcu);
> > +}
> > +#endif
>
> Having two separate functions for if and else is good when the else
> branch is a completely empty dummy function. In this case you end up
> duplicating code, so it's better to have just one function and put the
> ifdef around the nr_charged_bytes handling in it.

Make sense. I will rework the code here.

>
> > @@ -318,10 +338,14 @@ static struct obj_cgroup *obj_cgroup_alloc(void)
> >   return objcg;
> >  }
> >
> > -static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
> > -   struct mem_cgroup *parent)
> > +static void memcg_reparent_objcgs(struct mem_cgroup *memcg)
> >  {
> >   struct obj_cgroup *objcg, *iter;
> > + struct mem_cgroup *parent;
> > +
> > + parent = parent_mem_cgroup(memcg);
> > + if (!parent)
> > + parent = root_mem_cgroup;
> >
> >   objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
> >
> > @@ -342,6 +366,27 @@ static void memcg_reparent_objcgs(struct mem_cgroup 
> > *memcg,
> >   percpu_ref_kill(>refcnt);
> >  }
> >
> > +static int memcg_obj_cgroup_alloc(struct mem_cgroup *memcg)
> > +{
> > + struct obj_cgroup *objcg;
> > +
> > + objcg = obj_cgroup_alloc();
> > + if (!objcg)
> > + return -ENOMEM;
> > +
> > + objcg->memcg = memcg;
> > + rcu_assign_pointer(memcg->objcg, objcg);
> > +
> > + return 0;
> > +}
> > +
> > +static void memcg_obj_cgroup_free(struct mem_cgroup *memcg)
> > +{
> > + if (unlikely(memcg->objcg))
> > + memcg_reparent_objcgs(memcg);
> > +}
>
> It's confusing to have a 'free' function not free the object it's
> called on.
>
> But 

Re: [RFC PATCH v2 06/18] mm: memcontrol: move the objcg infrastructure out of CONFIG_MEMCG_KMEM

2021-04-09 Thread Johannes Weiner
On Fri, Apr 09, 2021 at 08:29:47PM +0800, Muchun Song wrote:
> Because memory allocations pinning memcgs for a long time - it exists
> at a larger scale and is causing recurring problems in the real world:
> page cache doesn't get reclaimed for a long time, or is used by the
> second, third, fourth, ... instance of the same job that was restarted
> into a new cgroup every time. Unreclaimable dying cgroups pile up,
> waste memory, and make page reclaim very inefficient.
> 
> We can convert LRU pages and most other raw memcg pins to the objcg
> direction to fix this problem, and then the page->memcg will always
> point to an object cgroup pointer.
> 
> Therefore, the infrastructure of objcg no longer only serves
> CONFIG_MEMCG_KMEM. In this patch, we move the infrastructure of the
> objcg out of the scope of the CONFIG_MEMCG_KMEM so that the LRU pages
> can reuse it to charge pages.

Just an observation on this:

We actually may want to remove CONFIG_MEMCG_KMEM altogether at this
point. It used to be an optional feature, but nowadays it's not
configurable anymore, and always on unless slob is configured.

We've also added more than just slab accounting to it, like kernel
stack pages, and it all gets disabled on slob configs just because it
doesn't support slab object tracking.

We could probably replace CONFIG_MEMCG_KMEM with CONFIG_MEMCG in most
places, and add a couple of !CONFIG_SLOB checks in the slab callbacks.

But that's beyond the scope of your patch series, so I'm also okay
with this patch here.

> We know that the LRU pages are not accounted at the root level. But the
> page->memcg_data points to the root_mem_cgroup. So the page->memcg_data
> of the LRU pages always points to a valid pointer. But the root_mem_cgroup
> dose not have an object cgroup. If we use obj_cgroup APIs to charge the
> LRU pages, we should set the page->memcg_data to a root object cgroup. So
> we also allocate an object cgroup for the root_mem_cgroup and introduce
> root_obj_cgroup to cache its value just like root_mem_cgroup.
> 
> Signed-off-by: Muchun Song 

Overall, the patch makes sense to me. A few comments below:

> @@ -252,9 +253,14 @@ struct cgroup_subsys_state *vmpressure_to_css(struct 
> vmpressure *vmpr)
>   return _of(vmpr, struct mem_cgroup, vmpressure)->css;
>  }
>  
> -#ifdef CONFIG_MEMCG_KMEM
>  extern spinlock_t css_set_lock;
>  
> +static inline bool obj_cgroup_is_root(struct obj_cgroup *objcg)
> +{
  > +   return objcg == root_obj_cgroup;
> +}

This function, and by extension root_obj_cgroup, aren't used by this
patch. Please move them to the patch that adds users for them.

> @@ -298,6 +304,20 @@ static void obj_cgroup_release(struct percpu_ref *ref)
>   percpu_ref_exit(ref);
>   kfree_rcu(objcg, rcu);
>  }
> +#else
> +static void obj_cgroup_release(struct percpu_ref *ref)
> +{
> + struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
> + unsigned long flags;
> +
> + spin_lock_irqsave(_set_lock, flags);
> + list_del(>list);
> + spin_unlock_irqrestore(_set_lock, flags);
> +
> + percpu_ref_exit(ref);
> + kfree_rcu(objcg, rcu);
> +}
> +#endif

Having two separate functions for if and else is good when the else
branch is a completely empty dummy function. In this case you end up
duplicating code, so it's better to have just one function and put the
ifdef around the nr_charged_bytes handling in it.

> @@ -318,10 +338,14 @@ static struct obj_cgroup *obj_cgroup_alloc(void)
>   return objcg;
>  }
>  
> -static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
> -   struct mem_cgroup *parent)
> +static void memcg_reparent_objcgs(struct mem_cgroup *memcg)
>  {
>   struct obj_cgroup *objcg, *iter;
> + struct mem_cgroup *parent;
> +
> + parent = parent_mem_cgroup(memcg);
> + if (!parent)
> + parent = root_mem_cgroup;
>  
>   objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
>  
> @@ -342,6 +366,27 @@ static void memcg_reparent_objcgs(struct mem_cgroup 
> *memcg,
>   percpu_ref_kill(>refcnt);
>  }
>  
> +static int memcg_obj_cgroup_alloc(struct mem_cgroup *memcg)
> +{
> + struct obj_cgroup *objcg;
> +
> + objcg = obj_cgroup_alloc();
> + if (!objcg)
> + return -ENOMEM;
> +
> + objcg->memcg = memcg;
> + rcu_assign_pointer(memcg->objcg, objcg);
> +
> + return 0;
> +}
> +
> +static void memcg_obj_cgroup_free(struct mem_cgroup *memcg)
> +{
> + if (unlikely(memcg->objcg))
> + memcg_reparent_objcgs(memcg);
> +}

It's confusing to have a 'free' function not free the object it's
called on.

But rather than search for a fitting name, I think it might be better
to just fold both of these short functions into their only callsites.

Also, since memcg->objcg is reparented, and the pointer cleared, on
offlining, when could this ever be non-NULL? This deserves a comment.

> @@ -3444,7 +3489,6 @@ static u64 mem_cgroup_read_u64(struct 
> 

[RFC PATCH v2 06/18] mm: memcontrol: move the objcg infrastructure out of CONFIG_MEMCG_KMEM

2021-04-09 Thread Muchun Song
Because memory allocations pinning memcgs for a long time - it exists
at a larger scale and is causing recurring problems in the real world:
page cache doesn't get reclaimed for a long time, or is used by the
second, third, fourth, ... instance of the same job that was restarted
into a new cgroup every time. Unreclaimable dying cgroups pile up,
waste memory, and make page reclaim very inefficient.

We can convert LRU pages and most other raw memcg pins to the objcg
direction to fix this problem, and then the page->memcg will always
point to an object cgroup pointer.

Therefore, the infrastructure of objcg no longer only serves
CONFIG_MEMCG_KMEM. In this patch, we move the infrastructure of the
objcg out of the scope of the CONFIG_MEMCG_KMEM so that the LRU pages
can reuse it to charge pages.

We know that the LRU pages are not accounted at the root level. But the
page->memcg_data points to the root_mem_cgroup. So the page->memcg_data
of the LRU pages always points to a valid pointer. But the root_mem_cgroup
dose not have an object cgroup. If we use obj_cgroup APIs to charge the
LRU pages, we should set the page->memcg_data to a root object cgroup. So
we also allocate an object cgroup for the root_mem_cgroup and introduce
root_obj_cgroup to cache its value just like root_mem_cgroup.

Signed-off-by: Muchun Song 
---
 include/linux/memcontrol.h |  4 ++-
 mm/memcontrol.c| 71 +-
 2 files changed, 60 insertions(+), 15 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 38b8d3fb24ff..ab948eb5f62e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -223,7 +223,9 @@ struct memcg_cgwb_frn {
 struct obj_cgroup {
struct percpu_ref refcnt;
struct mem_cgroup *memcg;
+#ifdef CONFIG_MEMCG_KMEM
atomic_t nr_charged_bytes;
+#endif
union {
struct list_head list;
struct rcu_head rcu;
@@ -321,9 +323,9 @@ struct mem_cgroup {
 #ifdef CONFIG_MEMCG_KMEM
int kmemcg_id;
enum memcg_kmem_state kmem_state;
+#endif
struct obj_cgroup __rcu *objcg;
struct list_head objcg_list; /* list of inherited objcgs */
-#endif
 
MEMCG_PADDING(_pad2_);
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 90c1ac58c64c..27caf24bb0c1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -75,6 +75,7 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly;
 EXPORT_SYMBOL(memory_cgrp_subsys);
 
 struct mem_cgroup *root_mem_cgroup __read_mostly;
+static struct obj_cgroup *root_obj_cgroup __read_mostly;
 
 /* Active memory cgroup to use from an interrupt context */
 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
@@ -252,9 +253,14 @@ struct cgroup_subsys_state *vmpressure_to_css(struct 
vmpressure *vmpr)
return _of(vmpr, struct mem_cgroup, vmpressure)->css;
 }
 
-#ifdef CONFIG_MEMCG_KMEM
 extern spinlock_t css_set_lock;
 
+static inline bool obj_cgroup_is_root(struct obj_cgroup *objcg)
+{
+   return objcg == root_obj_cgroup;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
  unsigned int nr_pages);
 
@@ -298,6 +304,20 @@ static void obj_cgroup_release(struct percpu_ref *ref)
percpu_ref_exit(ref);
kfree_rcu(objcg, rcu);
 }
+#else
+static void obj_cgroup_release(struct percpu_ref *ref)
+{
+   struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
+   unsigned long flags;
+
+   spin_lock_irqsave(_set_lock, flags);
+   list_del(>list);
+   spin_unlock_irqrestore(_set_lock, flags);
+
+   percpu_ref_exit(ref);
+   kfree_rcu(objcg, rcu);
+}
+#endif
 
 static struct obj_cgroup *obj_cgroup_alloc(void)
 {
@@ -318,10 +338,14 @@ static struct obj_cgroup *obj_cgroup_alloc(void)
return objcg;
 }
 
-static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
- struct mem_cgroup *parent)
+static void memcg_reparent_objcgs(struct mem_cgroup *memcg)
 {
struct obj_cgroup *objcg, *iter;
+   struct mem_cgroup *parent;
+
+   parent = parent_mem_cgroup(memcg);
+   if (!parent)
+   parent = root_mem_cgroup;
 
objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
 
@@ -342,6 +366,27 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
percpu_ref_kill(>refcnt);
 }
 
+static int memcg_obj_cgroup_alloc(struct mem_cgroup *memcg)
+{
+   struct obj_cgroup *objcg;
+
+   objcg = obj_cgroup_alloc();
+   if (!objcg)
+   return -ENOMEM;
+
+   objcg->memcg = memcg;
+   rcu_assign_pointer(memcg->objcg, objcg);
+
+   return 0;
+}
+
+static void memcg_obj_cgroup_free(struct mem_cgroup *memcg)
+{
+   if (unlikely(memcg->objcg))
+   memcg_reparent_objcgs(memcg);
+}
+
+#ifdef CONFIG_MEMCG_KMEM
 /*
  * This will be used as a shrinker list's index.
  * The main reason for