Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-19 Thread Michal Hocko
On Tue 13-11-12 16:30:36, Michal Hocko wrote:
[...]
> + /*
> +  * Root is not visited by cgroup iterators so it needs a special
> +  * treatment.
> +  */
> + if (!last_visited) {
> + css = >css;
> + } else {
> + struct cgroup *next_cgroup;
> +
> + next_cgroup = cgroup_next_descendant_pre(
> + last_visited->css.cgroup,
> + root->css.cgroup);
> + if (next_cgroup)
> + css = cgroup_subsys_state(next_cgroup,
> + mem_cgroup_subsys_id);

This is not correct because cgroup_next_descendant_pre expects pos to be
NULL for the first iteration but the way we do iterate (visit the root
first) means that the second iteration will have last_visited != NULL
and if root doesn't have any children the iteration would go unleashed
to to the endless loop. We need something like:
struct cgroup *prev_cgroup = (last_visited == root) ? NULL 
: last_visited->css.cgroup;
next_cgroup = cgroup_next_descendant_pre(prev_cgroup,
root->css.gtoup);

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-19 Thread Michal Hocko
On Wed 14-11-12 11:10:52, Michal Hocko wrote:
> On Wed 14-11-12 09:20:03, KAMEZAWA Hiroyuki wrote:
> > (2012/11/14 0:30), Michal Hocko wrote:
> [...]
> > > @@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct 
> > > mem_cgroup *root,
> > >   mz = mem_cgroup_zoneinfo(root, nid, zid);
> > >   iter = >reclaim_iter[reclaim->priority];
> > >   spin_lock(>iter_lock);
> > > + last_visited = iter->last_visited;
> > >   if (prev && reclaim->generation != 
> > > iter->generation) {
> > > + if (last_visited) {
> > > + mem_cgroup_put(last_visited);
> > > + iter->last_visited = NULL;
> > > + }
> > >   spin_unlock(>iter_lock);
> > >   return NULL;
> > >   }
> > > - id = iter->position;
> > >   }
> > >   
> > >   rcu_read_lock();
> > > - css = css_get_next(_cgroup_subsys, id + 1, >css, );
> > > - if (css) {
> > > - if (css == >css || css_tryget(css))
> > > - memcg = mem_cgroup_from_css(css);
> > > - } else
> > > - id = 0;
> > > - rcu_read_unlock();
> > > + /*
> > > +  * Root is not visited by cgroup iterators so it needs a special
> > > +  * treatment.
> > > +  */
> > > + if (!last_visited) {
> > > + css = >css;
> > > + } else {
> > > + struct cgroup *next_cgroup;
> > > +
> > > + next_cgroup = cgroup_next_descendant_pre(
> > > + last_visited->css.cgroup,
> > > + root->css.cgroup);
> > 
> > Maybe I miss something but last_visited is holded by memcg's refcnt.
> > The cgroup pointed by css.cgroup is by cgroup's refcnt which can be freed
> > before memcg is freed and last_visited->css.cgroup is out of RCU cycle.
> > Is this safe ?
> 
> Good spotted. You are right. What I need to do is to check that the
> last_visited is alive and restart from the root if not. Something like
> the bellow (incremental patch on top of this one) should help, right?
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 30efd7e..c0a91a3 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1105,6 +1105,16 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
> *root,
>   spin_unlock(>iter_lock);
>   return NULL;
>   }
> + /*
> +  * memcg is still valid because we hold a reference but
> +  * its cgroup might have vanished in the meantime so
> +  * we have to double check it is alive and restart the
> +  * tree walk otherwise.
> +  */
> + if (last_visited && !css_tryget(_visited->css)) {
> + mem_cgroup_put(last_visited);
> + last_visited = NULL;
> + }
>   }
>  
>   rcu_read_lock();
> @@ -1136,8 +1146,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
> *root,
>   if (reclaim) {
>   struct mem_cgroup *curr = memcg;
>  
> - if (last_visited)
> + if (last_visited) {
> + css_put(_visited->css);
>   mem_cgroup_put(last_visited);
> + }
>  
>   if (css && !memcg)
>   curr = mem_cgroup_from_css(css);

Now that I think about it again it seems that this is more complicated
than necessary. It should be sufficient to hold css' reference for the
iter->last_visited because this makes sure that the cgroup won't go
away same as mem_cgroup.
Memcg reference counting + css_tryget just makes the situation more
complicated because it forces us to retry the iteration on css_tryget
failure as the cgroup is gone already and we have no point to continue
other than start all over again. Which is, ehmm, _really_ ugly.

I will repost the updated version sometime this week after it passes
some testing.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-19 Thread Michal Hocko
On Wed 14-11-12 11:10:52, Michal Hocko wrote:
 On Wed 14-11-12 09:20:03, KAMEZAWA Hiroyuki wrote:
  (2012/11/14 0:30), Michal Hocko wrote:
 [...]
   @@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct 
   mem_cgroup *root,
 mz = mem_cgroup_zoneinfo(root, nid, zid);
 iter = mz-reclaim_iter[reclaim-priority];
 spin_lock(iter-iter_lock);
   + last_visited = iter-last_visited;
 if (prev  reclaim-generation != 
   iter-generation) {
   + if (last_visited) {
   + mem_cgroup_put(last_visited);
   + iter-last_visited = NULL;
   + }
 spin_unlock(iter-iter_lock);
 return NULL;
 }
   - id = iter-position;
 }
 
 rcu_read_lock();
   - css = css_get_next(mem_cgroup_subsys, id + 1, root-css, id);
   - if (css) {
   - if (css == root-css || css_tryget(css))
   - memcg = mem_cgroup_from_css(css);
   - } else
   - id = 0;
   - rcu_read_unlock();
   + /*
   +  * Root is not visited by cgroup iterators so it needs a special
   +  * treatment.
   +  */
   + if (!last_visited) {
   + css = root-css;
   + } else {
   + struct cgroup *next_cgroup;
   +
   + next_cgroup = cgroup_next_descendant_pre(
   + last_visited-css.cgroup,
   + root-css.cgroup);
  
  Maybe I miss something but last_visited is holded by memcg's refcnt.
  The cgroup pointed by css.cgroup is by cgroup's refcnt which can be freed
  before memcg is freed and last_visited-css.cgroup is out of RCU cycle.
  Is this safe ?
 
 Good spotted. You are right. What I need to do is to check that the
 last_visited is alive and restart from the root if not. Something like
 the bellow (incremental patch on top of this one) should help, right?
 
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 30efd7e..c0a91a3 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -1105,6 +1105,16 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
 *root,
   spin_unlock(iter-iter_lock);
   return NULL;
   }
 + /*
 +  * memcg is still valid because we hold a reference but
 +  * its cgroup might have vanished in the meantime so
 +  * we have to double check it is alive and restart the
 +  * tree walk otherwise.
 +  */
 + if (last_visited  !css_tryget(last_visited-css)) {
 + mem_cgroup_put(last_visited);
 + last_visited = NULL;
 + }
   }
  
   rcu_read_lock();
 @@ -1136,8 +1146,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
 *root,
   if (reclaim) {
   struct mem_cgroup *curr = memcg;
  
 - if (last_visited)
 + if (last_visited) {
 + css_put(last_visited-css);
   mem_cgroup_put(last_visited);
 + }
  
   if (css  !memcg)
   curr = mem_cgroup_from_css(css);

Now that I think about it again it seems that this is more complicated
than necessary. It should be sufficient to hold css' reference for the
iter-last_visited because this makes sure that the cgroup won't go
away same as mem_cgroup.
Memcg reference counting + css_tryget just makes the situation more
complicated because it forces us to retry the iteration on css_tryget
failure as the cgroup is gone already and we have no point to continue
other than start all over again. Which is, ehmm, _really_ ugly.

I will repost the updated version sometime this week after it passes
some testing.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-19 Thread Michal Hocko
On Tue 13-11-12 16:30:36, Michal Hocko wrote:
[...]
 + /*
 +  * Root is not visited by cgroup iterators so it needs a special
 +  * treatment.
 +  */
 + if (!last_visited) {
 + css = root-css;
 + } else {
 + struct cgroup *next_cgroup;
 +
 + next_cgroup = cgroup_next_descendant_pre(
 + last_visited-css.cgroup,
 + root-css.cgroup);
 + if (next_cgroup)
 + css = cgroup_subsys_state(next_cgroup,
 + mem_cgroup_subsys_id);

This is not correct because cgroup_next_descendant_pre expects pos to be
NULL for the first iteration but the way we do iterate (visit the root
first) means that the second iteration will have last_visited != NULL
and if root doesn't have any children the iteration would go unleashed
to to the endless loop. We need something like:
struct cgroup *prev_cgroup = (last_visited == root) ? NULL 
: last_visited-css.cgroup;
next_cgroup = cgroup_next_descendant_pre(prev_cgroup,
root-css.gtoup);

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-15 Thread Michal Hocko
On Thu 15-11-12 07:31:24, Tejun Heo wrote:
> Hello, Michal.
> 
> On Thu, Nov 15, 2012 at 04:12:55PM +0100, Michal Hocko wrote:
> > > Because I'd like to consider the next functions as implementation
> > > detail, and having interations structred as loops tend to read better
> > > and less error-prone.  e.g. when you use next functions directly, it's
> > > way easier to circumvent locking requirements in a way which isn't
> > > very obvious. 
> > 
> > The whole point behind mem_cgroup_iter is to hide all the complexity
> > behind memcg iteration. Memcg code either use for_each_mem_cgroup_tree
> > for !reclaim case and mem_cgroup_iter otherwise.
> > 
> > > So, unless it messes up the code too much (and I can't see why it
> > > would), I'd much prefer if memcg used for_each_*() macros.
> > 
> > As I said this would mean that the current mem_cgroup_iter code would
> > have to be inverted which doesn't simplify the code much. I'd rather
> > hide all the grossy details inside the memcg iterator.
> > Or am I still missing your suggestion?
> 
> One way or the other, I don't think the code complexity would change
> much.  Again, I'd much *prefer* if memcg used what other controllers
> would be using, but that's a preference and if necessary we can keep
> the next functions as exposed APIs. 

Yes please.

> I think the issue I have is that I can't see much technical
> justification for that. If the code becomes much simpler by choosing
> one over the other, sure, but is that the case here?

Yes and I've tried to say that already. Memcg needs hierarchy, css
ref counting and concurrent reclaim (per-zone per-priority) aware
iteration. All of that is hidden in mem_cgroup_iter currently so the
caller doesn't have to care about it at all. Which makes shrink_zone
not care about memcg that much.

cgroup_for_each_descendant_pre is not suitable at least because it
doesn't provide a way to start a walk at a selected node (which is
shared per-zone per-priority in memcg case).
Even if cgroup_for_each_descendant_pre had start parameter there
is still a lot of house keeping that callers would have to handle
(css_tryget to start with, update of the cached possible not mentioning
use_hierarchy thingy or mem_cgroup_disabled).
We also try to not pollute mm/vmscan.c as much as possible so we
definitely do not want to bring all this into shrink_zone.

This all sounds like too much of a hassle if it is exposed so I would
really like to stay with mem_cgroup_iter and slowly simplify it until it
can go away (if that is possible at all).

> Isn't it mostly just about where to put the same things?

Unfortunately no. We wouldn't grow own iterator in such a case.

> If so, what would be the rationale for requiring a different
> interface?

Does the above explain it?

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-15 Thread Tejun Heo
Hello, Michal.

On Thu, Nov 15, 2012 at 04:12:55PM +0100, Michal Hocko wrote:
> > Because I'd like to consider the next functions as implementation
> > detail, and having interations structred as loops tend to read better
> > and less error-prone.  e.g. when you use next functions directly, it's
> > way easier to circumvent locking requirements in a way which isn't
> > very obvious. 
> 
> The whole point behind mem_cgroup_iter is to hide all the complexity
> behind memcg iteration. Memcg code either use for_each_mem_cgroup_tree
> for !reclaim case and mem_cgroup_iter otherwise.
> 
> > So, unless it messes up the code too much (and I can't see why it
> > would), I'd much prefer if memcg used for_each_*() macros.
> 
> As I said this would mean that the current mem_cgroup_iter code would
> have to be inverted which doesn't simplify the code much. I'd rather
> hide all the grossy details inside the memcg iterator.
> Or am I still missing your suggestion?

One way or the other, I don't think the code complexity would change
much.  Again, I'd much *prefer* if memcg used what other controllers
would be using, but that's a preference and if necessary we can keep
the next functions as exposed APIs.  I think the issue I have is that
I can't see much technical justification for that.  If the code
becomes much simpler by choosing one over the other, sure, but is that
the case here?  Isn't it mostly just about where to put the same
things?  If so, what would be the rationale for requiring a different
interface?

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-15 Thread Michal Hocko
On Thu 15-11-12 06:47:32, Tejun Heo wrote:
> Hello, Michal.
> 
> On Thu, Nov 15, 2012 at 10:51:03AM +0100, Michal Hocko wrote:
> > > I'm a bit confused.  Why would that make any difference?  Shouldn't it
> > > be just able to test the condition and continue?
> > 
> > Ohh, I misunderstood your proposal. So what you are suggesting is
> > to put all the logic we have in mem_cgroup_iter inside what you call
> > reclaim here + mem_cgroup_iter_break inside the loop, right?
> > 
> > I do not see how this would help us much. mem_cgroup_iter is not the
> > nicest piece of code but it handles quite a complex requirements that we
> > have currently (css reference count, multiple reclaimers racing). So I
> > would rather keep it this way. Further simplifications are welcome of
> > course.
> > 
> > Is there any reason why you are not happy about direct using of
> > cgroup_next_descendant_pre?
> 
> Because I'd like to consider the next functions as implementation
> detail, and having interations structred as loops tend to read better
> and less error-prone.  e.g. when you use next functions directly, it's
> way easier to circumvent locking requirements in a way which isn't
> very obvious. 

The whole point behind mem_cgroup_iter is to hide all the complexity
behind memcg iteration. Memcg code either use for_each_mem_cgroup_tree
for !reclaim case and mem_cgroup_iter otherwise.

> So, unless it messes up the code too much (and I can't see why it
> would), I'd much prefer if memcg used for_each_*() macros.

As I said this would mean that the current mem_cgroup_iter code would
have to be inverted which doesn't simplify the code much. I'd rather
hide all the grossy details inside the memcg iterator.
Or am I still missing your suggestion?
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-15 Thread Tejun Heo
Hello, Michal.

On Thu, Nov 15, 2012 at 10:51:03AM +0100, Michal Hocko wrote:
> > I'm a bit confused.  Why would that make any difference?  Shouldn't it
> > be just able to test the condition and continue?
> 
> Ohh, I misunderstood your proposal. So what you are suggesting is
> to put all the logic we have in mem_cgroup_iter inside what you call
> reclaim here + mem_cgroup_iter_break inside the loop, right?
> 
> I do not see how this would help us much. mem_cgroup_iter is not the
> nicest piece of code but it handles quite a complex requirements that we
> have currently (css reference count, multiple reclaimers racing). So I
> would rather keep it this way. Further simplifications are welcome of
> course.
> 
> Is there any reason why you are not happy about direct using of
> cgroup_next_descendant_pre?

Because I'd like to consider the next functions as implementation
detail, and having interations structred as loops tend to read better
and less error-prone.  e.g. when you use next functions directly, it's
way easier to circumvent locking requirements in a way which isn't
very obvious.  So, unless it messes up the code too much (and I can't
see why it would), I'd much prefer if memcg used for_each_*() macros.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-15 Thread Michal Hocko
On Thu 15-11-12 13:12:38, KAMEZAWA Hiroyuki wrote:
> (2012/11/14 19:10), Michal Hocko wrote:
> >On Wed 14-11-12 09:20:03, KAMEZAWA Hiroyuki wrote:
> >>(2012/11/14 0:30), Michal Hocko wrote:
> >[...]
> >>>@@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct 
> >>>mem_cgroup *root,
> >>>   mz = mem_cgroup_zoneinfo(root, nid, zid);
> >>>   iter = >reclaim_iter[reclaim->priority];
> >>>   spin_lock(>iter_lock);
> >>>+  last_visited = iter->last_visited;
> >>>   if (prev && reclaim->generation != 
> >>> iter->generation) {
> >>>+  if (last_visited) {
> >>>+  mem_cgroup_put(last_visited);
> >>>+  iter->last_visited = NULL;
> >>>+  }
> >>>   spin_unlock(>iter_lock);
> >>>   return NULL;
> >>>   }
> >>>-  id = iter->position;
> >>>   }
> >>>
> >>>   rcu_read_lock();
> >>>-  css = css_get_next(_cgroup_subsys, id + 1, >css, );
> >>>-  if (css) {
> >>>-  if (css == >css || css_tryget(css))
> >>>-  memcg = mem_cgroup_from_css(css);
> >>>-  } else
> >>>-  id = 0;
> >>>-  rcu_read_unlock();
> >>>+  /*
> >>>+   * Root is not visited by cgroup iterators so it needs a special
> >>>+   * treatment.
> >>>+   */
> >>>+  if (!last_visited) {
> >>>+  css = >css;
> >>>+  } else {
> >>>+  struct cgroup *next_cgroup;
> >>>+
> >>>+  next_cgroup = cgroup_next_descendant_pre(
> >>>+  last_visited->css.cgroup,
> >>>+  root->css.cgroup);
> >>
> >>Maybe I miss something but last_visited is holded by memcg's refcnt.
> >>The cgroup pointed by css.cgroup is by cgroup's refcnt which can be freed
> >>before memcg is freed and last_visited->css.cgroup is out of RCU cycle.
> >>Is this safe ?
> >
> >Good spotted. You are right. What I need to do is to check that the
> >last_visited is alive and restart from the root if not. Something like
> >the bellow (incremental patch on top of this one) should help, right?
> >
> >diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> >index 30efd7e..c0a91a3 100644
> >--- a/mm/memcontrol.c
> >+++ b/mm/memcontrol.c
> >@@ -1105,6 +1105,16 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
> >*root,
> > spin_unlock(>iter_lock);
> > return NULL;
> > }
> >+/*
> >+ * memcg is still valid because we hold a reference but
> >+ * its cgroup might have vanished in the meantime so
> >+ * we have to double check it is alive and restart the
> >+ * tree walk otherwise.
> >+ */
> >+if (last_visited && !css_tryget(_visited->css)) {
> >+mem_cgroup_put(last_visited);
> >+last_visited = NULL;
> >+}
> > }
> >
> > rcu_read_lock();
> >@@ -1136,8 +1146,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
> >*root,
> > if (reclaim) {
> > struct mem_cgroup *curr = memcg;
> >
> >-if (last_visited)
> >+if (last_visited) {
> >+css_put(_visited->css);
> > mem_cgroup_put(last_visited);
> >+}
> >
> > if (css && !memcg)
> > curr = mem_cgroup_from_css(css);
> >
> 
> I think this will work.

Thanks for double checking. The updated patch:
---
>From 3811de23a0306c229ce44dc655878431a4fdf449 Mon Sep 17 00:00:00 2001
From: Michal Hocko 
Date: Tue, 13 Nov 2012 11:40:35 +0100
Subject: [PATCH] memcg: rework mem_cgroup_iter to use cgroup iterators

mem_cgroup_iter curently relies on css->id when walking down a group
hierarchy tree. This is really awkward because the tree walk depends on
the groups creation ordering. The only guarantee is that a parent node
is visited before its children.
Example
 1) mkdir -p a a/d a/b/c
 2) mkdir -a a/b/c a/d
Will create the same trees but the tree walks will be different:
 1) a, d, b, c
 2) a, b, c, d

574bd9f7 (cgroup: implement generic child / descendant walk macros) has
introduced generic cgroup tree walkers which provide either pre-order
or post-order tree walk. This patch converts css->id based iteration
to pre-order tree walk to keep the semantic with the original iterator
where parent is always visited before its subtree.

cgroup_for_each_descendant_pre suggests using post_create and

Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-15 Thread Michal Hocko
On Wed 14-11-12 10:52:45, Tejun Heo wrote:
> Hello, Michal.
> 
> On Wed, Nov 14, 2012 at 09:51:29AM +0100, Michal Hocko wrote:
> > >   reclaim(root);
> > >   for_each_descendent_pre()
> > >   reclaim(descendant);
> > 
> > We cannot do for_each_descendent_pre here because we do not iterate
> > through the whole hierarchy all the time. Check shrink_zone.
> 
> I'm a bit confused.  Why would that make any difference?  Shouldn't it
> be just able to test the condition and continue?

Ohh, I misunderstood your proposal. So what you are suggesting is
to put all the logic we have in mem_cgroup_iter inside what you call
reclaim here + mem_cgroup_iter_break inside the loop, right?

I do not see how this would help us much. mem_cgroup_iter is not the
nicest piece of code but it handles quite a complex requirements that we
have currently (css reference count, multiple reclaimers racing). So I
would rather keep it this way. Further simplifications are welcome of
course.

Is there any reason why you are not happy about direct using of
cgroup_next_descendant_pre?
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-15 Thread Michal Hocko
On Wed 14-11-12 10:52:45, Tejun Heo wrote:
 Hello, Michal.
 
 On Wed, Nov 14, 2012 at 09:51:29AM +0100, Michal Hocko wrote:
 reclaim(root);
 for_each_descendent_pre()
 reclaim(descendant);
  
  We cannot do for_each_descendent_pre here because we do not iterate
  through the whole hierarchy all the time. Check shrink_zone.
 
 I'm a bit confused.  Why would that make any difference?  Shouldn't it
 be just able to test the condition and continue?

Ohh, I misunderstood your proposal. So what you are suggesting is
to put all the logic we have in mem_cgroup_iter inside what you call
reclaim here + mem_cgroup_iter_break inside the loop, right?

I do not see how this would help us much. mem_cgroup_iter is not the
nicest piece of code but it handles quite a complex requirements that we
have currently (css reference count, multiple reclaimers racing). So I
would rather keep it this way. Further simplifications are welcome of
course.

Is there any reason why you are not happy about direct using of
cgroup_next_descendant_pre?
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-15 Thread Michal Hocko
On Thu 15-11-12 13:12:38, KAMEZAWA Hiroyuki wrote:
 (2012/11/14 19:10), Michal Hocko wrote:
 On Wed 14-11-12 09:20:03, KAMEZAWA Hiroyuki wrote:
 (2012/11/14 0:30), Michal Hocko wrote:
 [...]
 @@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct 
 mem_cgroup *root,
mz = mem_cgroup_zoneinfo(root, nid, zid);
iter = mz-reclaim_iter[reclaim-priority];
spin_lock(iter-iter_lock);
 +  last_visited = iter-last_visited;
if (prev  reclaim-generation != 
  iter-generation) {
 +  if (last_visited) {
 +  mem_cgroup_put(last_visited);
 +  iter-last_visited = NULL;
 +  }
spin_unlock(iter-iter_lock);
return NULL;
}
 -  id = iter-position;
}
 
rcu_read_lock();
 -  css = css_get_next(mem_cgroup_subsys, id + 1, root-css, id);
 -  if (css) {
 -  if (css == root-css || css_tryget(css))
 -  memcg = mem_cgroup_from_css(css);
 -  } else
 -  id = 0;
 -  rcu_read_unlock();
 +  /*
 +   * Root is not visited by cgroup iterators so it needs a special
 +   * treatment.
 +   */
 +  if (!last_visited) {
 +  css = root-css;
 +  } else {
 +  struct cgroup *next_cgroup;
 +
 +  next_cgroup = cgroup_next_descendant_pre(
 +  last_visited-css.cgroup,
 +  root-css.cgroup);
 
 Maybe I miss something but last_visited is holded by memcg's refcnt.
 The cgroup pointed by css.cgroup is by cgroup's refcnt which can be freed
 before memcg is freed and last_visited-css.cgroup is out of RCU cycle.
 Is this safe ?
 
 Good spotted. You are right. What I need to do is to check that the
 last_visited is alive and restart from the root if not. Something like
 the bellow (incremental patch on top of this one) should help, right?
 
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 30efd7e..c0a91a3 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -1105,6 +1105,16 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
 *root,
  spin_unlock(iter-iter_lock);
  return NULL;
  }
 +/*
 + * memcg is still valid because we hold a reference but
 + * its cgroup might have vanished in the meantime so
 + * we have to double check it is alive and restart the
 + * tree walk otherwise.
 + */
 +if (last_visited  !css_tryget(last_visited-css)) {
 +mem_cgroup_put(last_visited);
 +last_visited = NULL;
 +}
  }
 
  rcu_read_lock();
 @@ -1136,8 +1146,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
 *root,
  if (reclaim) {
  struct mem_cgroup *curr = memcg;
 
 -if (last_visited)
 +if (last_visited) {
 +css_put(last_visited-css);
  mem_cgroup_put(last_visited);
 +}
 
  if (css  !memcg)
  curr = mem_cgroup_from_css(css);
 
 
 I think this will work.

Thanks for double checking. The updated patch:
---
From 3811de23a0306c229ce44dc655878431a4fdf449 Mon Sep 17 00:00:00 2001
From: Michal Hocko mho...@suse.cz
Date: Tue, 13 Nov 2012 11:40:35 +0100
Subject: [PATCH] memcg: rework mem_cgroup_iter to use cgroup iterators

mem_cgroup_iter curently relies on css-id when walking down a group
hierarchy tree. This is really awkward because the tree walk depends on
the groups creation ordering. The only guarantee is that a parent node
is visited before its children.
Example
 1) mkdir -p a a/d a/b/c
 2) mkdir -a a/b/c a/d
Will create the same trees but the tree walks will be different:
 1) a, d, b, c
 2) a, b, c, d

574bd9f7 (cgroup: implement generic child / descendant walk macros) has
introduced generic cgroup tree walkers which provide either pre-order
or post-order tree walk. This patch converts css-id based iteration
to pre-order tree walk to keep the semantic with the original iterator
where parent is always visited before its subtree.

cgroup_for_each_descendant_pre suggests using post_create and
pre_destroy for proper synchronization with groups addidition resp.
removal. This implementation doesn't use those because a new memory
cgroup is fully initialized in mem_cgroup_create and css_tryget makes
sure that the group is 

Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-15 Thread Tejun Heo
Hello, Michal.

On Thu, Nov 15, 2012 at 10:51:03AM +0100, Michal Hocko wrote:
  I'm a bit confused.  Why would that make any difference?  Shouldn't it
  be just able to test the condition and continue?
 
 Ohh, I misunderstood your proposal. So what you are suggesting is
 to put all the logic we have in mem_cgroup_iter inside what you call
 reclaim here + mem_cgroup_iter_break inside the loop, right?
 
 I do not see how this would help us much. mem_cgroup_iter is not the
 nicest piece of code but it handles quite a complex requirements that we
 have currently (css reference count, multiple reclaimers racing). So I
 would rather keep it this way. Further simplifications are welcome of
 course.
 
 Is there any reason why you are not happy about direct using of
 cgroup_next_descendant_pre?

Because I'd like to consider the next functions as implementation
detail, and having interations structred as loops tend to read better
and less error-prone.  e.g. when you use next functions directly, it's
way easier to circumvent locking requirements in a way which isn't
very obvious.  So, unless it messes up the code too much (and I can't
see why it would), I'd much prefer if memcg used for_each_*() macros.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-15 Thread Michal Hocko
On Thu 15-11-12 06:47:32, Tejun Heo wrote:
 Hello, Michal.
 
 On Thu, Nov 15, 2012 at 10:51:03AM +0100, Michal Hocko wrote:
   I'm a bit confused.  Why would that make any difference?  Shouldn't it
   be just able to test the condition and continue?
  
  Ohh, I misunderstood your proposal. So what you are suggesting is
  to put all the logic we have in mem_cgroup_iter inside what you call
  reclaim here + mem_cgroup_iter_break inside the loop, right?
  
  I do not see how this would help us much. mem_cgroup_iter is not the
  nicest piece of code but it handles quite a complex requirements that we
  have currently (css reference count, multiple reclaimers racing). So I
  would rather keep it this way. Further simplifications are welcome of
  course.
  
  Is there any reason why you are not happy about direct using of
  cgroup_next_descendant_pre?
 
 Because I'd like to consider the next functions as implementation
 detail, and having interations structred as loops tend to read better
 and less error-prone.  e.g. when you use next functions directly, it's
 way easier to circumvent locking requirements in a way which isn't
 very obvious. 

The whole point behind mem_cgroup_iter is to hide all the complexity
behind memcg iteration. Memcg code either use for_each_mem_cgroup_tree
for !reclaim case and mem_cgroup_iter otherwise.

 So, unless it messes up the code too much (and I can't see why it
 would), I'd much prefer if memcg used for_each_*() macros.

As I said this would mean that the current mem_cgroup_iter code would
have to be inverted which doesn't simplify the code much. I'd rather
hide all the grossy details inside the memcg iterator.
Or am I still missing your suggestion?
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-15 Thread Tejun Heo
Hello, Michal.

On Thu, Nov 15, 2012 at 04:12:55PM +0100, Michal Hocko wrote:
  Because I'd like to consider the next functions as implementation
  detail, and having interations structred as loops tend to read better
  and less error-prone.  e.g. when you use next functions directly, it's
  way easier to circumvent locking requirements in a way which isn't
  very obvious. 
 
 The whole point behind mem_cgroup_iter is to hide all the complexity
 behind memcg iteration. Memcg code either use for_each_mem_cgroup_tree
 for !reclaim case and mem_cgroup_iter otherwise.
 
  So, unless it messes up the code too much (and I can't see why it
  would), I'd much prefer if memcg used for_each_*() macros.
 
 As I said this would mean that the current mem_cgroup_iter code would
 have to be inverted which doesn't simplify the code much. I'd rather
 hide all the grossy details inside the memcg iterator.
 Or am I still missing your suggestion?

One way or the other, I don't think the code complexity would change
much.  Again, I'd much *prefer* if memcg used what other controllers
would be using, but that's a preference and if necessary we can keep
the next functions as exposed APIs.  I think the issue I have is that
I can't see much technical justification for that.  If the code
becomes much simpler by choosing one over the other, sure, but is that
the case here?  Isn't it mostly just about where to put the same
things?  If so, what would be the rationale for requiring a different
interface?

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-15 Thread Michal Hocko
On Thu 15-11-12 07:31:24, Tejun Heo wrote:
 Hello, Michal.
 
 On Thu, Nov 15, 2012 at 04:12:55PM +0100, Michal Hocko wrote:
   Because I'd like to consider the next functions as implementation
   detail, and having interations structred as loops tend to read better
   and less error-prone.  e.g. when you use next functions directly, it's
   way easier to circumvent locking requirements in a way which isn't
   very obvious. 
  
  The whole point behind mem_cgroup_iter is to hide all the complexity
  behind memcg iteration. Memcg code either use for_each_mem_cgroup_tree
  for !reclaim case and mem_cgroup_iter otherwise.
  
   So, unless it messes up the code too much (and I can't see why it
   would), I'd much prefer if memcg used for_each_*() macros.
  
  As I said this would mean that the current mem_cgroup_iter code would
  have to be inverted which doesn't simplify the code much. I'd rather
  hide all the grossy details inside the memcg iterator.
  Or am I still missing your suggestion?
 
 One way or the other, I don't think the code complexity would change
 much.  Again, I'd much *prefer* if memcg used what other controllers
 would be using, but that's a preference and if necessary we can keep
 the next functions as exposed APIs. 

Yes please.

 I think the issue I have is that I can't see much technical
 justification for that. If the code becomes much simpler by choosing
 one over the other, sure, but is that the case here?

Yes and I've tried to say that already. Memcg needs hierarchy, css
ref counting and concurrent reclaim (per-zone per-priority) aware
iteration. All of that is hidden in mem_cgroup_iter currently so the
caller doesn't have to care about it at all. Which makes shrink_zone
not care about memcg that much.

cgroup_for_each_descendant_pre is not suitable at least because it
doesn't provide a way to start a walk at a selected node (which is
shared per-zone per-priority in memcg case).
Even if cgroup_for_each_descendant_pre had start parameter there
is still a lot of house keeping that callers would have to handle
(css_tryget to start with, update of the cached possible not mentioning
use_hierarchy thingy or mem_cgroup_disabled).
We also try to not pollute mm/vmscan.c as much as possible so we
definitely do not want to bring all this into shrink_zone.

This all sounds like too much of a hassle if it is exposed so I would
really like to stay with mem_cgroup_iter and slowly simplify it until it
can go away (if that is possible at all).

 Isn't it mostly just about where to put the same things?

Unfortunately no. We wouldn't grow own iterator in such a case.

 If so, what would be the rationale for requiring a different
 interface?

Does the above explain it?

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-14 Thread Kamezawa Hiroyuki

(2012/11/14 19:10), Michal Hocko wrote:

On Wed 14-11-12 09:20:03, KAMEZAWA Hiroyuki wrote:

(2012/11/14 0:30), Michal Hocko wrote:

[...]

@@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
mz = mem_cgroup_zoneinfo(root, nid, zid);
iter = >reclaim_iter[reclaim->priority];
spin_lock(>iter_lock);
+   last_visited = iter->last_visited;
if (prev && reclaim->generation != iter->generation) {
+   if (last_visited) {
+   mem_cgroup_put(last_visited);
+   iter->last_visited = NULL;
+   }
spin_unlock(>iter_lock);
return NULL;
}
-   id = iter->position;
}

rcu_read_lock();
-   css = css_get_next(_cgroup_subsys, id + 1, >css, );
-   if (css) {
-   if (css == >css || css_tryget(css))
-   memcg = mem_cgroup_from_css(css);
-   } else
-   id = 0;
-   rcu_read_unlock();
+   /*
+* Root is not visited by cgroup iterators so it needs a special
+* treatment.
+*/
+   if (!last_visited) {
+   css = >css;
+   } else {
+   struct cgroup *next_cgroup;
+
+   next_cgroup = cgroup_next_descendant_pre(
+   last_visited->css.cgroup,
+   root->css.cgroup);


Maybe I miss something but last_visited is holded by memcg's refcnt.
The cgroup pointed by css.cgroup is by cgroup's refcnt which can be freed
before memcg is freed and last_visited->css.cgroup is out of RCU cycle.
Is this safe ?


Good spotted. You are right. What I need to do is to check that the
last_visited is alive and restart from the root if not. Something like
the bellow (incremental patch on top of this one) should help, right?

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 30efd7e..c0a91a3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1105,6 +1105,16 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
spin_unlock(>iter_lock);
return NULL;
}
+   /*
+* memcg is still valid because we hold a reference but
+* its cgroup might have vanished in the meantime so
+* we have to double check it is alive and restart the
+* tree walk otherwise.
+*/
+   if (last_visited && !css_tryget(_visited->css)) {
+   mem_cgroup_put(last_visited);
+   last_visited = NULL;
+   }
}

rcu_read_lock();
@@ -1136,8 +1146,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
if (reclaim) {
struct mem_cgroup *curr = memcg;

-   if (last_visited)
+   if (last_visited) {
+   css_put(_visited->css);
mem_cgroup_put(last_visited);
+   }

if (css && !memcg)
curr = mem_cgroup_from_css(css);



I think this will work.

Thanks,
-Kame


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-14 Thread Tejun Heo
Hello, Michal.

On Wed, Nov 14, 2012 at 09:51:29AM +0100, Michal Hocko wrote:
> > reclaim(root);
> > for_each_descendent_pre()
> > reclaim(descendant);
> 
> We cannot do for_each_descendent_pre here because we do not iterate
> through the whole hierarchy all the time. Check shrink_zone.

I'm a bit confused.  Why would that make any difference?  Shouldn't it
be just able to test the condition and continue?

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-14 Thread Michal Hocko
On Wed 14-11-12 09:20:03, KAMEZAWA Hiroyuki wrote:
> (2012/11/14 0:30), Michal Hocko wrote:
[...]
> > @@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct 
> > mem_cgroup *root,
> > mz = mem_cgroup_zoneinfo(root, nid, zid);
> > iter = >reclaim_iter[reclaim->priority];
> > spin_lock(>iter_lock);
> > +   last_visited = iter->last_visited;
> > if (prev && reclaim->generation != iter->generation) {
> > +   if (last_visited) {
> > +   mem_cgroup_put(last_visited);
> > +   iter->last_visited = NULL;
> > +   }
> > spin_unlock(>iter_lock);
> > return NULL;
> > }
> > -   id = iter->position;
> > }
> >   
> > rcu_read_lock();
> > -   css = css_get_next(_cgroup_subsys, id + 1, >css, );
> > -   if (css) {
> > -   if (css == >css || css_tryget(css))
> > -   memcg = mem_cgroup_from_css(css);
> > -   } else
> > -   id = 0;
> > -   rcu_read_unlock();
> > +   /*
> > +* Root is not visited by cgroup iterators so it needs a special
> > +* treatment.
> > +*/
> > +   if (!last_visited) {
> > +   css = >css;
> > +   } else {
> > +   struct cgroup *next_cgroup;
> > +
> > +   next_cgroup = cgroup_next_descendant_pre(
> > +   last_visited->css.cgroup,
> > +   root->css.cgroup);
> 
> Maybe I miss something but last_visited is holded by memcg's refcnt.
> The cgroup pointed by css.cgroup is by cgroup's refcnt which can be freed
> before memcg is freed and last_visited->css.cgroup is out of RCU cycle.
> Is this safe ?

Good spotted. You are right. What I need to do is to check that the
last_visited is alive and restart from the root if not. Something like
the bellow (incremental patch on top of this one) should help, right?

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 30efd7e..c0a91a3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1105,6 +1105,16 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
spin_unlock(>iter_lock);
return NULL;
}
+   /*
+* memcg is still valid because we hold a reference but
+* its cgroup might have vanished in the meantime so
+* we have to double check it is alive and restart the
+* tree walk otherwise.
+*/
+   if (last_visited && !css_tryget(_visited->css)) {
+   mem_cgroup_put(last_visited);
+   last_visited = NULL;
+   }
}
 
rcu_read_lock();
@@ -1136,8 +1146,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
if (reclaim) {
struct mem_cgroup *curr = memcg;
 
-   if (last_visited)
+   if (last_visited) {
+   css_put(_visited->css);
mem_cgroup_put(last_visited);
+   }
 
if (css && !memcg)
curr = mem_cgroup_from_css(css);
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-14 Thread Michal Hocko
On Tue 13-11-12 08:14:42, Tejun Heo wrote:
> On Tue, Nov 13, 2012 at 04:30:36PM +0100, Michal Hocko wrote:
> > @@ -1063,8 +1063,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
> > *root,
> >struct mem_cgroup *prev,
> >struct mem_cgroup_reclaim_cookie *reclaim)
> >  {
> > -   struct mem_cgroup *memcg = NULL;
> > -   int id = 0;
> > +   struct mem_cgroup *memcg = NULL,
> > + *last_visited = NULL;
> 
> Nitpick but please don't do this.

OK, will make it grep friendlier;

> > +   /*
> > +* Root is not visited by cgroup iterators so it needs a special
> > +* treatment.
> > +*/
> > +   if (!last_visited) {
> > +   css = >css;
> > +   } else {
> > +   struct cgroup *next_cgroup;
> > +
> > +   next_cgroup = cgroup_next_descendant_pre(
> > +   last_visited->css.cgroup,
> > +   root->css.cgroup);
> > +   if (next_cgroup)
> > +   css = cgroup_subsys_state(next_cgroup,
> > +   mem_cgroup_subsys_id);
> 
> Hmmm... wouldn't it be better to move the reclaim logic into a
> function and do the following?
> 
>   reclaim(root);
>   for_each_descendent_pre()
>   reclaim(descendant);

We cannot do for_each_descendent_pre here because we do not iterate
through the whole hierarchy all the time. Check shrink_zone.

> If this is a problem, I'd be happy to add a iterator which includes
> the top node.  

This would help with the above if-else but I do not think this is the
worst thing in the function ;)

> I'd prefer controllers not using the next functions directly.

Well, we will need to use it directly because of the single group
reclaim mentioned above.

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-14 Thread Michal Hocko
On Tue 13-11-12 08:14:42, Tejun Heo wrote:
 On Tue, Nov 13, 2012 at 04:30:36PM +0100, Michal Hocko wrote:
  @@ -1063,8 +1063,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
  *root,
 struct mem_cgroup *prev,
 struct mem_cgroup_reclaim_cookie *reclaim)
   {
  -   struct mem_cgroup *memcg = NULL;
  -   int id = 0;
  +   struct mem_cgroup *memcg = NULL,
  + *last_visited = NULL;
 
 Nitpick but please don't do this.

OK, will make it grep friendlier;

  +   /*
  +* Root is not visited by cgroup iterators so it needs a special
  +* treatment.
  +*/
  +   if (!last_visited) {
  +   css = root-css;
  +   } else {
  +   struct cgroup *next_cgroup;
  +
  +   next_cgroup = cgroup_next_descendant_pre(
  +   last_visited-css.cgroup,
  +   root-css.cgroup);
  +   if (next_cgroup)
  +   css = cgroup_subsys_state(next_cgroup,
  +   mem_cgroup_subsys_id);
 
 Hmmm... wouldn't it be better to move the reclaim logic into a
 function and do the following?
 
   reclaim(root);
   for_each_descendent_pre()
   reclaim(descendant);

We cannot do for_each_descendent_pre here because we do not iterate
through the whole hierarchy all the time. Check shrink_zone.

 If this is a problem, I'd be happy to add a iterator which includes
 the top node.  

This would help with the above if-else but I do not think this is the
worst thing in the function ;)

 I'd prefer controllers not using the next functions directly.

Well, we will need to use it directly because of the single group
reclaim mentioned above.

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-14 Thread Michal Hocko
On Wed 14-11-12 09:20:03, KAMEZAWA Hiroyuki wrote:
 (2012/11/14 0:30), Michal Hocko wrote:
[...]
  @@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct 
  mem_cgroup *root,
  mz = mem_cgroup_zoneinfo(root, nid, zid);
  iter = mz-reclaim_iter[reclaim-priority];
  spin_lock(iter-iter_lock);
  +   last_visited = iter-last_visited;
  if (prev  reclaim-generation != iter-generation) {
  +   if (last_visited) {
  +   mem_cgroup_put(last_visited);
  +   iter-last_visited = NULL;
  +   }
  spin_unlock(iter-iter_lock);
  return NULL;
  }
  -   id = iter-position;
  }

  rcu_read_lock();
  -   css = css_get_next(mem_cgroup_subsys, id + 1, root-css, id);
  -   if (css) {
  -   if (css == root-css || css_tryget(css))
  -   memcg = mem_cgroup_from_css(css);
  -   } else
  -   id = 0;
  -   rcu_read_unlock();
  +   /*
  +* Root is not visited by cgroup iterators so it needs a special
  +* treatment.
  +*/
  +   if (!last_visited) {
  +   css = root-css;
  +   } else {
  +   struct cgroup *next_cgroup;
  +
  +   next_cgroup = cgroup_next_descendant_pre(
  +   last_visited-css.cgroup,
  +   root-css.cgroup);
 
 Maybe I miss something but last_visited is holded by memcg's refcnt.
 The cgroup pointed by css.cgroup is by cgroup's refcnt which can be freed
 before memcg is freed and last_visited-css.cgroup is out of RCU cycle.
 Is this safe ?

Good spotted. You are right. What I need to do is to check that the
last_visited is alive and restart from the root if not. Something like
the bellow (incremental patch on top of this one) should help, right?

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 30efd7e..c0a91a3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1105,6 +1105,16 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
spin_unlock(iter-iter_lock);
return NULL;
}
+   /*
+* memcg is still valid because we hold a reference but
+* its cgroup might have vanished in the meantime so
+* we have to double check it is alive and restart the
+* tree walk otherwise.
+*/
+   if (last_visited  !css_tryget(last_visited-css)) {
+   mem_cgroup_put(last_visited);
+   last_visited = NULL;
+   }
}
 
rcu_read_lock();
@@ -1136,8 +1146,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
if (reclaim) {
struct mem_cgroup *curr = memcg;
 
-   if (last_visited)
+   if (last_visited) {
+   css_put(last_visited-css);
mem_cgroup_put(last_visited);
+   }
 
if (css  !memcg)
curr = mem_cgroup_from_css(css);
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-14 Thread Tejun Heo
Hello, Michal.

On Wed, Nov 14, 2012 at 09:51:29AM +0100, Michal Hocko wrote:
  reclaim(root);
  for_each_descendent_pre()
  reclaim(descendant);
 
 We cannot do for_each_descendent_pre here because we do not iterate
 through the whole hierarchy all the time. Check shrink_zone.

I'm a bit confused.  Why would that make any difference?  Shouldn't it
be just able to test the condition and continue?

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-14 Thread Kamezawa Hiroyuki

(2012/11/14 19:10), Michal Hocko wrote:

On Wed 14-11-12 09:20:03, KAMEZAWA Hiroyuki wrote:

(2012/11/14 0:30), Michal Hocko wrote:

[...]

@@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
mz = mem_cgroup_zoneinfo(root, nid, zid);
iter = mz-reclaim_iter[reclaim-priority];
spin_lock(iter-iter_lock);
+   last_visited = iter-last_visited;
if (prev  reclaim-generation != iter-generation) {
+   if (last_visited) {
+   mem_cgroup_put(last_visited);
+   iter-last_visited = NULL;
+   }
spin_unlock(iter-iter_lock);
return NULL;
}
-   id = iter-position;
}

rcu_read_lock();
-   css = css_get_next(mem_cgroup_subsys, id + 1, root-css, id);
-   if (css) {
-   if (css == root-css || css_tryget(css))
-   memcg = mem_cgroup_from_css(css);
-   } else
-   id = 0;
-   rcu_read_unlock();
+   /*
+* Root is not visited by cgroup iterators so it needs a special
+* treatment.
+*/
+   if (!last_visited) {
+   css = root-css;
+   } else {
+   struct cgroup *next_cgroup;
+
+   next_cgroup = cgroup_next_descendant_pre(
+   last_visited-css.cgroup,
+   root-css.cgroup);


Maybe I miss something but last_visited is holded by memcg's refcnt.
The cgroup pointed by css.cgroup is by cgroup's refcnt which can be freed
before memcg is freed and last_visited-css.cgroup is out of RCU cycle.
Is this safe ?


Good spotted. You are right. What I need to do is to check that the
last_visited is alive and restart from the root if not. Something like
the bellow (incremental patch on top of this one) should help, right?

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 30efd7e..c0a91a3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1105,6 +1105,16 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
spin_unlock(iter-iter_lock);
return NULL;
}
+   /*
+* memcg is still valid because we hold a reference but
+* its cgroup might have vanished in the meantime so
+* we have to double check it is alive and restart the
+* tree walk otherwise.
+*/
+   if (last_visited  !css_tryget(last_visited-css)) {
+   mem_cgroup_put(last_visited);
+   last_visited = NULL;
+   }
}

rcu_read_lock();
@@ -1136,8 +1146,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
if (reclaim) {
struct mem_cgroup *curr = memcg;

-   if (last_visited)
+   if (last_visited) {
+   css_put(last_visited-css);
mem_cgroup_put(last_visited);
+   }

if (css  !memcg)
curr = mem_cgroup_from_css(css);



I think this will work.

Thanks,
-Kame


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-13 Thread Kamezawa Hiroyuki
(2012/11/14 0:30), Michal Hocko wrote:
> mem_cgroup_iter curently relies on css->id when walking down a group
> hierarchy tree. This is really awkward because the tree walk depends on
> the groups creation ordering. The only guarantee is that a parent node
> is visited before its children.
> Example
>   1) mkdir -p a a/d a/b/c
>   2) mkdir -a a/b/c a/d
> Will create the same trees but the tree walks will be different:
>   1) a, d, b, c
>   2) a, b, c, d
> 
> 574bd9f7 (cgroup: implement generic child / descendant walk macros) has
> introduced generic cgroup tree walkers which provide either pre-order
> or post-order tree walk. This patch converts css->id based iteration
> to pre-order tree walk to keep the semantic with the original iterator
> where parent is always visited before its subtree.
> 
> cgroup_for_each_descendant_pre suggests using post_create and
> pre_destroy for proper synchronization with groups addidition resp.
> removal. This implementation doesn't use those because a new memory
> cgroup is fully initialized in mem_cgroup_create and css_tryget makes
> sure that the group is alive when we encounter it by iterator.
> 
> If the reclaim cookie is used we need to store the last visited group
> into the iterator so we have to be careful that it doesn't disappear in
> the mean time. Elevated reference count on the memcg guarantees that
> the group will not vanish even though it has been already removed from
> the tree. In such a case css_tryget will fail and the iteration is
> retried (groups are linked with RCU safe lists so the forward progress
> is still possible). iter_lock will make sure that only one reclaimer
> will see the last_visited group and the reference count game around it.
> 
> Signed-off-by: Michal Hocko 
> ---
>   mm/memcontrol.c |   64 
> ++-
>   1 file changed, 49 insertions(+), 15 deletions(-)
> 
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 0fe5177..5da1e58 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -142,8 +142,8 @@ struct mem_cgroup_stat_cpu {
>   };
>   
>   struct mem_cgroup_reclaim_iter {
> - /* css_id of the last scanned hierarchy member */
> - int position;
> + /* last scanned hierarchy member with elevated ref count */
> + struct mem_cgroup *last_visited;
>   /* scan generation, increased every round-trip */
>   unsigned int generation;
>   /* lock to protect the position and generation */
> @@ -1063,8 +1063,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
> *root,
>  struct mem_cgroup *prev,
>  struct mem_cgroup_reclaim_cookie *reclaim)
>   {
> - struct mem_cgroup *memcg = NULL;
> - int id = 0;
> + struct mem_cgroup *memcg = NULL,
> +   *last_visited = NULL;
>   
>   if (mem_cgroup_disabled())
>   return NULL;
> @@ -1073,7 +1073,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
> *root,
>   root = root_mem_cgroup;
>   
>   if (prev && !reclaim)
> - id = css_id(>css);
> + last_visited = prev;
>   
>   if (prev && prev != root)
>   css_put(>css);
> @@ -1086,7 +1086,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
> *root,
>   
>   while (!memcg) {
>   struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
> - struct cgroup_subsys_state *css;
> + struct cgroup_subsys_state *css = NULL;
>   
>   if (reclaim) {
>   int nid = zone_to_nid(reclaim->zone);
> @@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
> *root,
>   mz = mem_cgroup_zoneinfo(root, nid, zid);
>   iter = >reclaim_iter[reclaim->priority];
>   spin_lock(>iter_lock);
> + last_visited = iter->last_visited;
>   if (prev && reclaim->generation != iter->generation) {
> + if (last_visited) {
> + mem_cgroup_put(last_visited);
> + iter->last_visited = NULL;
> + }
>   spin_unlock(>iter_lock);
>   return NULL;
>   }
> - id = iter->position;
>   }
>   
>   rcu_read_lock();
> - css = css_get_next(_cgroup_subsys, id + 1, >css, );
> - if (css) {
> - if (css == >css || css_tryget(css))
> - memcg = mem_cgroup_from_css(css);
> - } else
> - id = 0;
> - rcu_read_unlock();
> + /*
> +  * Root is not visited by cgroup iterators so it needs a special
> +  * treatment.
> +  */
> + if (!last_visited) {
> +

Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-13 Thread Tejun Heo
On Tue, Nov 13, 2012 at 04:30:36PM +0100, Michal Hocko wrote:
> @@ -1063,8 +1063,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
> *root,
>  struct mem_cgroup *prev,
>  struct mem_cgroup_reclaim_cookie *reclaim)
>  {
> - struct mem_cgroup *memcg = NULL;
> - int id = 0;
> + struct mem_cgroup *memcg = NULL,
> +   *last_visited = NULL;

Nitpick but please don't do this.

> + /*
> +  * Root is not visited by cgroup iterators so it needs a special
> +  * treatment.
> +  */
> + if (!last_visited) {
> + css = >css;
> + } else {
> + struct cgroup *next_cgroup;
> +
> + next_cgroup = cgroup_next_descendant_pre(
> + last_visited->css.cgroup,
> + root->css.cgroup);
> + if (next_cgroup)
> + css = cgroup_subsys_state(next_cgroup,
> + mem_cgroup_subsys_id);

Hmmm... wouldn't it be better to move the reclaim logic into a
function and do the following?

reclaim(root);
for_each_descendent_pre()
reclaim(descendant);

If this is a problem, I'd be happy to add a iterator which includes
the top node.  I'd prefer controllers not using the next functions
directly.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-13 Thread Tejun Heo
On Tue, Nov 13, 2012 at 04:30:36PM +0100, Michal Hocko wrote:
 @@ -1063,8 +1063,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
 *root,
  struct mem_cgroup *prev,
  struct mem_cgroup_reclaim_cookie *reclaim)
  {
 - struct mem_cgroup *memcg = NULL;
 - int id = 0;
 + struct mem_cgroup *memcg = NULL,
 +   *last_visited = NULL;

Nitpick but please don't do this.

 + /*
 +  * Root is not visited by cgroup iterators so it needs a special
 +  * treatment.
 +  */
 + if (!last_visited) {
 + css = root-css;
 + } else {
 + struct cgroup *next_cgroup;
 +
 + next_cgroup = cgroup_next_descendant_pre(
 + last_visited-css.cgroup,
 + root-css.cgroup);
 + if (next_cgroup)
 + css = cgroup_subsys_state(next_cgroup,
 + mem_cgroup_subsys_id);

Hmmm... wouldn't it be better to move the reclaim logic into a
function and do the following?

reclaim(root);
for_each_descendent_pre()
reclaim(descendant);

If this is a problem, I'd be happy to add a iterator which includes
the top node.  I'd prefer controllers not using the next functions
directly.

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators

2012-11-13 Thread Kamezawa Hiroyuki
(2012/11/14 0:30), Michal Hocko wrote:
 mem_cgroup_iter curently relies on css-id when walking down a group
 hierarchy tree. This is really awkward because the tree walk depends on
 the groups creation ordering. The only guarantee is that a parent node
 is visited before its children.
 Example
   1) mkdir -p a a/d a/b/c
   2) mkdir -a a/b/c a/d
 Will create the same trees but the tree walks will be different:
   1) a, d, b, c
   2) a, b, c, d
 
 574bd9f7 (cgroup: implement generic child / descendant walk macros) has
 introduced generic cgroup tree walkers which provide either pre-order
 or post-order tree walk. This patch converts css-id based iteration
 to pre-order tree walk to keep the semantic with the original iterator
 where parent is always visited before its subtree.
 
 cgroup_for_each_descendant_pre suggests using post_create and
 pre_destroy for proper synchronization with groups addidition resp.
 removal. This implementation doesn't use those because a new memory
 cgroup is fully initialized in mem_cgroup_create and css_tryget makes
 sure that the group is alive when we encounter it by iterator.
 
 If the reclaim cookie is used we need to store the last visited group
 into the iterator so we have to be careful that it doesn't disappear in
 the mean time. Elevated reference count on the memcg guarantees that
 the group will not vanish even though it has been already removed from
 the tree. In such a case css_tryget will fail and the iteration is
 retried (groups are linked with RCU safe lists so the forward progress
 is still possible). iter_lock will make sure that only one reclaimer
 will see the last_visited group and the reference count game around it.
 
 Signed-off-by: Michal Hocko mho...@suse.cz
 ---
   mm/memcontrol.c |   64 
 ++-
   1 file changed, 49 insertions(+), 15 deletions(-)
 
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 0fe5177..5da1e58 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -142,8 +142,8 @@ struct mem_cgroup_stat_cpu {
   };
   
   struct mem_cgroup_reclaim_iter {
 - /* css_id of the last scanned hierarchy member */
 - int position;
 + /* last scanned hierarchy member with elevated ref count */
 + struct mem_cgroup *last_visited;
   /* scan generation, increased every round-trip */
   unsigned int generation;
   /* lock to protect the position and generation */
 @@ -1063,8 +1063,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
 *root,
  struct mem_cgroup *prev,
  struct mem_cgroup_reclaim_cookie *reclaim)
   {
 - struct mem_cgroup *memcg = NULL;
 - int id = 0;
 + struct mem_cgroup *memcg = NULL,
 +   *last_visited = NULL;
   
   if (mem_cgroup_disabled())
   return NULL;
 @@ -1073,7 +1073,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
 *root,
   root = root_mem_cgroup;
   
   if (prev  !reclaim)
 - id = css_id(prev-css);
 + last_visited = prev;
   
   if (prev  prev != root)
   css_put(prev-css);
 @@ -1086,7 +1086,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
 *root,
   
   while (!memcg) {
   struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
 - struct cgroup_subsys_state *css;
 + struct cgroup_subsys_state *css = NULL;
   
   if (reclaim) {
   int nid = zone_to_nid(reclaim-zone);
 @@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
 *root,
   mz = mem_cgroup_zoneinfo(root, nid, zid);
   iter = mz-reclaim_iter[reclaim-priority];
   spin_lock(iter-iter_lock);
 + last_visited = iter-last_visited;
   if (prev  reclaim-generation != iter-generation) {
 + if (last_visited) {
 + mem_cgroup_put(last_visited);
 + iter-last_visited = NULL;
 + }
   spin_unlock(iter-iter_lock);
   return NULL;
   }
 - id = iter-position;
   }
   
   rcu_read_lock();
 - css = css_get_next(mem_cgroup_subsys, id + 1, root-css, id);
 - if (css) {
 - if (css == root-css || css_tryget(css))
 - memcg = mem_cgroup_from_css(css);
 - } else
 - id = 0;
 - rcu_read_unlock();
 + /*
 +  * Root is not visited by cgroup iterators so it needs a special
 +  * treatment.
 +  */
 + if (!last_visited) {
 + css = root-css;
 + } else {
 + struct cgroup