Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
On Tue 13-11-12 16:30:36, Michal Hocko wrote: [...] > + /* > + * Root is not visited by cgroup iterators so it needs a special > + * treatment. > + */ > + if (!last_visited) { > + css = >css; > + } else { > + struct cgroup *next_cgroup; > + > + next_cgroup = cgroup_next_descendant_pre( > + last_visited->css.cgroup, > + root->css.cgroup); > + if (next_cgroup) > + css = cgroup_subsys_state(next_cgroup, > + mem_cgroup_subsys_id); This is not correct because cgroup_next_descendant_pre expects pos to be NULL for the first iteration but the way we do iterate (visit the root first) means that the second iteration will have last_visited != NULL and if root doesn't have any children the iteration would go unleashed to to the endless loop. We need something like: struct cgroup *prev_cgroup = (last_visited == root) ? NULL : last_visited->css.cgroup; next_cgroup = cgroup_next_descendant_pre(prev_cgroup, root->css.gtoup); -- Michal Hocko SUSE Labs -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
On Wed 14-11-12 11:10:52, Michal Hocko wrote: > On Wed 14-11-12 09:20:03, KAMEZAWA Hiroyuki wrote: > > (2012/11/14 0:30), Michal Hocko wrote: > [...] > > > @@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct > > > mem_cgroup *root, > > > mz = mem_cgroup_zoneinfo(root, nid, zid); > > > iter = >reclaim_iter[reclaim->priority]; > > > spin_lock(>iter_lock); > > > + last_visited = iter->last_visited; > > > if (prev && reclaim->generation != > > > iter->generation) { > > > + if (last_visited) { > > > + mem_cgroup_put(last_visited); > > > + iter->last_visited = NULL; > > > + } > > > spin_unlock(>iter_lock); > > > return NULL; > > > } > > > - id = iter->position; > > > } > > > > > > rcu_read_lock(); > > > - css = css_get_next(_cgroup_subsys, id + 1, >css, ); > > > - if (css) { > > > - if (css == >css || css_tryget(css)) > > > - memcg = mem_cgroup_from_css(css); > > > - } else > > > - id = 0; > > > - rcu_read_unlock(); > > > + /* > > > + * Root is not visited by cgroup iterators so it needs a special > > > + * treatment. > > > + */ > > > + if (!last_visited) { > > > + css = >css; > > > + } else { > > > + struct cgroup *next_cgroup; > > > + > > > + next_cgroup = cgroup_next_descendant_pre( > > > + last_visited->css.cgroup, > > > + root->css.cgroup); > > > > Maybe I miss something but last_visited is holded by memcg's refcnt. > > The cgroup pointed by css.cgroup is by cgroup's refcnt which can be freed > > before memcg is freed and last_visited->css.cgroup is out of RCU cycle. > > Is this safe ? > > Good spotted. You are right. What I need to do is to check that the > last_visited is alive and restart from the root if not. Something like > the bellow (incremental patch on top of this one) should help, right? > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 30efd7e..c0a91a3 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -1105,6 +1105,16 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup > *root, > spin_unlock(>iter_lock); > return NULL; > } > + /* > + * memcg is still valid because we hold a reference but > + * its cgroup might have vanished in the meantime so > + * we have to double check it is alive and restart the > + * tree walk otherwise. > + */ > + if (last_visited && !css_tryget(_visited->css)) { > + mem_cgroup_put(last_visited); > + last_visited = NULL; > + } > } > > rcu_read_lock(); > @@ -1136,8 +1146,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup > *root, > if (reclaim) { > struct mem_cgroup *curr = memcg; > > - if (last_visited) > + if (last_visited) { > + css_put(_visited->css); > mem_cgroup_put(last_visited); > + } > > if (css && !memcg) > curr = mem_cgroup_from_css(css); Now that I think about it again it seems that this is more complicated than necessary. It should be sufficient to hold css' reference for the iter->last_visited because this makes sure that the cgroup won't go away same as mem_cgroup. Memcg reference counting + css_tryget just makes the situation more complicated because it forces us to retry the iteration on css_tryget failure as the cgroup is gone already and we have no point to continue other than start all over again. Which is, ehmm, _really_ ugly. I will repost the updated version sometime this week after it passes some testing. -- Michal Hocko SUSE Labs -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
On Wed 14-11-12 11:10:52, Michal Hocko wrote: On Wed 14-11-12 09:20:03, KAMEZAWA Hiroyuki wrote: (2012/11/14 0:30), Michal Hocko wrote: [...] @@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, mz = mem_cgroup_zoneinfo(root, nid, zid); iter = mz-reclaim_iter[reclaim-priority]; spin_lock(iter-iter_lock); + last_visited = iter-last_visited; if (prev reclaim-generation != iter-generation) { + if (last_visited) { + mem_cgroup_put(last_visited); + iter-last_visited = NULL; + } spin_unlock(iter-iter_lock); return NULL; } - id = iter-position; } rcu_read_lock(); - css = css_get_next(mem_cgroup_subsys, id + 1, root-css, id); - if (css) { - if (css == root-css || css_tryget(css)) - memcg = mem_cgroup_from_css(css); - } else - id = 0; - rcu_read_unlock(); + /* + * Root is not visited by cgroup iterators so it needs a special + * treatment. + */ + if (!last_visited) { + css = root-css; + } else { + struct cgroup *next_cgroup; + + next_cgroup = cgroup_next_descendant_pre( + last_visited-css.cgroup, + root-css.cgroup); Maybe I miss something but last_visited is holded by memcg's refcnt. The cgroup pointed by css.cgroup is by cgroup's refcnt which can be freed before memcg is freed and last_visited-css.cgroup is out of RCU cycle. Is this safe ? Good spotted. You are right. What I need to do is to check that the last_visited is alive and restart from the root if not. Something like the bellow (incremental patch on top of this one) should help, right? diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 30efd7e..c0a91a3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1105,6 +1105,16 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, spin_unlock(iter-iter_lock); return NULL; } + /* + * memcg is still valid because we hold a reference but + * its cgroup might have vanished in the meantime so + * we have to double check it is alive and restart the + * tree walk otherwise. + */ + if (last_visited !css_tryget(last_visited-css)) { + mem_cgroup_put(last_visited); + last_visited = NULL; + } } rcu_read_lock(); @@ -1136,8 +1146,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (reclaim) { struct mem_cgroup *curr = memcg; - if (last_visited) + if (last_visited) { + css_put(last_visited-css); mem_cgroup_put(last_visited); + } if (css !memcg) curr = mem_cgroup_from_css(css); Now that I think about it again it seems that this is more complicated than necessary. It should be sufficient to hold css' reference for the iter-last_visited because this makes sure that the cgroup won't go away same as mem_cgroup. Memcg reference counting + css_tryget just makes the situation more complicated because it forces us to retry the iteration on css_tryget failure as the cgroup is gone already and we have no point to continue other than start all over again. Which is, ehmm, _really_ ugly. I will repost the updated version sometime this week after it passes some testing. -- Michal Hocko SUSE Labs -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
On Tue 13-11-12 16:30:36, Michal Hocko wrote: [...] + /* + * Root is not visited by cgroup iterators so it needs a special + * treatment. + */ + if (!last_visited) { + css = root-css; + } else { + struct cgroup *next_cgroup; + + next_cgroup = cgroup_next_descendant_pre( + last_visited-css.cgroup, + root-css.cgroup); + if (next_cgroup) + css = cgroup_subsys_state(next_cgroup, + mem_cgroup_subsys_id); This is not correct because cgroup_next_descendant_pre expects pos to be NULL for the first iteration but the way we do iterate (visit the root first) means that the second iteration will have last_visited != NULL and if root doesn't have any children the iteration would go unleashed to to the endless loop. We need something like: struct cgroup *prev_cgroup = (last_visited == root) ? NULL : last_visited-css.cgroup; next_cgroup = cgroup_next_descendant_pre(prev_cgroup, root-css.gtoup); -- Michal Hocko SUSE Labs -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
On Thu 15-11-12 07:31:24, Tejun Heo wrote: > Hello, Michal. > > On Thu, Nov 15, 2012 at 04:12:55PM +0100, Michal Hocko wrote: > > > Because I'd like to consider the next functions as implementation > > > detail, and having interations structred as loops tend to read better > > > and less error-prone. e.g. when you use next functions directly, it's > > > way easier to circumvent locking requirements in a way which isn't > > > very obvious. > > > > The whole point behind mem_cgroup_iter is to hide all the complexity > > behind memcg iteration. Memcg code either use for_each_mem_cgroup_tree > > for !reclaim case and mem_cgroup_iter otherwise. > > > > > So, unless it messes up the code too much (and I can't see why it > > > would), I'd much prefer if memcg used for_each_*() macros. > > > > As I said this would mean that the current mem_cgroup_iter code would > > have to be inverted which doesn't simplify the code much. I'd rather > > hide all the grossy details inside the memcg iterator. > > Or am I still missing your suggestion? > > One way or the other, I don't think the code complexity would change > much. Again, I'd much *prefer* if memcg used what other controllers > would be using, but that's a preference and if necessary we can keep > the next functions as exposed APIs. Yes please. > I think the issue I have is that I can't see much technical > justification for that. If the code becomes much simpler by choosing > one over the other, sure, but is that the case here? Yes and I've tried to say that already. Memcg needs hierarchy, css ref counting and concurrent reclaim (per-zone per-priority) aware iteration. All of that is hidden in mem_cgroup_iter currently so the caller doesn't have to care about it at all. Which makes shrink_zone not care about memcg that much. cgroup_for_each_descendant_pre is not suitable at least because it doesn't provide a way to start a walk at a selected node (which is shared per-zone per-priority in memcg case). Even if cgroup_for_each_descendant_pre had start parameter there is still a lot of house keeping that callers would have to handle (css_tryget to start with, update of the cached possible not mentioning use_hierarchy thingy or mem_cgroup_disabled). We also try to not pollute mm/vmscan.c as much as possible so we definitely do not want to bring all this into shrink_zone. This all sounds like too much of a hassle if it is exposed so I would really like to stay with mem_cgroup_iter and slowly simplify it until it can go away (if that is possible at all). > Isn't it mostly just about where to put the same things? Unfortunately no. We wouldn't grow own iterator in such a case. > If so, what would be the rationale for requiring a different > interface? Does the above explain it? -- Michal Hocko SUSE Labs -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
Hello, Michal. On Thu, Nov 15, 2012 at 04:12:55PM +0100, Michal Hocko wrote: > > Because I'd like to consider the next functions as implementation > > detail, and having interations structred as loops tend to read better > > and less error-prone. e.g. when you use next functions directly, it's > > way easier to circumvent locking requirements in a way which isn't > > very obvious. > > The whole point behind mem_cgroup_iter is to hide all the complexity > behind memcg iteration. Memcg code either use for_each_mem_cgroup_tree > for !reclaim case and mem_cgroup_iter otherwise. > > > So, unless it messes up the code too much (and I can't see why it > > would), I'd much prefer if memcg used for_each_*() macros. > > As I said this would mean that the current mem_cgroup_iter code would > have to be inverted which doesn't simplify the code much. I'd rather > hide all the grossy details inside the memcg iterator. > Or am I still missing your suggestion? One way or the other, I don't think the code complexity would change much. Again, I'd much *prefer* if memcg used what other controllers would be using, but that's a preference and if necessary we can keep the next functions as exposed APIs. I think the issue I have is that I can't see much technical justification for that. If the code becomes much simpler by choosing one over the other, sure, but is that the case here? Isn't it mostly just about where to put the same things? If so, what would be the rationale for requiring a different interface? Thanks. -- tejun -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
On Thu 15-11-12 06:47:32, Tejun Heo wrote: > Hello, Michal. > > On Thu, Nov 15, 2012 at 10:51:03AM +0100, Michal Hocko wrote: > > > I'm a bit confused. Why would that make any difference? Shouldn't it > > > be just able to test the condition and continue? > > > > Ohh, I misunderstood your proposal. So what you are suggesting is > > to put all the logic we have in mem_cgroup_iter inside what you call > > reclaim here + mem_cgroup_iter_break inside the loop, right? > > > > I do not see how this would help us much. mem_cgroup_iter is not the > > nicest piece of code but it handles quite a complex requirements that we > > have currently (css reference count, multiple reclaimers racing). So I > > would rather keep it this way. Further simplifications are welcome of > > course. > > > > Is there any reason why you are not happy about direct using of > > cgroup_next_descendant_pre? > > Because I'd like to consider the next functions as implementation > detail, and having interations structred as loops tend to read better > and less error-prone. e.g. when you use next functions directly, it's > way easier to circumvent locking requirements in a way which isn't > very obvious. The whole point behind mem_cgroup_iter is to hide all the complexity behind memcg iteration. Memcg code either use for_each_mem_cgroup_tree for !reclaim case and mem_cgroup_iter otherwise. > So, unless it messes up the code too much (and I can't see why it > would), I'd much prefer if memcg used for_each_*() macros. As I said this would mean that the current mem_cgroup_iter code would have to be inverted which doesn't simplify the code much. I'd rather hide all the grossy details inside the memcg iterator. Or am I still missing your suggestion? -- Michal Hocko SUSE Labs -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
Hello, Michal. On Thu, Nov 15, 2012 at 10:51:03AM +0100, Michal Hocko wrote: > > I'm a bit confused. Why would that make any difference? Shouldn't it > > be just able to test the condition and continue? > > Ohh, I misunderstood your proposal. So what you are suggesting is > to put all the logic we have in mem_cgroup_iter inside what you call > reclaim here + mem_cgroup_iter_break inside the loop, right? > > I do not see how this would help us much. mem_cgroup_iter is not the > nicest piece of code but it handles quite a complex requirements that we > have currently (css reference count, multiple reclaimers racing). So I > would rather keep it this way. Further simplifications are welcome of > course. > > Is there any reason why you are not happy about direct using of > cgroup_next_descendant_pre? Because I'd like to consider the next functions as implementation detail, and having interations structred as loops tend to read better and less error-prone. e.g. when you use next functions directly, it's way easier to circumvent locking requirements in a way which isn't very obvious. So, unless it messes up the code too much (and I can't see why it would), I'd much prefer if memcg used for_each_*() macros. Thanks. -- tejun -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
On Thu 15-11-12 13:12:38, KAMEZAWA Hiroyuki wrote: > (2012/11/14 19:10), Michal Hocko wrote: > >On Wed 14-11-12 09:20:03, KAMEZAWA Hiroyuki wrote: > >>(2012/11/14 0:30), Michal Hocko wrote: > >[...] > >>>@@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct > >>>mem_cgroup *root, > >>> mz = mem_cgroup_zoneinfo(root, nid, zid); > >>> iter = >reclaim_iter[reclaim->priority]; > >>> spin_lock(>iter_lock); > >>>+ last_visited = iter->last_visited; > >>> if (prev && reclaim->generation != > >>> iter->generation) { > >>>+ if (last_visited) { > >>>+ mem_cgroup_put(last_visited); > >>>+ iter->last_visited = NULL; > >>>+ } > >>> spin_unlock(>iter_lock); > >>> return NULL; > >>> } > >>>- id = iter->position; > >>> } > >>> > >>> rcu_read_lock(); > >>>- css = css_get_next(_cgroup_subsys, id + 1, >css, ); > >>>- if (css) { > >>>- if (css == >css || css_tryget(css)) > >>>- memcg = mem_cgroup_from_css(css); > >>>- } else > >>>- id = 0; > >>>- rcu_read_unlock(); > >>>+ /* > >>>+ * Root is not visited by cgroup iterators so it needs a special > >>>+ * treatment. > >>>+ */ > >>>+ if (!last_visited) { > >>>+ css = >css; > >>>+ } else { > >>>+ struct cgroup *next_cgroup; > >>>+ > >>>+ next_cgroup = cgroup_next_descendant_pre( > >>>+ last_visited->css.cgroup, > >>>+ root->css.cgroup); > >> > >>Maybe I miss something but last_visited is holded by memcg's refcnt. > >>The cgroup pointed by css.cgroup is by cgroup's refcnt which can be freed > >>before memcg is freed and last_visited->css.cgroup is out of RCU cycle. > >>Is this safe ? > > > >Good spotted. You are right. What I need to do is to check that the > >last_visited is alive and restart from the root if not. Something like > >the bellow (incremental patch on top of this one) should help, right? > > > >diff --git a/mm/memcontrol.c b/mm/memcontrol.c > >index 30efd7e..c0a91a3 100644 > >--- a/mm/memcontrol.c > >+++ b/mm/memcontrol.c > >@@ -1105,6 +1105,16 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup > >*root, > > spin_unlock(>iter_lock); > > return NULL; > > } > >+/* > >+ * memcg is still valid because we hold a reference but > >+ * its cgroup might have vanished in the meantime so > >+ * we have to double check it is alive and restart the > >+ * tree walk otherwise. > >+ */ > >+if (last_visited && !css_tryget(_visited->css)) { > >+mem_cgroup_put(last_visited); > >+last_visited = NULL; > >+} > > } > > > > rcu_read_lock(); > >@@ -1136,8 +1146,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup > >*root, > > if (reclaim) { > > struct mem_cgroup *curr = memcg; > > > >-if (last_visited) > >+if (last_visited) { > >+css_put(_visited->css); > > mem_cgroup_put(last_visited); > >+} > > > > if (css && !memcg) > > curr = mem_cgroup_from_css(css); > > > > I think this will work. Thanks for double checking. The updated patch: --- >From 3811de23a0306c229ce44dc655878431a4fdf449 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 13 Nov 2012 11:40:35 +0100 Subject: [PATCH] memcg: rework mem_cgroup_iter to use cgroup iterators mem_cgroup_iter curently relies on css->id when walking down a group hierarchy tree. This is really awkward because the tree walk depends on the groups creation ordering. The only guarantee is that a parent node is visited before its children. Example 1) mkdir -p a a/d a/b/c 2) mkdir -a a/b/c a/d Will create the same trees but the tree walks will be different: 1) a, d, b, c 2) a, b, c, d 574bd9f7 (cgroup: implement generic child / descendant walk macros) has introduced generic cgroup tree walkers which provide either pre-order or post-order tree walk. This patch converts css->id based iteration to pre-order tree walk to keep the semantic with the original iterator where parent is always visited before its subtree. cgroup_for_each_descendant_pre suggests using post_create and
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
On Wed 14-11-12 10:52:45, Tejun Heo wrote: > Hello, Michal. > > On Wed, Nov 14, 2012 at 09:51:29AM +0100, Michal Hocko wrote: > > > reclaim(root); > > > for_each_descendent_pre() > > > reclaim(descendant); > > > > We cannot do for_each_descendent_pre here because we do not iterate > > through the whole hierarchy all the time. Check shrink_zone. > > I'm a bit confused. Why would that make any difference? Shouldn't it > be just able to test the condition and continue? Ohh, I misunderstood your proposal. So what you are suggesting is to put all the logic we have in mem_cgroup_iter inside what you call reclaim here + mem_cgroup_iter_break inside the loop, right? I do not see how this would help us much. mem_cgroup_iter is not the nicest piece of code but it handles quite a complex requirements that we have currently (css reference count, multiple reclaimers racing). So I would rather keep it this way. Further simplifications are welcome of course. Is there any reason why you are not happy about direct using of cgroup_next_descendant_pre? -- Michal Hocko SUSE Labs -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
On Wed 14-11-12 10:52:45, Tejun Heo wrote: Hello, Michal. On Wed, Nov 14, 2012 at 09:51:29AM +0100, Michal Hocko wrote: reclaim(root); for_each_descendent_pre() reclaim(descendant); We cannot do for_each_descendent_pre here because we do not iterate through the whole hierarchy all the time. Check shrink_zone. I'm a bit confused. Why would that make any difference? Shouldn't it be just able to test the condition and continue? Ohh, I misunderstood your proposal. So what you are suggesting is to put all the logic we have in mem_cgroup_iter inside what you call reclaim here + mem_cgroup_iter_break inside the loop, right? I do not see how this would help us much. mem_cgroup_iter is not the nicest piece of code but it handles quite a complex requirements that we have currently (css reference count, multiple reclaimers racing). So I would rather keep it this way. Further simplifications are welcome of course. Is there any reason why you are not happy about direct using of cgroup_next_descendant_pre? -- Michal Hocko SUSE Labs -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
On Thu 15-11-12 13:12:38, KAMEZAWA Hiroyuki wrote: (2012/11/14 19:10), Michal Hocko wrote: On Wed 14-11-12 09:20:03, KAMEZAWA Hiroyuki wrote: (2012/11/14 0:30), Michal Hocko wrote: [...] @@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, mz = mem_cgroup_zoneinfo(root, nid, zid); iter = mz-reclaim_iter[reclaim-priority]; spin_lock(iter-iter_lock); + last_visited = iter-last_visited; if (prev reclaim-generation != iter-generation) { + if (last_visited) { + mem_cgroup_put(last_visited); + iter-last_visited = NULL; + } spin_unlock(iter-iter_lock); return NULL; } - id = iter-position; } rcu_read_lock(); - css = css_get_next(mem_cgroup_subsys, id + 1, root-css, id); - if (css) { - if (css == root-css || css_tryget(css)) - memcg = mem_cgroup_from_css(css); - } else - id = 0; - rcu_read_unlock(); + /* + * Root is not visited by cgroup iterators so it needs a special + * treatment. + */ + if (!last_visited) { + css = root-css; + } else { + struct cgroup *next_cgroup; + + next_cgroup = cgroup_next_descendant_pre( + last_visited-css.cgroup, + root-css.cgroup); Maybe I miss something but last_visited is holded by memcg's refcnt. The cgroup pointed by css.cgroup is by cgroup's refcnt which can be freed before memcg is freed and last_visited-css.cgroup is out of RCU cycle. Is this safe ? Good spotted. You are right. What I need to do is to check that the last_visited is alive and restart from the root if not. Something like the bellow (incremental patch on top of this one) should help, right? diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 30efd7e..c0a91a3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1105,6 +1105,16 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, spin_unlock(iter-iter_lock); return NULL; } +/* + * memcg is still valid because we hold a reference but + * its cgroup might have vanished in the meantime so + * we have to double check it is alive and restart the + * tree walk otherwise. + */ +if (last_visited !css_tryget(last_visited-css)) { +mem_cgroup_put(last_visited); +last_visited = NULL; +} } rcu_read_lock(); @@ -1136,8 +1146,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (reclaim) { struct mem_cgroup *curr = memcg; -if (last_visited) +if (last_visited) { +css_put(last_visited-css); mem_cgroup_put(last_visited); +} if (css !memcg) curr = mem_cgroup_from_css(css); I think this will work. Thanks for double checking. The updated patch: --- From 3811de23a0306c229ce44dc655878431a4fdf449 Mon Sep 17 00:00:00 2001 From: Michal Hocko mho...@suse.cz Date: Tue, 13 Nov 2012 11:40:35 +0100 Subject: [PATCH] memcg: rework mem_cgroup_iter to use cgroup iterators mem_cgroup_iter curently relies on css-id when walking down a group hierarchy tree. This is really awkward because the tree walk depends on the groups creation ordering. The only guarantee is that a parent node is visited before its children. Example 1) mkdir -p a a/d a/b/c 2) mkdir -a a/b/c a/d Will create the same trees but the tree walks will be different: 1) a, d, b, c 2) a, b, c, d 574bd9f7 (cgroup: implement generic child / descendant walk macros) has introduced generic cgroup tree walkers which provide either pre-order or post-order tree walk. This patch converts css-id based iteration to pre-order tree walk to keep the semantic with the original iterator where parent is always visited before its subtree. cgroup_for_each_descendant_pre suggests using post_create and pre_destroy for proper synchronization with groups addidition resp. removal. This implementation doesn't use those because a new memory cgroup is fully initialized in mem_cgroup_create and css_tryget makes sure that the group is
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
Hello, Michal. On Thu, Nov 15, 2012 at 10:51:03AM +0100, Michal Hocko wrote: I'm a bit confused. Why would that make any difference? Shouldn't it be just able to test the condition and continue? Ohh, I misunderstood your proposal. So what you are suggesting is to put all the logic we have in mem_cgroup_iter inside what you call reclaim here + mem_cgroup_iter_break inside the loop, right? I do not see how this would help us much. mem_cgroup_iter is not the nicest piece of code but it handles quite a complex requirements that we have currently (css reference count, multiple reclaimers racing). So I would rather keep it this way. Further simplifications are welcome of course. Is there any reason why you are not happy about direct using of cgroup_next_descendant_pre? Because I'd like to consider the next functions as implementation detail, and having interations structred as loops tend to read better and less error-prone. e.g. when you use next functions directly, it's way easier to circumvent locking requirements in a way which isn't very obvious. So, unless it messes up the code too much (and I can't see why it would), I'd much prefer if memcg used for_each_*() macros. Thanks. -- tejun -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
On Thu 15-11-12 06:47:32, Tejun Heo wrote: Hello, Michal. On Thu, Nov 15, 2012 at 10:51:03AM +0100, Michal Hocko wrote: I'm a bit confused. Why would that make any difference? Shouldn't it be just able to test the condition and continue? Ohh, I misunderstood your proposal. So what you are suggesting is to put all the logic we have in mem_cgroup_iter inside what you call reclaim here + mem_cgroup_iter_break inside the loop, right? I do not see how this would help us much. mem_cgroup_iter is not the nicest piece of code but it handles quite a complex requirements that we have currently (css reference count, multiple reclaimers racing). So I would rather keep it this way. Further simplifications are welcome of course. Is there any reason why you are not happy about direct using of cgroup_next_descendant_pre? Because I'd like to consider the next functions as implementation detail, and having interations structred as loops tend to read better and less error-prone. e.g. when you use next functions directly, it's way easier to circumvent locking requirements in a way which isn't very obvious. The whole point behind mem_cgroup_iter is to hide all the complexity behind memcg iteration. Memcg code either use for_each_mem_cgroup_tree for !reclaim case and mem_cgroup_iter otherwise. So, unless it messes up the code too much (and I can't see why it would), I'd much prefer if memcg used for_each_*() macros. As I said this would mean that the current mem_cgroup_iter code would have to be inverted which doesn't simplify the code much. I'd rather hide all the grossy details inside the memcg iterator. Or am I still missing your suggestion? -- Michal Hocko SUSE Labs -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
Hello, Michal. On Thu, Nov 15, 2012 at 04:12:55PM +0100, Michal Hocko wrote: Because I'd like to consider the next functions as implementation detail, and having interations structred as loops tend to read better and less error-prone. e.g. when you use next functions directly, it's way easier to circumvent locking requirements in a way which isn't very obvious. The whole point behind mem_cgroup_iter is to hide all the complexity behind memcg iteration. Memcg code either use for_each_mem_cgroup_tree for !reclaim case and mem_cgroup_iter otherwise. So, unless it messes up the code too much (and I can't see why it would), I'd much prefer if memcg used for_each_*() macros. As I said this would mean that the current mem_cgroup_iter code would have to be inverted which doesn't simplify the code much. I'd rather hide all the grossy details inside the memcg iterator. Or am I still missing your suggestion? One way or the other, I don't think the code complexity would change much. Again, I'd much *prefer* if memcg used what other controllers would be using, but that's a preference and if necessary we can keep the next functions as exposed APIs. I think the issue I have is that I can't see much technical justification for that. If the code becomes much simpler by choosing one over the other, sure, but is that the case here? Isn't it mostly just about where to put the same things? If so, what would be the rationale for requiring a different interface? Thanks. -- tejun -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
On Thu 15-11-12 07:31:24, Tejun Heo wrote: Hello, Michal. On Thu, Nov 15, 2012 at 04:12:55PM +0100, Michal Hocko wrote: Because I'd like to consider the next functions as implementation detail, and having interations structred as loops tend to read better and less error-prone. e.g. when you use next functions directly, it's way easier to circumvent locking requirements in a way which isn't very obvious. The whole point behind mem_cgroup_iter is to hide all the complexity behind memcg iteration. Memcg code either use for_each_mem_cgroup_tree for !reclaim case and mem_cgroup_iter otherwise. So, unless it messes up the code too much (and I can't see why it would), I'd much prefer if memcg used for_each_*() macros. As I said this would mean that the current mem_cgroup_iter code would have to be inverted which doesn't simplify the code much. I'd rather hide all the grossy details inside the memcg iterator. Or am I still missing your suggestion? One way or the other, I don't think the code complexity would change much. Again, I'd much *prefer* if memcg used what other controllers would be using, but that's a preference and if necessary we can keep the next functions as exposed APIs. Yes please. I think the issue I have is that I can't see much technical justification for that. If the code becomes much simpler by choosing one over the other, sure, but is that the case here? Yes and I've tried to say that already. Memcg needs hierarchy, css ref counting and concurrent reclaim (per-zone per-priority) aware iteration. All of that is hidden in mem_cgroup_iter currently so the caller doesn't have to care about it at all. Which makes shrink_zone not care about memcg that much. cgroup_for_each_descendant_pre is not suitable at least because it doesn't provide a way to start a walk at a selected node (which is shared per-zone per-priority in memcg case). Even if cgroup_for_each_descendant_pre had start parameter there is still a lot of house keeping that callers would have to handle (css_tryget to start with, update of the cached possible not mentioning use_hierarchy thingy or mem_cgroup_disabled). We also try to not pollute mm/vmscan.c as much as possible so we definitely do not want to bring all this into shrink_zone. This all sounds like too much of a hassle if it is exposed so I would really like to stay with mem_cgroup_iter and slowly simplify it until it can go away (if that is possible at all). Isn't it mostly just about where to put the same things? Unfortunately no. We wouldn't grow own iterator in such a case. If so, what would be the rationale for requiring a different interface? Does the above explain it? -- Michal Hocko SUSE Labs -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
(2012/11/14 19:10), Michal Hocko wrote: On Wed 14-11-12 09:20:03, KAMEZAWA Hiroyuki wrote: (2012/11/14 0:30), Michal Hocko wrote: [...] @@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, mz = mem_cgroup_zoneinfo(root, nid, zid); iter = >reclaim_iter[reclaim->priority]; spin_lock(>iter_lock); + last_visited = iter->last_visited; if (prev && reclaim->generation != iter->generation) { + if (last_visited) { + mem_cgroup_put(last_visited); + iter->last_visited = NULL; + } spin_unlock(>iter_lock); return NULL; } - id = iter->position; } rcu_read_lock(); - css = css_get_next(_cgroup_subsys, id + 1, >css, ); - if (css) { - if (css == >css || css_tryget(css)) - memcg = mem_cgroup_from_css(css); - } else - id = 0; - rcu_read_unlock(); + /* +* Root is not visited by cgroup iterators so it needs a special +* treatment. +*/ + if (!last_visited) { + css = >css; + } else { + struct cgroup *next_cgroup; + + next_cgroup = cgroup_next_descendant_pre( + last_visited->css.cgroup, + root->css.cgroup); Maybe I miss something but last_visited is holded by memcg's refcnt. The cgroup pointed by css.cgroup is by cgroup's refcnt which can be freed before memcg is freed and last_visited->css.cgroup is out of RCU cycle. Is this safe ? Good spotted. You are right. What I need to do is to check that the last_visited is alive and restart from the root if not. Something like the bellow (incremental patch on top of this one) should help, right? diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 30efd7e..c0a91a3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1105,6 +1105,16 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, spin_unlock(>iter_lock); return NULL; } + /* +* memcg is still valid because we hold a reference but +* its cgroup might have vanished in the meantime so +* we have to double check it is alive and restart the +* tree walk otherwise. +*/ + if (last_visited && !css_tryget(_visited->css)) { + mem_cgroup_put(last_visited); + last_visited = NULL; + } } rcu_read_lock(); @@ -1136,8 +1146,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (reclaim) { struct mem_cgroup *curr = memcg; - if (last_visited) + if (last_visited) { + css_put(_visited->css); mem_cgroup_put(last_visited); + } if (css && !memcg) curr = mem_cgroup_from_css(css); I think this will work. Thanks, -Kame -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
Hello, Michal. On Wed, Nov 14, 2012 at 09:51:29AM +0100, Michal Hocko wrote: > > reclaim(root); > > for_each_descendent_pre() > > reclaim(descendant); > > We cannot do for_each_descendent_pre here because we do not iterate > through the whole hierarchy all the time. Check shrink_zone. I'm a bit confused. Why would that make any difference? Shouldn't it be just able to test the condition and continue? Thanks. -- tejun -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
On Wed 14-11-12 09:20:03, KAMEZAWA Hiroyuki wrote: > (2012/11/14 0:30), Michal Hocko wrote: [...] > > @@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct > > mem_cgroup *root, > > mz = mem_cgroup_zoneinfo(root, nid, zid); > > iter = >reclaim_iter[reclaim->priority]; > > spin_lock(>iter_lock); > > + last_visited = iter->last_visited; > > if (prev && reclaim->generation != iter->generation) { > > + if (last_visited) { > > + mem_cgroup_put(last_visited); > > + iter->last_visited = NULL; > > + } > > spin_unlock(>iter_lock); > > return NULL; > > } > > - id = iter->position; > > } > > > > rcu_read_lock(); > > - css = css_get_next(_cgroup_subsys, id + 1, >css, ); > > - if (css) { > > - if (css == >css || css_tryget(css)) > > - memcg = mem_cgroup_from_css(css); > > - } else > > - id = 0; > > - rcu_read_unlock(); > > + /* > > +* Root is not visited by cgroup iterators so it needs a special > > +* treatment. > > +*/ > > + if (!last_visited) { > > + css = >css; > > + } else { > > + struct cgroup *next_cgroup; > > + > > + next_cgroup = cgroup_next_descendant_pre( > > + last_visited->css.cgroup, > > + root->css.cgroup); > > Maybe I miss something but last_visited is holded by memcg's refcnt. > The cgroup pointed by css.cgroup is by cgroup's refcnt which can be freed > before memcg is freed and last_visited->css.cgroup is out of RCU cycle. > Is this safe ? Good spotted. You are right. What I need to do is to check that the last_visited is alive and restart from the root if not. Something like the bellow (incremental patch on top of this one) should help, right? diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 30efd7e..c0a91a3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1105,6 +1105,16 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, spin_unlock(>iter_lock); return NULL; } + /* +* memcg is still valid because we hold a reference but +* its cgroup might have vanished in the meantime so +* we have to double check it is alive and restart the +* tree walk otherwise. +*/ + if (last_visited && !css_tryget(_visited->css)) { + mem_cgroup_put(last_visited); + last_visited = NULL; + } } rcu_read_lock(); @@ -1136,8 +1146,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (reclaim) { struct mem_cgroup *curr = memcg; - if (last_visited) + if (last_visited) { + css_put(_visited->css); mem_cgroup_put(last_visited); + } if (css && !memcg) curr = mem_cgroup_from_css(css); -- Michal Hocko SUSE Labs -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
On Tue 13-11-12 08:14:42, Tejun Heo wrote: > On Tue, Nov 13, 2012 at 04:30:36PM +0100, Michal Hocko wrote: > > @@ -1063,8 +1063,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup > > *root, > >struct mem_cgroup *prev, > >struct mem_cgroup_reclaim_cookie *reclaim) > > { > > - struct mem_cgroup *memcg = NULL; > > - int id = 0; > > + struct mem_cgroup *memcg = NULL, > > + *last_visited = NULL; > > Nitpick but please don't do this. OK, will make it grep friendlier; > > + /* > > +* Root is not visited by cgroup iterators so it needs a special > > +* treatment. > > +*/ > > + if (!last_visited) { > > + css = >css; > > + } else { > > + struct cgroup *next_cgroup; > > + > > + next_cgroup = cgroup_next_descendant_pre( > > + last_visited->css.cgroup, > > + root->css.cgroup); > > + if (next_cgroup) > > + css = cgroup_subsys_state(next_cgroup, > > + mem_cgroup_subsys_id); > > Hmmm... wouldn't it be better to move the reclaim logic into a > function and do the following? > > reclaim(root); > for_each_descendent_pre() > reclaim(descendant); We cannot do for_each_descendent_pre here because we do not iterate through the whole hierarchy all the time. Check shrink_zone. > If this is a problem, I'd be happy to add a iterator which includes > the top node. This would help with the above if-else but I do not think this is the worst thing in the function ;) > I'd prefer controllers not using the next functions directly. Well, we will need to use it directly because of the single group reclaim mentioned above. -- Michal Hocko SUSE Labs -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
On Tue 13-11-12 08:14:42, Tejun Heo wrote: On Tue, Nov 13, 2012 at 04:30:36PM +0100, Michal Hocko wrote: @@ -1063,8 +1063,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { - struct mem_cgroup *memcg = NULL; - int id = 0; + struct mem_cgroup *memcg = NULL, + *last_visited = NULL; Nitpick but please don't do this. OK, will make it grep friendlier; + /* +* Root is not visited by cgroup iterators so it needs a special +* treatment. +*/ + if (!last_visited) { + css = root-css; + } else { + struct cgroup *next_cgroup; + + next_cgroup = cgroup_next_descendant_pre( + last_visited-css.cgroup, + root-css.cgroup); + if (next_cgroup) + css = cgroup_subsys_state(next_cgroup, + mem_cgroup_subsys_id); Hmmm... wouldn't it be better to move the reclaim logic into a function and do the following? reclaim(root); for_each_descendent_pre() reclaim(descendant); We cannot do for_each_descendent_pre here because we do not iterate through the whole hierarchy all the time. Check shrink_zone. If this is a problem, I'd be happy to add a iterator which includes the top node. This would help with the above if-else but I do not think this is the worst thing in the function ;) I'd prefer controllers not using the next functions directly. Well, we will need to use it directly because of the single group reclaim mentioned above. -- Michal Hocko SUSE Labs -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
On Wed 14-11-12 09:20:03, KAMEZAWA Hiroyuki wrote: (2012/11/14 0:30), Michal Hocko wrote: [...] @@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, mz = mem_cgroup_zoneinfo(root, nid, zid); iter = mz-reclaim_iter[reclaim-priority]; spin_lock(iter-iter_lock); + last_visited = iter-last_visited; if (prev reclaim-generation != iter-generation) { + if (last_visited) { + mem_cgroup_put(last_visited); + iter-last_visited = NULL; + } spin_unlock(iter-iter_lock); return NULL; } - id = iter-position; } rcu_read_lock(); - css = css_get_next(mem_cgroup_subsys, id + 1, root-css, id); - if (css) { - if (css == root-css || css_tryget(css)) - memcg = mem_cgroup_from_css(css); - } else - id = 0; - rcu_read_unlock(); + /* +* Root is not visited by cgroup iterators so it needs a special +* treatment. +*/ + if (!last_visited) { + css = root-css; + } else { + struct cgroup *next_cgroup; + + next_cgroup = cgroup_next_descendant_pre( + last_visited-css.cgroup, + root-css.cgroup); Maybe I miss something but last_visited is holded by memcg's refcnt. The cgroup pointed by css.cgroup is by cgroup's refcnt which can be freed before memcg is freed and last_visited-css.cgroup is out of RCU cycle. Is this safe ? Good spotted. You are right. What I need to do is to check that the last_visited is alive and restart from the root if not. Something like the bellow (incremental patch on top of this one) should help, right? diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 30efd7e..c0a91a3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1105,6 +1105,16 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, spin_unlock(iter-iter_lock); return NULL; } + /* +* memcg is still valid because we hold a reference but +* its cgroup might have vanished in the meantime so +* we have to double check it is alive and restart the +* tree walk otherwise. +*/ + if (last_visited !css_tryget(last_visited-css)) { + mem_cgroup_put(last_visited); + last_visited = NULL; + } } rcu_read_lock(); @@ -1136,8 +1146,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (reclaim) { struct mem_cgroup *curr = memcg; - if (last_visited) + if (last_visited) { + css_put(last_visited-css); mem_cgroup_put(last_visited); + } if (css !memcg) curr = mem_cgroup_from_css(css); -- Michal Hocko SUSE Labs -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
Hello, Michal. On Wed, Nov 14, 2012 at 09:51:29AM +0100, Michal Hocko wrote: reclaim(root); for_each_descendent_pre() reclaim(descendant); We cannot do for_each_descendent_pre here because we do not iterate through the whole hierarchy all the time. Check shrink_zone. I'm a bit confused. Why would that make any difference? Shouldn't it be just able to test the condition and continue? Thanks. -- tejun -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
(2012/11/14 19:10), Michal Hocko wrote: On Wed 14-11-12 09:20:03, KAMEZAWA Hiroyuki wrote: (2012/11/14 0:30), Michal Hocko wrote: [...] @@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, mz = mem_cgroup_zoneinfo(root, nid, zid); iter = mz-reclaim_iter[reclaim-priority]; spin_lock(iter-iter_lock); + last_visited = iter-last_visited; if (prev reclaim-generation != iter-generation) { + if (last_visited) { + mem_cgroup_put(last_visited); + iter-last_visited = NULL; + } spin_unlock(iter-iter_lock); return NULL; } - id = iter-position; } rcu_read_lock(); - css = css_get_next(mem_cgroup_subsys, id + 1, root-css, id); - if (css) { - if (css == root-css || css_tryget(css)) - memcg = mem_cgroup_from_css(css); - } else - id = 0; - rcu_read_unlock(); + /* +* Root is not visited by cgroup iterators so it needs a special +* treatment. +*/ + if (!last_visited) { + css = root-css; + } else { + struct cgroup *next_cgroup; + + next_cgroup = cgroup_next_descendant_pre( + last_visited-css.cgroup, + root-css.cgroup); Maybe I miss something but last_visited is holded by memcg's refcnt. The cgroup pointed by css.cgroup is by cgroup's refcnt which can be freed before memcg is freed and last_visited-css.cgroup is out of RCU cycle. Is this safe ? Good spotted. You are right. What I need to do is to check that the last_visited is alive and restart from the root if not. Something like the bellow (incremental patch on top of this one) should help, right? diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 30efd7e..c0a91a3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1105,6 +1105,16 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, spin_unlock(iter-iter_lock); return NULL; } + /* +* memcg is still valid because we hold a reference but +* its cgroup might have vanished in the meantime so +* we have to double check it is alive and restart the +* tree walk otherwise. +*/ + if (last_visited !css_tryget(last_visited-css)) { + mem_cgroup_put(last_visited); + last_visited = NULL; + } } rcu_read_lock(); @@ -1136,8 +1146,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (reclaim) { struct mem_cgroup *curr = memcg; - if (last_visited) + if (last_visited) { + css_put(last_visited-css); mem_cgroup_put(last_visited); + } if (css !memcg) curr = mem_cgroup_from_css(css); I think this will work. Thanks, -Kame -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
(2012/11/14 0:30), Michal Hocko wrote: > mem_cgroup_iter curently relies on css->id when walking down a group > hierarchy tree. This is really awkward because the tree walk depends on > the groups creation ordering. The only guarantee is that a parent node > is visited before its children. > Example > 1) mkdir -p a a/d a/b/c > 2) mkdir -a a/b/c a/d > Will create the same trees but the tree walks will be different: > 1) a, d, b, c > 2) a, b, c, d > > 574bd9f7 (cgroup: implement generic child / descendant walk macros) has > introduced generic cgroup tree walkers which provide either pre-order > or post-order tree walk. This patch converts css->id based iteration > to pre-order tree walk to keep the semantic with the original iterator > where parent is always visited before its subtree. > > cgroup_for_each_descendant_pre suggests using post_create and > pre_destroy for proper synchronization with groups addidition resp. > removal. This implementation doesn't use those because a new memory > cgroup is fully initialized in mem_cgroup_create and css_tryget makes > sure that the group is alive when we encounter it by iterator. > > If the reclaim cookie is used we need to store the last visited group > into the iterator so we have to be careful that it doesn't disappear in > the mean time. Elevated reference count on the memcg guarantees that > the group will not vanish even though it has been already removed from > the tree. In such a case css_tryget will fail and the iteration is > retried (groups are linked with RCU safe lists so the forward progress > is still possible). iter_lock will make sure that only one reclaimer > will see the last_visited group and the reference count game around it. > > Signed-off-by: Michal Hocko > --- > mm/memcontrol.c | 64 > ++- > 1 file changed, 49 insertions(+), 15 deletions(-) > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 0fe5177..5da1e58 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -142,8 +142,8 @@ struct mem_cgroup_stat_cpu { > }; > > struct mem_cgroup_reclaim_iter { > - /* css_id of the last scanned hierarchy member */ > - int position; > + /* last scanned hierarchy member with elevated ref count */ > + struct mem_cgroup *last_visited; > /* scan generation, increased every round-trip */ > unsigned int generation; > /* lock to protect the position and generation */ > @@ -1063,8 +1063,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup > *root, > struct mem_cgroup *prev, > struct mem_cgroup_reclaim_cookie *reclaim) > { > - struct mem_cgroup *memcg = NULL; > - int id = 0; > + struct mem_cgroup *memcg = NULL, > + *last_visited = NULL; > > if (mem_cgroup_disabled()) > return NULL; > @@ -1073,7 +1073,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup > *root, > root = root_mem_cgroup; > > if (prev && !reclaim) > - id = css_id(>css); > + last_visited = prev; > > if (prev && prev != root) > css_put(>css); > @@ -1086,7 +1086,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup > *root, > > while (!memcg) { > struct mem_cgroup_reclaim_iter *uninitialized_var(iter); > - struct cgroup_subsys_state *css; > + struct cgroup_subsys_state *css = NULL; > > if (reclaim) { > int nid = zone_to_nid(reclaim->zone); > @@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup > *root, > mz = mem_cgroup_zoneinfo(root, nid, zid); > iter = >reclaim_iter[reclaim->priority]; > spin_lock(>iter_lock); > + last_visited = iter->last_visited; > if (prev && reclaim->generation != iter->generation) { > + if (last_visited) { > + mem_cgroup_put(last_visited); > + iter->last_visited = NULL; > + } > spin_unlock(>iter_lock); > return NULL; > } > - id = iter->position; > } > > rcu_read_lock(); > - css = css_get_next(_cgroup_subsys, id + 1, >css, ); > - if (css) { > - if (css == >css || css_tryget(css)) > - memcg = mem_cgroup_from_css(css); > - } else > - id = 0; > - rcu_read_unlock(); > + /* > + * Root is not visited by cgroup iterators so it needs a special > + * treatment. > + */ > + if (!last_visited) { > +
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
On Tue, Nov 13, 2012 at 04:30:36PM +0100, Michal Hocko wrote: > @@ -1063,8 +1063,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup > *root, > struct mem_cgroup *prev, > struct mem_cgroup_reclaim_cookie *reclaim) > { > - struct mem_cgroup *memcg = NULL; > - int id = 0; > + struct mem_cgroup *memcg = NULL, > + *last_visited = NULL; Nitpick but please don't do this. > + /* > + * Root is not visited by cgroup iterators so it needs a special > + * treatment. > + */ > + if (!last_visited) { > + css = >css; > + } else { > + struct cgroup *next_cgroup; > + > + next_cgroup = cgroup_next_descendant_pre( > + last_visited->css.cgroup, > + root->css.cgroup); > + if (next_cgroup) > + css = cgroup_subsys_state(next_cgroup, > + mem_cgroup_subsys_id); Hmmm... wouldn't it be better to move the reclaim logic into a function and do the following? reclaim(root); for_each_descendent_pre() reclaim(descendant); If this is a problem, I'd be happy to add a iterator which includes the top node. I'd prefer controllers not using the next functions directly. Thanks. -- tejun -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
On Tue, Nov 13, 2012 at 04:30:36PM +0100, Michal Hocko wrote: @@ -1063,8 +1063,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { - struct mem_cgroup *memcg = NULL; - int id = 0; + struct mem_cgroup *memcg = NULL, + *last_visited = NULL; Nitpick but please don't do this. + /* + * Root is not visited by cgroup iterators so it needs a special + * treatment. + */ + if (!last_visited) { + css = root-css; + } else { + struct cgroup *next_cgroup; + + next_cgroup = cgroup_next_descendant_pre( + last_visited-css.cgroup, + root-css.cgroup); + if (next_cgroup) + css = cgroup_subsys_state(next_cgroup, + mem_cgroup_subsys_id); Hmmm... wouldn't it be better to move the reclaim logic into a function and do the following? reclaim(root); for_each_descendent_pre() reclaim(descendant); If this is a problem, I'd be happy to add a iterator which includes the top node. I'd prefer controllers not using the next functions directly. Thanks. -- tejun -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 2/5] memcg: rework mem_cgroup_iter to use cgroup iterators
(2012/11/14 0:30), Michal Hocko wrote: mem_cgroup_iter curently relies on css-id when walking down a group hierarchy tree. This is really awkward because the tree walk depends on the groups creation ordering. The only guarantee is that a parent node is visited before its children. Example 1) mkdir -p a a/d a/b/c 2) mkdir -a a/b/c a/d Will create the same trees but the tree walks will be different: 1) a, d, b, c 2) a, b, c, d 574bd9f7 (cgroup: implement generic child / descendant walk macros) has introduced generic cgroup tree walkers which provide either pre-order or post-order tree walk. This patch converts css-id based iteration to pre-order tree walk to keep the semantic with the original iterator where parent is always visited before its subtree. cgroup_for_each_descendant_pre suggests using post_create and pre_destroy for proper synchronization with groups addidition resp. removal. This implementation doesn't use those because a new memory cgroup is fully initialized in mem_cgroup_create and css_tryget makes sure that the group is alive when we encounter it by iterator. If the reclaim cookie is used we need to store the last visited group into the iterator so we have to be careful that it doesn't disappear in the mean time. Elevated reference count on the memcg guarantees that the group will not vanish even though it has been already removed from the tree. In such a case css_tryget will fail and the iteration is retried (groups are linked with RCU safe lists so the forward progress is still possible). iter_lock will make sure that only one reclaimer will see the last_visited group and the reference count game around it. Signed-off-by: Michal Hocko mho...@suse.cz --- mm/memcontrol.c | 64 ++- 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0fe5177..5da1e58 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -142,8 +142,8 @@ struct mem_cgroup_stat_cpu { }; struct mem_cgroup_reclaim_iter { - /* css_id of the last scanned hierarchy member */ - int position; + /* last scanned hierarchy member with elevated ref count */ + struct mem_cgroup *last_visited; /* scan generation, increased every round-trip */ unsigned int generation; /* lock to protect the position and generation */ @@ -1063,8 +1063,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { - struct mem_cgroup *memcg = NULL; - int id = 0; + struct mem_cgroup *memcg = NULL, + *last_visited = NULL; if (mem_cgroup_disabled()) return NULL; @@ -1073,7 +1073,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, root = root_mem_cgroup; if (prev !reclaim) - id = css_id(prev-css); + last_visited = prev; if (prev prev != root) css_put(prev-css); @@ -1086,7 +1086,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, while (!memcg) { struct mem_cgroup_reclaim_iter *uninitialized_var(iter); - struct cgroup_subsys_state *css; + struct cgroup_subsys_state *css = NULL; if (reclaim) { int nid = zone_to_nid(reclaim-zone); @@ -1096,30 +1096,64 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, mz = mem_cgroup_zoneinfo(root, nid, zid); iter = mz-reclaim_iter[reclaim-priority]; spin_lock(iter-iter_lock); + last_visited = iter-last_visited; if (prev reclaim-generation != iter-generation) { + if (last_visited) { + mem_cgroup_put(last_visited); + iter-last_visited = NULL; + } spin_unlock(iter-iter_lock); return NULL; } - id = iter-position; } rcu_read_lock(); - css = css_get_next(mem_cgroup_subsys, id + 1, root-css, id); - if (css) { - if (css == root-css || css_tryget(css)) - memcg = mem_cgroup_from_css(css); - } else - id = 0; - rcu_read_unlock(); + /* + * Root is not visited by cgroup iterators so it needs a special + * treatment. + */ + if (!last_visited) { + css = root-css; + } else { + struct cgroup