Previously we invalidated all iterators up to the memcg root, but this is overkill: we can check if currently dying memcg is stored in iterator's 'last_visited' and invalidate only those unlucky iterators.
https://jira.sw.ru/browse/PSBM-123655 Signed-off-by: Konstantin Khorenko <[email protected]> --- mm/memcontrol.c | 48 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index eec944f712b0..b6059f46049b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -192,8 +192,8 @@ struct mem_cgroup_stat2_cpu { struct mem_cgroup_reclaim_iter { /* - * last scanned hierarchy member. Valid only if last_dead_count - * matches memcg->dead_count of the hierarchy root group. + * Last scanned hierarchy member. + * If stored memcg is destroyed, the field is wiped. */ struct mem_cgroup *last_visited; unsigned long last_dead_count; @@ -1570,19 +1570,54 @@ static void mem_cgroup_iter_invalidate(struct mem_cgroup *root, { struct mem_cgroup_reclaim_iter *iter; struct mem_cgroup_per_zone *mz; + struct mem_cgroup *pos; int zone, node, i; + /* + * When a group in the hierarchy below root is destroyed, + * the hierarchy iterator can no longer be trusted iif + * iter->last_visited contains the cgroup being destroyed. + * Check if we get this unlikely case and invalidate the iterator + * if so. + * + * Note, only "break-ed" iterators can store iter->last_visited + * == dead_memcg because normally 'last_visited' is assigned + * in mem_cgroup_iter_update() and 'new_position' is just after + * css_tryget() there (ref inc-ed in __mem_cgroup_iter_next()) + * and thus cgroup is not offlined yet. + * + * mem_cgroup_iter_break() in its turn puts memcg's css but does + * not wipe it from iter->last_visited. + * + * Q: Why dying memcg (dead_memcg) cannot get into + * iter->last_visited a bit later after we wipe it here? + * A: Because up to the moment of the current function execution + * css_tryget() is guaranteed to fail on 'dead_memcg'. + */ + + rcu_read_lock(); for_each_node(node) { for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = mem_cgroup_zoneinfo(root, node, zone); for (i = 0; i < ARRAY_SIZE(mz->reclaim_iter); i++) { iter = &mz->reclaim_iter[i]; - rcu_assign_pointer(iter->last_visited, NULL); + + pos = rcu_access_pointer(iter->last_visited); + /* + * It's OK to race with mem_cgroup_iter_update() + * here because it cannot write new + * position == dead_memcg as + * css_tryget() for it should fail already. + */ + if (pos == dead_memcg) { + rcu_assign_pointer(iter->last_visited, + NULL); } } } } + rcu_read_unlock(); /* * When a group in the hierarchy below root is destroyed, the @@ -1631,10 +1666,9 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, int sequence) { /* - * We store the sequence count from the time @last_visited was - * loaded successfully instead of rereading it here so that we - * don't lose destruction events in between. We could have - * raced with the destruction of @new_position after all. + * The position saved in 'last_visited' is always valid. + * If the stored corresponding cgroup is destroyed, + * 'last_visited' is NULLed. */ rcu_assign_pointer(iter->last_visited, new_position); iter->last_dead_count = sequence; -- 2.24.3 _______________________________________________ Devel mailing list [email protected] https://lists.openvz.org/mailman/listinfo/devel
