Previously we invalidated all iterators up to the memcg root,
but this is overkill: we can check if currently dying memcg is
stored in iterator's 'last_visited' and invalidate only those
unlucky iterators.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko <[email protected]>
---
 mm/memcontrol.c | 48 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index eec944f712b0..b6059f46049b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -192,8 +192,8 @@ struct mem_cgroup_stat2_cpu {
 
 struct mem_cgroup_reclaim_iter {
        /*
-        * last scanned hierarchy member. Valid only if last_dead_count
-        * matches memcg->dead_count of the hierarchy root group.
+        * Last scanned hierarchy member.
+        * If stored memcg is destroyed, the field is wiped.
         */
        struct mem_cgroup *last_visited;
        unsigned long last_dead_count;
@@ -1570,19 +1570,54 @@ static void mem_cgroup_iter_invalidate(struct 
mem_cgroup *root,
 {
        struct mem_cgroup_reclaim_iter *iter;
        struct mem_cgroup_per_zone *mz;
+       struct mem_cgroup *pos;
        int zone, node, i;
 
+       /*
+        * When a group in the hierarchy below root is destroyed,
+        * the hierarchy iterator can no longer be trusted iif
+        * iter->last_visited contains the cgroup being destroyed.
+        * Check if we get this unlikely case and invalidate the iterator
+        * if so.
+        *
+        * Note, only "break-ed" iterators can store iter->last_visited
+        * == dead_memcg because normally 'last_visited' is assigned
+        * in mem_cgroup_iter_update() and 'new_position' is just after
+        * css_tryget() there (ref inc-ed in __mem_cgroup_iter_next())
+        * and thus cgroup is not offlined yet.
+        *
+        * mem_cgroup_iter_break() in its turn puts memcg's css but does
+        * not wipe it from iter->last_visited.
+        *
+        * Q: Why dying memcg (dead_memcg) cannot get into
+        *    iter->last_visited a bit later after we wipe it here?
+        * A: Because up to the moment of the current function execution
+        *    css_tryget() is guaranteed to fail on 'dead_memcg'.
+        */
+
+       rcu_read_lock();
        for_each_node(node) {
                for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                        mz = mem_cgroup_zoneinfo(root, node, zone);
 
                        for (i = 0; i < ARRAY_SIZE(mz->reclaim_iter); i++) {
                                iter = &mz->reclaim_iter[i];
-                               rcu_assign_pointer(iter->last_visited, NULL);
+
+                               pos = rcu_access_pointer(iter->last_visited);
+                               /*
+                                * It's OK to race with mem_cgroup_iter_update()
+                                * here because it cannot write new
+                                * position == dead_memcg as
+                                * css_tryget() for it should fail already.
+                                */
+                               if (pos == dead_memcg) {
+                                       rcu_assign_pointer(iter->last_visited,
+                                                          NULL);
                                }
                        }
                }
        }
+       rcu_read_unlock();
 
        /*
         * When a group in the hierarchy below root is destroyed, the
@@ -1631,10 +1666,9 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
                                   int sequence)
 {
        /*
-        * We store the sequence count from the time @last_visited was
-        * loaded successfully instead of rereading it here so that we
-        * don't lose destruction events in between.  We could have
-        * raced with the destruction of @new_position after all.
+        * The position saved in 'last_visited' is always valid.
+        * If the stored corresponding cgroup is destroyed,
+        * 'last_visited' is NULLed.
         */
        rcu_assign_pointer(iter->last_visited, new_position);
        iter->last_dead_count = sequence;
-- 
2.24.3

_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to