Currently mem_cgroup_iter() uses complicated, odd, weak reference scheme
to iterate memcgs during reclaim. The basic scheme looks like this:

        if (iter->last_dead_count == *sequence) {
                smp_rmb();
                position = iter->last_visited;
...
        new_position = __mem_cgroup_iter_next(root, last_visited);
...
        iter->last_visited = new_position;
        smp_wmb();
        iter->last_dead_count = sequence;

The problem is that all this code could run in parallel. E.g.
we may have several threads simmulatniously updating
"iter->last_visited", "iter->last_dead_count" fields to different
values. In result we may have iter in inconsistent state - last_visited
from one writer, and last_dead_count from another.

It seems to may cause use-afte-frees in mem_cgroup_iter(), although
I'm not entirely sure about that. I still can't understand how this
mess should work.

Use seqlock to protect iter updates.

https://jira.sw.ru/browse/PSBM-83369
Signed-off-by: Andrey Ryabinin <[email protected]>
---
 mm/memcontrol.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 99d5da15b377..e0303b428ac1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -188,6 +188,7 @@ struct mem_cgroup_reclaim_iter {
         */
        struct mem_cgroup *last_visited;
        unsigned long last_dead_count;
+       seqlock_t last_visited_lock;
 
        /* scan generation, increased every round-trip */
        unsigned int generation;
@@ -1279,6 +1280,8 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
                     int *sequence)
 {
        struct mem_cgroup *position = NULL;
+       unsigned seq;
+
        /*
         * A cgroup destruction happens in two stages: offlining and
         * release.  They are separated by a RCU grace period.
@@ -1288,9 +1291,13 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter 
*iter,
         * released, tryget will fail if we lost the race.
         */
        *sequence = atomic_read(&root->dead_count);
+retry:
+       seq = read_seqbegin(&iter->last_visited_lock);
        if (iter->last_dead_count == *sequence) {
-               smp_rmb();
-               position = iter->last_visited;
+               position = READ_ONCE(iter->last_visited);
+
+               if (read_seqretry(&iter->last_visited_lock, seq))
+                       goto retry;
 
                /*
                 * We cannot take a reference to root because we might race
@@ -1321,9 +1328,10 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
         * don't lose destruction events in between.  We could have
         * raced with the destruction of @new_position after all.
         */
+       write_seqlock(&iter->last_visited_lock);
        iter->last_visited = new_position;
-       smp_wmb();
        iter->last_dead_count = sequence;
+       write_sequnlock(&iter->last_visited_lock);
 }
 
 /**
@@ -5912,11 +5920,15 @@ static int alloc_mem_cgroup_per_zone_info(struct 
mem_cgroup *memcg, int node)
                return 1;
 
        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+               int i;
+
                mz = &pn->zoneinfo[zone];
                lruvec_init(&mz->lruvec);
                mz->usage_in_excess = 0;
                mz->on_tree = false;
                mz->memcg = memcg;
+               for (i = 0; i < ARRAY_SIZE(mz->reclaim_iter); i++)
+                       seqlock_init(&mz->reclaim_iter[i].last_visited_lock);
        }
        memcg->info.nodeinfo[node] = pn;
        return 0;
-- 
2.16.1

_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to