Make sure that all prepared tasks are in sys_restart before the
coordinator proceeds.  Otherwise, it was possible for the last
task actually in sys_restart to sync before the straggler was in
sys_restart, sending that task a signal and causing that task to
exit.  Then since there are not enough tasks completing restart,
the restart operation ends up hanging, waiting for the killed task.

Signed-off-by: Serge E. Hallyn <[email protected]>
---
 checkpoint/restart.c             |   53 ++++++++++++++++++++++++++++++++++++++
 checkpoint/sys.c                 |    1 +
 include/linux/checkpoint_types.h |    2 +
 3 files changed, 56 insertions(+), 0 deletions(-)

diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 543b380..849bda5 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -655,13 +655,35 @@ static struct ckpt_ctx *wait_checkpoint_ctx(void)
 static int do_ghost_task(void)
 {
        struct ckpt_ctx *ctx;
+       int ret;
 
        ctx = wait_checkpoint_ctx();
        if (IS_ERR(ctx))
                return PTR_ERR(ctx);
 
+       /*
+        * Wait until coordinator task has completed prepare_descendants().
+        * Note that prepare_descendants() wakes each task one a time
+        * finishing the wait_checkpoint_ctx() above.  We may want to change
+        * that flow so tasks only sleep once, but this works for now.
+        */
+       ret = wait_event_interruptible(ctx->waitq,
+                                       atomic_read(&ctx->nr_running));
+       if (ret) {
+               ckpt_debug("wait for coordinator prep returned %d\n", ret);
+               return ret;
+       }
+       /*
+        * Now each prepared task decrements nr_running, and, when that
+        * hits zero, wakes the coordinator.  That way the coordinator
+        * knows that all tasks are in sys_restart(0
+        */
+       if (atomic_dec_and_test(&ctx->nr_running))
+               complete(&ctx->all_ready);
+
        current->flags |= PF_RESTARTING;
 
+       /* Finally wait for all ghosts to be woken up - to die */
        wait_event_interruptible(ctx->ghostq,
                                 all_tasks_activated(ctx) ||
                                 ckpt_test_ctx_error(ctx));
@@ -682,6 +704,26 @@ static int do_restore_task(void)
        if (IS_ERR(ctx))
                return PTR_ERR(ctx);
 
+       /*
+        * Wait until coordinator task has completed prepare_descendants().
+        * Note that prepare_descendants() wakes each task one a time
+        * finishing the wait_checkpoint_ctx() above.  We may want to change
+        * that flow so tasks only sleep once, but this works for now.
+        */
+       ret = wait_event_interruptible(ctx->waitq,
+                                       atomic_read(&ctx->nr_running));
+       if (ret) {
+               ckpt_debug("wait for coordinator prep returned %d\n", ret);
+               return ret;
+       }
+       /*
+        * Now each prepared task decrements nr_running, and, when that
+        * hits zero, wakes the coordinator.  That way the coordinator
+        * knows that all tasks are in sys_restart(0
+        */
+       if (atomic_dec_and_test(&ctx->nr_running))
+               complete(&ctx->all_ready);
+
        current->flags |= PF_RESTARTING;
 
        /* wait for our turn, do the restore, and tell next task in line */
@@ -814,6 +856,7 @@ static int prepare_descendants(struct ckpt_ctx *ctx, struct 
task_struct *root)
                ret = -ESRCH;
 
        atomic_set(&ctx->nr_total, nr_pids);
+       atomic_set(&ctx->nr_running, nr_pids);
        return ret;
 }
 
@@ -948,6 +991,16 @@ static int do_restore_coord(struct ckpt_ctx *ctx, pid_t 
pid)
                ret = prepare_descendants(ctx, ctx->root_task);
                if (ret < 0)
                        goto out;
+
+               /* tell tasks we're ready for them to dec  ctx->nr_running */
+               wake_up_all(&ctx->waitq);
+               /* ctx->nr_running hit 0, all our tasks are in sys_restart */
+               ret = wait_for_completion_interruptible(&ctx->all_ready);
+               if (ret) {
+                       ckpt_debug("waiting on all_ready returned %d\n", ret);
+                       goto out;
+               }
+
                /* wait for all other tasks to complete do_restore_task() */
                ret = wait_all_tasks_finish(ctx);
                if (ret < 0)
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 76a3fa9..d48e261 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -233,6 +233,7 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned 
long uflags,
        ctx->uflags = uflags;
        ctx->kflags = kflags;
        ctx->ktime_begin = ktime_get();
+       init_completion(&ctx->all_ready);
 
        atomic_set(&ctx->refcount, 0);
        INIT_LIST_HEAD(&ctx->pgarr_list);
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index 9b7b4dd..2a854e0 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -69,8 +69,10 @@ struct ckpt_ctx {
        struct ckpt_pids *pids_arr;     /* array of all pids [restart] */
        int nr_pids;                    /* size of pids array */
        atomic_t nr_total;              /* total tasks count (with ghosts) */
+       atomic_t nr_running;            /* countdown of tasks in sys_restart */
        int active_pid;                 /* (next) position in pids array */
        struct completion complete;     /* completion for container root */
+       struct completion all_ready;    /* all tasks are in sys_restart */
        wait_queue_head_t waitq;        /* waitqueue for restarting tasks */
        wait_queue_head_t ghostq;       /* waitqueue for ghost tasks */
        struct cred *realcred, *ecred;  /* tmp storage for cred at restart */
-- 
1.6.1

_______________________________________________
Containers mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
[email protected]
https://openvz.org/mailman/listinfo/devel

Reply via email to