[ Patch against https://www.linux-cr.org/redmine/tab/show/kernel-cr ]

In place of one big pids array, checkpoint one struct ckpt_hdr_pids
per task.  It contains pid/ppid/etc in the root nsproxy's pidns, and
is followed by a list of all virtual pids in child pid namespaces, if
any.

When an nsproxy is created during do_restore_ns(), we don't yet set
its pid_ns, waiting instead until a task attaches that new nsproxy to
itself.  I *think* the nsproxy will generally get recreated by the
task which will use it, but we may as well be sure by having the pid_ns
set when the nsproxy is first assigned.

This patch applies on top of ckpt-v20.  With this patch applied (and
the corresponding user-cr patch), all cr_tests pass, including a new
pidns test (which is in branch pidns.1 until this patch goes into
ckpt-v20-dev).

Please apply.

Changelog:
        Mar 18: bump checkpoing image format version

Signed-off-by: Serge E. Hallyn <[email protected]>
---
 checkpoint/checkpoint.c          |   91 +++++++++++++++++++++++--------------
 checkpoint/process.c             |   27 +++++++-----
 checkpoint/restart.c             |   59 ++++++++++++++++--------
 checkpoint/sys.c                 |    8 +++-
 include/linux/checkpoint.h       |    2 +-
 include/linux/checkpoint_hdr.h   |   19 +++++---
 include/linux/checkpoint_types.h |    3 +
 kernel/nsproxy.c                 |    9 +++-
 8 files changed, 142 insertions(+), 76 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index f27af41..55e14c3 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -27,6 +27,7 @@
 #include <linux/deferqueue.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
+#include <linux/pid_namespace.h>
 
 /* unique checkpoint identifier (FIXME: should be per-container ?) */
 static atomic_t ctx_count = ATOMIC_INIT(0);
@@ -241,6 +242,7 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct 
task_struct *t)
 {
        struct task_struct *root = ctx->root_task;
        struct nsproxy *nsproxy;
+       struct pid_namespace *pidns;
        int ret = 0;
 
        ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
@@ -293,66 +295,85 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, 
struct task_struct *t)
                _ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n");
                ret = -EPERM;
        }
-       /* no support for >1 private pidns */
-       if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) {
-               _ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n");
-               ret = -EPERM;
+       /* pidns must be descendent of root_nsproxy */
+       pidns = nsproxy->pid_ns;
+       while (pidns != ctx->root_nsproxy->pid_ns) {
+               if (pidns == &init_pid_ns) {
+                       ret = -EPERM;
+                       _ckpt_err(ctx, ret, "%(T)stranger pid_ns\n");
+                       break;
+               }
+               pidns = pidns->parent;
        }
        rcu_read_unlock();
 
        return ret;
 }
 
-#define CKPT_HDR_PIDS_CHUNK    256
+/* called under rcu_read_lock */
+static void copy_task(struct ckpt_hdr_pids *h, struct task_struct *t,
+                       struct pid_namespace *root_pid_ns,
+                       struct pid_namespace *task_pid_ns)
+{
+       int i = 0;
+       __s32 *pids;
+
+       h->pid = task_pid_nr_ns(t, root_pid_ns);
+       h->tgid = task_tgid_nr_ns(t, root_pid_ns);
+       h->pgid = task_pgrp_nr_ns(t, root_pid_ns);
+       h->sid = task_session_nr_ns(t, root_pid_ns);
+       h->ppid = task_tgid_nr_ns(t->real_parent, root_pid_ns);
+       h->rpid = task_pid_vnr(t);
+       pids = h->vpids;
+
+       while (task_pid_ns != root_pid_ns) {
+               pids[i++] = task_pid_nr_ns(t, task_pid_ns);
+               task_pid_ns = task_pid_ns->parent;
+       }
+}
 
 static int checkpoint_pids(struct ckpt_ctx *ctx)
 {
-       struct ckpt_pids *h;
-       struct pid_namespace *ns;
+       struct ckpt_hdr_pids *h;
+       struct pid_namespace *root_pidns;
        struct task_struct *task;
        struct task_struct **tasks_arr;
-       int nr_tasks, n, pos = 0, ret = 0;
+       int nr_tasks, i, ret = 0;
 
-       ns = ctx->root_nsproxy->pid_ns;
+       root_pidns = ctx->root_nsproxy->pid_ns;
        tasks_arr = ctx->tasks_arr;
        nr_tasks = ctx->nr_tasks;
        BUG_ON(nr_tasks <= 0);
 
-       ret = ckpt_write_obj_type(ctx, NULL,
-                                 sizeof(*h) * nr_tasks,
-                                 CKPT_HDR_BUFFER);
-       if (ret < 0)
-               return ret;
+       for (i = 0; i < nr_tasks; i++) {
+               int nsdelta, size;
+               struct pid_namespace *task_pidns;
 
-       h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
-       if (!h)
-               return -ENOMEM;
+               task = tasks_arr[i];
+               rcu_read_lock();
+               task_pidns = task_nsproxy(task)->pid_ns;
+               rcu_read_unlock();
+
+               nsdelta = task_pidns->level - root_pidns->level;
+               size = sizeof(*h) + nsdelta * sizeof(__s32);
+
+               h = ckpt_hdr_get_type(ctx, size, CKPT_HDR_PID);
+               if (!h)
+                       return -ENOMEM;
 
-       do {
                rcu_read_lock();
-               for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) {
-                       task = tasks_arr[pos];
-
-                       h[n].vpid = task_pid_nr_ns(task, ns);
-                       h[n].vtgid = task_tgid_nr_ns(task, ns);
-                       h[n].vpgid = task_pgrp_nr_ns(task, ns);
-                       h[n].vsid = task_session_nr_ns(task, ns);
-                       h[n].vppid = task_tgid_nr_ns(task->real_parent, ns);
-                       ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n",
-                                  pos, h[n].vpid, h[n].vtgid, h[n].vppid);
-                       pos++;
-               }
+               copy_task(h, task, root_pidns, task_pidns);
                rcu_read_unlock();
+               ckpt_debug("task[%d]: pid %d tgid %d parent %d\n",
+                          i, h->pid, h->tgid, h->ppid);
 
-               n = min(nr_tasks, CKPT_HDR_PIDS_CHUNK);
-               ret = ckpt_kwrite(ctx, h, n * sizeof(*h));
+               ret = ckpt_write_obj(ctx, &h->h);
+               ckpt_hdr_put(ctx, h);
                if (ret < 0)
                        break;
 
-               nr_tasks -= n;
-       } while (nr_tasks > 0);
+       }
 
-       _ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
        return ret;
 }
 
diff --git a/checkpoint/process.c b/checkpoint/process.c
index f917112..bb44960 100644
--- a/checkpoint/process.c
+++ b/checkpoint/process.c
@@ -22,7 +22,7 @@
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
 #include <linux/syscalls.h>
-
+#include <linux/pid_namespace.h>
 
 pid_t ckpt_pid_nr(struct ckpt_ctx *ctx, struct pid *pid)
 {
@@ -36,12 +36,6 @@ struct pid *_ckpt_find_pgrp(struct ckpt_ctx *ctx, pid_t pgid)
        struct pid *pgrp;
 
        if (pgid == 0) {
-               /*
-                * At checkpoint the pgid owner lived in an ancestor
-                * pid-ns. The best we can do (sanely and safely) is
-                * to examine the parent of this restart's root: if in
-                * a distinct pid-ns, use its pgrp; otherwise fail.
-                */
                p = ctx->root_task->real_parent;
                if (p->nsproxy->pid_ns == current->nsproxy->pid_ns)
                        return NULL;
@@ -51,7 +45,7 @@ struct pid *_ckpt_find_pgrp(struct ckpt_ctx *ctx, pid_t pgid)
                 * Find the owner process of this pgid (it must exist
                 * if pgrp exists). It must be a thread group leader.
                 */
-               pgrp = find_vpid(pgid);
+               pgrp = find_pid_ns(pgid, ctx->root_nsproxy->pid_ns);
                p = pid_task(pgrp, PIDTYPE_PID);
                if (!p || !thread_group_leader(p))
                        return NULL;
@@ -578,6 +572,14 @@ static int restore_task_ns(struct ckpt_ctx *ctx)
        }
 
        if (nsproxy != task_nsproxy(current)) {
+               /*
+                * This is *kinda* shady to do without any locking.  However
+                * it is safe because each task is restarted separately in
+                * serial.  If that ever changes, we'll need a spinlock?
+                */
+               if (!nsproxy->pid_ns)
+                       nsproxy->pid_ns = get_pid_ns(current->nsproxy->pid_ns);
+
                get_nsproxy(nsproxy);
                switch_task_namespaces(current, nsproxy);
        }
@@ -827,10 +829,10 @@ static int restore_task_pgid(struct ckpt_ctx *ctx)
        if (!thread_group_leader(task))  /* (1) */
                return 0;
 
-       pgid = ctx->pids_arr[ctx->active_pid].vpgid;
+       pgid = ctx->vpgids_arr[ctx->active_pid];
 
-       if (pgid == task_pgrp_vnr(task))  /* nothing to do */
-               return 0;
+       if (pgid == task_pgrp_nr_ns(task, ctx->root_nsproxy->pid_ns))
+               return 0;  /* nothing to do */
 
        if (task->signal->leader)  /* (2) */
                return -EINVAL;
@@ -850,6 +852,9 @@ static int restore_task_pgid(struct ckpt_ctx *ctx)
        if (ctx->uflags & RESTART_TASKSELF)
                ret = 0;
 
+       if (ret < 0)
+               ckpt_err(ctx, ret, "setting pgid\n");
+
        return ret;
 }
 
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 6a9644d..84713c7 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -145,7 +145,7 @@ void restore_debug_free(struct ckpt_ctx *ctx)
        ckpt_debug("kflags %lu uflags %lu oflags %lu", ctx->kflags,
                   ctx->uflags, ctx->oflags);
        for (i = 0; i < ctx->nr_pids; i++)
-               ckpt_debug("task[%d] to run %d\n", i, ctx->pids_arr[i].vpid);
+               ckpt_debug("task[%d] to run %d\n", i, ctx->vpids_arr[i]);
 
        list_for_each_entry_safe(s, p, &ctx->task_status, list) {
                if (s->flags & RESTART_DBG_COORD)
@@ -420,7 +420,8 @@ void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int 
type)
 
        h = ckpt_read_obj(ctx, len, len);
        if (IS_ERR(h)) {
-               ckpt_err(ctx, PTR_ERR(h), "Expecting to read type %d\n", type);
+               ckpt_err(ctx, PTR_ERR(h), "Expecting to read type %d len %d\n",
+                       type, len);
                return h;
        }
 
@@ -730,34 +731,51 @@ static int restore_read_tail(struct ckpt_ctx *ctx)
        return ret;
 }
 
+#define CKPT_MAX_PIDS_SZ 99999
 /* restore_read_tree - read the tasks tree into the checkpoint context */
 static int restore_read_tree(struct ckpt_ctx *ctx)
 {
        struct ckpt_hdr_tree *h;
-       int size, ret;
+       int i, size;
 
        h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TREE);
        if (IS_ERR(h))
                return PTR_ERR(h);
 
-       ret = -EINVAL;
+       ctx->nr_pids = h->nr_tasks;
+       ckpt_hdr_put(ctx, h);
+
        if (h->nr_tasks <= 0)
-               goto out;
+               return -EINVAL;
 
-       ctx->nr_pids = h->nr_tasks;
-       size = sizeof(*ctx->pids_arr) * ctx->nr_pids;
+       size = sizeof(pid_t) * ctx->nr_pids;
        if (size <= 0)          /* overflow ? */
-               goto out;
+               return -EINVAL;
 
-       ctx->pids_arr = kmalloc(size, GFP_KERNEL);
-       if (!ctx->pids_arr) {
-               ret = -ENOMEM;
-               goto out;
+       ctx->vpids_arr = kmalloc(size, GFP_KERNEL);
+       ctx->vpgids_arr = kmalloc(size, GFP_KERNEL);
+       if (!ctx->vpids_arr || !ctx->vpgids_arr)
+               return -ENOMEM;
+
+       for (i = 0; i < ctx->nr_pids; i++) {
+               struct ckpt_hdr_pids *p;
+
+               p = ckpt_read_obj(ctx, 0, CKPT_MAX_PIDS_SZ);
+               if (!p)
+                       return -EINVAL;
+               if (p->h.type != CKPT_HDR_PID) {
+                       ckpt_hdr_put(ctx, p);
+                       return -EINVAL;
+               }
+               if (p->h.len < sizeof(*p)) {
+                       ckpt_hdr_put(ctx, p);
+                       return -EINVAL;
+               }
+               ctx->vpids_arr[i] = p->pid;
+               ctx->vpgids_arr[i] = p->pgid;
+               ckpt_hdr_put(ctx, p);
        }
-       ret = _ckpt_read_buffer(ctx, ctx->pids_arr, size);
- out:
-       ckpt_hdr_put(ctx, h);
-       return ret;
+       return 0;
 }
 
 static inline int all_tasks_activated(struct ckpt_ctx *ctx)
@@ -768,7 +786,7 @@ static inline int all_tasks_activated(struct ckpt_ctx *ctx)
 static inline pid_t get_active_pid(struct ckpt_ctx *ctx)
 {
        int active = ctx->active_pid;
-       return active >= 0 ? ctx->pids_arr[active].vpid : 0;
+       return active >= 0 ? ctx->vpids_arr[active] : 0;
 }
 
 static inline int is_task_active(struct ckpt_ctx *ctx, pid_t pid)
@@ -870,7 +888,7 @@ static int restore_activate_next(struct ckpt_ctx *ctx)
 
 static int wait_task_active(struct ckpt_ctx *ctx)
 {
-       pid_t pid = task_pid_vnr(current);
+       pid_t pid = task_pid_nr_ns(current, ctx->root_nsproxy->pid_ns);
        int ret;
 
        ckpt_debug("pid %d waiting\n", pid);
@@ -886,7 +904,8 @@ static int wait_task_active(struct ckpt_ctx *ctx)
 
 static int wait_task_sync(struct ckpt_ctx *ctx)
 {
-       ckpt_debug("pid %d syncing\n", task_pid_vnr(current));
+       ckpt_debug("pid %d syncing\n",
+               task_pid_nr_ns(current, ctx->root_nsproxy->pid_ns));
        wait_event_interruptible(ctx->waitq, ckpt_test_complete(ctx));
        ckpt_debug("task sync done (errno %d)\n", ctx->errno);
        if (ckpt_test_error(ctx))
@@ -1160,7 +1179,7 @@ static struct task_struct *choose_root_task(struct 
ckpt_ctx *ctx, pid_t pid)
 
        read_lock(&tasklist_lock);
        list_for_each_entry(task, &current->children, sibling) {
-               if (task_pid_vnr(task) == pid) {
+               if (task_pid_nr_ns(task, ctx->coord_pidns) == pid) {
                        get_task_struct(task);
                        ctx->root_task = task;
                        ctx->root_pid = pid;
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 9e9df9b..5df72b0 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -22,6 +22,7 @@
 #include <linux/capability.h>
 #include <linux/checkpoint.h>
 #include <linux/deferqueue.h>
+#include <linux/pid_namespace.h>
 
 /*
  * ckpt_unpriv_allowed - sysctl controlled.
@@ -247,6 +248,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
        if (ctx->tasks_arr)
                task_arr_free(ctx);
 
+       if (ctx->coord_pidns)
+               put_pid_ns(ctx->coord_pidns);
        if (ctx->root_nsproxy)
                put_nsproxy(ctx->root_nsproxy);
        if (ctx->root_task)
@@ -256,7 +259,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
 
        free_page((unsigned long) ctx->scratch_page);
 
-       kfree(ctx->pids_arr);
+       kfree(ctx->vpids_arr);
+       kfree(ctx->vpgids_arr);
 
        sock_listening_list_free(&ctx->listen_sockets);
 
@@ -277,6 +281,8 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned 
long uflags,
        ctx->kflags = kflags;
        ctx->ktime_begin = ktime_get();
 
+       ctx->coord_pidns = get_pid_ns(current->nsproxy->pid_ns);
+
        atomic_set(&ctx->refcount, 0);
        INIT_LIST_HEAD(&ctx->pgarr_list);
        INIT_LIST_HEAD(&ctx->pgarr_pool);
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 792b523..e860bf5 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -10,7 +10,7 @@
  *  distribution for more details.
  */
 
-#define CHECKPOINT_VERSION  5
+#define CHECKPOINT_VERSION  6
 
 /* checkpoint user flags */
 #define CHECKPOINT_SUBTREE     0x1
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 41412d1..7957b3b 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -117,6 +117,8 @@ enum {
 #define CKPT_HDR_GROUPINFO CKPT_HDR_GROUPINFO
        CKPT_HDR_TASK_CREDS,
 #define CKPT_HDR_TASK_CREDS CKPT_HDR_TASK_CREDS
+       CKPT_HDR_PID,
+#define CKPT_HDR_PID CKPT_HDR_PID
 
        /* 201-299: reserved for arch-dependent */
 
@@ -326,12 +328,17 @@ struct ckpt_hdr_tree {
        __s32 nr_tasks;
 } __attribute__((aligned(8)));
 
-struct ckpt_pids {
-       __s32 vpid;
-       __s32 vppid;
-       __s32 vtgid;
-       __s32 vpgid;
-       __s32 vsid;
+struct ckpt_hdr_pids {
+       struct ckpt_hdr h;
+       __s32 rpid;  /* pid in checkpointer's pid_ns */
+       /* The rest of these are in container init's pid_ns */
+       __s32 pid;
+       __s32 ppid;
+       __s32 tgid;
+       __s32 pgid;
+       __s32 sid;
+       /* followed by pids in pid_ns up to root->nsproxy->pid_ns */
+       __s32 vpids[0];
 } __attribute__((aligned(8)));
 
 /* pids */
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index ecd3e91..0ae78a7 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -37,6 +37,7 @@ struct ckpt_ctx {
        int root_init;                          /* [container] root init ? */
        pid_t root_pid;                         /* [container] root pid */
        struct task_struct *root_task;          /* [container] root task */
+       struct pid_namespace *coord_pidns;      /* coordinator pid_ns */
        struct nsproxy *root_nsproxy;           /* [container] root nsproxy */
        struct task_struct *root_freezer;       /* [container] root task */
        char lsm_name[SECURITY_NAME_MAX + 1];   /* security module at ckpt */
@@ -74,6 +75,8 @@ struct ckpt_ctx {
 
        /* [multi-process restart] */
        struct ckpt_pids *pids_arr;     /* array of all pids [restart] */
+       pid_t *vpids_arr;               /* pids array in container pidns */
+       pid_t *vpgids_arr;              /* vpgids array in container pidns */
        int nr_pids;                    /* size of pids array */
        atomic_t nr_total;              /* total tasks count (with ghosts) */
        int active_pid;                 /* (next) position in pids array */
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 0da0d83..6d86240 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -364,8 +364,13 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
                get_net(net_ns);
                nsproxy->net_ns = net_ns;
 
-               get_pid_ns(current->nsproxy->pid_ns);
-               nsproxy->pid_ns = current->nsproxy->pid_ns;
+               /*
+                * The pid_ns will get assigned the first time that we
+                * assign the nsproxy to a task.  The task had unshared
+                * its pid_ns in userspace before calling restart, and
+                * we want to keep using that pid_ns.
+                */
+               nsproxy->pid_ns = NULL;
        }
  out:
        if (ret < 0)
-- 
1.6.1

_______________________________________________
Containers mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
[email protected]
https://openvz.org/mailman/listinfo/devel

Reply via email to