On Fri, Mar 19, 2010 at 04:39:55PM -0500, Serge E. Hallyn wrote:
> Support checkpoint and restart of tasks in nested pid namespaces.  At
> Oren's request here is an alternative to my previous implementation.  In
> this one, we keep the original single pids_array to minimize memory
> allocations.  The pids array entries are augmented with a pidns depth
> (relative to the container init's pidns, and an "rpid" which is the pid
> in the checkpointer's pidns (or 0 if no valid pid exists).  The rpid
> will be used by userspace to gather more information (like
> /proc/$$/mountinfo) after the kernel sys_checkpoint.  If any tasks are
> in nested pid namespace, another single array holds all of the vpids.
> At restart those are used by userspace to determine how to call
> eclone().  Kernel ignores them.
> 
> All cr_tests including the new pid_ns testcase pass.
> 

IMHO this approach looks ok too. I just feel that checkpoint_vpids() could
be re-worked a bit in order to not impose an artificial limit of
CKPT_HDR_PIDS_CHUNK to the depth of pid namespaces, even if it is 256 (see
suggested changes below).

It would probably be safer too to use task_active_pid_ns() instead of
task->nsproxy->pid_ns, just in case some PID namespace unsharing like proposed
by Eric makes it to mainline.

Thanks,

Louis

> Signed-off-by: Serge E. Hallyn <[email protected]>
> ---
>  checkpoint/checkpoint.c          |  113 
> ++++++++++++++++++++++++++++++++++----
>  checkpoint/process.c             |   18 +++++-
>  checkpoint/restart.c             |   45 ++++++++++++++-
>  checkpoint/sys.c                 |    2 +
>  include/linux/checkpoint.h       |    2 +-
>  include/linux/checkpoint_hdr.h   |   16 +++++
>  include/linux/checkpoint_types.h |    3 +
>  kernel/nsproxy.c                 |    9 ++-
>  8 files changed, 186 insertions(+), 22 deletions(-)
> 
> diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> index f27af41..fe3546a 100644
> --- a/checkpoint/checkpoint.c
> +++ b/checkpoint/checkpoint.c
> @@ -27,6 +27,7 @@
>  #include <linux/deferqueue.h>
>  #include <linux/checkpoint.h>
>  #include <linux/checkpoint_hdr.h>
> +#include <linux/pid_namespace.h>
>  
>  /* unique checkpoint identifier (FIXME: should be per-container ?) */
>  static atomic_t ctx_count = ATOMIC_INIT(0);
> @@ -242,6 +243,7 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, 
> struct task_struct *t)
>       struct task_struct *root = ctx->root_task;
>       struct nsproxy *nsproxy;
>       int ret = 0;
> +     struct pid_namespace *pidns;
>  
>       ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
>  
> @@ -293,10 +295,15 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, 
> struct task_struct *t)
>               _ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n");
>               ret = -EPERM;
>       }
> -     /* no support for >1 private pidns */
> -     if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) {
> -             _ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n");
> -             ret = -EPERM;
> +     /* pidns must be descendent of root_nsproxy */
> +     pidns = nsproxy->pid_ns;
> +     while (pidns != ctx->root_nsproxy->pid_ns) {
> +             if (pidns == &init_pid_ns) {
> +                     ret = -EPERM;
> +                     _ckpt_err(ctx, ret, "%(T)stranger pid_ns\n");
> +                     break;
> +             }
> +             pidns = pidns->parent;
>       }
>       rcu_read_unlock();
>  
> @@ -305,15 +312,19 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, 
> struct task_struct *t)
>  
>  #define CKPT_HDR_PIDS_CHUNK  256
>  
> +/*
> + * Write the pids in ctx->root_nsproxy->pidns.  This info is
> + * needed at restart to unambiguously dereference tasks.
> + */
>  static int checkpoint_pids(struct ckpt_ctx *ctx)
>  {
>       struct ckpt_pids *h;
> -     struct pid_namespace *ns;
> +     struct pid_namespace *root_pidns;
>       struct task_struct *task;
>       struct task_struct **tasks_arr;
>       int nr_tasks, n, pos = 0, ret = 0;
>  
> -     ns = ctx->root_nsproxy->pid_ns;
> +     root_pidns = ctx->root_nsproxy->pid_ns;
>       tasks_arr = ctx->tasks_arr;
>       nr_tasks = ctx->nr_tasks;
>       BUG_ON(nr_tasks <= 0);
> @@ -331,15 +342,21 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
>       do {
>               rcu_read_lock();
>               for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) {
> +                     struct pid_namespace *task_pidns;
>                       task = tasks_arr[pos];
>  
> -                     h[n].vpid = task_pid_nr_ns(task, ns);
> -                     h[n].vtgid = task_tgid_nr_ns(task, ns);
> -                     h[n].vpgid = task_pgrp_nr_ns(task, ns);
> -                     h[n].vsid = task_session_nr_ns(task, ns);
> -                     h[n].vppid = task_tgid_nr_ns(task->real_parent, ns);
> +                     h[n].vpid = task_pid_nr_ns(task, root_pidns);
> +                     h[n].vtgid = task_tgid_nr_ns(task, root_pidns);
> +                     h[n].vpgid = task_pgrp_nr_ns(task, root_pidns);
> +                     h[n].vsid = task_session_nr_ns(task, root_pidns);
> +                     h[n].vppid = task_tgid_nr_ns(task->real_parent,
> +                                     root_pidns);
> +                     task_pidns = task_nsproxy(task)->pid_ns;
> +                     h[n].rpid = task_pid_vnr(task);
> +                     h[n].depth = task_pidns->level - root_pidns->level;
>                       ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n",
>                                  pos, h[n].vpid, h[n].vtgid, h[n].vppid);
> +                     ctx->nr_vpids += h[n].depth;
>                       pos++;
>               }
>               rcu_read_unlock();
> @@ -356,6 +373,61 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
>       return ret;
>  }
>  
> +static int checkpoint_vpids(struct ckpt_ctx *ctx)
> +{
> +     struct ckpt_vpid *h;
> +     struct pid_namespace *root_pidns, *task_pidns;
> +     struct task_struct *task;
> +     int ret, nr_tasks = ctx->nr_tasks;
> +     int tidx = 0, /* index into task array */
> +             hidx = 0; /* pids written into current ckpt_vpids chunk */
> +
> +     root_pidns = ctx->root_nsproxy->pid_ns;
> +     nr_tasks = ctx->nr_tasks;
> +
> +     ret = ckpt_write_obj_type(ctx, NULL,
> +                               sizeof(*h) * ctx->nr_vpids,
> +                               CKPT_HDR_BUFFER);
> +     if (ret < 0)
> +             return ret;
> +
> +     h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> +     if (!h)
> +             return -ENOMEM;
> +
> +     do {
> +             rcu_read_lock();
> +             while (tidx < nr_tasks) {
> +                     int vidx; /* vpid index */
> +                     int nsdelta;
> +
> +                     task = ctx->tasks_arr[tidx];
> +                     task_pidns = task_nsproxy(task)->pid_ns;
> +                     nsdelta = task_pidns->level - root_pidns->level;
> +                     if (hidx + nsdelta >= CKPT_HDR_PIDS_CHUNK)

I think that (hidx + nsdelta > CKPT_HDR_PIDS_CHUNK) checks more accurately the 
limit.

> +                             break;
> +
> +                     for (vidx = 0; vidx < nsdelta; vidx++) {
> +                             h[vidx].pid = task_pid_nr_ns(task, task_pidns);

Here:
                                h[hidx + vidx]

> +                             task_pidns = task_pidns->parent;
> +                     }
> +
> +                     hidx += nsdelta;
> +                     tidx++;
> +             }
> +             rcu_read_unlock();
> +
> +             ret = ckpt_kwrite(ctx, h, hidx * sizeof(*h));
> +             if (ret < 0)
> +                     break;
> +
> +             hidx = 0;
> +     } while (tidx < nr_tasks);
> +
> +     _ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> +     return ret;
> +}

Maybe re-work it this way:

static int checkpoint_vpids(struct ckpt_ctx *ctx)
{
        struct ckpt_vpid *h;
        struct pid_namespace *root_pidns, *task_pidns, *active_pidns;
        struct task_struct *task;
        int ret, nr_tasks = ctx->nr_tasks;
        int tidx = 0, /* index into task array */
                hidx = 0; /* pids written into current ckpt_vpids chunk */
                vidx = 0; /* vpid index for current task */

        root_pidns = ctx->root_nsproxy->pid_ns;
        nr_tasks = ctx->nr_tasks;

        ret = ckpt_write_obj_type(ctx, NULL,
                                  sizeof(*h) * ctx->nr_vpids,
                                  CKPT_HDR_BUFFER);
        if (ret < 0)
                return ret;

        h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
        if (!h)
                return -ENOMEM;

        do {
                rcu_read_lock();
                while (tidx < nr_tasks && hidx < CKPT_HDR_PIDS_CHUNK) {
                        int nsdelta;

                        task = ctx->tasks_arr[tidx];
                        active_pidns = task_active_pid_ns(task);
                        nsdelta = active_pidns->level - root_pidns->level;
                        if (hidx + nsdelta - vidx > CKPT_HDR_PIDS_CHUNK)
                                /*
                                 * We will release rcu before recording the
                                 * remaining vpids, but neither task nor its
                                 * pid can disappear.
                                 */
                                nsdelta = CKPT_HDR_PIDS_CHUNK - hidx + vidx;

                        if (vidx == 0)
                                task_pidns = active_pidns;
                        for (; vidx < nsdelta; vidx++) {
                                h[hidx].pid = task_pid_nr_ns(task, task_pidns);
                                hidx++;
                                task_pidns = task_pidns->parent;
                        }

                        if (task_pidns == root_pidns) {
                                tidx++;
                                vidx = 0;
                        }
                }
                rcu_read_unlock();

                ret = ckpt_kwrite(ctx, h, hidx * sizeof(*h));
                if (ret < 0)
                        break;

                hidx = 0;
        } while (tidx < nr_tasks);

        _ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
        return ret;
}

[...]

-- 
Dr Louis Rilling                        Kerlabs
Skype: louis.rilling                    Batiment Germanium
Phone: (+33|0) 6 80 89 08 23            80 avenue des Buttes de Coesmes
http://www.kerlabs.com/                 35700 Rennes

Attachment: signature.asc
Description: Digital signature

_______________________________________________
Containers mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/containers
_______________________________________________
Devel mailing list
[email protected]
https://openvz.org/mailman/listinfo/devel

Reply via email to