Gitweb:     
http://git.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=30e49c263e36341b60b735cbef5ca37912549264
Commit:     30e49c263e36341b60b735cbef5ca37912549264
Parent:     b461cc03828c743aed6b3855b9ab0d39a9d54ec5
Author:     Pavel Emelyanov <[EMAIL PROTECTED]>
AuthorDate: Thu Oct 18 23:40:10 2007 -0700
Committer:  Linus Torvalds <[EMAIL PROTECTED]>
CommitDate: Fri Oct 19 11:53:39 2007 -0700

    pid namespaces: allow cloning of new namespace
    
    When clone() is invoked with CLONE_NEWPID, create a new pid namespace and 
then
    create a new struct pid for the new process.  Allocate pid_t's for the new
    process in the new pid namespace and all ancestor pid namespaces.  Make the
    newly cloned process the session and process group leader.
    
    Since the active pid namespace is special and expected to be the first entry
    in pid->upid_list, preserve the order of pid namespaces.
    
    The size of 'struct pid' is dependent on the the number of pid namespaces 
the
    process exists in, so we use multiple pid-caches'.  Only one pid cache is
    created during system startup and this used by processes that exist only in
    init_pid_ns.
    
    When a process clones its pid namespace, we create additional pid caches as
    necessary and use the pid cache to allocate 'struct pids' for that depth.
    
    Note, that with this patch the newly created namespace won't work, since the
    rest of the kernel still uses global pids, but this is to be fixed soon.  
Init
    pid namespace still works.
    
    [EMAIL PROTECTED]: merge fix]
    Signed-off-by: Pavel Emelyanov <[EMAIL PROTECTED]>
    Signed-off-by: Sukadev Bhattiprolu <[EMAIL PROTECTED]>
    Cc: Paul Menage <[EMAIL PROTECTED]>
    Cc: "Eric W. Biederman" <[EMAIL PROTECTED]>
    Cc: Oleg Nesterov <[EMAIL PROTECTED]>
    Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
    Signed-off-by: Linus Torvalds <[EMAIL PROTECTED]>
---
 include/linux/sched.h |    1 +
 kernel/fork.c         |   43 +++++++++++++++--------
 kernel/nsproxy.c      |    3 +-
 kernel/pid.c          |   88 +++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 113 insertions(+), 22 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b0bf326..1301c08 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -25,6 +25,7 @@
 #define CLONE_NEWUTS           0x04000000      /* New utsname group? */
 #define CLONE_NEWIPC           0x08000000      /* New ipcs */
 #define CLONE_NEWUSER          0x10000000      /* New user namespace */
+#define CLONE_NEWPID           0x20000000      /* New pid namespace */
 #define CLONE_NEWNET           0x40000000      /* New network namespace */
 
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index bab3419..f252784 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -973,7 +973,6 @@ static struct task_struct *copy_process(unsigned long 
clone_flags,
                                        unsigned long stack_start,
                                        struct pt_regs *regs,
                                        unsigned long stack_size,
-                                       int __user *parent_tidptr,
                                        int __user *child_tidptr,
                                        struct pid *pid)
 {
@@ -1043,11 +1042,6 @@ static struct task_struct *copy_process(unsigned long 
clone_flags,
        p->did_exec = 0;
        delayacct_tsk_init(p);  /* Must remain after dup_task_struct() */
        copy_flags(clone_flags, p);
-       retval = -EFAULT;
-       if (clone_flags & CLONE_PARENT_SETTID)
-               if (put_user(p->pid, parent_tidptr))
-                       goto bad_fork_cleanup_delays_binfmt;
-
        INIT_LIST_HEAD(&p->children);
        INIT_LIST_HEAD(&p->sibling);
        p->vfork_done = NULL;
@@ -1289,11 +1283,22 @@ static struct task_struct *copy_process(unsigned long 
clone_flags,
                        __ptrace_link(p, current->parent);
 
                if (thread_group_leader(p)) {
-                       p->signal->tty = current->signal->tty;
-                       p->signal->pgrp = task_pgrp_nr(current);
-                       set_task_session(p, task_session_nr(current));
-                       attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
-                       attach_pid(p, PIDTYPE_SID, task_session(current));
+                       if (clone_flags & CLONE_NEWPID) {
+                               p->nsproxy->pid_ns->child_reaper = p;
+                               p->signal->tty = NULL;
+                               p->signal->pgrp = p->pid;
+                               set_task_session(p, p->pid);
+                               attach_pid(p, PIDTYPE_PGID, pid);
+                               attach_pid(p, PIDTYPE_SID, pid);
+                       } else {
+                               p->signal->tty = current->signal->tty;
+                               p->signal->pgrp = task_pgrp_nr(current);
+                               set_task_session(p, task_session_nr(current));
+                               attach_pid(p, PIDTYPE_PGID,
+                                               task_pgrp(current));
+                               attach_pid(p, PIDTYPE_SID,
+                                               task_session(current));
+                       }
 
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
                        __get_cpu_var(process_counts)++;
@@ -1339,7 +1344,6 @@ bad_fork_cleanup_policy:
 bad_fork_cleanup_cgroup:
 #endif
        cgroup_exit(p, cgroup_callbacks_done);
-bad_fork_cleanup_delays_binfmt:
        delayacct_tsk_free(p);
        if (p->binfmt)
                module_put(p->binfmt->module);
@@ -1366,7 +1370,7 @@ struct task_struct * __cpuinit fork_idle(int cpu)
        struct task_struct *task;
        struct pt_regs regs;
 
-       task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL,
+       task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
                                &init_struct_pid);
        if (!IS_ERR(task))
                init_idle(task, cpu);
@@ -1414,7 +1418,7 @@ long do_fork(unsigned long clone_flags,
        }
 
        p = copy_process(clone_flags, stack_start, regs, stack_size,
-                       parent_tidptr, child_tidptr, NULL);
+                       child_tidptr, NULL);
        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
@@ -1422,7 +1426,16 @@ long do_fork(unsigned long clone_flags,
        if (!IS_ERR(p)) {
                struct completion vfork;
 
-               nr = pid_nr(task_pid(p));
+               /*
+                * this is enough to call pid_nr_ns here, but this if
+                * improves optimisation of regular fork()
+                */
+               nr = (clone_flags & CLONE_NEWPID) ?
+                       task_pid_nr_ns(p, current->nsproxy->pid_ns) :
+                               task_pid_vnr(p);
+
+               if (clone_flags & CLONE_PARENT_SETTID)
+                       put_user(nr, parent_tidptr);
 
                if (clone_flags & CLONE_VFORK) {
                        p->vfork_done = &vfork;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index c8ef7c2..79f871b 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -129,7 +129,8 @@ int copy_namespaces(unsigned long flags, struct task_struct 
*tsk)
 
        get_nsproxy(old_ns);
 
-       if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 
CLONE_NEWUSER | CLONE_NEWNET)))
+       if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+                               CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET)))
                return 0;
 
        if (!capable(CAP_SYS_ADMIN)) {
diff --git a/kernel/pid.c b/kernel/pid.c
index 4b17acd..f76097c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -18,6 +18,12 @@
  * allocation scenario when all but one out of 1 million PIDs possible are
  * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
  * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
+ *
+ * Pid namespaces:
+ *    (C) 2007 Pavel Emelyanov <[EMAIL PROTECTED]>, OpenVZ, SWsoft Inc.
+ *    (C) 2007 Sukadev Bhattiprolu <[EMAIL PROTECTED]>, IBM
+ *     Many thanks to Oleg Nesterov for comments and help
+ *
  */
 
 #include <linux/mm.h>
@@ -456,8 +462,8 @@ static struct kmem_cache *create_pid_cachep(int nr_ids)
 
        snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
        cachep = kmem_cache_create(pcache->name,
-                       /* FIXME add numerical ids here */
-                       sizeof(struct pid), 0, SLAB_HWCACHE_ALIGN, NULL);
+                       sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
+                       0, SLAB_HWCACHE_ALIGN, NULL);
        if (cachep == NULL)
                goto err_cachep;
 
@@ -475,19 +481,89 @@ err_alloc:
        return NULL;
 }
 
+static struct pid_namespace *create_pid_namespace(int level)
+{
+       struct pid_namespace *ns;
+       int i;
+
+       ns = kmalloc(sizeof(struct pid_namespace), GFP_KERNEL);
+       if (ns == NULL)
+               goto out;
+
+       ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!ns->pidmap[0].page)
+               goto out_free;
+
+       ns->pid_cachep = create_pid_cachep(level + 1);
+       if (ns->pid_cachep == NULL)
+               goto out_free_map;
+
+       kref_init(&ns->kref);
+       ns->last_pid = 0;
+       ns->child_reaper = NULL;
+       ns->level = level;
+
+       set_bit(0, ns->pidmap[0].page);
+       atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
+
+       for (i = 1; i < PIDMAP_ENTRIES; i++) {
+               ns->pidmap[i].page = 0;
+               atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
+       }
+
+       return ns;
+
+out_free_map:
+       kfree(ns->pidmap[0].page);
+out_free:
+       kfree(ns);
+out:
+       return ERR_PTR(-ENOMEM);
+}
+
+static void destroy_pid_namespace(struct pid_namespace *ns)
+{
+       int i;
+
+       for (i = 0; i < PIDMAP_ENTRIES; i++)
+               kfree(ns->pidmap[i].page);
+       kfree(ns);
+}
+
 struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace 
*old_ns)
 {
+       struct pid_namespace *new_ns;
+
        BUG_ON(!old_ns);
-       get_pid_ns(old_ns);
-       return old_ns;
+       new_ns = get_pid_ns(old_ns);
+       if (!(flags & CLONE_NEWPID))
+               goto out;
+
+       new_ns = ERR_PTR(-EINVAL);
+       if (flags & CLONE_THREAD)
+               goto out_put;
+
+       new_ns = create_pid_namespace(old_ns->level + 1);
+       if (!IS_ERR(new_ns))
+               new_ns->parent = get_pid_ns(old_ns);
+
+out_put:
+       put_pid_ns(old_ns);
+out:
+       return new_ns;
 }
 
 void free_pid_ns(struct kref *kref)
 {
-       struct pid_namespace *ns;
+       struct pid_namespace *ns, *parent;
 
        ns = container_of(kref, struct pid_namespace, kref);
-       kfree(ns);
+
+       parent = ns->parent;
+       destroy_pid_namespace(ns);
+
+       if (parent != NULL)
+               put_pid_ns(parent);
 }
 
 /*
-
To unsubscribe from this list: send the line "unsubscribe git-commits-head" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to