Introduce two new system calls:
int nsfd(pid_t pid, unsigned long nstype);
int setns(unsigned long nstype, int fd);

These two new system calls address three specific problems that can
make namespaces hard to work with.
- Namespaces require a dedicated process to pin them in memory.
- It is not possible to use a namespace unless you are the
  child of the original creator.
- Namespaces don't have names that userspace can use to talk
  about them.

The nsfd() system call returns a file descriptor that can
be used to talk about a specific namespace, and to keep
the specified namespace alive.

The fd returned by nsfd() can be bind mounted as:
mount --bind /proc/self/fd/N /some/filesystem/path
to keep the namespace alive indefinitely as long as
it is mounted.

open works on the fd returned by nsfd() so another
process can get a hold of it and do interesting things.

Overall that allows for persistent naming of namespaces
according to userspace policy.

setns() allows changing the namespace of the current process
to a namespace that originates with nsfd().

Signed-off-by: Eric W. Biederman <ebied...@xmission.com>
---

This is just my first pass at this, and not yet compiled tested.
I was pleasantly surprised at how easy all of this was to implement.

I have verified mount will let me bind mount /proc/self/fd/N so
there is nothing special needed for the mount case, except
getting the reference counting and lifetime rules correct for
my filesystem objects.

 arch/x86/ia32/ia32entry.S          |    2 +
 arch/x86/include/asm/unistd_32.h   |    4 +-
 arch/x86/include/asm/unistd_64.h   |    4 +
 arch/x86/kernel/syscall_table_32.S |    2 +
 fs/Makefile                        |    2 +-
 fs/nsfd.c                          |  278 ++++++++++++++++++++++++++++++++++++
 include/linux/magic.h              |    1 +
 include/linux/nsproxy.h            |    1 +
 include/linux/nstype.h             |    6 +
 kernel/nsproxy.c                   |   17 +++
 10 files changed, 315 insertions(+), 2 deletions(-)
 create mode 100644 fs/nsfd.c
 create mode 100644 include/linux/nstype.h

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 53147ad..9fd33de 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -842,4 +842,6 @@ ia32_sys_call_table:
        .quad compat_sys_rt_tgsigqueueinfo      /* 335 */
        .quad sys_perf_event_open
        .quad compat_sys_recvmmsg
+       .quad sys_nsfd
+       .quad sys_setns
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 3baf379..5b7833c 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -343,10 +343,12 @@
 #define __NR_rt_tgsigqueueinfo 335
 #define __NR_perf_event_open   336
 #define __NR_recvmmsg          337
+#define __NR_nsfd              338
+#define __NR_setns             339
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 338
+#define NR_syscalls 340
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 4843f7b..260d542 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -663,6 +663,10 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
 __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 #define __NR_recvmmsg                          299
 __SYSCALL(__NR_recvmmsg, sys_recvmmsg)
+#define __NR_nsfd                              300
+__SYSCALL(__NR_nsfd, sys_nsfd)
+#define __NR_setns                             301
+__SYSCALL(__NR_setns, sys_setns)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S 
b/arch/x86/kernel/syscall_table_32.S
index 15228b5..e09a45b 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,5 @@ ENTRY(sys_call_table)
        .long sys_rt_tgsigqueueinfo     /* 335 */
        .long sys_perf_event_open
        .long sys_recvmmsg
+       .long sys_nsfd
+       .long sys_setns
diff --git a/fs/Makefile b/fs/Makefile
index af6d047..74d5091 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y :=      open.o read_write.o file_table.o super.o \
                attr.o bad_inode.o file.o filesystems.o namespace.o \
                seq_file.o xattr.o libfs.o fs-writeback.o \
                pnode.o drop_caches.o splice.o sync.o utimes.o \
-               stack.o fs_struct.o
+               stack.o fs_struct.o nsfd.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=       buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/nsfd.c b/fs/nsfd.c
new file mode 100644
index 0000000..71bcc55
--- /dev/null
+++ b/fs/nsfd.c
@@ -0,0 +1,278 @@
+#include <linux/nstype.h>
+#include <linux/fs.h>
+#include <linux/magic.h>
+#include <net/net_namespace.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/cred.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/nsproxy.h>
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+
+static struct vfsmount *nsfd_mnt __read_mostly;
+static struct inode *nsfd_inode;
+
+static const struct file_operations nsfd_file_operations = {
+       .llseek = no_llseek,
+};
+
+
+static int nsfd_get_sb(struct file_system_type *fs_type, int flags,
+       const char *dev_name, void *data, struct vfsmount *mnt)
+{
+       return get_sb_pseudo(fs_type, "nsfd:", NULL, NSFD_FS_MAGIC, mnt);
+}
+
+static char *nsfd_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+       static const char name[] = "nsfd";
+
+       if (sizeof(name) > buflen)
+               return ERR_PTR(-ENAMETOOLONG);
+
+       return memcpy(buffer, name, sizeof(name));
+}
+
+static const struct dentry_operations nsfd_dentry_operations = {
+       .d_dname                = nsfd_dname,
+};
+
+static struct file_system_type nsfd_fs_type = {
+       .name           = "nsfd",
+       .get_sb         = nsfd_get_sb,
+       .kill_sb        = kill_anon_super,
+       
+};
+
+static void netns_dentry_release(struct dentry *dentry)
+{
+       put_net(dentry->d_fsdata);
+       dentry->d_fsdata = NULL;
+}
+
+static const struct dentry_operations netns_dentry_operations = {
+       .d_dname        = nsfd_dname,
+       .d_release      = netns_dentry_release,
+};
+
+static const struct dentry_operations *nsfd_dops[] = {
+       [NSTYPE_NET] = &netns_dentry_operations,
+};
+
+static const struct dentry_operations *nstype_dops(unsigned long nstype)
+{
+       const struct dentry_operations *d_op = NULL;
+
+       if (nstype < sizeof(nsfd_dops)/sizeof(nsfd_dops[0]))
+               d_op = nsfd_dops[nstype];
+
+       return d_op;
+}
+
+static struct file *nsfd_fget(int fd, unsigned long nstype)
+{
+       const struct dentry_operations *d_op;
+       struct file *file;
+
+       d_op = nstype_dops(nstype);
+       if (!d_op)
+               return ERR_PTR(-EINVAL);
+
+       file = fget(fd);
+       if (!file)
+               return ERR_PTR(-EBADF);
+
+       if (file->f_op != &nsfd_file_operations)
+               goto out_invalid;
+
+       if (file->f_path.dentry->d_op != d_op)
+               goto out_invalid;
+
+       return file;
+
+out_invalid:
+       fput(file);
+       return ERR_PTR(-EINVAL);
+}
+
+static struct inode *nsfd_mkinode(void)
+{
+       struct inode *inode;
+       inode = new_inode(nsfd_mnt->mnt_sb);
+       if (!inode)
+               return ERR_PTR(-ENOMEM);
+
+       inode->i_fop = &nsfd_file_operations;
+
+       /*
+        * Mark the inode dirty from the very beginning,
+        * that way it will never be moved to the dirty
+        * list because mark_inode_dirty() will think that
+        * it already _is_ on the dirty list.
+        */
+       inode->i_state = I_DIRTY;
+       inode->i_mode = S_IRUSR | S_IWUSR;
+       inode->i_uid = current_fsuid();
+       inode->i_gid = current_fsgid();
+       inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+       return inode;
+}
+
+
+static struct file *nsfd_getfile(void)
+{
+       struct qstr name = { .name = "" };
+       struct path path;
+       struct file *file;
+
+       path.dentry = d_alloc(nsfd_mnt->mnt_sb->s_root, &name);
+       if (!path.dentry)
+               return ERR_PTR(-ENOMEM);
+
+       path.mnt = mntget(nsfd_mnt);
+
+       /*
+        * We know the nsfd_inode inode count is always greater than zero,
+        * so we can avoid doing an igrab() and we can use an open-coded
+        * atomic_inc().
+        */
+       atomic_inc(&nsfd_inode->i_count);
+       path.dentry->d_op = &nsfd_dentry_operations;
+       d_instantiate(path.dentry, nsfd_inode);
+
+       file = alloc_file(&path, FMODE_READ, &nsfd_file_operations);
+       if (!file) {
+               path_put(&path);
+               return ERR_PTR(-ENFILE);
+       }
+       file->f_mapping = nsfd_inode->i_mapping;
+
+       file->f_pos = 0;
+       file->f_flags = O_RDONLY;
+       file->f_version = 0;
+       file->private_data = NULL;
+
+       return file;
+}
+
+static void *nsfd_getns(pid_t pid, unsigned long nstype)
+{
+       struct task_struct *task;
+       struct nsproxy *nsproxy;
+       void *ns;
+
+       ns = ERR_PTR(-ESRCH);
+       rcu_read_lock();
+       if (pid == 0)
+               task = current;
+       else
+               task = find_task_by_vpid(pid);
+       if (!task)
+               goto out;
+
+       ns = ERR_PTR(-EPERM);
+       if (!ptrace_may_access(task, PTRACE_MODE_ATTACH))
+               goto out;
+
+       ns = ERR_PTR(-ESRCH);
+       nsproxy = task_nsproxy(task);
+       if (!nsproxy)
+               goto out;
+
+       ns = ERR_PTR(-EINVAL);
+       switch(nstype) {
+       case NSTYPE_NET:
+               ns = get_net(nsproxy->net_ns);
+               break;
+       }
+out:
+       rcu_read_unlock();
+       return ns;
+}
+
+SYSCALL_DEFINE2(nsfd, pid_t, pid, unsigned long, nstype)
+{
+       const struct dentry_operations *d_op;
+       struct file *file;
+       int fd;
+       void *ns;
+
+       d_op = nstype_dops(nstype);
+       if (!d_op)
+               return -EINVAL;
+
+       file = nsfd_getfile();
+       if (IS_ERR(file))
+               return PTR_ERR(file);
+
+       ns = nsfd_getns(pid, nstype);
+       if (IS_ERR(ns)) {
+               fput(file);
+               return PTR_ERR(ns);
+       }
+
+       file->f_dentry->d_fsdata = ns;
+       file->f_dentry->d_op = d_op;
+       
+       fd = get_unused_fd();
+       if (fd < 0) {
+               fput(file);
+               return fd;
+       }
+       fd_install(fd, file);
+
+       return fd;
+}
+
+
+SYSCALL_DEFINE2(setns, unsigned long, nstype, int, fd)
+{
+       struct file *file;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       file = nsfd_fget(fd, nstype);
+       if (IS_ERR(file))
+               return PTR_ERR(file);
+
+       set_namespace(nstype, file->f_dentry->d_fsdata);
+
+       fput(file);
+       return 0;
+}
+
+
+static int __init nsfd_init(void)
+{
+       int error;
+
+       error = register_filesystem(&nsfd_fs_type);
+       if (error)
+               goto err_exit;
+
+       nsfd_mnt  = kern_mount(&nsfd_fs_type);
+       if (IS_ERR(nsfd_mnt)) {
+               error = PTR_ERR(nsfd_mnt);
+               goto err_unregister_filesystem;
+       }
+
+       nsfd_inode = nsfd_mkinode();
+       if (IS_ERR(nsfd_inode)) {
+               error = PTR_ERR(nsfd_inode);
+               goto err_mntput;
+       }
+
+       return 0;
+
+err_mntput:
+       mntput(nsfd_mnt);
+err_unregister_filesystem:
+       unregister_filesystem(&nsfd_fs_type);
+err_exit:
+       panic(KERN_ERR "nsfd_init() failed (%d)\n", error);
+}
+
+fs_initcall(nsfd_init);
diff --git a/include/linux/magic.h b/include/linux/magic.h
index 76285e0..a4fe6eb 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -26,6 +26,7 @@
 #define ISOFS_SUPER_MAGIC      0x9660
 #define JFFS2_SUPER_MAGIC      0x72b6
 #define ANON_INODE_FS_MAGIC    0x09041934
+#define NSFD_FS_MAGIC          0x6e736664
 
 #define MINIX_SUPER_MAGIC      0x137F          /* original minix fs */
 #define MINIX_SUPER_MAGIC2     0x138F          /* minix fs, 30 char names */
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 7b370c7..45f1e07 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -65,6 +65,7 @@ static inline struct nsproxy *task_nsproxy(struct task_struct 
*tsk)
 int copy_namespaces(unsigned long flags, struct task_struct *tsk);
 void exit_task_namespaces(struct task_struct *tsk);
 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
+void set_namespace(unsigned long nstype, void *ns);
 void free_nsproxy(struct nsproxy *ns);
 int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
        struct fs_struct *);
diff --git a/include/linux/nstype.h b/include/linux/nstype.h
new file mode 100644
index 0000000..3bdf856
--- /dev/null
+++ b/include/linux/nstype.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_NSTYPE_H
+#define _LINUX_NSTYPE_H
+
+#define NSTYPE_NET 0
+
+#endif /* _LINUX_NSTYPE_H */
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 09b4ff9..574461c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -21,6 +21,7 @@
 #include <linux/pid_namespace.h>
 #include <net/net_namespace.h>
 #include <linux/ipc_namespace.h>
+#include <linux/nstype.h>
 
 static struct kmem_cache *nsproxy_cachep;
 
@@ -221,6 +222,22 @@ void exit_task_namespaces(struct task_struct *p)
        switch_task_namespaces(p, NULL);
 }
 
+void set_namespace(unsigned long nstype, void *ns)
+{
+       struct task_struct *tsk = current;
+       struct nsproxy *new_nsproxy;
+
+       new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
+       switch(nstype) {
+       case NSTYPE_NET:
+               put_net(new_nsproxy->net_ns);
+               new_nsproxy->net_ns = get_net(ns);
+               break;
+       }
+
+       switch_task_namespaces(tsk, new_nsproxy);
+}
+
 static int __init nsproxy_cache_init(void)
 {
        nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
-- 
1.6.5.2.143.g8cc62

_______________________________________________
Containers mailing list
contain...@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
Devel@openvz.org
https://openvz.org/mailman/listinfo/devel

Reply via email to