On 03/22/2013 03:48 AM, Glauber Costa wrote:
This patch allows the execution of unprivileged containers running ontop
of an upstream Linux Kernel. We will run at whatever UID is found in the
configuration file.

Signed-off-by: Glauber Costa <[email protected]>
---
  include/env.h      |   1 +
  include/types.h    |   1 +
  src/lib/hooks_ct.c | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++--
  3 files changed, 191 insertions(+), 5 deletions(-)

diff --git a/include/env.h b/include/env.h
index 1628bbf..d41df2e 100644
--- a/include/env.h
+++ b/include/env.h
@@ -116,6 +116,7 @@ struct arg_start {
        vps_handler *h;
        void *data;
        env_create_FN fn;
+       int userns_p; /* while running in userns, there's extra sync needed */
  };
struct env_create_param3;
diff --git a/include/types.h b/include/types.h
index ceecb93..54eb1f4 100644
--- a/include/types.h
+++ b/include/types.h
@@ -95,6 +95,7 @@ typedef struct vps_handler {
        int vzfd;       /**< /dev/vzctl file descriptor. */
        int stdfd;
        int can_join_pidns; /* can't enter otherwise */
+       int can_join_userns; /* can't run non privileged otherwise */
        int (*is_run)(struct vps_handler *h, envid_t veid);
        int (*enter)(struct vps_handler *h, envid_t veid, const char *root, int 
flags);
        int (*destroy)(struct vps_handler *h, envid_t veid);
diff --git a/src/lib/hooks_ct.c b/src/lib/hooks_ct.c
index 29d7eea..6bd27c1 100644
--- a/src/lib/hooks_ct.c
+++ b/src/lib/hooks_ct.c
@@ -66,6 +66,8 @@ static int sys_setns(int fd, int nstype)
  # define MS_PRIVATE (1 << 18)
  #endif
+#define UID_GID_RANGE 100000 /* how many users per container */
+
  /* This function is there in GLIBC, but not in headers */
  extern int pivot_root(const char * new_root, const char * put_old);
@@ -138,10 +140,39 @@ static int _env_create(void *data)
        struct env_create_param3 create_param;
        int ret;
- if ((ret = ct_chroot(arg->res->fs.root)))
+       if ((arg->userns_p != -1) && (read(arg->userns_p, &ret, sizeof(ret)) == 
0))
+               return -1;
+
+       ret = ct_chroot(arg->res->fs.root);
+       close(arg->userns_p);
+       /* Probably means chroot failed */
+       if (ret)
                return ret;
- if ((ret = vps_set_cap(arg->veid, &arg->res->env, &arg->res->cap, 1)))
+       if (arg->h->can_join_userns) {
+               setuid(0);
+               setgid(0);
+               /*
+                * We need the special flag "newinstance". This is a requirement
+                * of the userns-aware implementation of devpts as of Linux 3.9.
+                * Because of that special requirement, we do it here rather 
than
+                * later.
+                */
+               mount("devpts", "/dev/pts", "devpts", 0, "newinstance");
+               /* /dev/ptmx, if it even exists, would refer to the root ptmx.
+                * We don't want that, we want our newly created instance to 
contain
+                * all ptys. So we bind mount the root device here
+                */
+               open("/dev/ptmx", O_RDWR|O_CREAT, 0);
+               mount("/dev/pts/ptmx", "/dev/ptmx", "", MS_BIND, 0);
+       }
+
+       /*
+        * If we are using the user namespace, we will have the full capability
+        * set in the target namespace. So we don't need any of that.
+        */
+       if (!arg->h->can_join_userns &&
+               (ret = vps_set_cap(arg->veid, &arg->res->env, &arg->res->cap, 
1)))
                return ret;
fill_container_param(arg, &create_param);
@@ -153,6 +184,79 @@ static int _env_create(void *data)
        return exec_container_init(arg, &create_param);
  }
+static int write_uid_gid_mapping(vps_handler *h, unsigned long uid, unsigned long gid, pid_t pid)
+{
+       char buf[STR_SIZE];
+       char map[STR_SIZE];
+       int fd;
+
+       snprintf(map, sizeof(map), "0 %ld %d", uid, UID_GID_RANGE);
+       snprintf(buf, sizeof(buf), "/proc/%d/uid_map", pid);
+       if ((fd = open(buf, O_WRONLY)) < 0)
+               return -1;
+
+       if ((write(fd, map, sizeof(map)) < 0))
+               return -1;

You write the whole STR_SIZE, while just strlen(map) (or value returned by snprintf) should be enough.

+
+       snprintf(map, sizeof(map), "0 %ld %d", gid, UID_GID_RANGE);
+       snprintf(buf, sizeof(map), "/proc/%d/gid_map", pid);
+       if ((fd = open(buf, O_WRONLY)) < 0)
+               return -1;
+
+       if ((write(fd, map, sizeof(map)) < 0))
+               return -1;

ditto

+

1 close the files opened

2 you can reuse write_val() from src/lib/env.c or maybe use fopen/fprintf/fclose(). Up to you

+       return 0;
+}
+
+/*
+ * Those devices should exist in the container, and be valid device nodes with
+ * user access permission. But we need to be absolutely sure this is the case,
+ * so we will provide our own versions. That could actually happen since some
+ * distributions may come with emptied /dev's, waiting for udev to populate 
them.
+ * That won't happen, we do it ourselves.
+ */
+static void create_devices(vps_handler *h, envid_t veid, const char *root)
+{
+       unsigned int i;
+       char *devices[] = {
+               "/dev/null",
+               "/dev/zero",
+               "/dev/random",
+               "/dev/urandom",
+       };
+
+       /*
+        * We will tolerate errors, and keep the container running, because it 
is
+        * likely we will be able to boot it to a barely functional state. But
+        * be vocal about it
+        */
+       for (i = 0; i < ARRAY_SIZE(devices); i++) {
+               char ct_devname[STR_SIZE];
+               int ret;
+
+               ret = snprintf(ct_devname, sizeof(ct_devname), "%s/%s", root, 
devices[i]);
+               if (ret < 0) {
+                       logger(-1, errno, "Could not allocate device string\n");
+                       continue;
+               }
+
+               /*
+                * No need to be crazy about file flags. When we bind mount, the
+                * source permissions will be inherited.
+                */
+               ret = open(ct_devname, O_RDWR|O_CREAT, 0);
+               if (ret < 0) {
+                       logger(-1, errno, "Could not touch device %s\n", 
devices[i]);
+                       continue;
+               }
+               ret = mount(devices[i], ct_devname, "", MS_BIND, 0);
+               if (ret < 0)
+                       logger(-1, errno, "Could not touch device %s\n", 
devices[i]);

close()

+       }
+
+}
+
  static int ct_env_create(struct arg_start *arg)
  {
@@ -162,7 +266,8 @@ static int ct_env_create(struct arg_start *arg)
        int ret;
        char procpath[STR_SIZE];
        char ctpath[STR_SIZE];
-
+       int userns_p[2];
+       int err;
/* non-fatal */
        if ((ret = ct_destroy(arg->h, arg->veid)))
@@ -190,16 +295,54 @@ static int ct_env_create(struct arg_start *arg)
         * Belong in the setup phase
         */
        clone_flags = SIGCHLD;
-       /* FIXME: USERNS is still work in progress */
        clone_flags |= CLONE_NEWUTS|CLONE_NEWPID|CLONE_NEWIPC;
        clone_flags |= CLONE_NEWNET|CLONE_NEWNS;
+ if (!arg->h->can_join_userns) {
+               logger(-1, 0, "WARNING: Running container unprivileged. USER_NS not 
supported");
+
+               userns_p[0] = userns_p[1] = -1;
+       } else {
+               clone_flags |= CLONE_NEWUSER;
+               if (pipe(userns_p) < 0) {
+                       logger(-1, errno, "Can not create userns pipe");
+                       return VZ_RESOURCE_ERROR;
+               }
+       }
+       arg->userns_p = userns_p[0];

It would be good if you have explicit close() here.

+
+       create_devices(arg->h, arg->veid, arg->res->fs.root);
+
        ret = clone(_env_create, child_stack, clone_flags, arg);
        if (ret  < 0) {
                logger(-1, errno, "Unable to clone");
                /* FIXME: remove ourselves from container first */
                destroy_container(arg->veid);
                return VZ_RESOURCE_ERROR;
+       } else if (arg->h->can_join_userns) {
+               /*
+                * Now we need to write to the mapping file. It has to be us,
+                * since CAP_SETUID is required in the parent namespace. vzctl
+                * is run as root, so we should have it. But our cloned kid
+                * will start as the overflow uid 65534 in the new namespace.
+                */
+               if (write_uid_gid_mapping(arg->h, *arg->res->misc.local_uid,
+                                         *arg->res->misc.local_gid, ret))
+                       return VZ_RESOURCE_ERROR;
+
+               /*
+                * Nothing should proceed userns wide until we have the
+                * mapping.  That creates many non-determisnitic behaviors
+                * since some runs will execute with the mapping already done,
+                * while others with the mapping off. This is particularly
+                * important for setuid, for instance. It will categorically
+                * fail if called before a mapping is in place.
+                */
+               if ((userns_p[1] != -1) &&
+                       write(userns_p[1], &err, sizeof(err)) != sizeof(err)) {
+                       logger(-1, errno, "Unable to read from userns pipe");
+                       return -1;

return VZ_SOME_ERROR_CODE not -1

+               }
        }
snprintf(procpath, STR_SIZE, "/proc/%d/ns/net", ret);
@@ -221,6 +364,7 @@ static int ct_enter(vps_handler *h, envid_t veid, const 
char *root, int flags)
        pid_t task_pid;
        int ret = VZ_RESOURCE_ERROR;
        bool joined_mnt_ns = false;
+       int fd;
if (!h->can_join_pidns) {
                logger(-1, 0, "Kernel lacks setns for pid namespace");
@@ -245,18 +389,45 @@ static int ct_enter(vps_handler *h, envid_t veid, const 
char *root, int flags)
                return VZ_RESOURCE_ERROR;
        }
+ /*
+        * Because all namespaces are associated with an owner userns,
+        * and capabilities may be needed for issuing setns syscalls into
+        * some key target namespaces (like the mount namespace), we will
+        * first enter the user namespace if it is available. Only then we
+        * scan all others and join them as they appear
+        */
+       if (h->can_join_userns) {
+               if (snprintf(path, sizeof(path), "/proc/%d/ns/user", task_pid) 
< 0)
+                       goto out;
+
+               if ((fd = open(path, O_RDONLY)) < 0)
+                       goto out;
+
+               if (setns(fd, CLONE_NEWUSER)) {
+                       logger(-1, errno, "Failed to set context for user 
namespace");
+                       goto out;
+               }

close()

+               setuid(0);
+               setgid(0);
+       }
+
        ret = VZ_RESOURCE_ERROR;
        while ((ep = readdir (dp))) {
-               int fd;
                if (!strcmp(ep->d_name, "."))
                        continue;
                if (!strcmp(ep->d_name, ".."))
                        continue;
+ /* already joined */
+               if ((!strcmp(ep->d_name, "user")))
+                       continue;
+
                if (snprintf(path, sizeof(path), "/proc/%d/ns/%s", task_pid, 
ep->d_name) < 0)
                        goto out;
+
                if ((fd = open(path, O_RDONLY)) < 0)
                        goto out;
+
                if (setns(fd, 0))
                        logger(-1, errno, "Failed to set context for %s", 
ep->d_name);
@@ -562,6 +733,7 @@ int ct_do_open(vps_handler *h)
  {
        int ret;
        char path[STR_SIZE];
+       char upath[STR_SIZE];
        struct stat st;
ret = container_init();
@@ -578,6 +750,9 @@ int ct_do_open(vps_handler *h)
        if (snprintf(path, sizeof(path), "/proc/%d/ns/pid", getpid()) < 0)
                return VZ_RESOURCE_ERROR;
+ if (snprintf(upath, sizeof(upath), "/proc/%d/ns/user", getpid()) < 0)
+               return VZ_RESOURCE_ERROR;
+
        ret = mkdir(NETNS_RUN_DIR, S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH);
if (ret && (errno != EEXIST)) {
@@ -586,6 +761,15 @@ int ct_do_open(vps_handler *h)
        }
h->can_join_pidns = !stat(path, &st);
+       /*
+        * Being able to join the user namespace is a good indication that the
+        * user namespace is complete. For a long time, the user namespace
+        * existed, but were far away from being feature complete.  When
+        * running in such a kernel, joining the user namespace will just
+        * cripple our container, since we won't be able to do anything. It is
+        * only good for people who are okay running containers as root
+        */
+       h->can_join_userns = !stat(upath, &st);
        h->is_run = ct_is_run;
        h->enter = ct_enter;
        h->destroy = ct_destroy;

_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to