This patch allows the execution of unprivileged containers running ontop of an upstream Linux Kernel. We will run at whatever UID is found in the configuration file.
Signed-off-by: Glauber Costa <[email protected]> --- include/env.h | 1 + include/types.h | 1 + src/lib/hooks_ct.c | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 191 insertions(+), 5 deletions(-) diff --git a/include/env.h b/include/env.h index 1628bbf..d41df2e 100644 --- a/include/env.h +++ b/include/env.h @@ -116,6 +116,7 @@ struct arg_start { vps_handler *h; void *data; env_create_FN fn; + int userns_p; /* while running in userns, there's extra sync needed */ }; struct env_create_param3; diff --git a/include/types.h b/include/types.h index ceecb93..54eb1f4 100644 --- a/include/types.h +++ b/include/types.h @@ -95,6 +95,7 @@ typedef struct vps_handler { int vzfd; /**< /dev/vzctl file descriptor. */ int stdfd; int can_join_pidns; /* can't enter otherwise */ + int can_join_userns; /* can't run non privileged otherwise */ int (*is_run)(struct vps_handler *h, envid_t veid); int (*enter)(struct vps_handler *h, envid_t veid, const char *root, int flags); int (*destroy)(struct vps_handler *h, envid_t veid); diff --git a/src/lib/hooks_ct.c b/src/lib/hooks_ct.c index 29d7eea..6bd27c1 100644 --- a/src/lib/hooks_ct.c +++ b/src/lib/hooks_ct.c @@ -66,6 +66,8 @@ static int sys_setns(int fd, int nstype) # define MS_PRIVATE (1 << 18) #endif +#define UID_GID_RANGE 100000 /* how many users per container */ + /* This function is there in GLIBC, but not in headers */ extern int pivot_root(const char * new_root, const char * put_old); @@ -138,10 +140,39 @@ static int _env_create(void *data) struct env_create_param3 create_param; int ret; - if ((ret = ct_chroot(arg->res->fs.root))) + if ((arg->userns_p != -1) && (read(arg->userns_p, &ret, sizeof(ret)) == 0)) + return -1; + + ret = ct_chroot(arg->res->fs.root); + close(arg->userns_p); + /* Probably means chroot failed */ + if (ret) return ret; - if ((ret = vps_set_cap(arg->veid, &arg->res->env, &arg->res->cap, 1))) + if (arg->h->can_join_userns) { + setuid(0); + setgid(0); + /* + * We need the special flag "newinstance". This is a requirement + * of the userns-aware implementation of devpts as of Linux 3.9. + * Because of that special requirement, we do it here rather than + * later. + */ + mount("devpts", "/dev/pts", "devpts", 0, "newinstance"); + /* /dev/ptmx, if it even exists, would refer to the root ptmx. + * We don't want that, we want our newly created instance to contain + * all ptys. So we bind mount the root device here + */ + open("/dev/ptmx", O_RDWR|O_CREAT, 0); + mount("/dev/pts/ptmx", "/dev/ptmx", "", MS_BIND, 0); + } + + /* + * If we are using the user namespace, we will have the full capability + * set in the target namespace. So we don't need any of that. + */ + if (!arg->h->can_join_userns && + (ret = vps_set_cap(arg->veid, &arg->res->env, &arg->res->cap, 1))) return ret; fill_container_param(arg, &create_param); @@ -153,6 +184,79 @@ static int _env_create(void *data) return exec_container_init(arg, &create_param); } +static int write_uid_gid_mapping(vps_handler *h, unsigned long uid, unsigned long gid, pid_t pid) +{ + char buf[STR_SIZE]; + char map[STR_SIZE]; + int fd; + + snprintf(map, sizeof(map), "0 %ld %d", uid, UID_GID_RANGE); + snprintf(buf, sizeof(buf), "/proc/%d/uid_map", pid); + if ((fd = open(buf, O_WRONLY)) < 0) + return -1; + + if ((write(fd, map, sizeof(map)) < 0)) + return -1; + + snprintf(map, sizeof(map), "0 %ld %d", gid, UID_GID_RANGE); + snprintf(buf, sizeof(map), "/proc/%d/gid_map", pid); + if ((fd = open(buf, O_WRONLY)) < 0) + return -1; + + if ((write(fd, map, sizeof(map)) < 0)) + return -1; + + return 0; +} + +/* + * Those devices should exist in the container, and be valid device nodes with + * user access permission. But we need to be absolutely sure this is the case, + * so we will provide our own versions. That could actually happen since some + * distributions may come with emptied /dev's, waiting for udev to populate them. + * That won't happen, we do it ourselves. + */ +static void create_devices(vps_handler *h, envid_t veid, const char *root) +{ + unsigned int i; + char *devices[] = { + "/dev/null", + "/dev/zero", + "/dev/random", + "/dev/urandom", + }; + + /* + * We will tolerate errors, and keep the container running, because it is + * likely we will be able to boot it to a barely functional state. But + * be vocal about it + */ + for (i = 0; i < ARRAY_SIZE(devices); i++) { + char ct_devname[STR_SIZE]; + int ret; + + ret = snprintf(ct_devname, sizeof(ct_devname), "%s/%s", root, devices[i]); + if (ret < 0) { + logger(-1, errno, "Could not allocate device string\n"); + continue; + } + + /* + * No need to be crazy about file flags. When we bind mount, the + * source permissions will be inherited. + */ + ret = open(ct_devname, O_RDWR|O_CREAT, 0); + if (ret < 0) { + logger(-1, errno, "Could not touch device %s\n", devices[i]); + continue; + } + ret = mount(devices[i], ct_devname, "", MS_BIND, 0); + if (ret < 0) + logger(-1, errno, "Could not touch device %s\n", devices[i]); + } + +} + static int ct_env_create(struct arg_start *arg) { @@ -162,7 +266,8 @@ static int ct_env_create(struct arg_start *arg) int ret; char procpath[STR_SIZE]; char ctpath[STR_SIZE]; - + int userns_p[2]; + int err; /* non-fatal */ if ((ret = ct_destroy(arg->h, arg->veid))) @@ -190,16 +295,54 @@ static int ct_env_create(struct arg_start *arg) * Belong in the setup phase */ clone_flags = SIGCHLD; - /* FIXME: USERNS is still work in progress */ clone_flags |= CLONE_NEWUTS|CLONE_NEWPID|CLONE_NEWIPC; clone_flags |= CLONE_NEWNET|CLONE_NEWNS; + if (!arg->h->can_join_userns) { + logger(-1, 0, "WARNING: Running container unprivileged. USER_NS not supported"); + + userns_p[0] = userns_p[1] = -1; + } else { + clone_flags |= CLONE_NEWUSER; + if (pipe(userns_p) < 0) { + logger(-1, errno, "Can not create userns pipe"); + return VZ_RESOURCE_ERROR; + } + } + arg->userns_p = userns_p[0]; + + create_devices(arg->h, arg->veid, arg->res->fs.root); + ret = clone(_env_create, child_stack, clone_flags, arg); if (ret < 0) { logger(-1, errno, "Unable to clone"); /* FIXME: remove ourselves from container first */ destroy_container(arg->veid); return VZ_RESOURCE_ERROR; + } else if (arg->h->can_join_userns) { + /* + * Now we need to write to the mapping file. It has to be us, + * since CAP_SETUID is required in the parent namespace. vzctl + * is run as root, so we should have it. But our cloned kid + * will start as the overflow uid 65534 in the new namespace. + */ + if (write_uid_gid_mapping(arg->h, *arg->res->misc.local_uid, + *arg->res->misc.local_gid, ret)) + return VZ_RESOURCE_ERROR; + + /* + * Nothing should proceed userns wide until we have the + * mapping. That creates many non-determisnitic behaviors + * since some runs will execute with the mapping already done, + * while others with the mapping off. This is particularly + * important for setuid, for instance. It will categorically + * fail if called before a mapping is in place. + */ + if ((userns_p[1] != -1) && + write(userns_p[1], &err, sizeof(err)) != sizeof(err)) { + logger(-1, errno, "Unable to read from userns pipe"); + return -1; + } } snprintf(procpath, STR_SIZE, "/proc/%d/ns/net", ret); @@ -221,6 +364,7 @@ static int ct_enter(vps_handler *h, envid_t veid, const char *root, int flags) pid_t task_pid; int ret = VZ_RESOURCE_ERROR; bool joined_mnt_ns = false; + int fd; if (!h->can_join_pidns) { logger(-1, 0, "Kernel lacks setns for pid namespace"); @@ -245,18 +389,45 @@ static int ct_enter(vps_handler *h, envid_t veid, const char *root, int flags) return VZ_RESOURCE_ERROR; } + /* + * Because all namespaces are associated with an owner userns, + * and capabilities may be needed for issuing setns syscalls into + * some key target namespaces (like the mount namespace), we will + * first enter the user namespace if it is available. Only then we + * scan all others and join them as they appear + */ + if (h->can_join_userns) { + if (snprintf(path, sizeof(path), "/proc/%d/ns/user", task_pid) < 0) + goto out; + + if ((fd = open(path, O_RDONLY)) < 0) + goto out; + + if (setns(fd, CLONE_NEWUSER)) { + logger(-1, errno, "Failed to set context for user namespace"); + goto out; + } + setuid(0); + setgid(0); + } + ret = VZ_RESOURCE_ERROR; while ((ep = readdir (dp))) { - int fd; if (!strcmp(ep->d_name, ".")) continue; if (!strcmp(ep->d_name, "..")) continue; + /* already joined */ + if ((!strcmp(ep->d_name, "user"))) + continue; + if (snprintf(path, sizeof(path), "/proc/%d/ns/%s", task_pid, ep->d_name) < 0) goto out; + if ((fd = open(path, O_RDONLY)) < 0) goto out; + if (setns(fd, 0)) logger(-1, errno, "Failed to set context for %s", ep->d_name); @@ -562,6 +733,7 @@ int ct_do_open(vps_handler *h) { int ret; char path[STR_SIZE]; + char upath[STR_SIZE]; struct stat st; ret = container_init(); @@ -578,6 +750,9 @@ int ct_do_open(vps_handler *h) if (snprintf(path, sizeof(path), "/proc/%d/ns/pid", getpid()) < 0) return VZ_RESOURCE_ERROR; + if (snprintf(upath, sizeof(upath), "/proc/%d/ns/user", getpid()) < 0) + return VZ_RESOURCE_ERROR; + ret = mkdir(NETNS_RUN_DIR, S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH); if (ret && (errno != EEXIST)) { @@ -586,6 +761,15 @@ int ct_do_open(vps_handler *h) } h->can_join_pidns = !stat(path, &st); + /* + * Being able to join the user namespace is a good indication that the + * user namespace is complete. For a long time, the user namespace + * existed, but were far away from being feature complete. When + * running in such a kernel, joining the user namespace will just + * cripple our container, since we won't be able to do anything. It is + * only good for people who are okay running containers as root + */ + h->can_join_userns = !stat(upath, &st); h->is_run = ct_is_run; h->enter = ct_enter; h->destroy = ct_destroy; -- 1.7.11.7 _______________________________________________ Devel mailing list [email protected] https://lists.openvz.org/mailman/listinfo/devel
