On 5 February 2015 at 12:48, Vasiliy Tolstov <v.tols...@selfip.ru> wrote:
>
> 2015-02-05 12:44 GMT+03:00 Alban Crequy <alban.cre...@gmail.com>:
>>
>> Manual page namespaces(7):
>>
>>        Creation of new namespaces using clone(2) and unshare(2) in most
>> cases
>>        requires the CAP_SYS_ADMIN capability.  User namespaces are the
>>        exception: since  Linux 3.8, no privilege is required to create a
>> user
>>        namespace.
>
>
> So as i understand i can't create full featured container with network under
> non root user (and not have cap_sys_admin)

caps like CAP_SYS_ADMIN don't have an global meaning anymore but
refers to operations a process can do *in its current namespace*. An
unprivileged process (uid!=0, without cap_sys_admin) can join a user
namespace and get uid=0 & cap_sys_admin for operations inside the user
namespace, but it will still have uid!=0 & !cap_sys_admin for
operations in the parent user namespace.

user_namespaces(7) contains userns_child_exec.c and it creates a fully
featured container with network without being root. (I attached a
patched version I was testing)

# # Because I'm using the kernel patched by my distribution
# echo 1 > /proc/sys/kernel/unprivileged_userns_clone

$ gcc -lcap -o userns_child_exec userns_child_exec.c

Here it seems to work:

alban@alban:~$ ls -l /tmp/userns_child_exec
-rwxr-xr-x 1 alban alban 14488 Feb  5 23:24 /tmp/userns_child_exec
alban@alban:~$ id -u
1000
alban@alban:~$ ip link # ---> will show lo, eth0, wlan0...
alban@alban:~$ /tmp/userns_child_exec -p -m -U -M '0 1000 1' -G '0
1000 1' -n bash
About to exec bash
root@alban:~# id
uid=0(root) gid=0(root) groups=0(root),65534(nogroup)
root@alban:~# ip link # ---> only lo visible in this namespace

Cheers,
Alban
--- userns_child_exec.orig.c	2015-02-05 23:20:19.208741366 +0100
+++ userns_child_exec.c	2015-01-30 17:01:56.948493001 +0100
@@ -108,6 +108,30 @@
     close(fd);
 }
 
+static void
+write_file(char *content, char *path)
+{
+    int fd;
+    size_t content_len;
+
+    content_len = strlen(content);
+
+    fd = open(path, O_RDWR);
+    if (fd == -1) {
+        fprintf(stderr, "ERROR: open %s: %s\n", path,
+                strerror(errno));
+        exit(EXIT_FAILURE);
+    }
+
+    if (write(fd, content, content_len) != content_len) {
+        fprintf(stderr, "ERROR: write %s: %s\n", content,
+                strerror(errno));
+        exit(EXIT_FAILURE);
+    }
+
+    close(fd);
+}
+
 static int              /* Start function for cloned child */
 childFunc(void *arg)
 {
@@ -149,6 +173,7 @@
     const int MAP_BUF_SIZE = 100;
     char map_buf[MAP_BUF_SIZE];
     char map_path[PATH_MAX];
+    char groups_path[PATH_MAX];
 
     /* Parse command-line options. The initial '+' character in
        the final getopt() argument prevents GNU-style permutation
@@ -225,6 +250,11 @@
         update_map(uid_map, map_path);
     }
     if (gid_map != NULL || map_zero) {
+        snprintf(groups_path, PATH_MAX, "/proc/%ld/setgroups",
+                (long) child_pid);
+        write_file("deny\n", groups_path);
+    }
+    if (gid_map != NULL || map_zero) {
         snprintf(map_path, PATH_MAX, "/proc/%ld/gid_map",
                 (long) child_pid);
         if (map_zero) {
/* userns_child_exec.c

   Licensed under GNU General Public License v2 or later

   Create a child process that executes a shell command in new
   namespace(s); allow UID and GID mappings to be specified when
   creating a user namespace.
*/
#define _GNU_SOURCE
#include <sched.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/wait.h>
#include <signal.h>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <limits.h>
#include <errno.h>

/* A simple error-handling function: print an error message based
   on the value in 'errno' and terminate the calling process */

#define errExit(msg)    do { perror(msg); exit(EXIT_FAILURE); \
                        } while (0)

struct child_args {
    char **argv;        /* Command to be executed by child, with args */
    int    pipe_fd[2];  /* Pipe used to synchronize parent and child */
};

static int verbose;

static void
usage(char *pname)
{
    fprintf(stderr, "Usage: %s [options] cmd [arg...]\n\n", pname);
    fprintf(stderr, "Create a child process that executes a shell "
            "command in a new user namespace,\n"
            "and possibly also other new namespace(s).\n\n");
    fprintf(stderr, "Options can be:\n\n");
#define fpe(str) fprintf(stderr, "    %s", str);
    fpe("-i          New IPC namespace\n");
    fpe("-m          New mount namespace\n");
    fpe("-n          New network namespace\n");
    fpe("-p          New PID namespace\n");
    fpe("-u          New UTS namespace\n");
    fpe("-U          New user namespace\n");
    fpe("-M uid_map  Specify UID map for user namespace\n");
    fpe("-G gid_map  Specify GID map for user namespace\n");
    fpe("-z          Map user's UID and GID to 0 in user namespace\n");
    fpe("            (equivalent to: -M '0 <uid> 1' -G '0 <gid> 1')\n");
    fpe("-v          Display verbose messages\n");
    fpe("\n");
    fpe("If -z, -M, or -G is specified, -U is required.\n");
    fpe("It is not permitted to specify both -z and either -M or -G.\n");
    fpe("\n");
    fpe("Map strings for -M and -G consist of records of the form:\n");
    fpe("\n");
    fpe("    ID-inside-ns   ID-outside-ns   len\n");
    fpe("\n");
    fpe("A map string can contain multiple records, separated"
        " by commas;\n");
    fpe("the commas are replaced by newlines before writing"
        " to map files.\n");

    exit(EXIT_FAILURE);
}

/* Update the mapping file 'map_file', with the value provided in
   'mapping', a string that defines a UID or GID mapping. A UID or
   GID mapping consists of one or more newline-delimited records
   of the form:

       ID_inside-ns    ID-outside-ns   length

   Requiring the user to supply a string that contains newlines is
   of course inconvenient for command-line use. Thus, we permit the
   use of commas to delimit records in this string, and replace them
   with newlines before writing the string to the file. */

static void
update_map(char *mapping, char *map_file)
{
    int fd, j;
    size_t map_len;     /* Length of 'mapping' */

    /* Replace commas in mapping string with newlines */

    map_len = strlen(mapping);
    for (j = 0; j < map_len; j++)
        if (mapping[j] == ',')
            mapping[j] = '\n';

    fd = open(map_file, O_RDWR);
    if (fd == -1) {
        fprintf(stderr, "ERROR: open %s: %s\n", map_file,
                strerror(errno));
        exit(EXIT_FAILURE);
    }

    if (write(fd, mapping, map_len) != map_len) {
        fprintf(stderr, "ERROR: write %s: %s\n", map_file,
                strerror(errno));
        exit(EXIT_FAILURE);
    }

    close(fd);
}

static void
write_file(char *content, char *path)
{
    int fd;
    size_t content_len;

    content_len = strlen(content);

    fd = open(path, O_RDWR);
    if (fd == -1) {
        fprintf(stderr, "ERROR: open %s: %s\n", path,
                strerror(errno));
        exit(EXIT_FAILURE);
    }

    if (write(fd, content, content_len) != content_len) {
        fprintf(stderr, "ERROR: write %s: %s\n", content,
                strerror(errno));
        exit(EXIT_FAILURE);
    }

    close(fd);
}

static int              /* Start function for cloned child */
childFunc(void *arg)
{
    struct child_args *args = (struct child_args *) arg;
    char ch;

    /* Wait until the parent has updated the UID and GID mappings.
       See the comment in main(). We wait for end of file on a
       pipe that will be closed by the parent process once it has
       updated the mappings. */

    close(args->pipe_fd[1]);    /* Close our descriptor for the write
                                   end of the pipe so that we see EOF
                                   when parent closes its descriptor */
    if (read(args->pipe_fd[0], &ch, 1) != 0) {
        fprintf(stderr,
                "Failure in child: read from pipe returned != 0\n");
        exit(EXIT_FAILURE);
    }

    /* Execute a shell command */

    printf("About to exec %s\n", args->argv[0]);
    execvp(args->argv[0], args->argv);
    errExit("execvp");
}

#define STACK_SIZE (1024 * 1024)

static char child_stack[STACK_SIZE];    /* Space for child's stack */

int
main(int argc, char *argv[])
{
    int flags, opt, map_zero;
    pid_t child_pid;
    struct child_args args;
    char *uid_map, *gid_map;
    const int MAP_BUF_SIZE = 100;
    char map_buf[MAP_BUF_SIZE];
    char map_path[PATH_MAX];
    char groups_path[PATH_MAX];

    /* Parse command-line options. The initial '+' character in
       the final getopt() argument prevents GNU-style permutation
       of command-line options. That's useful, since sometimes
       the 'command' to be executed by this program itself
       has command-line options. We don't want getopt() to treat
       those as options to this program. */

    flags = 0;
    verbose = 0;
    gid_map = NULL;
    uid_map = NULL;
    map_zero = 0;
    while ((opt = getopt(argc, argv, "+imnpuUM:G:zv")) != -1) {
        switch (opt) {
        case 'i': flags |= CLONE_NEWIPC;        break;
        case 'm': flags |= CLONE_NEWNS;         break;
        case 'n': flags |= CLONE_NEWNET;        break;
        case 'p': flags |= CLONE_NEWPID;        break;
        case 'u': flags |= CLONE_NEWUTS;        break;
        case 'v': verbose = 1;                  break;
        case 'z': map_zero = 1;                 break;
        case 'M': uid_map = optarg;             break;
        case 'G': gid_map = optarg;             break;
        case 'U': flags |= CLONE_NEWUSER;       break;
        default:  usage(argv[0]);
        }
    }

    /* -M or -G without -U is nonsensical */

    if (((uid_map != NULL || gid_map != NULL || map_zero) &&
                !(flags & CLONE_NEWUSER)) ||
            (map_zero && (uid_map != NULL || gid_map != NULL)))
        usage(argv[0]);

    args.argv = &argv[optind];

    /* We use a pipe to synchronize the parent and child, in order to
       ensure that the parent sets the UID and GID maps before the child
       calls execve(). This ensures that the child maintains its
       capabilities during the execve() in the common case where we
       want to map the child's effective user ID to 0 in the new user
       namespace. Without this synchronization, the child would lose
       its capabilities if it performed an execve() with nonzero
       user IDs (see the capabilities(7) man page for details of the
       transformation of a process's capabilities during execve()). */

    if (pipe(args.pipe_fd) == -1)
        errExit("pipe");

    /* Create the child in new namespace(s) */

    child_pid = clone(childFunc, child_stack + STACK_SIZE,
                      flags | SIGCHLD, &args);
    if (child_pid == -1)
        errExit("clone");

    /* Parent falls through to here */

    if (verbose)
        printf("%s: PID of child created by clone() is %ld\n",
                argv[0], (long) child_pid);

    /* Update the UID and GID maps in the child */

    if (uid_map != NULL || map_zero) {
        snprintf(map_path, PATH_MAX, "/proc/%ld/uid_map",
                (long) child_pid);
        if (map_zero) {
            snprintf(map_buf, MAP_BUF_SIZE, "0 %ld 1", (long) getuid());
            uid_map = map_buf;
        }
        update_map(uid_map, map_path);
    }
    if (gid_map != NULL || map_zero) {
        snprintf(groups_path, PATH_MAX, "/proc/%ld/setgroups",
                (long) child_pid);
        write_file("deny\n", groups_path);
    }
    if (gid_map != NULL || map_zero) {
        snprintf(map_path, PATH_MAX, "/proc/%ld/gid_map",
                (long) child_pid);
        if (map_zero) {
            snprintf(map_buf, MAP_BUF_SIZE, "0 %ld 1", (long) getgid());
            gid_map = map_buf;
        }
        update_map(gid_map, map_path);
    }

    /* Close the write end of the pipe, to signal to the child that we
       have updated the UID and GID maps */

    close(args.pipe_fd[1]);

    if (waitpid(child_pid, NULL, 0) == -1)      /* Wait for child */
        errExit("waitpid");

    if (verbose)
        printf("%s: terminating\n", argv[0]);

    exit(EXIT_SUCCESS);
}
_______________________________________________
systemd-devel mailing list
systemd-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/systemd-devel

Reply via email to