In order for one to use QEMU user mode emulation under a chroot, it is required to use binfmt_misc. This can be avoided by QEMU never doing a raw execve() to the host system, which is especially useful in environments where binfmt_misc can't be used.
Introduce a new option, -execve, that uses the current QEMU interpreter to intercept execve(). In addition, execve mode can also be en- and disabled using the 'QEMU_EXECVE' env var. qemu_execve() will prepend the interpreter path, similar to what binfmt_misc would do, and then pass the modified execve() to the host. It is necessary to parse hashbang scripts in that function otherwise the kernel will try to run the interpreter of a script without QEMU and get an invalid exec format error. Note that a previous incarnation of this patch was submitted a few years ago (see [0]) by Petros Angelatos as the original author who confirmed that it's OK to resubmit it. CC: [email protected] CC: [email protected] CC: [email protected] CC: [email protected] Signed-off-by: Moritz Haase <[email protected]> --- We've been using this feature internally for at least five years by now. Prior to submission, the code was updated to (hopefully) conform to the current QEMU coding style. I'd be happy to add test cases for this feature, but I'd need some pointers given that I'm a first-time contributor. Thanks! [0]: https://patchwork.kernel.org/project/qemu-devel/patch/[email protected]/ --- linux-user/linuxload.c | 119 ++++++++++++++++++++++++++++++++++-- linux-user/loader.h | 1 + linux-user/main.c | 54 ++++++++++++++++ linux-user/syscall.c | 94 ++++++++++++++++++++++++---- linux-user/user-internals.h | 1 + 5 files changed, 252 insertions(+), 17 deletions(-) diff --git a/linux-user/linuxload.c b/linux-user/linuxload.c index 85d700953e..eb1fdf3f85 100644 --- a/linux-user/linuxload.c +++ b/linux-user/linuxload.c @@ -138,15 +138,124 @@ abi_ulong loader_build_argptr(int envc, int argc, abi_ulong sp, return sp; } +int load_script_file(const char *filename, struct linux_binprm *bprm) +{ + int retval, fd; + char *i_arg = NULL, *i_name = NULL; + char **new_argv; + char *cp; + char buf[BPRM_BUF_SIZE]; + + /* Check if it is a script */ + fd = open(filename, O_RDONLY); + if (fd == -1) { + return fd; + } + + retval = read(fd, buf, BPRM_BUF_SIZE); + if (retval == -1) { + close(fd); + return retval; + } + + /* if we have less than 2 bytes, we can guess it is not executable */ + if (retval < 2) { + close(fd); + return -ENOEXEC; + } + + close(fd); + /* + * adapted from the kernel + * https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/fs/binfmt_script.c + */ + if ((buf[0] == '#') && (buf[1] == '!')) { + buf[BPRM_BUF_SIZE - 1] = '\0'; + cp = strchr(buf, '\n'); + if (cp == NULL) { + cp = buf + BPRM_BUF_SIZE - 1; + } + *cp = '\0'; + while (cp > buf) { + cp--; + if ((*cp == ' ') || (*cp == '\t')) { + *cp = '\0'; + } else { + break; + } + } + for (cp = buf + 2; (*cp == ' ') || (*cp == '\t'); cp++) { + /* nothing */ ; + } + if (*cp == '\0') { + return -ENOEXEC; /* No interpreter name found */ + } + i_name = cp; + i_arg = NULL; + for ( ; *cp && (*cp != ' ') && (*cp != '\t'); cp++) { + /* nothing */ ; + } + while ((*cp == ' ') || (*cp == '\t')) { + *cp++ = '\0'; + } + + new_argv = NULL; + if (*cp) { + i_arg = cp; + } + + if (i_arg) { + new_argv = g_alloca(sizeof(void *)); + new_argv[0] = i_arg; + } + bprm->argv = new_argv; + bprm->filename = i_name; + } else { + return 1; + } + return 0; +} + int loader_exec(int fdexec, const char *filename, char **argv, char **envp, struct image_info *infop, struct linux_binprm *bprm) { - int retval; + int retval, fd, offset = 1, argc = count(argv); + char **new_argv; + + retval = load_script_file(filename, bprm); + if (retval == 0) { + if (bprm->argv != NULL) { + offset = 2; + } + new_argv = g_alloca((argc + offset + 1) * sizeof(void *)); + + new_argv[0] = (char *)filename; + if (bprm->argv != NULL) { + new_argv[1] = bprm->argv[0]; + } + /* Copy the original arguments with offset */ + for (int i = 0; i < argc; i++) { + new_argv[i + offset] = argv[i]; + } + new_argv[argc + offset] = NULL; + + bprm->argc = count(new_argv); + bprm->argv = new_argv; + fd = open(bprm->filename, O_RDONLY); + if (fd < 0) { + printf("Error while loading %s: %s\n", + bprm->filename, + strerror(errno)); + _exit(EXIT_FAILURE); + } + bprm->src.fd = fd; + } else { + bprm->filename = (char *)filename; + bprm->argc = count(argv); + bprm->argv = argv; + bprm->src.fd = fdexec; + } - bprm->src.fd = fdexec; - bprm->filename = (char *)filename; - bprm->argc = count(argv); - bprm->argv = argv; bprm->envc = count(envp); bprm->envp = envp; diff --git a/linux-user/loader.h b/linux-user/loader.h index da9ad28db5..2beedc5f0d 100644 --- a/linux-user/loader.h +++ b/linux-user/loader.h @@ -90,6 +90,7 @@ int loader_exec(int fdexec, const char *filename, char **argv, char **envp, uint32_t get_elf_eflags(int fd); int load_elf_binary(struct linux_binprm *bprm, struct image_info *info); int load_flt_binary(struct linux_binprm *bprm, struct image_info *info); +int load_script_file(const char *filename, struct linux_binprm *bprm); abi_long memcpy_to_target(abi_ulong dest, const void *src, unsigned long len); diff --git a/linux-user/main.c b/linux-user/main.c index db751c0757..3a8a748fda 100644 --- a/linux-user/main.c +++ b/linux-user/main.c @@ -128,6 +128,7 @@ static void usage(int exitcode); static const char *interp_prefix = CONFIG_QEMU_INTERP_PREFIX; const char *qemu_uname_release; +const char *qemu_execve_path; #if !defined(TARGET_DEFAULT_STACK_SIZE) /* XXX: on x86 MAP_GROWSDOWN only works if ESP <= address + 32, so @@ -367,6 +368,56 @@ static void handle_arg_guest_base(const char *arg) have_guest_base = true; } +static void handle_arg_execve(const char *arg) +{ + const char *execfn; + char buf[PATH_MAX]; + char *ret; + int len; + + /* + * Since the 'execve' command line option has no argument ('has_arg' is + * 'false'), this function will always receive NULL for 'arg' during + * argument parsing. If 'arg' is non-NULL, we are being called during env + * var handling, because QEMU_EXECVE is set. + */ + if (arg != NULL) { + /* + * If the env var is set, check whether its value is '0'. In this case, + * we don't want to enable 'execve' mode and thus bail out. Please note + * that an empty value will NOT disable 'execve' mode. + */ + if (!strcmp(arg, "0")) { + return; + } + } + + /* try getauxval() */ + execfn = (const char *)qemu_getauxval(AT_EXECFN); + + if (execfn != 0) { + ret = realpath(execfn, buf); + + if (ret != NULL) { + qemu_execve_path = g_strdup(buf); + return; + } + } + + /* try /proc/self/exe */ + len = readlink("/proc/self/exe", buf, sizeof(buf) - 1); + + if (len != -1) { + buf[len] = '\0'; + qemu_execve_path = g_strdup(buf); + return; + } + + fprintf(stderr, "qemu_execve: unable to determine interpreter's path\n"); + exit(EXIT_FAILURE); +} + + static void handle_arg_reserved_va(const char *arg) { char *p; @@ -497,6 +548,9 @@ static const struct qemu_argument arg_table[] = { "uname", "set qemu uname release string to 'uname'"}, {"B", "QEMU_GUEST_BASE", true, handle_arg_guest_base, "address", "set guest_base address to 'address'"}, + {"execve", "QEMU_EXECVE", false, handle_arg_execve, + "", "use this interpreter when a process calls execve() " + "(disabled if env var is '0', enabled for all other values / when empty)"}, {"R", "QEMU_RESERVED_VA", true, handle_arg_reserved_va, "size", "reserve 'size' bytes for guest virtual address space"}, {"t", "QEMU_RTSIG_MAP", true, handle_arg_rtsig_map, diff --git a/linux-user/syscall.c b/linux-user/syscall.c index 2060e561a2..bf9e084975 100644 --- a/linux-user/syscall.c +++ b/linux-user/syscall.c @@ -127,6 +127,7 @@ #include <libdrm/drm.h> #include <libdrm/i915_drm.h> #endif +#include <linux/binfmts.h> #include "linux_loop.h" #include "uname.h" @@ -8726,6 +8727,86 @@ ssize_t do_guest_readlink(const char *pathname, char *buf, size_t bufsiz) return ret; } +static int qemu_execve(const char *filename, char *argv[], + char *envp[]) +{ + char **new_argv; + const char *new_filename; + int argc, ret, i, offset = 3; + struct linux_binprm *bprm; + + /* normal execve case */ + if (qemu_execve_path == NULL || *qemu_execve_path == 0) { + new_filename = filename; + new_argv = argv; + } else { + new_filename = qemu_execve_path; + + for (argc = 0; argv[argc] != NULL; argc++) { + /* nothing */ ; + } + + bprm = g_alloca(sizeof(struct linux_binprm)); + ret = load_script_file(filename, bprm); + + if (ret < 0) { + if (ret == -1) { + return get_errno(ret); + } else { + return -host_to_target_errno(ENOEXEC); + } + } + + if (ret == 0) { + if (bprm->argv != NULL) { + offset = 5; + } else { + offset = 4; + } + } + + /* Need to store execve argument */ + offset++; + + new_argv = g_alloca((argc + offset + 1) * sizeof(void *)); + + /* Copy the original arguments with offset */ + for (i = 0; i < argc; i++) { + new_argv[i + offset] = argv[i]; + } + + new_argv[0] = g_strdup(qemu_execve_path); + new_argv[1] = g_strdup("-execve"); /* Add execve argument */ + new_argv[2] = g_strdup("-0"); + new_argv[offset] = g_strdup(filename); + new_argv[argc + offset] = NULL; + + if (ret == 0) { + new_argv[3] = bprm->filename; + new_argv[4] = bprm->filename; + + if (bprm->argv != NULL) { + new_argv[5] = bprm->argv[0]; + } + } else { + new_argv[3] = argv[0]; + } + } + + /* + * Although execve() is not an interruptible syscall it is + * a special case where we must use the safe_syscall wrapper: + * if we allow a signal to happen before we make the host + * syscall then we will 'lose' it, because at the point of + * execve the process leaves QEMU's control. So we use the + * safe syscall wrapper to ensure that we either take the + * signal as a guest signal, or else it does not happen + * before the execve completes and makes it the other + * program's problem. + */ + return safe_execve(new_filename, new_argv, envp); +} + static int do_execv(CPUArchState *cpu_env, int dirfd, abi_long pathname, abi_long guest_argp, abi_long guest_envp, int flags, bool is_execveat) @@ -8791,17 +8872,6 @@ static int do_execv(CPUArchState *cpu_env, int dirfd, } *q = NULL; - /* - * Although execve() is not an interruptible syscall it is - * a special case where we must use the safe_syscall wrapper: - * if we allow a signal to happen before we make the host - * syscall then we will 'lose' it, because at the point of - * execve the process leaves QEMU's control. So we use the - * safe syscall wrapper to ensure that we either take the - * signal as a guest signal, or else it does not happen - * before the execve completes and makes it the other - * program's problem. - */ p = lock_user_string(pathname); if (!p) { goto execve_efault; @@ -8813,7 +8883,7 @@ static int do_execv(CPUArchState *cpu_env, int dirfd, } ret = is_execveat ? safe_execveat(dirfd, exe, argp, envp, flags) - : safe_execve(exe, argp, envp); + : qemu_execve(exe, argp, envp); ret = get_errno(ret); unlock_user(p, pathname, 0); diff --git a/linux-user/user-internals.h b/linux-user/user-internals.h index 7099349ec8..0fd97cdb4f 100644 --- a/linux-user/user-internals.h +++ b/linux-user/user-internals.h @@ -69,6 +69,7 @@ abi_long get_errno(abi_long ret); const char *target_strerror(int err); int get_osversion(void); void init_qemu_uname_release(void); +extern const char *qemu_execve_path; void fork_start(void); void fork_end(pid_t pid);
