[PATCH_v4.1 1/3] Make call_usermodehelper_exec possible to set namespaces
Current call_usermodehelper_work() can not set namespaces for the executed program. This patch add above function for call_usermodehelper_work(). The init_intermediate is introduced for init works which should be done before fork(). So that we get a method to set namespaces for children. The cleanup_intermediate is introduced for cleaning up what we have done in init_intermediate, like switching back the namespace. This function is helpful for coredump to run pipe_program in specific container environment. Signed-off-by: Cao Shufeng --- fs/coredump.c | 3 ++- include/linux/umh.h | 5 + init/do_mounts_initrd.c | 3 ++- kernel/kmod.c | 3 ++- kernel/umh.c| 51 ++--- security/keys/request_key.c | 4 ++-- 6 files changed, 56 insertions(+), 13 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 52c63d6..84c2b8a 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -647,7 +647,8 @@ void do_coredump(const siginfo_t *siginfo) retval = -ENOMEM; sub_info = call_usermodehelper_setup(helper_argv[0], helper_argv, NULL, GFP_KERNEL, - umh_pipe_setup, NULL, &cprm); + NULL, NULL, umh_pipe_setup, + NULL, &cprm); if (sub_info) retval = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); diff --git a/include/linux/umh.h b/include/linux/umh.h index 244aff6..832ff5d 100644 --- a/include/linux/umh.h +++ b/include/linux/umh.h @@ -24,6 +24,9 @@ struct subprocess_info { char **envp; int wait; int retval; + bool cleaned; + void (*init_intermediate)(struct subprocess_info *info); + void (*cleanup_intermediate)(struct subprocess_info *info); int (*init)(struct subprocess_info *info, struct cred *new); void (*cleanup)(struct subprocess_info *info); void *data; @@ -35,6 +38,8 @@ call_usermodehelper(const char *path, char **argv, char **envp, int wait); extern struct subprocess_info * call_usermodehelper_setup(const char *path, char **argv, char **envp, gfp_t gfp_mask, + void (*init_intermediate)(struct subprocess_info *info), + void (*cleanup_intermediate)(struct subprocess_info *info), int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *), void *data); diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index 53d4f0f..8bb34c0 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -73,7 +73,8 @@ static void __init handle_initrd(void) current->flags |= PF_FREEZER_SKIP; info = call_usermodehelper_setup("/linuxrc", argv, envp_init, -GFP_KERNEL, init_linuxrc, NULL, NULL); +GFP_KERNEL, NULL, NULL, init_linuxrc, +NULL, NULL); if (!info) return; call_usermodehelper_exec(info, UMH_WAIT_PROC); diff --git a/kernel/kmod.c b/kernel/kmod.c index bc6addd..41df494 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -92,7 +92,8 @@ static int call_modprobe(char *module_name, int wait) argv[4] = NULL; info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL, -NULL, free_modprobe_argv, NULL); +NULL, NULL, NULL, free_modprobe_argv, +NULL); if (!info) goto free_module_name; diff --git a/kernel/umh.c b/kernel/umh.c index 6ff9905..97e9bd8 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -53,8 +54,15 @@ static void umh_complete(struct subprocess_info *sub_info) */ if (comp) complete(comp); - else + else { + for(;;) { + if (sub_info->cleaned == false) + udelay(20); + else + break; + } call_usermodehelper_freeinfo(sub_info); + } } /* @@ -120,6 +128,9 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) /* If SIGCLD is ignored sys_wait4 won't populate the status. */ kernel_sigaction(SIGCHLD, SIG_DFL); + if(sub_info->cleanup_intermediate) { + sub_info->cleanup_intermediate(sub_info); + } pid = kernel_thread(call_usermodehelper_exec_async, sub_
[PATCH_v4.1 3/3] Make core_pattern support namespace
Currently, each container shared one copy of coredump setting with the host system, if host system changed the setting, each running containers will be affected. Same story happened when container changed core_pattern, both host and other container will be affected. For container based on namespace design, it is good to allow each container keeping their own coredump setting. It will bring us following benefit: 1: Each container can change their own coredump setting based on operation on /proc/sys/kernel/core_pattern 2: Coredump setting changed in host will not affect running containers. 3: Support both case of "putting coredump in guest" and "putting curedump in host". Each namespace-based software(lxc, docker, ..) can use this function to custom their dump setting. And this function makes each continer working as separate system, it fit for design goal of namespace. Test(in lxc): # In the host # # echo host_core >/proc/sys/kernel/core_pattern # cat /proc/sys/kernel/core_pattern host_core # ulimit -c 1024000 # ./make_dump Segmentation fault (core dumped) # ls -l -rw--- 1 root root 331776 Feb 4 18:02 host_core.2175 -rwxr-xr-x 1 root root 759731 Feb 4 18:01 make_dump # # In the container # # cat /proc/sys/kernel/core_pattern host_core # echo container_core >/proc/sys/kernel/core_pattern # ./make_dump Segmentation fault (core dumped) # ls -l -rwxr-xr-x1 root root 759731 Feb 4 10:45 make_dump -rw---1 root root 331776 Feb 4 10:45 container_core.16 # # Return to host # # cat /proc/sys/kernel/core_pattern host_core # ls host_core.2175 make_dump make_dump.c # rm -f host_core.2175 # ./make_dump Segmentation fault (core dumped) # ls -l -rw--- 1 root root 331776 Feb 4 18:49 host_core.2351 -rwxr-xr-x 1 root root 759731 Feb 4 18:01 make_dump # --- fs/coredump.c | 25 -- include/linux/pid_namespace.h | 3 +++ kernel/pid.c | 2 ++ kernel/pid_namespace.c| 2 ++ kernel/sysctl.c | 50 ++- 5 files changed, 70 insertions(+), 12 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 41448bd..cf08c65 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -53,7 +53,6 @@ int core_uses_pid; unsigned int core_pipe_limit; -char core_pattern[CORENAME_MAX_SIZE] = "core"; static int core_name_size = CORENAME_MAX_SIZE; struct core_name { @@ -61,8 +60,6 @@ struct core_name { int used, size; }; -/* The maximal length of core_pattern is also specified in sysctl.c */ - static int expand_corename(struct core_name *cn, int size) { char *corename = krealloc(cn->corename, size, GFP_KERNEL); @@ -187,10 +184,10 @@ static int cn_print_exe_file(struct core_name *cn) * name into corename, which must have space for at least * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. */ -static int format_corename(struct core_name *cn, struct coredump_params *cprm) +static int format_corename(struct core_name *cn, const char *pat_ptr, + struct coredump_params *cprm) { const struct cred *cred = current_cred(); - const char *pat_ptr = core_pattern; int ispipe = (*pat_ptr == '|'); int pid_in_pattern = 0; int err = 0; @@ -669,6 +666,8 @@ void do_coredump(const siginfo_t *siginfo) */ .mm_flags = mm->flags, }; + struct pid_namespace *pid_ns; + char core_pattern[CORENAME_MAX_SIZE]; audit_core_dumps(siginfo->si_signo); @@ -678,6 +677,18 @@ void do_coredump(const siginfo_t *siginfo) if (!__get_dumpable(cprm.mm_flags)) goto fail; + pid_ns = task_active_pid_ns(current); + spin_lock(&pid_ns->core_pattern_lock); + while (pid_ns != &init_pid_ns) { + if (pid_ns->core_pattern[0]) + break; + spin_unlock(&pid_ns->core_pattern_lock); + pid_ns = pid_ns->parent, + spin_lock(&pid_ns->core_pattern_lock); + } + strcpy(core_pattern, pid_ns->core_pattern); + spin_unlock(&pid_ns->core_pattern_lock); + cred = prepare_creds(); if (!cred) goto fail; @@ -699,7 +710,7 @@ void do_coredump(const siginfo_t *siginfo) old_cred = override_creds(cred); - ispipe = format_corename(&cn, &cprm); + ispipe = format_corename(&cn, core_pattern, &cprm); if (ispipe) { int dump_count; @@ -746,7 +757,7 @@ void do_coredump(const siginfo_t *siginfo) } rcu_read_lock(); - vinit_task = find_task_by_vpid(1); + vinit_task = find_task_by_pid_ns(1, pid_ns); rcu_read_unlock(); if (!vinit_task) { printk(KERN_WARNING "failed getting init task in
[PATCH_v4.1 2/3] Limit dump_pipe program's permission to init for container
Currently when we set core_pattern to a pipe, the pipe program is forked by kthread running with root's permission, and write dumpfile into host's filesystem. Same thing happened for container, the dumper and dumpfile are also in host(not in container). It have following program: 1: Not consistent with file_type core_pattern When we set core_pattern to a file, the container will write dump into container's filesystem instead of host. 2: Not safe for privileged container In a privileged container, user can destroy host system by following command: # # In a container # echo "|/bin/dd of=/boot/vmlinuz" >/proc/sys/kernel/core_pattern # make_dump This patch switch dumper program's environment to init task, so, for container, dumper program have same environment with init task in container, which make dumper program put in container's filesystem, and write coredump into container's filesystem. The dumper's permission is also limited into subset of container's init process. Suggested-by: Eric W. Biederman Suggested-by: KOSAKI Motohiro Signed-off-by: Cao ShuFeng --- fs/coredump.c | 126 +++- include/linux/binfmts.h | 2 + 2 files changed, 126 insertions(+), 2 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 84c2b8a..41448bd 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -508,6 +508,45 @@ static void wait_for_dump_helpers(struct file *file) } /* + * umh_ns_setup + * set the namesapces to the bask task of a container. + * we need to switch back to the original namespaces + * so that the thread of workqueue is not influlenced. + * + * this method runs in workqueue kernel thread. + */ +static void umh_ns_setup(struct subprocess_info *info) +{ + struct coredump_params *cp = (struct coredump_params *)info->data; + struct task_struct *base_task = cp->base_task; + + if (base_task) { + cp->current_task_nsproxy = current->nsproxy; + //prevent current namespace from being freed + get_nsproxy(current->nsproxy); + /* Set namespaces to base_task */ + get_nsproxy(base_task->nsproxy); + switch_task_namespaces(current, base_task->nsproxy); + } +} + +/* + * umh_ns_cleanup + * cleanup what we have done in umh_ns_setup. + * + * this method runs in workqueue kernel thread. + */ +static void umh_ns_cleanup(struct subprocess_info *info) +{ + struct coredump_params *cp = (struct coredump_params *)info->data; + struct nsproxy *current_task_nsproxy = cp->current_task_nsproxy; + if (current_task_nsproxy) { + /* switch workqueue's original namespace back */ + switch_task_namespaces(current, current_task_nsproxy); + } +} + +/* * umh_pipe_setup * helper function to customize the process used * to collect the core in userspace. Specifically @@ -522,6 +561,8 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) { struct file *files[2]; struct coredump_params *cp = (struct coredump_params *)info->data; + struct task_struct *base_task; + int err = create_pipe_files(files, 0); if (err) return err; @@ -530,10 +571,76 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) err = replace_fd(0, files[0], 0); fput(files[0]); + if (err) + return err; + /* and disallow core files too */ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; - return err; + base_task = cp->base_task; + if (base_task) { + const struct cred *base_cred; + + /* Set fs_root to base_task */ + spin_lock(&base_task->fs->lock); + set_fs_root(current->fs, &base_task->fs->root); + set_fs_pwd(current->fs, &base_task->fs->pwd); + spin_unlock(&base_task->fs->lock); + + /* Set cgroup to base_task */ + current->flags &= ~PF_NO_SETAFFINITY; + err = cgroup_attach_task_all(base_task, current); + if (err < 0) + return err; + + /* Set cred to base_task */ + base_cred = get_task_cred(base_task); + + new->uid = base_cred->uid; + new->gid = base_cred->gid; + new->suid = base_cred->suid; + new->sgid = base_cred->sgid; + new->euid = base_cred->euid; + new->egid = base_cred->egid; + new->fsuid = base_cred->fsuid; + new->fsgid = base_cred->fsgid; + + new->securebits = base_cred->securebits; + + new->cap_inheritabl
[PATCH_v4.1 0/3] Make core_pattern support namespace
This patchset includes following function points: 1: Let usermodehelper function possible to set pid namespace done by: [PATCH_v4.1 1/3] Make call_usermodehelper_exec possible to set namespaces 2: Let pipe_type core_pattern write dump into container's rootfs done by: [PATCH_v4.1 2/3] Limit dump_pipe program's permission to init for container 3: Make separate core_pattern setting for each container done by: [PATCH_v4.1 3/3] Make core_pattern support namespace 4: Compatibility with current system also included in: [PATCH_v4.1 3/3] Make core_pattern support namespace If container hadn't change core_pattern setting, it will keep same setting with host. Changelog v3.1-v4: 1. remove extra fork pointed out by: Andrei Vagin 2: Rebase on top of v4.9-rc8. 3: Rebase on top of v4.12. 3: Rebase on top of v4.14. Changelog v3-v3.1: 1. Switch "pwd" of pipe program to container's root fs. 2. Rebase on top of v4.9-rc1. Changelog v2->v3: 1: Fix problem of setting pid namespace, pointed out by: Andrei Vagin Changelog v1(RFC)->v2: 1: Add [PATCH 2/2] which was todo in [RFC v1]. 2: Pass a test script for each function. 3: Rebase on top of v4.7. Suggested-by: Eric W. Biederman Suggested-by: KOSAKI Motohiro Signed-off-by: Cao Shufeng Cao Shufeng (3): Make call_usermodehelper_exec possible to set namespaces Limit dump_pipe program's permission to init for container Make core_pattern support namespace fs/coredump.c | 150 +++--- include/linux/binfmts.h | 2 + include/linux/pid_namespace.h | 3 + include/linux/umh.h | 5 ++ init/do_mounts_initrd.c | 3 +- kernel/kmod.c | 3 +- kernel/pid.c | 2 + kernel/pid_namespace.c| 2 + kernel/sysctl.c | 50 -- kernel/umh.c | 51 +++--- security/keys/request_key.c | 4 +- 11 files changed, 250 insertions(+), 25 deletions(-) -- 2.1.0
[PATCH 0/3] Make core_pattern support namespace
This patchset includes following function points: 1: Let usermodehelper function possible to set pid namespace done by: [PATCH_v4.1_1/3] Make call_usermodehelper_exec possible to set namespaces 2: Let pipe_type core_pattern write dump into container's rootfs done by: [PATCH_v4.1_2/3] Limit dump_pipe program's permission to init for container 3: Make separate core_pattern setting for each container done by: [PATCH_v4.1_3/3] Make core_pattern support namespace 4: Compatibility with current system also included in: [PATCH_v4.1_3/3] Make core_pattern support namespace If container hadn't change core_pattern setting, it will keep same setting with host. Test: 1: Pass a test script for each function of this patchset ## TEST IN HOST ## [root@kerneldev dumptest]# ./test_host Set file core_pattern: OK ./test_host: line 41: 2366 Segmentation fault (core dumped) "$SCRI= PT_BASE_DIR"/make_dump Checking dumpfile: OK Set file core_pattern: OK ./test_host: line 41: 2369 Segmentation fault (core dumped) "$SCRI= PT_BASE_DIR"/make_dump Checking dump_pipe triggered: OK Checking rootfs: OK Checking dumpfile: OK Checking namespace: OK Checking process list: OK Checking capabilities: OK ## TEST IN GUEST ## # ./test Segmentation fault (core dumped) Checking dump_pipe triggered: OK Checking rootfs: OK Checking dumpfile: OK Checking namespace: OK Checking process list: OK Checking cg pids: OK Checking capabilities: OK [ 64.940734] make_dump[2432]: segfault at 0 ip 0040049d sp 000= 07ffc4af025f0 error 6 in make_dump[40+a6000] # 2: Pass other test(which is not easy to do in script) by hand. Changelog v3.1-v4: 1. remove extra fork pointed out by: Andrei Vagin 2: Rebase on top of v4.9-rc8. 3: Rebase on top of v4.12. Changelog v3-v3.1: 1. Switch "pwd" of pipe program to container's root fs. 2. Rebase on top of v4.9-rc1. Changelog v2->v3: 1: Fix problem of setting pid namespace, pointed out by: Andrei Vagin Changelog v1(RFC)->v2: 1: Add [PATCH 2/2] which was todo in [RFC v1]. 2: Pass a test script for each function. 3: Rebase on top of v4.7. Suggested-by: Eric W. Biederman Suggested-by: KOSAKI Motohiro Signed-off-by: Cao Shufeng Cao Shufeng (3): Make call_usermodehelper_exec possible to set namespaces Limit dump_pipe program's permission to init for container Make core_pattern support namespace fs/coredump.c | 150 +++--- include/linux/binfmts.h | 2 + include/linux/kmod.h | 5 ++ include/linux/pid_namespace.h | 3 + init/do_mounts_initrd.c | 3 +- kernel/kmod.c | 56 +--- kernel/pid.c | 2 + kernel/pid_namespace.c| 2 + kernel/sysctl.c | 50 -- lib/kobject_uevent.c | 3 +- security/keys/request_key.c | 4 +- 11 files changed, 253 insertions(+), 27 deletions(-) -- 2.9.3
[PATCH_v4.1_1/3] Make call_usermodehelper_exec possible to set namespaces
Current call_usermodehelper_work() can not set namespaces for the executed program. This patch add above function for call_usermodehelper_work(). The init_intermediate is introduced for init works which should be done before fork(). So that we get a method to set namespaces for children. The cleanup_intermediate is introduced for cleaning up what we have done in init_intermediate, like switching back the namespace. This function is helpful for coredump to run pipe_program in specific container environment. Signed-off-by: Cao Shufeng --- fs/coredump.c | 3 ++- include/linux/kmod.h| 5 init/do_mounts_initrd.c | 3 ++- kernel/kmod.c | 56 + lib/kobject_uevent.c| 3 ++- security/keys/request_key.c | 4 ++-- 6 files changed, 59 insertions(+), 15 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 5926837..802f434 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -646,7 +646,8 @@ void do_coredump(const siginfo_t *siginfo) retval = -ENOMEM; sub_info = call_usermodehelper_setup(helper_argv[0], helper_argv, NULL, GFP_KERNEL, - umh_pipe_setup, NULL, &cprm); + NULL, NULL, umh_pipe_setup, + NULL, &cprm); if (sub_info) retval = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); diff --git a/include/linux/kmod.h b/include/linux/kmod.h index c4e441e..bb4e1a6 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -61,6 +61,9 @@ struct subprocess_info { char **envp; int wait; int retval; + bool cleaned; + void (*init_intermediate)(struct subprocess_info *info); + void (*cleanup_intermediate)(struct subprocess_info *info); int (*init)(struct subprocess_info *info, struct cred *new); void (*cleanup)(struct subprocess_info *info); void *data; @@ -72,6 +75,8 @@ call_usermodehelper(const char *path, char **argv, char **envp, int wait); extern struct subprocess_info * call_usermodehelper_setup(const char *path, char **argv, char **envp, gfp_t gfp_mask, + void (*init_intermediate)(struct subprocess_info *info), + void (*cleanup_intermediate)(struct subprocess_info *info), int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *), void *data); diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index a1000ca..59d11c9 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -72,7 +72,8 @@ static void __init handle_initrd(void) current->flags |= PF_FREEZER_SKIP; info = call_usermodehelper_setup("/linuxrc", argv, envp_init, -GFP_KERNEL, init_linuxrc, NULL, NULL); +GFP_KERNEL, NULL, NULL, init_linuxrc, +NULL, NULL); if (!info) return; call_usermodehelper_exec(info, UMH_WAIT_PROC); diff --git a/kernel/kmod.c b/kernel/kmod.c index 563f97e..f75725b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -93,7 +94,8 @@ static int call_modprobe(char *module_name, int wait) argv[4] = NULL; info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL, -NULL, free_modprobe_argv, NULL); +NULL, NULL, NULL, free_modprobe_argv, +NULL); if (!info) goto free_module_name; @@ -207,8 +209,15 @@ static void umh_complete(struct subprocess_info *sub_info) */ if (comp) complete(comp); - else + else { + for(;;) { + if (sub_info->cleaned == false) + udelay(20); + else + break; + } call_usermodehelper_freeinfo(sub_info); + } } /* @@ -302,7 +311,10 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) /* Restore default kernel sig handler */ kernel_sigaction(SIGCHLD, SIG_IGN); - + if(sub_info->cleanup_intermediate) { + sub_info->cleanup_intermediate(sub_info); + } + sub_info->cleaned = true; umh_complete(sub_info); } @@ -324,6 +336,9 @@ static void call_usermodehelper_exec_work(struct work_struct *work) {
[PATCH_v4.1_2/3] Limit dump_pipe program's permission to init for container
Currently when we set core_pattern to a pipe, the pipe program is forked by kthread running with root's permission, and write dumpfile into host's filesystem. Same thing happened for container, the dumper and dumpfile are also in host(not in container). It have following program: 1: Not consistent with file_type core_pattern When we set core_pattern to a file, the container will write dump into container's filesystem instead of host. 2: Not safe for privileged container In a privileged container, user can destroy host system by following command: # # In a container # echo "|/bin/dd of=/boot/vmlinuz" >/proc/sys/kernel/core_pattern # make_dump This patch switch dumper program's environment to init task, so, for container, dumper program have same environment with init task in container, which make dumper program put in container's filesystem, and write coredump into container's filesystem. The dumper's permission is also limited into subset of container's init process. Suggested-by: Eric W. Biederman Suggested-by: KOSAKI Motohiro Signed-off-by: Cao ShuFeng --- fs/coredump.c | 126 +++- include/linux/binfmts.h | 2 + 2 files changed, 126 insertions(+), 2 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 802f434..745c757 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -507,6 +507,45 @@ static void wait_for_dump_helpers(struct file *file) } /* + * umh_ns_setup + * set the namesapces to the bask task of a container. + * we need to switch back to the original namespaces + * so that the thread of workqueue is not influlenced. + * + * this method runs in workqueue kernel thread. + */ +static void umh_ns_setup(struct subprocess_info *info) +{ + struct coredump_params *cp = (struct coredump_params *)info->data; + struct task_struct *base_task = cp->base_task; + + if (base_task) { + cp->current_task_nsproxy = current->nsproxy; + //prevent current namespace from being freed + get_nsproxy(current->nsproxy); + /* Set namespaces to base_task */ + get_nsproxy(base_task->nsproxy); + switch_task_namespaces(current, base_task->nsproxy); + } +} + +/* + * umh_ns_cleanup + * cleanup what we have done in umh_ns_setup. + * + * this method runs in workqueue kernel thread. + */ +static void umh_ns_cleanup(struct subprocess_info *info) +{ + struct coredump_params *cp = (struct coredump_params *)info->data; + struct nsproxy *current_task_nsproxy = cp->current_task_nsproxy; + if (current_task_nsproxy) { + /* switch workqueue's original namespace back */ + switch_task_namespaces(current, current_task_nsproxy); + } +} + +/* * umh_pipe_setup * helper function to customize the process used * to collect the core in userspace. Specifically @@ -521,6 +560,8 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) { struct file *files[2]; struct coredump_params *cp = (struct coredump_params *)info->data; + struct task_struct *base_task; + int err = create_pipe_files(files, 0); if (err) return err; @@ -529,10 +570,76 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) err = replace_fd(0, files[0], 0); fput(files[0]); + if (err) + return err; + /* and disallow core files too */ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; - return err; + base_task = cp->base_task; + if (base_task) { + const struct cred *base_cred; + + /* Set fs_root to base_task */ + spin_lock(&base_task->fs->lock); + set_fs_root(current->fs, &base_task->fs->root); + set_fs_pwd(current->fs, &base_task->fs->pwd); + spin_unlock(&base_task->fs->lock); + + /* Set cgroup to base_task */ + current->flags &= ~PF_NO_SETAFFINITY; + err = cgroup_attach_task_all(base_task, current); + if (err < 0) + return err; + + /* Set cred to base_task */ + base_cred = get_task_cred(base_task); + + new->uid = base_cred->uid; + new->gid = base_cred->gid; + new->suid = base_cred->suid; + new->sgid = base_cred->sgid; + new->euid = base_cred->euid; + new->egid = base_cred->egid; + new->fsuid = base_cred->fsuid; + new->fsgid = base_cred->fsgid; + + new->securebits = base_cred->securebits; + + new->cap_inheritabl
[PATCH_v4.1_3/3] Make core_pattern support namespace
Currently, each container shared one copy of coredump setting with the host system, if host system changed the setting, each running containers will be affected. Same story happened when container changed core_pattern, both host and other container will be affected. For container based on namespace design, it is good to allow each container keeping their own coredump setting. It will bring us following benefit: 1: Each container can change their own coredump setting based on operation on /proc/sys/kernel/core_pattern 2: Coredump setting changed in host will not affect running containers. 3: Support both case of "putting coredump in guest" and "putting curedump in host". Each namespace-based software(lxc, docker, ..) can use this function to custom their dump setting. And this function makes each continer working as separate system, it fit for design goal of namespace. Test(in lxc): # In the host # # echo host_core >/proc/sys/kernel/core_pattern # cat /proc/sys/kernel/core_pattern host_core # ulimit -c 1024000 # ./make_dump Segmentation fault (core dumped) # ls -l -rw--- 1 root root 331776 Feb 4 18:02 host_core.2175 -rwxr-xr-x 1 root root 759731 Feb 4 18:01 make_dump # # In the container # # cat /proc/sys/kernel/core_pattern host_core # echo container_core >/proc/sys/kernel/core_pattern # ./make_dump Segmentation fault (core dumped) # ls -l -rwxr-xr-x1 root root 759731 Feb 4 10:45 make_dump -rw---1 root root 331776 Feb 4 10:45 container_core.16 # # Return to host # # cat /proc/sys/kernel/core_pattern host_core # ls host_core.2175 make_dump make_dump.c # rm -f host_core.2175 # ./make_dump Segmentation fault (core dumped) # ls -l -rw--- 1 root root 331776 Feb 4 18:49 host_core.2351 -rwxr-xr-x 1 root root 759731 Feb 4 18:01 make_dump # --- fs/coredump.c | 25 -- include/linux/pid_namespace.h | 3 +++ kernel/pid.c | 2 ++ kernel/pid_namespace.c| 2 ++ kernel/sysctl.c | 50 ++- 5 files changed, 70 insertions(+), 12 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 745c757..b0ab533 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -52,7 +52,6 @@ int core_uses_pid; unsigned int core_pipe_limit; -char core_pattern[CORENAME_MAX_SIZE] = "core"; static int core_name_size = CORENAME_MAX_SIZE; struct core_name { @@ -60,8 +59,6 @@ struct core_name { int used, size; }; -/* The maximal length of core_pattern is also specified in sysctl.c */ - static int expand_corename(struct core_name *cn, int size) { char *corename = krealloc(cn->corename, size, GFP_KERNEL); @@ -186,10 +183,10 @@ static int cn_print_exe_file(struct core_name *cn) * name into corename, which must have space for at least * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. */ -static int format_corename(struct core_name *cn, struct coredump_params *cprm) +static int format_corename(struct core_name *cn, const char *pat_ptr, + struct coredump_params *cprm) { const struct cred *cred = current_cred(); - const char *pat_ptr = core_pattern; int ispipe = (*pat_ptr == '|'); int pid_in_pattern = 0; int err = 0; @@ -668,6 +665,8 @@ void do_coredump(const siginfo_t *siginfo) */ .mm_flags = mm->flags, }; + struct pid_namespace *pid_ns; + char core_pattern[CORENAME_MAX_SIZE]; audit_core_dumps(siginfo->si_signo); @@ -677,6 +676,18 @@ void do_coredump(const siginfo_t *siginfo) if (!__get_dumpable(cprm.mm_flags)) goto fail; + pid_ns = task_active_pid_ns(current); + spin_lock(&pid_ns->core_pattern_lock); + while (pid_ns != &init_pid_ns) { + if (pid_ns->core_pattern[0]) + break; + spin_unlock(&pid_ns->core_pattern_lock); + pid_ns = pid_ns->parent, + spin_lock(&pid_ns->core_pattern_lock); + } + strcpy(core_pattern, pid_ns->core_pattern); + spin_unlock(&pid_ns->core_pattern_lock); + cred = prepare_creds(); if (!cred) goto fail; @@ -698,7 +709,7 @@ void do_coredump(const siginfo_t *siginfo) old_cred = override_creds(cred); - ispipe = format_corename(&cn, &cprm); + ispipe = format_corename(&cn, core_pattern, &cprm); if (ispipe) { int dump_count; @@ -745,7 +756,7 @@ void do_coredump(const siginfo_t *siginfo) } rcu_read_lock(); - vinit_task = find_task_by_vpid(1); + vinit_task = find_task_by_pid_ns(1, pid_ns); rcu_read_unlock(); if (!vinit_task) { printk(KERN_WARNING "failed getting init task in
Re: [PATCH_v4.1_0_3] Make core_pattern support namespace
ping 在 2017-02-08三的 11:00 +0800,Cao Shufeng写道: > This patchset includes following function points: > 1: Let usermodehelper function possible to set pid namespace >done by: [PATCH v4 1/3] Make call_usermodehelper_exec possible >to set pid namespace. > 2: Let pipe_type core_pattern write dump into container's rootfs >done by: [PATCH v4 2/3] Limit dump_pipe program's permission to >init for container. > 2: Make separate core_pattern setting for each container >done by: [PATCH v4 3/3] Make core_pattern support namespace > 3: Compatibility with current system >also included in: [PATCH v4 3/3] Make core_pattern support namespace >If container hadn't change core_pattern setting, it will keep >same setting with host. > > Test: > 1: Pass a test script for each function of this patchset >## TEST IN HOST ## >[root@kerneldev dumptest]# ./test_host >Set file core_pattern: OK >./test_host: line 41: 2366 Segmentation fault (core dumped) "$SCRI= > PT_BASE_DIR"/make_dump >Checking dumpfile: OK >Set file core_pattern: OK >./test_host: line 41: 2369 Segmentation fault (core dumped) "$SCRI= > PT_BASE_DIR"/make_dump >Checking dump_pipe triggered: OK >Checking rootfs: OK >Checking dumpfile: OK >Checking namespace: OK >Checking process list: OK >Checking capabilities: OK > >## TEST IN GUEST ## ># ./test >Segmentation fault (core dumped) >Checking dump_pipe triggered: OK >Checking rootfs: OK >Checking dumpfile: OK >Checking namespace: OK >Checking process list: OK >Checking cg pids: OK >Checking capabilities: OK >[ 64.940734] make_dump[2432]: segfault at 0 ip 0040049d sp 000= > 07ffc4af025f0 error 6 in make_dump[40+a6000] ># > 2: Pass other test(which is not easy to do in script) by hand. > > Changelog v4-v4.1: > 1. Fix kernel panic pointed out by: >xiaolong...@intel.com > > Changelog v3.1-v4: > 1. remove extra fork pointed out by: >Andrei Vagin > > Changelog v3-v3.1: > 1. Switch "pwd" of pipe program to container's root fs. > 2. Rebase on top of v4.9-rc1 > > Changelog v2->v3: > 1: Fix problem of setting pid namespace, pointed out by: >Andrei Vagin > > Changelog v1(RFC)->v2: > 1: Add [PATCH 2/2] which was todo in [RFC v1]. > 2: Pass a test script for each function. > 3: Rebase on top of v4.7. > > Suggested-by: Eric W. Biederman > Suggested-by: KOSAKI Motohiro > Signed-off-by: Zhao Lei > Signed-off-by: Cao Shufeng > > Cao Shufeng (2): > Make call_usermodehelper_exec possible to set namespaces > Limit dump_pipe program's permission to init for container > > Zhao Lei (1): > Make core_pattern support namespace > > fs/coredump.c | 150 > +++--- > include/linux/binfmts.h | 2 + > include/linux/kmod.h | 5 ++ > include/linux/pid_namespace.h | 3 + > init/do_mounts_initrd.c | 3 +- > kernel/kmod.c | 55 +--- > kernel/pid.c | 2 + > kernel/pid_namespace.c| 2 + > kernel/sysctl.c | 50 -- > lib/kobject_uevent.c | 3 +- > security/keys/request_key.c | 4 +- > 11 files changed, 253 insertions(+), 26 deletions(-) > -- Best Regards, Cao Shufeng -- Cao Shufeng Development Dept.I Nanjing Fujitsu Nanda Software Tech. Co., Ltd.(FNST) No.6 Wenzhu Road, Nanjing, 210012, China TEL: +86+25-86630566-8552 FUJITSU INTERNAL: 7998-8552 EMail: caosf.f...@cn.fujitsu.com
[PATCH_v4.1_3_3] Make core_pattern support namespace
From: Zhao Lei Currently, each container shared one copy of coredump setting with the host system, if host system changed the setting, each running containers will be affected. Same story happened when container changed core_pattern, both host and other container will be affected. For container based on namespace design, it is good to allow each container keeping their own coredump setting. It will bring us following benefit: 1: Each container can change their own coredump setting based on operation on /proc/sys/kernel/core_pattern 2: Coredump setting changed in host will not affect running containers. 3: Support both case of "putting coredump in guest" and "putting curedump in host". Each namespace-based software(lxc, docker, ..) can use this function to custom their dump setting. And this function makes each continer working as separate system, it fit for design goal of namespace. Test(in lxc): # In the host # # echo host_core >/proc/sys/kernel/core_pattern # cat /proc/sys/kernel/core_pattern host_core # ulimit -c 1024000 # ./make_dump Segmentation fault (core dumped) # ls -l -rw--- 1 root root 331776 Feb 4 18:02 host_core.2175 -rwxr-xr-x 1 root root 759731 Feb 4 18:01 make_dump # # In the container # # cat /proc/sys/kernel/core_pattern host_core # echo container_core >/proc/sys/kernel/core_pattern # ./make_dump Segmentation fault (core dumped) # ls -l -rwxr-xr-x1 root root 759731 Feb 4 10:45 make_dump -rw---1 root root 331776 Feb 4 10:45 container_core.16 # # Return to host # # cat /proc/sys/kernel/core_pattern host_core # ls host_core.2175 make_dump make_dump.c # rm -f host_core.2175 # ./make_dump Segmentation fault (core dumped) # ls -l -rw--- 1 root root 331776 Feb 4 18:49 host_core.2351 -rwxr-xr-x 1 root root 759731 Feb 4 18:01 make_dump # Signed-off-by: Zhao Lei --- fs/coredump.c | 25 -- include/linux/pid_namespace.h | 3 +++ kernel/pid.c | 2 ++ kernel/pid_namespace.c| 2 ++ kernel/sysctl.c | 50 ++- 5 files changed, 70 insertions(+), 12 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 83282d7..4bab7bf 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -50,7 +50,6 @@ int core_uses_pid; unsigned int core_pipe_limit; -char core_pattern[CORENAME_MAX_SIZE] = "core"; static int core_name_size = CORENAME_MAX_SIZE; struct core_name { @@ -58,8 +57,6 @@ struct core_name { int used, size; }; -/* The maximal length of core_pattern is also specified in sysctl.c */ - static int expand_corename(struct core_name *cn, int size) { char *corename = krealloc(cn->corename, size, GFP_KERNEL); @@ -184,10 +181,10 @@ static int cn_print_exe_file(struct core_name *cn) * name into corename, which must have space for at least * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. */ -static int format_corename(struct core_name *cn, struct coredump_params *cprm) +static int format_corename(struct core_name *cn, const char *pat_ptr, + struct coredump_params *cprm) { const struct cred *cred = current_cred(); - const char *pat_ptr = core_pattern; int ispipe = (*pat_ptr == '|'); int pid_in_pattern = 0; int err = 0; @@ -666,6 +663,8 @@ void do_coredump(const siginfo_t *siginfo) */ .mm_flags = mm->flags, }; + struct pid_namespace *pid_ns; + char core_pattern[CORENAME_MAX_SIZE]; audit_core_dumps(siginfo->si_signo); @@ -675,6 +674,18 @@ void do_coredump(const siginfo_t *siginfo) if (!__get_dumpable(cprm.mm_flags)) goto fail; + pid_ns = task_active_pid_ns(current); + spin_lock(&pid_ns->core_pattern_lock); + while (pid_ns != &init_pid_ns) { + if (pid_ns->core_pattern[0]) + break; + spin_unlock(&pid_ns->core_pattern_lock); + pid_ns = pid_ns->parent, + spin_lock(&pid_ns->core_pattern_lock); + } + strcpy(core_pattern, pid_ns->core_pattern); + spin_unlock(&pid_ns->core_pattern_lock); + cred = prepare_creds(); if (!cred) goto fail; @@ -696,7 +707,7 @@ void do_coredump(const siginfo_t *siginfo) old_cred = override_creds(cred); - ispipe = format_corename(&cn, &cprm); + ispipe = format_corename(&cn, core_pattern, &cprm); if (ispipe) { int dump_count; @@ -743,7 +754,7 @@ void do_coredump(const siginfo_t *siginfo) } rcu_read_lock(); - vinit_task = find_task_by_vpid(1); + vinit_task = find_task_by_pid_ns(1, pid_ns); rcu_read_unlock(); if (!vinit_task) { print
[PATCH_v4.1_0_3] Make core_pattern support namespace
This patchset includes following function points: 1: Let usermodehelper function possible to set pid namespace done by: [PATCH v4 1/3] Make call_usermodehelper_exec possible to set pid namespace. 2: Let pipe_type core_pattern write dump into container's rootfs done by: [PATCH v4 2/3] Limit dump_pipe program's permission to init for container. 2: Make separate core_pattern setting for each container done by: [PATCH v4 3/3] Make core_pattern support namespace 3: Compatibility with current system also included in: [PATCH v4 3/3] Make core_pattern support namespace If container hadn't change core_pattern setting, it will keep same setting with host. Test: 1: Pass a test script for each function of this patchset ## TEST IN HOST ## [root@kerneldev dumptest]# ./test_host Set file core_pattern: OK ./test_host: line 41: 2366 Segmentation fault (core dumped) "$SCRI= PT_BASE_DIR"/make_dump Checking dumpfile: OK Set file core_pattern: OK ./test_host: line 41: 2369 Segmentation fault (core dumped) "$SCRI= PT_BASE_DIR"/make_dump Checking dump_pipe triggered: OK Checking rootfs: OK Checking dumpfile: OK Checking namespace: OK Checking process list: OK Checking capabilities: OK ## TEST IN GUEST ## # ./test Segmentation fault (core dumped) Checking dump_pipe triggered: OK Checking rootfs: OK Checking dumpfile: OK Checking namespace: OK Checking process list: OK Checking cg pids: OK Checking capabilities: OK [ 64.940734] make_dump[2432]: segfault at 0 ip 0040049d sp 000= 07ffc4af025f0 error 6 in make_dump[40+a6000] # 2: Pass other test(which is not easy to do in script) by hand. Changelog v4-v4.1: 1. Fix kernel panic pointed out by: xiaolong...@intel.com Changelog v3.1-v4: 1. remove extra fork pointed out by: Andrei Vagin Changelog v3-v3.1: 1. Switch "pwd" of pipe program to container's root fs. 2. Rebase on top of v4.9-rc1 Changelog v2->v3: 1: Fix problem of setting pid namespace, pointed out by: Andrei Vagin Changelog v1(RFC)->v2: 1: Add [PATCH 2/2] which was todo in [RFC v1]. 2: Pass a test script for each function. 3: Rebase on top of v4.7. Suggested-by: Eric W. Biederman Suggested-by: KOSAKI Motohiro Signed-off-by: Zhao Lei Signed-off-by: Cao Shufeng Cao Shufeng (2): Make call_usermodehelper_exec possible to set namespaces Limit dump_pipe program's permission to init for container Zhao Lei (1): Make core_pattern support namespace fs/coredump.c | 150 +++--- include/linux/binfmts.h | 2 + include/linux/kmod.h | 5 ++ include/linux/pid_namespace.h | 3 + init/do_mounts_initrd.c | 3 +- kernel/kmod.c | 55 +--- kernel/pid.c | 2 + kernel/pid_namespace.c| 2 + kernel/sysctl.c | 50 -- lib/kobject_uevent.c | 3 +- security/keys/request_key.c | 4 +- 11 files changed, 253 insertions(+), 26 deletions(-) -- 2.9.3
[PATCH_v4.1_1_3] Make call_usermodehelper_exec possible to set namespaces
Current call_usermodehelper_work() can not set namespaces for the executed program. This patch add above function for call_usermodehelper_work(). The init_intermediate is introduced for init works which should be done before fork(). So that we get a method to set namespaces for children. The cleanup_intermediate is introduced for cleaning up what we have done in init_intermediate, like switching back the namespace. This function is helpful for coredump to run pipe_program in specific container environment. Signed-off-by: Cao Shufeng Co-author-by: Zhao Lei --- fs/coredump.c | 3 ++- include/linux/kmod.h| 5 + init/do_mounts_initrd.c | 3 ++- kernel/kmod.c | 55 + lib/kobject_uevent.c| 3 ++- security/keys/request_key.c | 4 ++-- 6 files changed, 59 insertions(+), 14 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index eb9c92c..9abf4e5 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -644,7 +644,8 @@ void do_coredump(const siginfo_t *siginfo) retval = -ENOMEM; sub_info = call_usermodehelper_setup(helper_argv[0], helper_argv, NULL, GFP_KERNEL, - umh_pipe_setup, NULL, &cprm); + NULL, NULL, umh_pipe_setup, + NULL, &cprm); if (sub_info) retval = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); diff --git a/include/linux/kmod.h b/include/linux/kmod.h index fcfd2bf..0e474d4 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -61,6 +61,9 @@ struct subprocess_info { char **envp; int wait; int retval; + bool cleaned; + void (*init_intermediate)(struct subprocess_info *info); + void (*cleanup_intermediate)(struct subprocess_info *info); int (*init)(struct subprocess_info *info, struct cred *new); void (*cleanup)(struct subprocess_info *info); void *data; @@ -71,6 +74,8 @@ call_usermodehelper(char *path, char **argv, char **envp, int wait); extern struct subprocess_info * call_usermodehelper_setup(char *path, char **argv, char **envp, gfp_t gfp_mask, + void (*init_intermediate)(struct subprocess_info *info), + void (*cleanup_intermediate)(struct subprocess_info *info), int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *), void *data); diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index a1000ca..59d11c9 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -72,7 +72,8 @@ static void __init handle_initrd(void) current->flags |= PF_FREEZER_SKIP; info = call_usermodehelper_setup("/linuxrc", argv, envp_init, -GFP_KERNEL, init_linuxrc, NULL, NULL); +GFP_KERNEL, NULL, NULL, init_linuxrc, +NULL, NULL); if (!info) return; call_usermodehelper_exec(info, UMH_WAIT_PROC); diff --git a/kernel/kmod.c b/kernel/kmod.c index 0277d12..dcaa17d 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -91,7 +92,8 @@ static int call_modprobe(char *module_name, int wait) argv[4] = NULL; info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL, -NULL, free_modprobe_argv, NULL); +NULL, NULL, NULL, free_modprobe_argv, + NULL); if (!info) goto free_module_name; @@ -205,8 +207,15 @@ static void umh_complete(struct subprocess_info *sub_info) */ if (comp) complete(comp); - else + else { + for(;;) { + if (sub_info->cleaned == false) + udelay(20); + else + break; + } call_usermodehelper_freeinfo(sub_info); + } } /* @@ -301,6 +310,10 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) /* Restore default kernel sig handler */ kernel_sigaction(SIGCHLD, SIG_IGN); + if(sub_info->cleanup_intermediate) { + sub_info->cleanup_intermediate(sub_info); + } + sub_info->cleaned = true; umh_complete(sub_info); } @@ -322,6 +335,9 @@ static void call_usermodehelper_exec_work(struct work_struct *work) {
[PATCH_v4.1_2_3] Limit dump_pipe program's permission to init for container
Currently when we set core_pattern to a pipe, the pipe program is forked by kthread running with root's permission, and write dumpfile into host's filesystem. Same thing happened for container, the dumper and dumpfile are also in host(not in container). It have following program: 1: Not consistent with file_type core_pattern When we set core_pattern to a file, the container will write dump into container's filesystem instead of host. 2: Not safe for privileged container In a privileged container, user can destroy host system by following command: # # In a container # echo "|/bin/dd of=/boot/vmlinuz" >/proc/sys/kernel/core_pattern # make_dump This patch switch dumper program's environment to init task, so, for container, dumper program have same environment with init task in container, which make dumper program put in container's filesystem, and write coredump into container's filesystem. The dumper's permission is also limited into subset of container's init process. Suggested-by: Eric W. Biederman Suggested-by: KOSAKI Motohiro Signed-off-by: Cao ShuFeng --- fs/coredump.c | 126 +++- include/linux/binfmts.h | 2 + 2 files changed, 126 insertions(+), 2 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 9abf4e5..83282d7 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -505,6 +505,45 @@ static void wait_for_dump_helpers(struct file *file) } /* + * umh_ns_setup + * set the namesapces to the bask task of a container. + * we need to switch back to the original namespaces + * so that the thread of workqueue is not influlenced. + * + * this method runs in workqueue kernel thread. + */ +static void umh_ns_setup(struct subprocess_info *info) +{ + struct coredump_params *cp = (struct coredump_params *)info->data; + struct task_struct *base_task = cp->base_task; + + if (base_task) { + cp->current_task_nsproxy = current->nsproxy; + //prevent current namespace from being freed + get_nsproxy(current->nsproxy); + /* Set namespaces to base_task */ + get_nsproxy(base_task->nsproxy); + switch_task_namespaces(current, base_task->nsproxy); + } +} + +/* + * umh_ns_cleanup + * cleanup what we have done in umh_ns_setup. + * + * this method runs in workqueue kernel thread. + */ +static void umh_ns_cleanup(struct subprocess_info *info) +{ + struct coredump_params *cp = (struct coredump_params *)info->data; + struct nsproxy *current_task_nsproxy = cp->current_task_nsproxy; + if (current_task_nsproxy) { + /* switch workqueue's original namespace back */ + switch_task_namespaces(current, current_task_nsproxy); + } +} + +/* * umh_pipe_setup * helper function to customize the process used * to collect the core in userspace. Specifically @@ -519,6 +558,8 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) { struct file *files[2]; struct coredump_params *cp = (struct coredump_params *)info->data; + struct task_struct *base_task; + int err = create_pipe_files(files, 0); if (err) return err; @@ -527,10 +568,76 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) err = replace_fd(0, files[0], 0); fput(files[0]); + if (err) + return err; + /* and disallow core files too */ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; - return err; + base_task = cp->base_task; + if (base_task) { + const struct cred *base_cred; + + /* Set fs_root to base_task */ + spin_lock(&base_task->fs->lock); + set_fs_root(current->fs, &base_task->fs->root); + set_fs_pwd(current->fs, &base_task->fs->pwd); + spin_unlock(&base_task->fs->lock); + + /* Set cgroup to base_task */ + current->flags &= ~PF_NO_SETAFFINITY; + err = cgroup_attach_task_all(base_task, current); + if (err < 0) + return err; + + /* Set cred to base_task */ + base_cred = get_task_cred(base_task); + + new->uid = base_cred->uid; + new->gid = base_cred->gid; + new->suid = base_cred->suid; + new->sgid = base_cred->sgid; + new->euid = base_cred->euid; + new->egid = base_cred->egid; + new->fsuid = base_cred->fsuid; + new->fsgid = base_cred->fsgid; + + new->securebits = base_cred->securebits; + + new->cap_inheritabl
[PATCH 3/3] Make core_pattern support namespace
From: Zhao Lei Currently, each container shared one copy of coredump setting with the host system, if host system changed the setting, each running containers will be affected. Same story happened when container changed core_pattern, both host and other container will be affected. For container based on namespace design, it is good to allow each container keeping their own coredump setting. It will bring us following benefit: 1: Each container can change their own coredump setting based on operation on /proc/sys/kernel/core_pattern 2: Coredump setting changed in host will not affect running containers. 3: Support both case of "putting coredump in guest" and "putting curedump in host". Each namespace-based software(lxc, docker, ..) can use this function to custom their dump setting. And this function makes each continer working as separate system, it fit for design goal of namespace. Test(in lxc): # In the host # # echo host_core >/proc/sys/kernel/core_pattern # cat /proc/sys/kernel/core_pattern host_core # ulimit -c 1024000 # ./make_dump Segmentation fault (core dumped) # ls -l -rw--- 1 root root 331776 Feb 4 18:02 host_core.2175 -rwxr-xr-x 1 root root 759731 Feb 4 18:01 make_dump # # In the container # # cat /proc/sys/kernel/core_pattern host_core # echo container_core >/proc/sys/kernel/core_pattern # ./make_dump Segmentation fault (core dumped) # ls -l -rwxr-xr-x1 root root 759731 Feb 4 10:45 make_dump -rw---1 root root 331776 Feb 4 10:45 container_core.16 # # Return to host # # cat /proc/sys/kernel/core_pattern host_core # ls host_core.2175 make_dump make_dump.c # rm -f host_core.2175 # ./make_dump Segmentation fault (core dumped) # ls -l -rw--- 1 root root 331776 Feb 4 18:49 host_core.2351 -rwxr-xr-x 1 root root 759731 Feb 4 18:01 make_dump # Signed-off-by: Zhao Lei --- fs/coredump.c | 25 -- include/linux/pid_namespace.h | 3 +++ kernel/pid.c | 2 ++ kernel/pid_namespace.c| 2 ++ kernel/sysctl.c | 50 ++- 5 files changed, 70 insertions(+), 12 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 83282d7..4bab7bf 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -50,7 +50,6 @@ int core_uses_pid; unsigned int core_pipe_limit; -char core_pattern[CORENAME_MAX_SIZE] = "core"; static int core_name_size = CORENAME_MAX_SIZE; struct core_name { @@ -58,8 +57,6 @@ struct core_name { int used, size; }; -/* The maximal length of core_pattern is also specified in sysctl.c */ - static int expand_corename(struct core_name *cn, int size) { char *corename = krealloc(cn->corename, size, GFP_KERNEL); @@ -184,10 +181,10 @@ static int cn_print_exe_file(struct core_name *cn) * name into corename, which must have space for at least * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. */ -static int format_corename(struct core_name *cn, struct coredump_params *cprm) +static int format_corename(struct core_name *cn, const char *pat_ptr, + struct coredump_params *cprm) { const struct cred *cred = current_cred(); - const char *pat_ptr = core_pattern; int ispipe = (*pat_ptr == '|'); int pid_in_pattern = 0; int err = 0; @@ -666,6 +663,8 @@ void do_coredump(const siginfo_t *siginfo) */ .mm_flags = mm->flags, }; + struct pid_namespace *pid_ns; + char core_pattern[CORENAME_MAX_SIZE]; audit_core_dumps(siginfo->si_signo); @@ -675,6 +674,18 @@ void do_coredump(const siginfo_t *siginfo) if (!__get_dumpable(cprm.mm_flags)) goto fail; + pid_ns = task_active_pid_ns(current); + spin_lock(&pid_ns->core_pattern_lock); + while (pid_ns != &init_pid_ns) { + if (pid_ns->core_pattern[0]) + break; + spin_unlock(&pid_ns->core_pattern_lock); + pid_ns = pid_ns->parent, + spin_lock(&pid_ns->core_pattern_lock); + } + strcpy(core_pattern, pid_ns->core_pattern); + spin_unlock(&pid_ns->core_pattern_lock); + cred = prepare_creds(); if (!cred) goto fail; @@ -696,7 +707,7 @@ void do_coredump(const siginfo_t *siginfo) old_cred = override_creds(cred); - ispipe = format_corename(&cn, &cprm); + ispipe = format_corename(&cn, core_pattern, &cprm); if (ispipe) { int dump_count; @@ -743,7 +754,7 @@ void do_coredump(const siginfo_t *siginfo) } rcu_read_lock(); - vinit_task = find_task_by_vpid(1); + vinit_task = find_task_by_pid_ns(1, pid_ns); rcu_read_unlock(); if (!vinit_task) { print
[PATCH 1/3] Make call_usermodehelper_exec possible to set namespaces
Current call_usermodehelper_work() can not set namespaces for the executed program. This patch add above function for call_usermodehelper_work(). The init_intermediate is introduced for init works which should be done before fork(). So that we get a method to set namespaces for children. The cleanup_intermediate is introduced for cleaning up what we have done in init_intermediate, like switching back the namespace. This function is helpful for coredump to run pipe_program in specific container environment. Signed-off-by: Cao Shufeng Co-author-by: Zhao Lei --- fs/coredump.c | 3 ++- include/linux/kmod.h| 4 init/do_mounts_initrd.c | 3 ++- kernel/kmod.c | 43 +++ lib/kobject_uevent.c| 3 ++- security/keys/request_key.c | 4 ++-- 6 files changed, 47 insertions(+), 13 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index eb9c92c..9abf4e5 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -644,7 +644,8 @@ void do_coredump(const siginfo_t *siginfo) retval = -ENOMEM; sub_info = call_usermodehelper_setup(helper_argv[0], helper_argv, NULL, GFP_KERNEL, - umh_pipe_setup, NULL, &cprm); + NULL, NULL, umh_pipe_setup, + NULL, &cprm); if (sub_info) retval = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); diff --git a/include/linux/kmod.h b/include/linux/kmod.h index fcfd2bf..994e429 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -61,6 +61,8 @@ struct subprocess_info { char **envp; int wait; int retval; + void (*init_intermediate)(struct subprocess_info *info); + void (*cleanup_intermediate)(struct subprocess_info *info); int (*init)(struct subprocess_info *info, struct cred *new); void (*cleanup)(struct subprocess_info *info); void *data; @@ -71,6 +73,8 @@ call_usermodehelper(char *path, char **argv, char **envp, int wait); extern struct subprocess_info * call_usermodehelper_setup(char *path, char **argv, char **envp, gfp_t gfp_mask, + void (*init_intermediate)(struct subprocess_info *info), + void (*cleanup_intermediate)(struct subprocess_info *info), int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *), void *data); diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index a1000ca..59d11c9 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -72,7 +72,8 @@ static void __init handle_initrd(void) current->flags |= PF_FREEZER_SKIP; info = call_usermodehelper_setup("/linuxrc", argv, envp_init, -GFP_KERNEL, init_linuxrc, NULL, NULL); +GFP_KERNEL, NULL, NULL, init_linuxrc, +NULL, NULL); if (!info) return; call_usermodehelper_exec(info, UMH_WAIT_PROC); diff --git a/kernel/kmod.c b/kernel/kmod.c index 0277d12..42f5a74 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -91,7 +91,8 @@ static int call_modprobe(char *module_name, int wait) argv[4] = NULL; info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL, -NULL, free_modprobe_argv, NULL); +NULL, NULL, NULL, free_modprobe_argv, + NULL); if (!info) goto free_module_name; @@ -301,6 +302,9 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) /* Restore default kernel sig handler */ kernel_sigaction(SIGCHLD, SIG_IGN); + if(sub_info->cleanup_intermediate) { + sub_info->cleanup_intermediate(sub_info); + } umh_complete(sub_info); } @@ -322,6 +326,9 @@ static void call_usermodehelper_exec_work(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); + if(sub_info->init_intermediate) { + sub_info->init_intermediate(sub_info); + } if (sub_info->wait & UMH_WAIT_PROC) { call_usermodehelper_exec_sync(sub_info); @@ -334,6 +341,11 @@ static void call_usermodehelper_exec_work(struct work_struct *work) */ pid = kernel_thread(call_usermodehelper_exec_async, sub_info, CLONE_PARENT | SIGCHLD); + + if(sub_info->cleanup_inter
[PATCH 0/3] Make core_pattern support namespace
This patchset includes following function points: 1: Let usermodehelper function possible to set pid namespace done by: [PATCH v4 1/3] Make call_usermodehelper_exec possible to set pid namespace. 2: Let pipe_type core_pattern write dump into container's rootfs done by: [PATCH v4 2/3] Limit dump_pipe program's permission to init for container. 2: Make separate core_pattern setting for each container done by: [PATCH v4 3/3] Make core_pattern support namespace 3: Compatibility with current system also included in: [PATCH v4 3/3] Make core_pattern support namespace If container hadn't change core_pattern setting, it will keep same setting with host. Test: 1: Pass a test script for each function of this patchset ## TEST IN HOST ## [root@kerneldev dumptest]# ./test_host Set file core_pattern: OK ./test_host: line 41: 2366 Segmentation fault (core dumped) "$SCRI= PT_BASE_DIR"/make_dump Checking dumpfile: OK Set file core_pattern: OK ./test_host: line 41: 2369 Segmentation fault (core dumped) "$SCRI= PT_BASE_DIR"/make_dump Checking dump_pipe triggered: OK Checking rootfs: OK Checking dumpfile: OK Checking namespace: OK Checking process list: OK Checking capabilities: OK ## TEST IN GUEST ## # ./test Segmentation fault (core dumped) Checking dump_pipe triggered: OK Checking rootfs: OK Checking dumpfile: OK Checking namespace: OK Checking process list: OK Checking cg pids: OK Checking capabilities: OK [ 64.940734] make_dump[2432]: segfault at 0 ip 0040049d sp 000= 07ffc4af025f0 error 6 in make_dump[40+a6000] # 2: Pass other test(which is not easy to do in script) by hand. Changelog v3.1-v4: 1. remove extra fork pointed out by: Andrei Vagin 2: Rebase on top of v4.9-rc8. Changelog v3-v3.1: 1. Switch "pwd" of pipe program to container's root fs. 2. Rebase on top of v4.9-rc1. Changelog v2->v3: 1: Fix problem of setting pid namespace, pointed out by: Andrei Vagin Changelog v1(RFC)->v2: 1: Add [PATCH 2/2] which was todo in [RFC v1]. 2: Pass a test script for each function. 3: Rebase on top of v4.7. Suggested-by: Eric W. Biederman Suggested-by: KOSAKI Motohiro Signed-off-by: Zhao Lei Signed-off-by: Cao Shufeng Cao Shufeng (2): Make call_usermodehelper_exec possible to set namespaces Limit dump_pipe program's permission to init for container Zhao Lei (1): Make core_pattern support namespace fs/coredump.c | 150 +++--- include/linux/binfmts.h | 2 + include/linux/kmod.h | 4 ++ include/linux/pid_namespace.h | 3 + init/do_mounts_initrd.c | 3 +- kernel/kmod.c | 43 +--- kernel/pid.c | 2 + kernel/pid_namespace.c| 2 + kernel/sysctl.c | 50 -- lib/kobject_uevent.c | 3 +- security/keys/request_key.c | 4 +- 11 files changed, 241 insertions(+), 25 deletions(-) -- 2.7.4
[PATCH 2/3] Limit dump_pipe program's permission to init for container
Currently when we set core_pattern to a pipe, the pipe program is forked by kthread running with root's permission, and write dumpfile into host's filesystem. Same thing happened for container, the dumper and dumpfile are also in host(not in container). It have following program: 1: Not consistent with file_type core_pattern When we set core_pattern to a file, the container will write dump into container's filesystem instead of host. 2: Not safe for privileged container In a privileged container, user can destroy host system by following command: # # In a container # echo "|/bin/dd of=/boot/vmlinuz" >/proc/sys/kernel/core_pattern # make_dump This patch switch dumper program's environment to init task, so, for container, dumper program have same environment with init task in container, which make dumper program put in container's filesystem, and write coredump into container's filesystem. The dumper's permission is also limited into subset of container's init process. Suggested-by: Eric W. Biederman Suggested-by: KOSAKI Motohiro Signed-off-by: Cao ShuFeng --- fs/coredump.c | 126 +++- include/linux/binfmts.h | 2 + 2 files changed, 126 insertions(+), 2 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 9abf4e5..83282d7 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -505,6 +505,45 @@ static void wait_for_dump_helpers(struct file *file) } /* + * umh_ns_setup + * set the namesapces to the bask task of a container. + * we need to switch back to the original namespaces + * so that the thread of workqueue is not influlenced. + * + * this method runs in workqueue kernel thread. + */ +static void umh_ns_setup(struct subprocess_info *info) +{ + struct coredump_params *cp = (struct coredump_params *)info->data; + struct task_struct *base_task = cp->base_task; + + if (base_task) { + cp->current_task_nsproxy = current->nsproxy; + //prevent current namespace from being freed + get_nsproxy(current->nsproxy); + /* Set namespaces to base_task */ + get_nsproxy(base_task->nsproxy); + switch_task_namespaces(current, base_task->nsproxy); + } +} + +/* + * umh_ns_cleanup + * cleanup what we have done in umh_ns_setup. + * + * this method runs in workqueue kernel thread. + */ +static void umh_ns_cleanup(struct subprocess_info *info) +{ + struct coredump_params *cp = (struct coredump_params *)info->data; + struct nsproxy *current_task_nsproxy = cp->current_task_nsproxy; + if (current_task_nsproxy) { + /* switch workqueue's original namespace back */ + switch_task_namespaces(current, current_task_nsproxy); + } +} + +/* * umh_pipe_setup * helper function to customize the process used * to collect the core in userspace. Specifically @@ -519,6 +558,8 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) { struct file *files[2]; struct coredump_params *cp = (struct coredump_params *)info->data; + struct task_struct *base_task; + int err = create_pipe_files(files, 0); if (err) return err; @@ -527,10 +568,76 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) err = replace_fd(0, files[0], 0); fput(files[0]); + if (err) + return err; + /* and disallow core files too */ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; - return err; + base_task = cp->base_task; + if (base_task) { + const struct cred *base_cred; + + /* Set fs_root to base_task */ + spin_lock(&base_task->fs->lock); + set_fs_root(current->fs, &base_task->fs->root); + set_fs_pwd(current->fs, &base_task->fs->pwd); + spin_unlock(&base_task->fs->lock); + + /* Set cgroup to base_task */ + current->flags &= ~PF_NO_SETAFFINITY; + err = cgroup_attach_task_all(base_task, current); + if (err < 0) + return err; + + /* Set cred to base_task */ + base_cred = get_task_cred(base_task); + + new->uid = base_cred->uid; + new->gid = base_cred->gid; + new->suid = base_cred->suid; + new->sgid = base_cred->sgid; + new->euid = base_cred->euid; + new->egid = base_cred->egid; + new->fsuid = base_cred->fsuid; + new->fsgid = base_cred->fsgid; + + new->securebits = base_cred->securebits; + + new->cap_inheritabl
[PATCH v4 3/3] Make core_pattern support namespace
From: Zhao Lei Currently, each container shared one copy of coredump setting with the host system, if host system changed the setting, each running containers will be affected. Same story happened when container changed core_pattern, both host and other container will be affected. For container based on namespace design, it is good to allow each container keeping their own coredump setting. It will bring us following benefit: 1: Each container can change their own coredump setting based on operation on /proc/sys/kernel/core_pattern 2: Coredump setting changed in host will not affect running containers. 3: Support both case of "putting coredump in guest" and "putting curedump in host". Each namespace-based software(lxc, docker, ..) can use this function to custom their dump setting. And this function makes each continer working as separate system, it fit for design goal of namespace. Test(in lxc): # In the host # # echo host_core >/proc/sys/kernel/core_pattern # cat /proc/sys/kernel/core_pattern host_core # ulimit -c 1024000 # ./make_dump Segmentation fault (core dumped) # ls -l -rw--- 1 root root 331776 Feb 4 18:02 host_core.2175 -rwxr-xr-x 1 root root 759731 Feb 4 18:01 make_dump # # In the container # # cat /proc/sys/kernel/core_pattern host_core # echo container_core >/proc/sys/kernel/core_pattern # ./make_dump Segmentation fault (core dumped) # ls -l -rwxr-xr-x1 root root 759731 Feb 4 10:45 make_dump -rw---1 root root 331776 Feb 4 10:45 container_core.16 # # Return to host # # cat /proc/sys/kernel/core_pattern host_core # ls host_core.2175 make_dump make_dump.c # rm -f host_core.2175 # ./make_dump Segmentation fault (core dumped) # ls -l -rw--- 1 root root 331776 Feb 4 18:49 host_core.2351 -rwxr-xr-x 1 root root 759731 Feb 4 18:01 make_dump # Signed-off-by: Zhao Lei --- fs/coredump.c | 25 -- include/linux/pid_namespace.h | 3 +++ kernel/pid.c | 2 ++ kernel/pid_namespace.c| 2 ++ kernel/sysctl.c | 50 ++- 5 files changed, 70 insertions(+), 12 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index aa2ef6c..f97a987 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -49,7 +49,6 @@ int core_uses_pid; unsigned int core_pipe_limit; -char core_pattern[CORENAME_MAX_SIZE] = "core"; static int core_name_size = CORENAME_MAX_SIZE; struct core_name { @@ -57,8 +56,6 @@ struct core_name { int used, size; }; -/* The maximal length of core_pattern is also specified in sysctl.c */ - static int expand_corename(struct core_name *cn, int size) { char *corename = krealloc(cn->corename, size, GFP_KERNEL); @@ -183,10 +180,10 @@ static int cn_print_exe_file(struct core_name *cn) * name into corename, which must have space for at least * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. */ -static int format_corename(struct core_name *cn, struct coredump_params *cprm) +static int format_corename(struct core_name *cn, const char *pat_ptr, + struct coredump_params *cprm) { const struct cred *cred = current_cred(); - const char *pat_ptr = core_pattern; int ispipe = (*pat_ptr == '|'); int pid_in_pattern = 0; int err = 0; @@ -663,6 +660,8 @@ void do_coredump(const siginfo_t *siginfo) */ .mm_flags = mm->flags, }; + struct pid_namespace *pid_ns; + char core_pattern[CORENAME_MAX_SIZE]; audit_core_dumps(siginfo->si_signo); @@ -672,6 +671,18 @@ void do_coredump(const siginfo_t *siginfo) if (!__get_dumpable(cprm.mm_flags)) goto fail; + pid_ns = task_active_pid_ns(current); + spin_lock(&pid_ns->core_pattern_lock); + while (pid_ns != &init_pid_ns) { + if (pid_ns->core_pattern[0]) + break; + spin_unlock(&pid_ns->core_pattern_lock); + pid_ns = pid_ns->parent, + spin_lock(&pid_ns->core_pattern_lock); + } + strcpy(core_pattern, pid_ns->core_pattern); + spin_unlock(&pid_ns->core_pattern_lock); + cred = prepare_creds(); if (!cred) goto fail; @@ -693,7 +704,7 @@ void do_coredump(const siginfo_t *siginfo) old_cred = override_creds(cred); - ispipe = format_corename(&cn, &cprm); + ispipe = format_corename(&cn, core_pattern, &cprm); if (ispipe) { int dump_count; @@ -740,7 +751,7 @@ void do_coredump(const siginfo_t *siginfo) } rcu_read_lock(); - vinit_task = find_task_by_vpid(1); + vinit_task = find_task_by_pid_ns(1, pid_ns); rcu_read_unlock(); if (!vinit_task) { print
[PATCH v4 1/3] Make call_usermodehelper_exec possible to set namespaces
Current call_usermodehelper_work() can not set namespaces for the executed program. This patch add above function for call_usermodehelper_work(). The init_intermediate is introduced for init works which should be done before fork(). So that we get a method to set namespaces for children. The cleanup_intermediate is introduced for cleaning up what we have done in init_intermediate, like switching back the namespace. This function is helpful for coredump to run pipe_program in specific container environment. Signed-off-by: Cao Shufeng Co-author-by: Zhao Lei --- fs/coredump.c | 3 ++- include/linux/kmod.h| 4 init/do_mounts_initrd.c | 3 ++- kernel/kmod.c | 43 +++ lib/kobject_uevent.c| 3 ++- security/keys/request_key.c | 4 ++-- 6 files changed, 47 insertions(+), 13 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 281b768..52f2ed6 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -641,7 +641,8 @@ void do_coredump(const siginfo_t *siginfo) retval = -ENOMEM; sub_info = call_usermodehelper_setup(helper_argv[0], helper_argv, NULL, GFP_KERNEL, - umh_pipe_setup, NULL, &cprm); + NULL, NULL, umh_pipe_setup, + NULL, &cprm); if (sub_info) retval = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); diff --git a/include/linux/kmod.h b/include/linux/kmod.h index fcfd2bf..994e429 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -61,6 +61,8 @@ struct subprocess_info { char **envp; int wait; int retval; + void (*init_intermediate)(struct subprocess_info *info); + void (*cleanup_intermediate)(struct subprocess_info *info); int (*init)(struct subprocess_info *info, struct cred *new); void (*cleanup)(struct subprocess_info *info); void *data; @@ -71,6 +73,8 @@ call_usermodehelper(char *path, char **argv, char **envp, int wait); extern struct subprocess_info * call_usermodehelper_setup(char *path, char **argv, char **envp, gfp_t gfp_mask, + void (*init_intermediate)(struct subprocess_info *info), + void (*cleanup_intermediate)(struct subprocess_info *info), int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *), void *data); diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index a1000ca..59d11c9 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -72,7 +72,8 @@ static void __init handle_initrd(void) current->flags |= PF_FREEZER_SKIP; info = call_usermodehelper_setup("/linuxrc", argv, envp_init, -GFP_KERNEL, init_linuxrc, NULL, NULL); +GFP_KERNEL, NULL, NULL, init_linuxrc, +NULL, NULL); if (!info) return; call_usermodehelper_exec(info, UMH_WAIT_PROC); diff --git a/kernel/kmod.c b/kernel/kmod.c index 0277d12..42f5a74 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -91,7 +91,8 @@ static int call_modprobe(char *module_name, int wait) argv[4] = NULL; info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL, -NULL, free_modprobe_argv, NULL); +NULL, NULL, NULL, free_modprobe_argv, + NULL); if (!info) goto free_module_name; @@ -301,6 +302,9 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) /* Restore default kernel sig handler */ kernel_sigaction(SIGCHLD, SIG_IGN); + if(sub_info->cleanup_intermediate) { + sub_info->cleanup_intermediate(sub_info); + } umh_complete(sub_info); } @@ -322,6 +326,9 @@ static void call_usermodehelper_exec_work(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); + if(sub_info->init_intermediate) { + sub_info->init_intermediate(sub_info); + } if (sub_info->wait & UMH_WAIT_PROC) { call_usermodehelper_exec_sync(sub_info); @@ -334,6 +341,11 @@ static void call_usermodehelper_exec_work(struct work_struct *work) */ pid = kernel_thread(call_usermodehelper_exec_async, sub_info, CLONE_PARENT | SIGCHLD); + + if(sub_info->cleanup_inter
[PATCH v4 0/3] Make core_pattern support namespace
This patchset includes following function points: 1: Let usermodehelper function possible to set pid namespace done by: [PATCH v4 1/3] Make call_usermodehelper_exec possible to set pid namespace. 2: Let pipe_type core_pattern write dump into container's rootfs done by: [PATCH v4 2/3] Limit dump_pipe program's permission to init for container. 2: Make separate core_pattern setting for each container done by: [PATCH v4 3/3] Make core_pattern support namespace 3: Compatibility with current system also included in: [PATCH v4 3/3] Make core_pattern support namespace If container hadn't change core_pattern setting, it will keep same setting with host. Test: 1: Pass a test script for each function of this patchset ## TEST IN HOST ## [root@kerneldev dumptest]# ./test_host Set file core_pattern: OK ./test_host: line 41: 2366 Segmentation fault (core dumped) "$SCRI= PT_BASE_DIR"/make_dump Checking dumpfile: OK Set file core_pattern: OK ./test_host: line 41: 2369 Segmentation fault (core dumped) "$SCRI= PT_BASE_DIR"/make_dump Checking dump_pipe triggered: OK Checking rootfs: OK Checking dumpfile: OK Checking namespace: OK Checking process list: OK Checking capabilities: OK ## TEST IN GUEST ## # ./test Segmentation fault (core dumped) Checking dump_pipe triggered: OK Checking rootfs: OK Checking dumpfile: OK Checking namespace: OK Checking process list: OK Checking cg pids: OK Checking capabilities: OK [ 64.940734] make_dump[2432]: segfault at 0 ip 0040049d sp 000= 07ffc4af025f0 error 6 in make_dump[40+a6000] # 2: Pass other test(which is not easy to do in script) by hand. Changelog v3.1-v4: 1. remove extra fork pointed out by: Andrei Vagin Changelog v3-v3.1: 1. Switch "pwd" of pipe program to container's root fs. 2. Rebase on top of v4.9-rc1 Changelog v2->v3: 1: Fix problem of setting pid namespace, pointed out by: Andrei Vagin Changelog v1(RFC)->v2: 1: Add [PATCH 2/2] which was todo in [RFC v1]. 2: Pass a test script for each function. 3: Rebase on top of v4.7. Suggested-by: Eric W. Biederman Suggested-by: KOSAKI Motohiro Signed-off-by: Zhao Lei Signed-off-by: Cao Shufeng Cao Shufeng (2): Make call_usermodehelper_exec possible to set namespaces Limit dump_pipe program's permission to init for container Zhao Lei (1): Make core_pattern support namespace fs/coredump.c | 150 +++--- include/linux/binfmts.h | 2 + include/linux/kmod.h | 4 ++ include/linux/pid_namespace.h | 3 + init/do_mounts_initrd.c | 3 +- kernel/kmod.c | 43 +--- kernel/pid.c | 2 + kernel/pid_namespace.c| 2 + kernel/sysctl.c | 50 -- lib/kobject_uevent.c | 3 +- security/keys/request_key.c | 4 +- 11 files changed, 241 insertions(+), 25 deletions(-) -- 2.7.4 >From caosf.f...@cn.fujitsu.com Tue Oct 25 15:28:53 2016 Received: from localhost.localdomain (10.167.226.94) by G08CNEXCHPEKD01.g08.fujitsu.local (10.167.33.89) with Microsoft SMTP Server (TLS) id 14.3.279.2; Tue, 25 Oct 2016 15:28:53 +0800 From: Cao Shufeng To: CC: , , , , , , , , Subject: [PATCH v4 3/3] Make core_pattern support namespace Date: Tue, 25 Oct 2016 15:28:56 +0800 Message-ID: <1477380536-3307-4-git-send-email-caosf.f...@cn.fujitsu.com> X-Mailer: git-send-email 2.1.0 In-Reply-To: <1477380536-3307-1-git-send-email-caosf.f...@cn.fujitsu.com> References: <1477380536-3307-1-git-send-email-caosf.f...@cn.fujitsu.com> Content-Type: text/plain Return-Path: caosf.f...@cn.fujitsu.com X-MS-Exchange-Organization-AuthSource: G08CNEXCHPEKD01.g08.fujitsu.local X-MS-Exchange-Organization-AuthAs: Internal X-MS-Exchange-Organization-AuthMechanism: 06 X-Originating-IP: [10.167.226.94] X-MS-Exchange-Organization-AVStamp-Mailbox: SMEXw]nP;1285660;0;This mail has been scanned by Trend Micro ScanMail for Microsoft Exchange; X-MS-Exchange-Organization-SCL: 0 MIME-Version: 1.0 X-Evolution-POP3-UID: 24016 X-Evolution-Source: 1406508640.5943.5@localhost.localdomain Content-Transfer-Encoding: 8bit From: Zhao Lei Currently, each container shared one copy of coredump setting with the host system, if host system changed the setting, each running containers will be affected. Same story happened when container changed core_pattern, both host and other container will be affected. For container based on namespace design, it is good to allow each container keeping their own coredump setting. It will bring us following benefit: 1: Each container can change their own coredump setting based on operation on /proc/sys/kernel/core_pattern 2: Coredump setting changed in host will not affect running containers. 3: Support both case of "putting coredump in guest" and "pu
[PATCH v4 2/3] Limit dump_pipe program's permission to init for container
Currently when we set core_pattern to a pipe, the pipe program is forked by kthread running with root's permission, and write dumpfile into host's filesystem. Same thing happened for container, the dumper and dumpfile are also in host(not in container). It have following program: 1: Not consistent with file_type core_pattern When we set core_pattern to a file, the container will write dump into container's filesystem instead of host. 2: Not safe for privileged container In a privileged container, user can destroy host system by following command: # # In a container # echo "|/bin/dd of=/boot/vmlinuz" >/proc/sys/kernel/core_pattern # make_dump This patch switch dumper program's environment to init task, so, for container, dumper program have same environment with init task in container, which make dumper program put in container's filesystem, and write coredump into container's filesystem. The dumper's permission is also limited into subset of container's init process. Suggested-by: Eric W. Biederman Suggested-by: KOSAKI Motohiro Signed-off-by: Cao ShuFeng --- fs/coredump.c | 126 +++- include/linux/binfmts.h | 2 + 2 files changed, 126 insertions(+), 2 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 52f2ed6..aa2ef6c 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -502,6 +502,45 @@ static void wait_for_dump_helpers(struct file *file) } /* + * umh_ns_setup + * set the namesapces to the bask task of a container. + * we need to switch back to the original namespaces + * so that the thread of workqueue is not influlenced. + * + * this method runs in workqueue kernel thread. + */ +static void umh_ns_setup(struct subprocess_info *info) +{ + struct coredump_params *cp = (struct coredump_params *)info->data; + struct task_struct *base_task = cp->base_task; + + if (base_task) { + cp->current_task_nsproxy = current->nsproxy; + //prevent current namespace from being freed + get_nsproxy(current->nsproxy); + /* Set namespaces to base_task */ + get_nsproxy(base_task->nsproxy); + switch_task_namespaces(current, base_task->nsproxy); + } +} + +/* + * umh_ns_cleanup + * cleanup what we have done in umh_ns_setup. + * + * this method runs in workqueue kernel thread. + */ +static void umh_ns_cleanup(struct subprocess_info *info) +{ + struct coredump_params *cp = (struct coredump_params *)info->data; + struct nsproxy *current_task_nsproxy = cp->current_task_nsproxy; + if (current_task_nsproxy) { + /* switch workqueue's original namespace back */ + switch_task_namespaces(current, current_task_nsproxy); + } +} + +/* * umh_pipe_setup * helper function to customize the process used * to collect the core in userspace. Specifically @@ -516,6 +555,8 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) { struct file *files[2]; struct coredump_params *cp = (struct coredump_params *)info->data; + struct task_struct *base_task; + int err = create_pipe_files(files, 0); if (err) return err; @@ -524,10 +565,76 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) err = replace_fd(0, files[0], 0); fput(files[0]); + if (err) + return err; + /* and disallow core files too */ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; - return err; + base_task = cp->base_task; + if (base_task) { + const struct cred *base_cred; + + /* Set fs_root to base_task */ + spin_lock(&base_task->fs->lock); + set_fs_root(current->fs, &base_task->fs->root); + set_fs_pwd(current->fs, &base_task->fs->pwd); + spin_unlock(&base_task->fs->lock); + + /* Set cgroup to base_task */ + current->flags &= ~PF_NO_SETAFFINITY; + err = cgroup_attach_task_all(base_task, current); + if (err < 0) + return err; + + /* Set cred to base_task */ + base_cred = get_task_cred(base_task); + + new->uid = base_cred->uid; + new->gid = base_cred->gid; + new->suid = base_cred->suid; + new->sgid = base_cred->sgid; + new->euid = base_cred->euid; + new->egid = base_cred->egid; + new->fsuid = base_cred->fsuid; + new->fsgid = base_cred->fsgid; + + new->securebits = base_cred->securebits; + + new->cap_inheritabl
Re: [PATCH v4 0/3] Make core_pattern support namespace
ping 在 2016-10-25二的 15:28 +0800,Cao Shufeng写道: > This patchset includes following function points: > 1: Let usermodehelper function possible to set pid namespace >done by: [PATCH v4 1/3] Make call_usermodehelper_exec possible >to set pid namespace. > 2: Let pipe_type core_pattern write dump into container's rootfs >done by: [PATCH v4 2/3] Limit dump_pipe program's permission to >init for container. > 2: Make separate core_pattern setting for each container >done by: [PATCH v4 3/3] Make core_pattern support namespace > 3: Compatibility with current system >also included in: [PATCH v4 3/3] Make core_pattern support namespace >If container hadn't change core_pattern setting, it will keep >same setting with host. > > Test: > 1: Pass a test script for each function of this patchset >## TEST IN HOST ## >[root@kerneldev dumptest]# ./test_host >Set file core_pattern: OK >./test_host: line 41: 2366 Segmentation fault (core dumped) "$SCRI= > PT_BASE_DIR"/make_dump >Checking dumpfile: OK >Set file core_pattern: OK >./test_host: line 41: 2369 Segmentation fault (core dumped) "$SCRI= > PT_BASE_DIR"/make_dump >Checking dump_pipe triggered: OK >Checking rootfs: OK >Checking dumpfile: OK >Checking namespace: OK >Checking process list: OK >Checking capabilities: OK > >## TEST IN GUEST ## ># ./test >Segmentation fault (core dumped) >Checking dump_pipe triggered: OK >Checking rootfs: OK >Checking dumpfile: OK >Checking namespace: OK >Checking process list: OK >Checking cg pids: OK >Checking capabilities: OK >[ 64.940734] make_dump[2432]: segfault at 0 ip 0040049d sp 000= > 07ffc4af025f0 error 6 in make_dump[40+a6000] ># > 2: Pass other test(which is not easy to do in script) by hand. > > Changelog v3.1-v4: > 1. remove extra fork pointed out by: >Andrei Vagin > > Changelog v3-v3.1: > 1. Switch "pwd" of pipe program to container's root fs. > 2. Rebase on top of v4.9-rc1 > > Changelog v2->v3: > 1: Fix problem of setting pid namespace, pointed out by: >Andrei Vagin > > Changelog v1(RFC)->v2: > 1: Add [PATCH 2/2] which was todo in [RFC v1]. > 2: Pass a test script for each function. > 3: Rebase on top of v4.7. > > Suggested-by: Eric W. Biederman > Suggested-by: KOSAKI Motohiro > Signed-off-by: Zhao Lei > Signed-off-by: Cao Shufeng > > Cao Shufeng (2): > Make call_usermodehelper_exec possible to set namespaces > Limit dump_pipe program's permission to init for container > > Zhao Lei (1): > Make core_pattern support namespace > > fs/coredump.c | 150 > +++--- > include/linux/binfmts.h | 2 + > include/linux/kmod.h | 4 ++ > include/linux/pid_namespace.h | 3 + > init/do_mounts_initrd.c | 3 +- > kernel/kmod.c | 43 +++++--- > kernel/pid.c | 2 + > kernel/pid_namespace.c| 2 + > kernel/sysctl.c | 50 -- > lib/kobject_uevent.c | 3 +- > security/keys/request_key.c | 4 +- > 11 files changed, 241 insertions(+), 25 deletions(-) > -- Best Regards, Cao Shufeng -- Cao Shufeng Development Dept.I Nanjing Fujitsu Nanda Software Tech. Co., Ltd.(FNST) No.6 Wenzhu Road, Nanjing, 210012, China TEL: +86+25-86630566-8552 FUJITSU INTERNAL: 7998-8552 EMail: caosf.f...@cn.fujitsu.com
[PATCH v4 3/3] Make core_pattern support namespace
From: Zhao Lei Currently, each container shared one copy of coredump setting with the host system, if host system changed the setting, each running containers will be affected. Same story happened when container changed core_pattern, both host and other container will be affected. For container based on namespace design, it is good to allow each container keeping their own coredump setting. It will bring us following benefit: 1: Each container can change their own coredump setting based on operation on /proc/sys/kernel/core_pattern 2: Coredump setting changed in host will not affect running containers. 3: Support both case of "putting coredump in guest" and "putting curedump in host". Each namespace-based software(lxc, docker, ..) can use this function to custom their dump setting. And this function makes each continer working as separate system, it fit for design goal of namespace. Test(in lxc): # In the host # # echo host_core >/proc/sys/kernel/core_pattern # cat /proc/sys/kernel/core_pattern host_core # ulimit -c 1024000 # ./make_dump Segmentation fault (core dumped) # ls -l -rw--- 1 root root 331776 Feb 4 18:02 host_core.2175 -rwxr-xr-x 1 root root 759731 Feb 4 18:01 make_dump # # In the container # # cat /proc/sys/kernel/core_pattern host_core # echo container_core >/proc/sys/kernel/core_pattern # ./make_dump Segmentation fault (core dumped) # ls -l -rwxr-xr-x1 root root 759731 Feb 4 10:45 make_dump -rw---1 root root 331776 Feb 4 10:45 container_core.16 # # Return to host # # cat /proc/sys/kernel/core_pattern host_core # ls host_core.2175 make_dump make_dump.c # rm -f host_core.2175 # ./make_dump Segmentation fault (core dumped) # ls -l -rw--- 1 root root 331776 Feb 4 18:49 host_core.2351 -rwxr-xr-x 1 root root 759731 Feb 4 18:01 make_dump # Signed-off-by: Zhao Lei --- fs/coredump.c | 25 -- include/linux/pid_namespace.h | 3 +++ kernel/pid.c | 2 ++ kernel/pid_namespace.c| 2 ++ kernel/sysctl.c | 50 ++- 5 files changed, 70 insertions(+), 12 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index aa2ef6c..f97a987 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -49,7 +49,6 @@ int core_uses_pid; unsigned int core_pipe_limit; -char core_pattern[CORENAME_MAX_SIZE] = "core"; static int core_name_size = CORENAME_MAX_SIZE; struct core_name { @@ -57,8 +56,6 @@ struct core_name { int used, size; }; -/* The maximal length of core_pattern is also specified in sysctl.c */ - static int expand_corename(struct core_name *cn, int size) { char *corename = krealloc(cn->corename, size, GFP_KERNEL); @@ -183,10 +180,10 @@ static int cn_print_exe_file(struct core_name *cn) * name into corename, which must have space for at least * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. */ -static int format_corename(struct core_name *cn, struct coredump_params *cprm) +static int format_corename(struct core_name *cn, const char *pat_ptr, + struct coredump_params *cprm) { const struct cred *cred = current_cred(); - const char *pat_ptr = core_pattern; int ispipe = (*pat_ptr == '|'); int pid_in_pattern = 0; int err = 0; @@ -663,6 +660,8 @@ void do_coredump(const siginfo_t *siginfo) */ .mm_flags = mm->flags, }; + struct pid_namespace *pid_ns; + char core_pattern[CORENAME_MAX_SIZE]; audit_core_dumps(siginfo->si_signo); @@ -672,6 +671,18 @@ void do_coredump(const siginfo_t *siginfo) if (!__get_dumpable(cprm.mm_flags)) goto fail; + pid_ns = task_active_pid_ns(current); + spin_lock(&pid_ns->core_pattern_lock); + while (pid_ns != &init_pid_ns) { + if (pid_ns->core_pattern[0]) + break; + spin_unlock(&pid_ns->core_pattern_lock); + pid_ns = pid_ns->parent, + spin_lock(&pid_ns->core_pattern_lock); + } + strcpy(core_pattern, pid_ns->core_pattern); + spin_unlock(&pid_ns->core_pattern_lock); + cred = prepare_creds(); if (!cred) goto fail; @@ -693,7 +704,7 @@ void do_coredump(const siginfo_t *siginfo) old_cred = override_creds(cred); - ispipe = format_corename(&cn, &cprm); + ispipe = format_corename(&cn, core_pattern, &cprm); if (ispipe) { int dump_count; @@ -740,7 +751,7 @@ void do_coredump(const siginfo_t *siginfo) } rcu_read_lock(); - vinit_task = find_task_by_vpid(1); + vinit_task = find_task_by_pid_ns(1, pid_ns); rcu_read_unlock(); if (!vinit_task) { print
[PATCH v4 1/3] Make call_usermodehelper_exec possible to set namespaces
Current call_usermodehelper_work() can not set namespaces for the executed program. This patch add above function for call_usermodehelper_work(). The init_intermediate is introduced for init works which should be done before fork(). So that we get a method to set namespaces for children. The cleanup_intermediate is introduced for cleaning up what we have done in init_intermediate, like switching back the namespace. This function is helpful for coredump to run pipe_program in specific container environment. Signed-off-by: Cao Shufeng Co-author-by: Zhao Lei --- fs/coredump.c | 3 ++- include/linux/kmod.h| 4 init/do_mounts_initrd.c | 3 ++- kernel/kmod.c | 43 +++ lib/kobject_uevent.c| 3 ++- security/keys/request_key.c | 4 ++-- 6 files changed, 47 insertions(+), 13 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 281b768..52f2ed6 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -641,7 +641,8 @@ void do_coredump(const siginfo_t *siginfo) retval = -ENOMEM; sub_info = call_usermodehelper_setup(helper_argv[0], helper_argv, NULL, GFP_KERNEL, - umh_pipe_setup, NULL, &cprm); + NULL, NULL, umh_pipe_setup, + NULL, &cprm); if (sub_info) retval = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); diff --git a/include/linux/kmod.h b/include/linux/kmod.h index fcfd2bf..994e429 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -61,6 +61,8 @@ struct subprocess_info { char **envp; int wait; int retval; + void (*init_intermediate)(struct subprocess_info *info); + void (*cleanup_intermediate)(struct subprocess_info *info); int (*init)(struct subprocess_info *info, struct cred *new); void (*cleanup)(struct subprocess_info *info); void *data; @@ -71,6 +73,8 @@ call_usermodehelper(char *path, char **argv, char **envp, int wait); extern struct subprocess_info * call_usermodehelper_setup(char *path, char **argv, char **envp, gfp_t gfp_mask, + void (*init_intermediate)(struct subprocess_info *info), + void (*cleanup_intermediate)(struct subprocess_info *info), int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *), void *data); diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index a1000ca..59d11c9 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -72,7 +72,8 @@ static void __init handle_initrd(void) current->flags |= PF_FREEZER_SKIP; info = call_usermodehelper_setup("/linuxrc", argv, envp_init, -GFP_KERNEL, init_linuxrc, NULL, NULL); +GFP_KERNEL, NULL, NULL, init_linuxrc, +NULL, NULL); if (!info) return; call_usermodehelper_exec(info, UMH_WAIT_PROC); diff --git a/kernel/kmod.c b/kernel/kmod.c index 0277d12..42f5a74 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -91,7 +91,8 @@ static int call_modprobe(char *module_name, int wait) argv[4] = NULL; info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL, -NULL, free_modprobe_argv, NULL); +NULL, NULL, NULL, free_modprobe_argv, + NULL); if (!info) goto free_module_name; @@ -301,6 +302,9 @@ static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info) /* Restore default kernel sig handler */ kernel_sigaction(SIGCHLD, SIG_IGN); + if(sub_info->cleanup_intermediate) { + sub_info->cleanup_intermediate(sub_info); + } umh_complete(sub_info); } @@ -322,6 +326,9 @@ static void call_usermodehelper_exec_work(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); + if(sub_info->init_intermediate) { + sub_info->init_intermediate(sub_info); + } if (sub_info->wait & UMH_WAIT_PROC) { call_usermodehelper_exec_sync(sub_info); @@ -334,6 +341,11 @@ static void call_usermodehelper_exec_work(struct work_struct *work) */ pid = kernel_thread(call_usermodehelper_exec_async, sub_info, CLONE_PARENT | SIGCHLD); + + if(sub_info->cleanup_inter
[PATCH v4 2/3] Limit dump_pipe program's permission to init for container
Currently when we set core_pattern to a pipe, the pipe program is forked by kthread running with root's permission, and write dumpfile into host's filesystem. Same thing happened for container, the dumper and dumpfile are also in host(not in container). It have following program: 1: Not consistent with file_type core_pattern When we set core_pattern to a file, the container will write dump into container's filesystem instead of host. 2: Not safe for privileged container In a privileged container, user can destroy host system by following command: # # In a container # echo "|/bin/dd of=/boot/vmlinuz" >/proc/sys/kernel/core_pattern # make_dump This patch switch dumper program's environment to init task, so, for container, dumper program have same environment with init task in container, which make dumper program put in container's filesystem, and write coredump into container's filesystem. The dumper's permission is also limited into subset of container's init process. Suggested-by: Eric W. Biederman Suggested-by: KOSAKI Motohiro Signed-off-by: Cao ShuFeng --- fs/coredump.c | 126 +++- include/linux/binfmts.h | 2 + 2 files changed, 126 insertions(+), 2 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 52f2ed6..aa2ef6c 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -502,6 +502,45 @@ static void wait_for_dump_helpers(struct file *file) } /* + * umh_ns_setup + * set the namesapces to the bask task of a container. + * we need to switch back to the original namespaces + * so that the thread of workqueue is not influlenced. + * + * this method runs in workqueue kernel thread. + */ +static void umh_ns_setup(struct subprocess_info *info) +{ + struct coredump_params *cp = (struct coredump_params *)info->data; + struct task_struct *base_task = cp->base_task; + + if (base_task) { + cp->current_task_nsproxy = current->nsproxy; + //prevent current namespace from being freed + get_nsproxy(current->nsproxy); + /* Set namespaces to base_task */ + get_nsproxy(base_task->nsproxy); + switch_task_namespaces(current, base_task->nsproxy); + } +} + +/* + * umh_ns_cleanup + * cleanup what we have done in umh_ns_setup. + * + * this method runs in workqueue kernel thread. + */ +static void umh_ns_cleanup(struct subprocess_info *info) +{ + struct coredump_params *cp = (struct coredump_params *)info->data; + struct nsproxy *current_task_nsproxy = cp->current_task_nsproxy; + if (current_task_nsproxy) { + /* switch workqueue's original namespace back */ + switch_task_namespaces(current, current_task_nsproxy); + } +} + +/* * umh_pipe_setup * helper function to customize the process used * to collect the core in userspace. Specifically @@ -516,6 +555,8 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) { struct file *files[2]; struct coredump_params *cp = (struct coredump_params *)info->data; + struct task_struct *base_task; + int err = create_pipe_files(files, 0); if (err) return err; @@ -524,10 +565,76 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) err = replace_fd(0, files[0], 0); fput(files[0]); + if (err) + return err; + /* and disallow core files too */ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; - return err; + base_task = cp->base_task; + if (base_task) { + const struct cred *base_cred; + + /* Set fs_root to base_task */ + spin_lock(&base_task->fs->lock); + set_fs_root(current->fs, &base_task->fs->root); + set_fs_pwd(current->fs, &base_task->fs->pwd); + spin_unlock(&base_task->fs->lock); + + /* Set cgroup to base_task */ + current->flags &= ~PF_NO_SETAFFINITY; + err = cgroup_attach_task_all(base_task, current); + if (err < 0) + return err; + + /* Set cred to base_task */ + base_cred = get_task_cred(base_task); + + new->uid = base_cred->uid; + new->gid = base_cred->gid; + new->suid = base_cred->suid; + new->sgid = base_cred->sgid; + new->euid = base_cred->euid; + new->egid = base_cred->egid; + new->fsuid = base_cred->fsuid; + new->fsgid = base_cred->fsgid; + + new->securebits = base_cred->securebits; + + new->cap_inheritabl
[PATCH v4 0/3] Make core_pattern support namespace
This patchset includes following function points: 1: Let usermodehelper function possible to set pid namespace done by: [PATCH v4 1/3] Make call_usermodehelper_exec possible to set pid namespace. 2: Let pipe_type core_pattern write dump into container's rootfs done by: [PATCH v4 2/3] Limit dump_pipe program's permission to init for container. 2: Make separate core_pattern setting for each container done by: [PATCH v4 3/3] Make core_pattern support namespace 3: Compatibility with current system also included in: [PATCH v4 3/3] Make core_pattern support namespace If container hadn't change core_pattern setting, it will keep same setting with host. Test: 1: Pass a test script for each function of this patchset ## TEST IN HOST ## [root@kerneldev dumptest]# ./test_host Set file core_pattern: OK ./test_host: line 41: 2366 Segmentation fault (core dumped) "$SCRI= PT_BASE_DIR"/make_dump Checking dumpfile: OK Set file core_pattern: OK ./test_host: line 41: 2369 Segmentation fault (core dumped) "$SCRI= PT_BASE_DIR"/make_dump Checking dump_pipe triggered: OK Checking rootfs: OK Checking dumpfile: OK Checking namespace: OK Checking process list: OK Checking capabilities: OK ## TEST IN GUEST ## # ./test Segmentation fault (core dumped) Checking dump_pipe triggered: OK Checking rootfs: OK Checking dumpfile: OK Checking namespace: OK Checking process list: OK Checking cg pids: OK Checking capabilities: OK [ 64.940734] make_dump[2432]: segfault at 0 ip 0040049d sp 000= 07ffc4af025f0 error 6 in make_dump[40+a6000] # 2: Pass other test(which is not easy to do in script) by hand. Changelog v3.1-v4: 1. remove extra fork pointed out by: Andrei Vagin Changelog v3-v3.1: 1. Switch "pwd" of pipe program to container's root fs. 2. Rebase on top of v4.9-rc1 Changelog v2->v3: 1: Fix problem of setting pid namespace, pointed out by: Andrei Vagin Changelog v1(RFC)->v2: 1: Add [PATCH 2/2] which was todo in [RFC v1]. 2: Pass a test script for each function. 3: Rebase on top of v4.7. Suggested-by: Eric W. Biederman Suggested-by: KOSAKI Motohiro Signed-off-by: Zhao Lei Signed-off-by: Cao Shufeng Cao Shufeng (2): Make call_usermodehelper_exec possible to set namespaces Limit dump_pipe program's permission to init for container Zhao Lei (1): Make core_pattern support namespace fs/coredump.c | 150 +++--- include/linux/binfmts.h | 2 + include/linux/kmod.h | 4 ++ include/linux/pid_namespace.h | 3 + init/do_mounts_initrd.c | 3 +- kernel/kmod.c | 43 +--- kernel/pid.c | 2 + kernel/pid_namespace.c| 2 + kernel/sysctl.c | 50 -- lib/kobject_uevent.c | 3 +- security/keys/request_key.c | 4 +- 11 files changed, 241 insertions(+), 25 deletions(-) -- 2.7.4
[PATCH v3.1 3/3] Make core_pattern support namespace
From: Zhao Lei Currently, each container shared one copy of coredump setting with the host system, if host system changed the setting, each running containers will be affected. Same story happened when container changed core_pattern, both host and other container will be affected. For container based on namespace design, it is good to allow each container keeping their own coredump setting. It will bring us following benefit: 1: Each container can change their own coredump setting based on operation on /proc/sys/kernel/core_pattern 2: Coredump setting changed in host will not affect running containers. 3: Support both case of "putting coredump in guest" and "putting curedump in host". Each namespace-based software(lxc, docker, ..) can use this function to custom their dump setting. And this function makes each continer working as separate system, it fit for design goal of namespace. Test(in lxc): # In the host # # echo host_core >/proc/sys/kernel/core_pattern # cat /proc/sys/kernel/core_pattern host_core # ulimit -c 1024000 # ./make_dump Segmentation fault (core dumped) # ls -l -rw--- 1 root root 331776 Feb 4 18:02 host_core.2175 -rwxr-xr-x 1 root root 759731 Feb 4 18:01 make_dump # # In the container # # cat /proc/sys/kernel/core_pattern host_core # echo container_core >/proc/sys/kernel/core_pattern # ./make_dump Segmentation fault (core dumped) # ls -l -rwxr-xr-x1 root root 759731 Feb 4 10:45 make_dump -rw---1 root root 331776 Feb 4 10:45 container_core.16 # # Return to host # # cat /proc/sys/kernel/core_pattern host_core # ls host_core.2175 make_dump make_dump.c # rm -f host_core.2175 # ./make_dump Segmentation fault (core dumped) # ls -l -rw--- 1 root root 331776 Feb 4 18:49 host_core.2351 -rwxr-xr-x 1 root root 759731 Feb 4 18:01 make_dump # Signed-off-by: Zhao Lei --- fs/coredump.c | 25 -- include/linux/pid_namespace.h | 3 +++ kernel/pid.c | 2 ++ kernel/pid_namespace.c| 2 ++ kernel/sysctl.c | 50 ++- 5 files changed, 70 insertions(+), 12 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 8ea8bc1..b854541 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -49,7 +49,6 @@ int core_uses_pid; unsigned int core_pipe_limit; -char core_pattern[CORENAME_MAX_SIZE] = "core"; static int core_name_size = CORENAME_MAX_SIZE; struct core_name { @@ -57,8 +56,6 @@ struct core_name { int used, size; }; -/* The maximal length of core_pattern is also specified in sysctl.c */ - static int expand_corename(struct core_name *cn, int size) { char *corename = krealloc(cn->corename, size, GFP_KERNEL); @@ -183,10 +180,10 @@ static int cn_print_exe_file(struct core_name *cn) * name into corename, which must have space for at least * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. */ -static int format_corename(struct core_name *cn, struct coredump_params *cprm) +static int format_corename(struct core_name *cn, const char *pat_ptr, + struct coredump_params *cprm) { const struct cred *cred = current_cred(); - const char *pat_ptr = core_pattern; int ispipe = (*pat_ptr == '|'); int pid_in_pattern = 0; int err = 0; @@ -641,6 +638,8 @@ void do_coredump(const siginfo_t *siginfo) */ .mm_flags = mm->flags, }; + struct pid_namespace *pid_ns; + char core_pattern[CORENAME_MAX_SIZE]; audit_core_dumps(siginfo->si_signo); @@ -650,6 +649,18 @@ void do_coredump(const siginfo_t *siginfo) if (!__get_dumpable(cprm.mm_flags)) goto fail; + pid_ns = task_active_pid_ns(current); + spin_lock(&pid_ns->core_pattern_lock); + while (pid_ns != &init_pid_ns) { + if (pid_ns->core_pattern[0]) + break; + spin_unlock(&pid_ns->core_pattern_lock); + pid_ns = pid_ns->parent, + spin_lock(&pid_ns->core_pattern_lock); + } + strcpy(core_pattern, pid_ns->core_pattern); + spin_unlock(&pid_ns->core_pattern_lock); + cred = prepare_creds(); if (!cred) goto fail; @@ -671,7 +682,7 @@ void do_coredump(const siginfo_t *siginfo) old_cred = override_creds(cred); - ispipe = format_corename(&cn, &cprm); + ispipe = format_corename(&cn, core_pattern, &cprm); if (ispipe) { int dump_count; @@ -718,7 +729,7 @@ void do_coredump(const siginfo_t *siginfo) } rcu_read_lock(); - vinit_task = find_task_by_vpid(1); + vinit_task = find_task_by_pid_ns(1, pid_ns); rcu_read_unlock(); if (!vinit_task) { print
[PATCH v3.1 0/3] Make core_pattern support namespace
This patchset includes following function points: 1: Let usermodehelper function possible to set pid namespace done by: [PATCH v3.1 1/3] Make call_usermodehelper_exec possible to set pid namespace. 2: Let pipe_type core_pattern write dump into container's rootfs done by: [PATCH v3.1 2/3] Limit dump_pipe program's permission to init for container. 2: Make separate core_pattern setting for each container done by: [PATCH v3.1 3/3] Make core_pattern support namespace 3: Compatibility with current system also included in: [PATCH v3.1 3/3] Make core_pattern support namespace If container hadn't change core_pattern setting, it will keep same setting with host. Test: 1: Pass a test script for each function of this patchset ## TEST IN HOST ## [root@kerneldev dumptest]# ./test_host Set file core_pattern: OK ./test_host: line 41: 2366 Segmentation fault (core dumped) "$SCRI= PT_BASE_DIR"/make_dump Checking dumpfile: OK Set file core_pattern: OK ./test_host: line 41: 2369 Segmentation fault (core dumped) "$SCRI= PT_BASE_DIR"/make_dump Checking dump_pipe triggered: OK Checking rootfs: OK Checking dumpfile: OK Checking namespace: OK Checking process list: OK Checking capabilities: OK ## TEST IN GUEST ## # ./test Segmentation fault (core dumped) Checking dump_pipe triggered: OK Checking rootfs: OK Checking dumpfile: OK Checking namespace: OK Checking process list: OK Checking cg pids: OK Checking capabilities: OK [ 64.940734] make_dump[2432]: segfault at 0 ip 0040049d sp 000= 07ffc4af025f0 error 6 in make_dump[40+a6000] # 2: Pass other test(which is not easy to do in script) by hand. Changelog v3-v3.1: 1. Switch "pwd" of pipe program to container's root fs. 2. Rebase on top of v4.9-rc1 Changelog v2->v3: 1: Fix problem of setting pid namespace, pointed out by: Andrei Vagin Changelog v1(RFC)->v2: 1: Add [PATCH 2/2] which was todo in [RFC v1]. 2: Pass a test script for each function. 3: Rebase on top of v4.7. Suggested-by: Eric W. Biederman Suggested-by: KOSAKI Motohiro Signed-off-by: Zhao Lei Zhao Lei (3): Make call_usermodehelper_exec possible to set pid namespace Limit dump_pipe program's permission to init for container Make core_pattern support namespace fs/coredump.c | 127 +--- include/linux/binfmts.h | 1 + include/linux/kmod.h | 2 + include/linux/pid_namespace.h | 3 + init/do_mounts_initrd.c | 3 +- kernel/kmod.c | 133 +- kernel/pid.c | 2 + kernel/pid_namespace.c| 2 + kernel/sysctl.c | 50 ++-- lib/kobject_uevent.c | 3 +- security/keys/request_key.c | 4 +- 11 files changed, 297 insertions(+), 33 deletions(-) -- 2.7.4
[PATCH v3.1 2/3] Limit dump_pipe program's permission to init for container
From: Zhao Lei Currently when we set core_pattern to a pipe, the pipe program is forked by kthread running with root's permission, and write dumpfile into host's filesystem. Same thing happened for container, the dumper and dumpfile are also in host(not in container). It have following program: 1: Not consistent with file_type core_pattern When we set core_pattern to a file, the container will write dump into container's filesystem instead of host. 2: Not safe for privileged container In a privileged container, user can destroy host system by following command: # # In a container # echo "|/bin/dd of=/boot/vmlinuz" >/proc/sys/kernel/core_pattern # make_dump This patch switch dumper program's environment to init task, so, for container, dumper program have same environment with init task in container, which make dumper program put in container's filesystem, and write coredump into container's filesystem. The dumper's permission is also limited into subset of container's init process. Suggested-by: Eric W. Biederman Suggested-by: KOSAKI Motohiro Signed-off-by: Zhao Lei Co-Authored-By: Cao ShuFeng --- fs/coredump.c | 103 +++- include/linux/binfmts.h | 1 + 2 files changed, 102 insertions(+), 2 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index ceb0ee8..8ea8bc1 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -501,6 +501,23 @@ static void wait_for_dump_helpers(struct file *file) pipe_unlock(pipe); } +static int umh_ns_setup(struct subprocess_info *info) +{ + struct coredump_params *cp = (struct coredump_params *)info->data; + struct task_struct *base_task = cp->base_task; + + if (base_task) { + /* Set namespaces to base_task */ + get_nsproxy(base_task->nsproxy); + switch_task_namespaces(current, base_task->nsproxy); + + /* Return -EAGAIN to notice caller to refork */ + return -EAGAIN; + } + + return 0; +} + /* * umh_pipe_setup * helper function to customize the process used @@ -516,6 +533,8 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) { struct file *files[2]; struct coredump_params *cp = (struct coredump_params *)info->data; + struct task_struct *base_task; + int err = create_pipe_files(files, 0); if (err) return err; @@ -524,10 +543,76 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) err = replace_fd(0, files[0], 0); fput(files[0]); + if (err) + return err; + /* and disallow core files too */ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; - return err; + base_task = cp->base_task; + if (base_task) { + const struct cred *base_cred; + + /* Set fs_root to base_task */ + spin_lock(&base_task->fs->lock); + set_fs_root(current->fs, &base_task->fs->root); + set_fs_pwd(current->fs, &base_task->fs->pwd); + spin_unlock(&base_task->fs->lock); + + /* Set cgroup to base_task */ + current->flags &= ~PF_NO_SETAFFINITY; + err = cgroup_attach_task_all(base_task, current); + if (err < 0) + return err; + + /* Set cred to base_task */ + base_cred = get_task_cred(base_task); + + new->uid = base_cred->uid; + new->gid = base_cred->gid; + new->suid = base_cred->suid; + new->sgid = base_cred->sgid; + new->euid = base_cred->euid; + new->egid = base_cred->egid; + new->fsuid = base_cred->fsuid; + new->fsgid = base_cred->fsgid; + + new->securebits = base_cred->securebits; + + new->cap_inheritable = base_cred->cap_inheritable; + new->cap_permitted = base_cred->cap_permitted; + new->cap_effective = base_cred->cap_effective; + new->cap_bset= base_cred->cap_bset; + new->cap_ambient = base_cred->cap_ambient; + + security_cred_free(new); +#ifdef CONFIG_SECURITY + new->security = NULL; +#endif + err = security_prepare_creds(new, base_cred, GFP_KERNEL); + if (err < 0) { + put_cred(base_cred); + return err; + } + + free_uid(new->user); + new->user = base_cred->user; + get_uid(new->user); + + put_user_ns(new->user_ns); +
[PATCH v3.1 1/3] Make call_usermodehelper_exec possible to set pid namespace
From: Zhao Lei Current call_usermodehelper_exec() can not set pid namespace for the executed program, because we need addition fork to make pid namespace active. This patch add above function for call_usermodehelper_exec(). When init_intermediate callback return -EAGAIN, the usermodehelper will fork again to make pid namespace active, and run program in the child process. This function is helpful for coredump to run pipe_program in specific container environment. Signed-off-by: Zhao Lei --- fs/coredump.c | 3 +- include/linux/kmod.h| 2 + init/do_mounts_initrd.c | 3 +- kernel/kmod.c | 133 ++-- lib/kobject_uevent.c| 3 +- security/keys/request_key.c | 4 +- 6 files changed, 127 insertions(+), 21 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 281b768..ceb0ee8 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -641,7 +641,8 @@ void do_coredump(const siginfo_t *siginfo) retval = -ENOMEM; sub_info = call_usermodehelper_setup(helper_argv[0], helper_argv, NULL, GFP_KERNEL, - umh_pipe_setup, NULL, &cprm); + NULL, umh_pipe_setup, + NULL, &cprm); if (sub_info) retval = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); diff --git a/include/linux/kmod.h b/include/linux/kmod.h index fcfd2bf..8fb8c0e 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -61,6 +61,7 @@ struct subprocess_info { char **envp; int wait; int retval; + int (*init_intermediate)(struct subprocess_info *info); int (*init)(struct subprocess_info *info, struct cred *new); void (*cleanup)(struct subprocess_info *info); void *data; @@ -71,6 +72,7 @@ call_usermodehelper(char *path, char **argv, char **envp, int wait); extern struct subprocess_info * call_usermodehelper_setup(char *path, char **argv, char **envp, gfp_t gfp_mask, + int (*init_intermediate)(struct subprocess_info *info), int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *), void *data); diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index a1000ca..bb5dce5 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -72,7 +72,8 @@ static void __init handle_initrd(void) current->flags |= PF_FREEZER_SKIP; info = call_usermodehelper_setup("/linuxrc", argv, envp_init, -GFP_KERNEL, init_linuxrc, NULL, NULL); +GFP_KERNEL, NULL, init_linuxrc, NULL, +NULL); if (!info) return; call_usermodehelper_exec(info, UMH_WAIT_PROC); diff --git a/kernel/kmod.c b/kernel/kmod.c index 0277d12..30a5802 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -91,7 +91,7 @@ static int call_modprobe(char *module_name, int wait) argv[4] = NULL; info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL, -NULL, free_modprobe_argv, NULL); +NULL, NULL, free_modprobe_argv, NULL); if (!info) goto free_module_name; @@ -209,14 +209,11 @@ static void umh_complete(struct subprocess_info *sub_info) call_usermodehelper_freeinfo(sub_info); } -/* - * This is the task which runs the usermode application - */ -static int call_usermodehelper_exec_async(void *data) +static int __call_usermodehelper_exec_doexec(void *data) { struct subprocess_info *sub_info = data; struct cred *new; - int retval; + int retval = 0; spin_lock_irq(¤t->sighand->siglock); flush_signal_handlers(current, 1); @@ -228,10 +225,11 @@ static int call_usermodehelper_exec_async(void *data) */ set_user_nice(current, 0); - retval = -ENOMEM; new = prepare_kernel_cred(current); - if (!new) + if (!new) { + retval = -ENOMEM; goto out; + } spin_lock(&umh_sysctl_lock); new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); @@ -248,20 +246,121 @@ static int call_usermodehelper_exec_async(void *data) } commit_creds(new); - retval = do_execve(getname_kernel(sub_info->path), - (const char __user *const __user *)sub_info->argv, - (const char __user *const __user *)sub_info->envp); + (const char __user *const __user *)sub_info->argv, + (const char __u