The main idea is to make ve of task (->task_ve) not directly related to task's cgroup. In cgroup-v2 to enable ve controller for some cgroup we would also need to enable the controller for all ancestor cgroups (and siblings of all ancestors too). So ve controoler would end up enabled for "/machine.slice" and "/user.slice" and so on. And we don't really want to consider those as separate VEs (containers). So to mitigate this we can link ve cgroup to ve namespace to indicate that this is a real VE (container).
Another idea behind adding this namespace is to extend ve lifetime to overlap the lifetime of all ve processes. This way we can generalize ->task_ve accesses all over the code base. Note: Regular nsproxy-like namespaces get disconnected from task in exit_task_namespaces() quite early, even though the task can live after that (e.g. in zombie state) for a long time. Note: Same thing is true about cgroups, cgroups are disconnected from processes in cgroup_exit(), almost at the same time with nsproxy-like namespaces It is somehow similar to other non-hierarchical namespaces, you can clone3, unshare or setns to it. (The old clone() syscall is not supported due to no available flags.) It takes the reference to current ve through css. Note that it does not prevent cgroup directory removal, only ve_struct and css under it. https://virtuozzo.atlassian.net/browse/VSTOR-118289 Signed-off-by: Pavel Tikhomirov <[email protected]> Feature: ve: ve generic structures --- fs/proc/namespaces.c | 3 + include/linux/init_task.h | 1 + include/linux/nsproxy.h | 2 + include/linux/proc_ns.h | 2 + include/linux/sched.h | 4 +- include/linux/user_namespace.h | 1 + include/linux/ve_namespace.h | 81 +++++++++++++ include/uapi/linux/sched.h | 1 + init/init_task.c | 3 + kernel/exit.c | 2 + kernel/fork.c | 24 +++- kernel/nsproxy.c | 39 +++++- kernel/ucount.c | 1 + kernel/ve/Makefile | 2 +- kernel/ve/ve_namespace.c | 209 +++++++++++++++++++++++++++++++++ 15 files changed, 369 insertions(+), 6 deletions(-) create mode 100644 include/linux/ve_namespace.h create mode 100644 kernel/ve/ve_namespace.c diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 8e159fc78c0a1..a9c20d2fca964 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -37,6 +37,9 @@ static const struct proc_ns_operations *ns_entries[] = { &timens_operations, &timens_for_children_operations, #endif +#ifdef CONFIG_VE + &ve_ns_operations, +#endif }; static const char *proc_ns_get_link(struct dentry *dentry, diff --git a/include/linux/init_task.h b/include/linux/init_task.h index bccb3f1f62621..a30d9d256a061 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -10,6 +10,7 @@ #include <linux/ipc.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> +#include <linux/ve_namespace.h> #include <linux/securebits.h> #include <linux/seqlock.h> #include <linux/rbtree.h> diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index dab6a1734a226..4dc0fe036bb9c 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -11,6 +11,7 @@ struct uts_namespace; struct ipc_namespace; struct pid_namespace; struct cgroup_namespace; +struct ve_namespace; struct fs_struct; /* @@ -67,6 +68,7 @@ struct nsset { struct nsproxy *nsproxy; struct fs_struct *fs; const struct cred *cred; + struct ve_namespace *ve_ns; }; static inline struct cred *nsset_cred(struct nsset *set) diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 5ea470eb4d768..6f24ea3bee59f 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -34,6 +34,7 @@ extern const struct proc_ns_operations mntns_operations; extern const struct proc_ns_operations cgroupns_operations; extern const struct proc_ns_operations timens_operations; extern const struct proc_ns_operations timens_for_children_operations; +extern const struct proc_ns_operations ve_ns_operations; /* * We always define these enumerators @@ -46,6 +47,7 @@ enum { PROC_PID_INIT_INO = 0xEFFFFFFCU, PROC_CGROUP_INIT_INO = 0xEFFFFFFBU, PROC_TIME_INIT_INO = 0xEFFFFFFAU, + PROC_VE_INIT_INO = 0xEFFFFFF9U, }; #ifdef CONFIG_PROC_FS diff --git a/include/linux/sched.h b/include/linux/sched.h index b1be67e682255..0f7892c449d21 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -82,6 +82,7 @@ struct signal_struct; struct task_delay_info; struct task_group; struct ve_struct; +struct ve_namespace; struct task_struct; struct user_event_mm; struct kernel_cpustat; @@ -1030,7 +1031,8 @@ struct task_struct { unsigned in_thrashing:1; #endif #ifdef CONFIG_VE - struct ve_struct *task_ve; + struct ve_struct *task_ve; + struct ve_namespace *ve_ns; #endif #ifdef CONFIG_PREEMPT_RT struct netdev_xmit net_xmit; diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 7af87135fe2f9..afffe09ed2115 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -51,6 +51,7 @@ enum ucount_type { UCOUNT_MNT_NAMESPACES, UCOUNT_CGROUP_NAMESPACES, UCOUNT_TIME_NAMESPACES, + UCOUNT_VE_NAMESPACES, #ifdef CONFIG_INOTIFY_USER UCOUNT_INOTIFY_INSTANCES, UCOUNT_INOTIFY_WATCHES, diff --git a/include/linux/ve_namespace.h b/include/linux/ve_namespace.h new file mode 100644 index 0000000000000..7a16669d7bfcd --- /dev/null +++ b/include/linux/ve_namespace.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VE_NAMESPACE_H +#define _LINUX_VE_NAMESPACE_H + +#include <linux/sched.h> +#include <linux/ns_common.h> +#include <linux/user_namespace.h> +#include <linux/container_of.h> +#include <linux/ve.h> + +struct user_namespace; + +struct ve_namespace { + struct user_namespace *user_ns; + struct ucounts *ucounts; + struct ns_common ns; + struct ve_struct *ve; +} __randomize_layout; + +extern struct ve_namespace init_ve_ns; + +#ifdef CONFIG_VE +int copy_ve_ns(unsigned long flags, struct task_struct *p); +int unshare_ve_namespace(unsigned long flags, struct ve_namespace **new_ve_ns, + struct cred *new_cred); +void free_ve_ns(struct ve_namespace *ns); +void switch_ve_namespace(struct task_struct *p, struct ve_namespace *new); +void exit_ve_namespace(struct task_struct *p); +struct ve_namespace *get_ve_ns(struct ve_namespace *ns); +void put_ve_ns(struct ve_namespace *ns); + +static inline struct ve_namespace *to_ve_ns(struct ns_common *ns) +{ + return container_of(ns, struct ve_namespace, ns); +} +#else +static inline int copy_ve_ns(unsigned long flags, struct task_struct *p) +{ + if (flags & CLONE_NEWVE) + return -EINVAL; + return 0; +} + +static inline int unshare_ve_namespace(unsigned long flags, + struct ve_namespace **new_ve_ns, + struct cred *new_cred); +{ + if (flags & CLONE_NEWVE) + return -EINVAL; + return 0; +} + +static inline void free_ve_ns(struct ve_namespace *ns) +{ +} + +static inline void switch_ve_namespace(struct task_struct *p, + struct ve_namespace *new) +{ +} + +static inline void exit_ve_namespace(struct task_struct *p) +{ +} + +static inline struct ve_namespace *get_ve_ns(struct ve_namespace *ns) +{ + return ns; +} + +static inline void put_ve_ns(struct ve_namespace *ns) +{ +} + +static inline struct ve_namespace *to_ve_ns(struct ns_common *ns) +{ + return NULL; +} +#endif + +#endif /* _LINUX_VE_NAMESPACE_H */ diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 359a14cc76a40..064a72b3285b6 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -42,6 +42,7 @@ * syscalls only: */ #define CLONE_NEWTIME 0x00000080 /* New time namespace */ +#define CLONE_NEWVE 0x00000040 /* New VE namespace */ #ifndef __ASSEMBLY__ /** diff --git a/init/init_task.c b/init/init_task.c index 9283a3a7fb493..e31c9297d2b3a 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -119,6 +119,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { }, #endif INIT_TASK_VE(tsk) +#ifdef CONFIG_VE + .ve_ns = &init_ve_ns, +#endif .ptraced = LIST_HEAD_INIT(init_task.ptraced), .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), .real_parent = &init_task, diff --git a/kernel/exit.c b/kernel/exit.c index 572887ad951f6..f5b40a392988e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -30,6 +30,7 @@ #include <linux/binfmts.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> +#include <linux/ve_namespace.h> #include <linux/ptrace.h> #include <linux/profile.h> #include <linux/mount.h> @@ -288,6 +289,7 @@ void release_task(struct task_struct *p) proc_flush_pid(thread_pid); put_pid(thread_pid); release_thread(p); + exit_ve_namespace(p); put_task_struct_rcu_user(p); p = leader; diff --git a/kernel/fork.c b/kernel/fork.c index f215fd4beec86..cfafff15bb856 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -100,6 +100,7 @@ #include <linux/io_uring.h> #include <linux/bpf.h> #include <linux/ve.h> +#include <linux/ve_namespace.h> #include <linux/stackprotector.h> #include <linux/user_events.h> #include <linux/iommu.h> @@ -2387,9 +2388,12 @@ __latent_entropy struct task_struct *copy_process( retval = copy_namespaces(clone_flags, p); if (retval) goto bad_fork_cleanup_mm; - retval = copy_io(clone_flags, p); + retval = copy_ve_ns(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; + retval = copy_io(clone_flags, p); + if (retval) + goto bad_fork_cleanup_ve_namespace; retval = copy_thread(p, args); if (retval) goto bad_fork_cleanup_io; @@ -2644,6 +2648,8 @@ __latent_entropy struct task_struct *copy_process( bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); +bad_fork_cleanup_ve_namespace: + exit_ve_namespace(p); bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: @@ -3206,7 +3212,7 @@ static int check_unshare_flags(unsigned long unshare_flags) CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP| - CLONE_NEWTIME)) + CLONE_NEWTIME|CLONE_NEWVE)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing @@ -3283,6 +3289,7 @@ int ksys_unshare(unsigned long unshare_flags) struct files_struct *new_fd = NULL; struct cred *new_cred = NULL; struct nsproxy *new_nsproxy = NULL; + struct ve_namespace *new_ve_ns = NULL; int do_sysvsem = 0; int err; @@ -3331,17 +3338,25 @@ int ksys_unshare(unsigned long unshare_flags) new_cred, new_fs); if (err) goto bad_unshare_cleanup_cred; + err = unshare_ve_namespace(unshare_flags, &new_ve_ns, new_cred); + if (err) { + if (new_nsproxy) + free_nsproxy(new_nsproxy); + goto bad_unshare_cleanup_cred; + } if (new_cred) { err = set_cred_ucounts(new_cred); if (err) { if (new_nsproxy) free_nsproxy(new_nsproxy); + if (new_ve_ns) + free_ve_ns(new_ve_ns); goto bad_unshare_cleanup_cred; } } - if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { + if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy || new_ve_ns) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). @@ -3357,6 +3372,9 @@ int ksys_unshare(unsigned long unshare_flags) if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); + if (new_ve_ns) + switch_ve_namespace(current, new_ve_ns); + task_lock(current); if (new_fs) { diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index dc952c3b05afd..ccf53bd9579d6 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -19,6 +19,7 @@ #include <net/net_namespace.h> #include <linux/ipc_namespace.h> #include <linux/time_namespace.h> +#include <linux/ve_namespace.h> #include <linux/fs_struct.h> #include <linux/proc_fs.h> #include <linux/proc_ns.h> @@ -277,7 +278,7 @@ static int check_setns_flags(unsigned long flags) { if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER | - CLONE_NEWPID | CLONE_NEWCGROUP))) + CLONE_NEWPID | CLONE_NEWCGROUP | CLONE_NEWVE))) return -EINVAL; #ifndef CONFIG_USER_NS @@ -308,6 +309,10 @@ static int check_setns_flags(unsigned long flags) if (flags & CLONE_NEWTIME) return -EINVAL; #endif +#ifndef CONFIG_VE + if (flags & CLONE_NEWVE) + return -EINVAL; +#endif return 0; } @@ -326,6 +331,8 @@ static void put_nsset(struct nsset *nsset) free_fs_struct(nsset->fs); if (nsset->nsproxy) free_nsproxy(nsset->nsproxy); + if (nsset->ve_ns) + put_ve_ns(nsset->ve_ns); } static int prepare_nsset(unsigned flags, struct nsset *nsset) @@ -378,6 +385,7 @@ static int validate_nsset(struct nsset *nsset, struct pid *pid) unsigned flags = nsset->flags; struct user_namespace *user_ns = NULL; struct pid_namespace *pid_ns = NULL; + struct ve_namespace *ve_ns = NULL; struct nsproxy *nsp; struct task_struct *tsk; @@ -419,6 +427,20 @@ static int validate_nsset(struct nsset *nsset, struct pid *pid) #ifdef CONFIG_USER_NS if (flags & CLONE_NEWUSER) user_ns = get_user_ns(__task_cred(tsk)->user_ns); +#endif +#ifdef CONFIG_VE + if (flags & CLONE_NEWVE) { + task_lock(tsk); + ve_ns = tsk->ve_ns; + if (ve_ns) + get_ve_ns(ve_ns); + task_unlock(tsk); + if (!ve_ns) { + rcu_read_unlock(); + ret = -ESRCH; + goto out; + } + } #endif rcu_read_unlock(); @@ -490,7 +512,17 @@ static int validate_nsset(struct nsset *nsset, struct pid *pid) } #endif +#ifdef CONFIG_VE + if (flags & CLONE_NEWVE) { + ret = validate_ns(nsset, &ve_ns->ns); + if (ret) + goto out; + } +#endif + out: + if (ve_ns) + put_ve_ns(ve_ns); if (pid_ns) put_pid_ns(pid_ns); if (nsp) @@ -541,6 +573,11 @@ static void commit_nsset(struct nsset *nsset) /* transfer ownership */ switch_task_namespaces(me, nsset->nsproxy); nsset->nsproxy = NULL; + + if (flags & CLONE_NEWVE) { + switch_ve_namespace(me, nsset->ve_ns); + nsset->ve_ns = NULL; + } } SYSCALL_DEFINE2(setns, int, fd, int, flags) diff --git a/kernel/ucount.c b/kernel/ucount.c index 696406939be55..2e5471839d588 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -79,6 +79,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_mnt_namespaces"), UCOUNT_ENTRY("max_cgroup_namespaces"), UCOUNT_ENTRY("max_time_namespaces"), + UCOUNT_ENTRY("max_ve_namespaces"), #ifdef CONFIG_INOTIFY_USER UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), diff --git a/kernel/ve/Makefile b/kernel/ve/Makefile index 6219357803fb6..d716f669755c5 100644 --- a/kernel/ve/Makefile +++ b/kernel/ve/Makefile @@ -6,7 +6,7 @@ # Copyright (c) 2017-2021 Virtuozzo International GmbH. All rights reserved. # -obj-$(CONFIG_VE) = ve.o hooks.o veowner.o vzstat_core.o +obj-$(CONFIG_VE) = ve.o ve_namespace.o hooks.o veowner.o vzstat_core.o obj-$(CONFIG_VZ_DEV) += vzdev.o obj-$(CONFIG_VZ_EVENT) += vzevent.o diff --git a/kernel/ve/ve_namespace.c b/kernel/ve/ve_namespace.c new file mode 100644 index 0000000000000..e5e5f540205ed --- /dev/null +++ b/kernel/ve/ve_namespace.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * VE Namespace implementation + * + * Copyright (c) 2025 Virtuozzo International GmbH. All rights reserved. + */ + +#include <linux/ve_namespace.h> +#include <linux/user_namespace.h> +#include <linux/proc_ns.h> +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/cgroup.h> +#include <linux/ve.h> + +static struct ucounts *inc_ve_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_VE_NAMESPACES); +} + +static void dec_ve_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_VE_NAMESPACES); +} + +static struct ve_namespace *clone_ve_ns(struct user_namespace *user_ns, + struct ve_namespace *old_ns) +{ + struct ve_namespace *ns; + struct ucounts *ucounts; + int err; + + ucounts = inc_ve_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + + err = -ENOMEM; + ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); + if (!ns) + goto err_dec_ucount; + + refcount_set(&ns->ns.count, 1); + + err = ns_alloc_inum(&ns->ns); + if (err) + goto err_free_ns; + + ns->ucounts = ucounts; + ns->ns.ops = &ve_ns_operations; + ns->user_ns = get_user_ns(user_ns); + + /* + * VE namespace links to current ve cgroup + * FIXME it should be a 1:1 link + */ + ns->ve = get_ve(css_to_ve(current->cgroups->subsys[ve_cgrp_id])); + + return ns; +err_free_ns: + kfree(ns); +err_dec_ucount: + dec_ve_namespaces(ucounts); + return ERR_PTR(err); +} + +int copy_ve_ns(unsigned long flags, struct task_struct *p) +{ + struct user_namespace *user_ns = task_cred_xxx(p, user_ns); + struct ve_namespace *old_ve_ns = p->ve_ns; + struct ve_namespace *new_ve_ns; + + if (!(flags & CLONE_NEWVE)) { + get_ve_ns(old_ve_ns); + return 0; + } + + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + new_ve_ns = clone_ve_ns(user_ns, p->ve_ns); + if (IS_ERR(new_ve_ns)) + return PTR_ERR(new_ve_ns); + + p->ve_ns = new_ve_ns; + return 0; +} + +int unshare_ve_namespace(unsigned long flags, struct ve_namespace **new_ve_ns, + struct cred *new_cred) +{ + struct user_namespace *user_ns; + + if (!(flags & CLONE_NEWVE)) + return 0; + + user_ns = new_cred ? new_cred->user_ns : current_user_ns(); + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + *new_ve_ns = clone_ve_ns(user_ns, current->ve_ns); + if (IS_ERR(*new_ve_ns)) + return PTR_ERR(*new_ve_ns); + + return 0; +} + +void free_ve_ns(struct ve_namespace *ns) +{ + struct ucounts *ucounts = ns->ucounts; + + put_ve(ns->ve); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + kfree(ns); + dec_ve_namespaces(ucounts); +} + +void switch_ve_namespace(struct task_struct *p, struct ve_namespace *new) +{ + struct ve_namespace *old; + + /* + * FIXME + * ve_ns is switched to last, so we can verify all other namespaces of + * task p to confirm to the ve_ns + */ + + task_lock(p); + old = p->ve_ns; + p->ve_ns = new; + task_unlock(p); + + if (old) + put_ve_ns(old); +} + +void exit_ve_namespace(struct task_struct *p) +{ + switch_ve_namespace(p, NULL); +} + +struct ve_namespace *get_ve_ns(struct ve_namespace *ns) +{ + refcount_inc(&ns->ns.count); + return ns; +} + +void put_ve_ns(struct ve_namespace *ns) +{ + if (refcount_dec_and_test(&ns->ns.count)) + free_ve_ns(ns); +} + +static struct ns_common *ve_ns_get(struct task_struct *task) +{ + struct ve_namespace *ns = NULL; + + task_lock(task); + if (task->ve_ns) + ns = get_ve_ns(task->ve_ns); + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void ve_ns_put(struct ns_common *ns) +{ + put_ve_ns(to_ve_ns(ns)); +} + +static int ve_ns_install(struct nsset *nsset, struct ns_common *new) +{ + struct ve_namespace *ve_ns = to_ve_ns(new); + + if (!ns_capable(ve_ns->user_ns, CAP_SYS_ADMIN) || + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + /* + * Don't need to put_ve_ns(nsset->ve_ns) here, + * as at this point it is always zero. + */ + nsset->ve_ns = get_ve_ns(ve_ns); + return 0; +} + +static struct user_namespace *ve_ns_owner(struct ns_common *ns) +{ + return to_ve_ns(ns)->user_ns; +} + +const struct proc_ns_operations ve_ns_operations = { + .name = "ve", + .type = CLONE_NEWVE, + .get = ve_ns_get, + .put = ve_ns_put, + .install = ve_ns_install, + .owner = ve_ns_owner, +}; + +struct ve_namespace init_ve_ns = { + .ns.count = REFCOUNT_INIT(2), + .ve = &ve0, + .user_ns = &init_user_ns, + .ns.inum = PROC_VE_INIT_INO, + .ns.ops = &ve_ns_operations, +}; +EXPORT_SYMBOL(init_ve_ns); -- 2.51.1 _______________________________________________ Devel mailing list [email protected] https://lists.openvz.org/mailman/listinfo/devel
