On 11/24/25 12:20, Pavel Tikhomirov wrote:
...
+static struct ve_namespace *clone_ve_ns(struct user_namespace *user_ns,
+                                       struct ve_namespace *old_ns)
+{
+       struct ve_namespace *ns;
+       struct ucounts *ucounts;
+       int err;
+
+       ucounts = inc_ve_namespaces(user_ns);
+       if (!ucounts)
+               return ERR_PTR(-ENOSPC);
+
+       err = -ENOMEM;
+       ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
+       if (!ns)
+               goto err_dec_ucount;
+
+       refcount_set(&ns->ns.count, 1);
+
+       err = ns_alloc_inum(&ns->ns);
+       if (err)
+               goto err_free_ns;
+
+       ns->ucounts = ucounts;
+       ns->ns.ops = &ve_ns_operations;
+       ns->user_ns = get_user_ns(user_ns);
+
+       /*
+        * VE namespace links to current ve cgroup
+        * FIXME it should be a 1:1 link
+        */
+       ns->ve = get_ve(css_to_ve(current->cgroups->subsys[ve_cgrp_id]));

  Complete Scenario: When `current->cgroups->subsys[ve_cgrp_id]` Can Be NULL


  Step 1: Creating a cgroup hierarchy WITHOUT VE subsystem


     1 │# Administrator creates a new cgroup hierarchy without VE subsystem
     2 │mkdir /sys/fs/cgroup/myhierarchy
     3 │mount -t cgroup2 none /sys/fs/cgroup/myhierarchy
     4 │# VE subsystem is NOT enabled in this hierarchy

  Result: root->subsys_mask & (1UL << ve_cgrp_id) == 0

  Step 2: Process migrates into this cgroup


     1 │// kernel/cgroup/cgroup.c:find_existing_css_set()
     2 │static struct css_set *find_existing_css_set(...)
     3 │{
     4 │    for_each_subsys(ss, i) {
     5 │        if (root->subsys_mask & (1UL << i)) {
     6 │            // VE subsystem is NOT enabled, this block does NOT execute
     7 │            template[i] = cgroup_e_css_by_mask(cgrp, ss);
     8 │        } else {
     9 │            // Uses old css from old_cset
    10 │            template[i] = old_cset->subsys[i];
    11 │        }
    12 │    }
    13 │}

  If old_cset->subsys[ve_cgrp_id] was already NULL (e.g., process was created 
in this hierarchy), then
  template[ve_cgrp_id] = NULL.

  Step 3: Creating a new css_set


     1 │// kernel/cgroup/cgroup.c:find_css_set() (line 1257)
     2 │cset = kzalloc(sizeof(*cset), GFP_KERNEL);  // Zeroes memory
     3 │// ...
     4 │memcpy(cset->subsys, template, sizeof(cset->subsys));  // Copies 
template

  Result: cset->subsys[ve_cgrp_id] = NULL

  Step 4: Process receives this css_set


     1 │// Process now has:
     2 │current->cgroups = cset;  // where cset->subsys[ve_cgrp_id] == NULL


  Step 5: Process calls `clone()` with `CLONE_NEWVE`


     1 │// Userspace:
     2 │pid = clone(child_func, stack, CLONE_NEWVE | SIGCHLD, NULL);


  Step 6: Kernel calls `copy_ve_ns()`


     1 │// kernel/fork.c:copy_process() (line 2391)
     2 │retval = copy_ve_ns(clone_flags, p);


     1 │// kernel/ve/ve_namespace.c:copy_ve_ns() (line 67)
     2 │int copy_ve_ns(unsigned long flags, struct task_struct *p)
     3 │{
     4 │    // ...
     5 │    if (!(flags & CLONE_NEWVE)) {
     6 │        get_ve_ns(old_ve_ns);
     7 │        return 0;
     8 │    }
     9 │
    10 │    if (!ns_capable(user_ns, CAP_SYS_ADMIN))
    11 │        return -EPERM;
    12 │
    13 │    new_ve_ns = clone_ve_ns(user_ns, p->ve_ns);  // <-- Call
    14 │    // ...
    15 │}


  Step 7: `clone_ve_ns()` called with NULL


     1 │// kernel/ve/ve_namespace.c:clone_ve_ns() (line 57)
     2 │ns->ve = get_ve(css_to_ve(current->cgroups->subsys[ve_cgrp_id]));
     3 │//                                      ^^^^^^^^^^^^^^^^^^^^^^^^^
     4 │//                                      THIS IS NULL!


  Step 8: Call breakdown


     1 │// include/linux/ve.h:css_to_ve() (line 198)
     2 │static inline struct ve_struct *css_to_ve(struct cgroup_subsys_state 
*css)
     3 │{
     4 │    return css ? container_of(css, struct ve_struct, css) : NULL;
     5 │    //     ^^^^
     6 │    //     css == NULL, returns NULL
     7 │}
     8 │
     9 │// kernel/ve/ve.c:get_ve() (line 115)
    10 │struct ve_struct *get_ve(struct ve_struct *ve)
    11 │{
    12 │    if (ve)  // ve == NULL, this block does NOT execute
    13 │        css_get(&ve->css);
    14 │    return ve;  // Returns NULL
    15 │}

  Result: ns->ve = NULL — this is a problem.
  ---

  When This Can Happen

  1. Process created in a cgroup hierarchy without VE subsystem:


     1 │   // When creating a process in such hierarchy:
     2 │   // init_css_set may not have ve_cgrp_id if VE subsystem was not 
initialized
     3 │   // or was disabled in this hierarchy

  2. Process migrated to a cgroup without VE subsystem:


     1 │   # Process was in VE cgroup, then migrated:
     2 │   echo $PID > /sys/fs/cgroup/myhierarchy/cgroup.procs
     3 │   # where myhierarchy does not have VE subsystem

  3. VE subsystem disabled in hierarchy:


     1 │   // kernel/cgroup/cgroup.c:cgroup_apply_control()
     2 │   // VE subsystem can be disabled dynamically
     3 │   root->subsys_mask &= ~(1 << ve_cgrp_id);

  ---

  Why This Is a Problem

  1. ns->ve = NULL — violates structure invariant.
  2. Subsequent accesses to ns->ve can cause NULL pointer dereference.
  3. Inconsistency: process has task_ve (via get_exec_env()), but namespace is 
not linked to a VE.

  ---

+
+       return ns;
+err_free_ns:
+       kfree(ns);
+err_dec_ucount:
+       dec_ve_namespaces(ucounts);
+       return ERR_PTR(err);
+}
...
_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to