On 2026/1/10 9:32, Waiman Long wrote:
> As stated in commit 1c09b195d37f ("cpuset: fix a regression in validating
> config change"), it is not allowed to clear masks of a cpuset if
> there're tasks in it. This is specific to v1 since empty "cpuset.cpus"
> or "cpuset.mems" will cause the v2 cpuset to inherit the effective CPUs
> or memory nodes from its parent. So it is OK to have empty cpus or mems
> even if there are tasks in the cpuset.
> 
> Move this empty cpus/mems check in validate_change() to
> cpuset1_validate_change() to allow more flexibility in setting
> cpus or mems in v2. cpuset_is_populated() needs to be moved into
> cpuset-internal.h as it is needed by the empty cpus/mems checking code.
> 
> Also add a test case to test_cpuset_prs.sh to verify that.
> 
> Reported-by: Chen Ridong <[email protected]>
> Closes: 
> https://lore.kernel.org/lkml/[email protected]/
> Signed-off-by: Waiman Long <[email protected]>
> ---
>  kernel/cgroup/cpuset-internal.h               |  9 ++++++++
>  kernel/cgroup/cpuset-v1.c                     | 14 +++++++++++
>  kernel/cgroup/cpuset.c                        | 23 -------------------
>  .../selftests/cgroup/test_cpuset_prs.sh       |  3 +++
>  4 files changed, 26 insertions(+), 23 deletions(-)
> 
> diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
> index e8e2683cb067..fd7d19842ded 100644
> --- a/kernel/cgroup/cpuset-internal.h
> +++ b/kernel/cgroup/cpuset-internal.h
> @@ -260,6 +260,15 @@ static inline int nr_cpusets(void)
>       return static_key_count(&cpusets_enabled_key.key) + 1;
>  }
>  
> +static inline bool cpuset_is_populated(struct cpuset *cs)
> +{
> +     lockdep_assert_cpuset_lock_held();
> +
> +     /* Cpusets in the process of attaching should be considered as 
> populated */
> +     return cgroup_is_populated(cs->css.cgroup) ||
> +             cs->attach_in_progress;
> +}
> +
>  /**
>   * cpuset_for_each_child - traverse online children of a cpuset
>   * @child_cs: loop cursor pointing to the current child
> diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
> index 04124c38a774..7a23b9e8778f 100644
> --- a/kernel/cgroup/cpuset-v1.c
> +++ b/kernel/cgroup/cpuset-v1.c
> @@ -368,6 +368,20 @@ int cpuset1_validate_change(struct cpuset *cur, struct 
> cpuset *trial)
>       if (par && !is_cpuset_subset(trial, par))
>               goto out;
>  
> +     /*
> +      * Cpusets with tasks - existing or newly being attached - can't
> +      * be changed to have empty cpus_allowed or mems_allowed.
> +      */
> +     ret = -ENOSPC;
> +     if (cpuset_is_populated(cur)) {
> +             if (!cpumask_empty(cur->cpus_allowed) &&
> +                 cpumask_empty(trial->cpus_allowed))
> +                     goto out;
> +             if (!nodes_empty(cur->mems_allowed) &&
> +                 nodes_empty(trial->mems_allowed))
> +                     goto out;
> +     }
> +
>       ret = 0;
>  out:
>       return ret;

The current implementation is sufficient.

However, I suggest we fully separate the validation logic for v1 and v2. While 
this may introduce
some code duplication (likely minimal), it would allow us to modify the 
validate_change logic for v2
in the future without needing to consider v1 compatibility. Given that v1 is 
unlikely to see further
changes, this separation would be a practical long-term decision.

@@ -368,6 +368,48 @@ int cpuset1_validate_change(struct cpuset *cur, struct 
cpuset *trial)
        if (par && !is_cpuset_subset(trial, par))
                goto out;

+       /*
+        * Cpusets with tasks - existing or newly being attached - can't
+        * be changed to have empty cpus_allowed or mems_allowed.
+        */
+       ret = -ENOSPC;
+       if (cpuset_is_populated(cur)) {
+               if (!cpumask_empty(cur->cpus_allowed) &&
+                   cpumask_empty(trial->cpus_allowed))
+                       goto out;
+               if (!nodes_empty(cur->mems_allowed) &&
+                   nodes_empty(trial->mems_allowed))
+                       goto out;
+       }
+
+       /*
+        * We can't shrink if we won't have enough room for SCHED_DEADLINE
+        * tasks. This check is not done when scheduling is disabled as the
+        * users should know what they are doing.
+        *
+        * For v1, effective_cpus == cpus_allowed & user_xcpus() returns
+        * cpus_allowed.
+        *
+        */
+       ret = -EBUSY;
+       if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) &&
+           !cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial)))
+               goto out;
+
+       /*
+        * If either I or some sibling (!= me) is exclusive, we can't
+        * overlap. exclusive_cpus cannot overlap with each other if set.
+        */
+       ret = -EINVAL;
+       cpuset_for_each_child(c, css, par) {
+               if (c == cur)
+                       continue;
+               if (cpuset1_cpus_excl_conflict(trial, c))
+                       goto out;
+               if (mems_excl_conflict(trial, c))
+                       goto out;
+       }
+
        ret = 0;
 out:
        return ret;

> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 83fb83a86b4b..a3dbca125588 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -370,15 +370,6 @@ static inline bool is_in_v2_mode(void)
>             (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
>  }
>  
> -static inline bool cpuset_is_populated(struct cpuset *cs)
> -{
> -     lockdep_assert_held(&cpuset_mutex);
> -
> -     /* Cpusets in the process of attaching should be considered as 
> populated */
> -     return cgroup_is_populated(cs->css.cgroup) ||
> -             cs->attach_in_progress;
> -}
> -
>  /**
>   * partition_is_populated - check if partition has tasks
>   * @cs: partition root to be checked
> @@ -695,20 +686,6 @@ static int validate_change(struct cpuset *cur, struct 
> cpuset *trial)
>  
>       par = parent_cs(cur);
>  
> -     /*
> -      * Cpusets with tasks - existing or newly being attached - can't
> -      * be changed to have empty cpus_allowed or mems_allowed.
> -      */
> -     ret = -ENOSPC;
> -     if (cpuset_is_populated(cur)) {
> -             if (!cpumask_empty(cur->cpus_allowed) &&
> -                 cpumask_empty(trial->cpus_allowed))
> -                     goto out;
> -             if (!nodes_empty(cur->mems_allowed) &&
> -                 nodes_empty(trial->mems_allowed))
> -                     goto out;
> -     }
> -
>       /*
>        * We can't shrink if we won't have enough room for SCHED_DEADLINE
>        * tasks. This check is not done when scheduling is disabled as the
> diff --git a/tools/testing/selftests/cgroup/test_cpuset_prs.sh 
> b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
> index ff4540b0490e..5dff3ad53867 100755
> --- a/tools/testing/selftests/cgroup/test_cpuset_prs.sh
> +++ b/tools/testing/selftests/cgroup/test_cpuset_prs.sh
> @@ -425,6 +425,9 @@ TEST_MATRIX=(
>       # cpuset.cpus can be set to a subset of sibling's cpuset.cpus.exclusive
>       " C1-3:X1-3  .      .    C4-5      .     .      .     C1-2   0 
> A1:1-3|B1:1-2"
>  
> +     # cpuset.cpus can become empty with task in it as it inherits parent's 
> effective CPUs
> +     " C1-3:S+   C2      .      .       .    T:C     .      .     0 
> A1:1-3|A2:1-3"
> +
>       #  old-A1 old-A2 old-A3 old-B1 new-A1 new-A2 new-A3 new-B1 fail ECPUs 
> Pstate ISOLCPUS
>       #  ------ ------ ------ ------ ------ ------ ------ ------ ---- ----- 
> ------ --------
>       # Failure cases:

-- 
Best regards,
Ridong


Reply via email to