On 2026/2/7 4:37, Waiman Long wrote:
> Clarify the locking rules associated with file level internal variables
> inside the cpuset code. There is no functional change.
> 
> Signed-off-by: Waiman Long <[email protected]>
> ---
>  kernel/cgroup/cpuset.c | 105 ++++++++++++++++++++++++-----------------
>  1 file changed, 61 insertions(+), 44 deletions(-)
> 
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index c43efef7df71..a4c6386a594d 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -61,6 +61,58 @@ static const char * const perr_strings[] = {
>       [PERR_REMOTE]    = "Have remote partition underneath",
>  };
>  
> +/*
> + * CPUSET Locking Convention
> + * -------------------------
> + *
> + * Below are the three global locks guarding cpuset structures in lock
> + * acquisition order:
> + *  - cpu_hotplug_lock (cpus_read_lock/cpus_write_lock)
> + *  - cpuset_mutex
> + *  - callback_lock (raw spinlock)
> + *
> + * A task must hold all the three locks to modify externally visible or
> + * used fields of cpusets, though some of the internally used cpuset fields
> + * and internal variables can be modified without holding callback_lock. If 
> only
> + * reliable read access of the externally used fields are needed, a task can
> + * hold either cpuset_mutex or callback_lock which are exposed to other
> + * external subsystems.
> + *
> + * If a task holds cpu_hotplug_lock and cpuset_mutex, it blocks others,
> + * ensuring that it is the only task able to also acquire callback_lock and
> + * be able to modify cpusets.  It can perform various checks on the cpuset
> + * structure first, knowing nothing will change. It can also allocate memory
> + * without holding callback_lock. While it is performing these checks, 
> various
> + * callback routines can briefly acquire callback_lock to query cpusets.  
> Once
> + * it is ready to make the changes, it takes callback_lock, blocking everyone
> + * else.
> + *
> + * Calls to the kernel memory allocator cannot be made while holding
> + * callback_lock which is a spinlock, as the memory allocator may sleep or
> + * call back into cpuset code and acquire callback_lock.
> + *
> + * Now, the task_struct fields mems_allowed and mempolicy may be changed
> + * by other task, we use alloc_lock in the task_struct fields to protect
> + * them.
> + *
> + * The cpuset_common_seq_show() handlers only hold callback_lock across
> + * small pieces of code, such as when reading out possibly multi-word
> + * cpumasks and nodemasks.
> + */
> +
> +static DEFINE_MUTEX(cpuset_mutex);
> +
> +/*
> + * File level internal variables below follow one of the following exclusion
> + * rules.
> + *
> + * RWCS: Read/write-able by holding either cpus_write_lock or both
> + *       cpus_read_lock and cpuset_mutex.
> + *

Does this mean that variables can be read or written only by holding
cpus_write_lock?

I believe that to write cpuset variables, we must hold either (cpus_write_lock
and cpuset_mutex) or (cpus_read_lock and cpuset_mutex).

> + * CSCB: Readable by holding either cpuset_mutex or callback_lock. Writable
> + *    by holding both cpuset_mutex and callback_lock.
> + */
> +
>  /*
>   * For local partitions, update to subpartitions_cpus & isolated_cpus is done
>   * in update_parent_effective_cpumask(). For remote partitions, it is done in
> @@ -70,19 +122,18 @@ static const char * const perr_strings[] = {
>   * Exclusive CPUs distributed out to local or remote sub-partitions of
>   * top_cpuset
>   */
> -static cpumask_var_t subpartitions_cpus;
> +static cpumask_var_t subpartitions_cpus;     /* RWCS */
>  
>  /*
> - * Exclusive CPUs in isolated partitions
> + * Exclusive CPUs in isolated partitions (shown in cpuset.cpus.isolated)
>   */
> -static cpumask_var_t isolated_cpus;
> +static cpumask_var_t isolated_cpus;          /* CSCB */
>  
>  /*
> - * isolated_cpus updating flag (protected by cpuset_mutex)
> - * Set if isolated_cpus is going to be updated in the current
> - * cpuset_mutex crtical section.
> + * Set if isolated_cpus is being updated in the current cpuset_mutex
> + * critical section.
>   */
> -static bool isolated_cpus_updating;
> +static bool          isolated_cpus_updating; /* RWCS */
>  
>  /*
>   * A flag to force sched domain rebuild at the end of an operation.
> @@ -98,7 +149,7 @@ static bool isolated_cpus_updating;
>   * Note that update_relax_domain_level() in cpuset-v1.c can still call
>   * rebuild_sched_domains_locked() directly without using this flag.
>   */
> -static bool force_sd_rebuild;
> +static bool force_sd_rebuild;                        /* RWCS */
>  
>  /*
>   * Partition root states:
> @@ -218,42 +269,6 @@ struct cpuset top_cpuset = {
>       .partition_root_state = PRS_ROOT,
>  };
>  
> -/*
> - * There are two global locks guarding cpuset structures - cpuset_mutex and
> - * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel
> - * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to 
> cpuset
> - * structures. Note that cpuset_mutex needs to be a mutex as it is used in
> - * paths that rely on priority inheritance (e.g. scheduler - on RT) for
> - * correctness.
> - *
> - * A task must hold both locks to modify cpusets.  If a task holds
> - * cpuset_mutex, it blocks others, ensuring that it is the only task able to
> - * also acquire callback_lock and be able to modify cpusets.  It can perform
> - * various checks on the cpuset structure first, knowing nothing will change.
> - * It can also allocate memory while just holding cpuset_mutex.  While it is
> - * performing these checks, various callback routines can briefly acquire
> - * callback_lock to query cpusets.  Once it is ready to make the changes, it
> - * takes callback_lock, blocking everyone else.
> - *
> - * Calls to the kernel memory allocator can not be made while holding
> - * callback_lock, as that would risk double tripping on callback_lock
> - * from one of the callbacks into the cpuset code from within
> - * __alloc_pages().
> - *
> - * If a task is only holding callback_lock, then it has read-only
> - * access to cpusets.
> - *
> - * Now, the task_struct fields mems_allowed and mempolicy may be changed
> - * by other task, we use alloc_lock in the task_struct fields to protect
> - * them.
> - *
> - * The cpuset_common_seq_show() handlers only hold callback_lock across
> - * small pieces of code, such as when reading out possibly multi-word
> - * cpumasks and nodemasks.
> - */
> -
> -static DEFINE_MUTEX(cpuset_mutex);
> -
>  /**
>   * cpuset_lock - Acquire the global cpuset mutex
>   *
> @@ -1163,6 +1178,8 @@ static void reset_partition_data(struct cpuset *cs)
>  static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask 
> *xcpus)
>  {
>       WARN_ON_ONCE(old_prs == new_prs);
> +     lockdep_assert_held(&callback_lock);
> +     lockdep_assert_held(&cpuset_mutex);
>       if (new_prs == PRS_ISOLATED)
>               cpumask_or(isolated_cpus, isolated_cpus, xcpus);
>       else

-- 
Best regards,
Ridong


Reply via email to