On Mon, Jul 06, 2020 at 03:28:38PM +0100, Qais Yousef wrote:

> +static void __uclamp_sync_util_min_rt_default(struct task_struct *p)
> +{
> +     unsigned int default_util_min;
> +     struct uclamp_se *uc_se;
> +
> +     WARN_ON_ONCE(!rcu_read_lock_held());
> +
> +     if (!rt_task(p))
> +             return;
> +
> +     uc_se = &p->uclamp_req[UCLAMP_MIN];
> +
> +     /* Only sync if user didn't override the default */
> +     if (uc_se->user_defined)
> +             return;
> +
> +     /* Sync with smp_wmb() in uclamp_sync_util_min_rt_default() */
> +     smp_rmb();
> +     default_util_min = sysctl_sched_uclamp_util_min_rt_default;
> +     uclamp_se_set(uc_se, default_util_min, false);
> +}
> +
> +static void uclamp_sync_util_min_rt_default(void)
> +{
> +     struct task_struct *g, *p;
> +
> +     /*
> +      * Make sure the updated sysctl_sched_uclamp_util_min_rt_default which
> +      * was just written is synchronized against any future read on another
> +      * cpu.
> +      */
> +     smp_wmb();
> +
> +     /*
> +      * Wait for all updaters to observe the new change.
> +      *
> +      * There are 2 races to deal with here:
> +      *
> +      * 1. fork()->copy_process()
> +      *
> +      *      If a task was concurrently forking, for_each_process_thread()
> +      *      will not see it, hence it could have copied the old value and
> +      *      we missed the opportunity to update it.
> +      *
> +      *      This should be handled by sched_post_fork() where it'll ensure
> +      *      it performs the sync after the fork.
> +      *
> +      * 2. fork()->sched_post_fork()
> +      *    __setscheduler_uclamp()
> +      *
> +      *      Both of these functions could read the old value but then get
> +      *      preempted, during which a user might write new value to
> +      *      sysctl_sched_uclamp_util_min_rt_default.
> +      *
> +      *      // read sysctl_sched_uclamp_util_min_rt_default;
> +      *      // PREEMPT-OUT
> +      *      .
> +      *      .                  <-- sync happens here
> +      *      .
> +      *      // PREEMPT-IN
> +      *      // write p->uclamp_req[UCLAMP_MIN]
> +      *
> +      *      That section is protected with rcu_read_lock(), so
> +      *      synchronize_rcu() will guarantee it has finished before we
> +      *      perform the update. Hence ensure that this sync happens after
> +      *      any concurrent sync which should guarantee correctness.
> +      */
> +     synchronize_rcu();
> +
> +     rcu_read_lock();
> +     for_each_process_thread(g, p)
> +             __uclamp_sync_util_min_rt_default(p);
> +     rcu_read_unlock();
> +}

It's monday, and I cannot get my brain working.. I cannot decipher the
comments you have with the smp_[rw]mb(), what actual ordering do they
enforce?

Also, your synchronize_rcu() relies on write_lock() beeing
non-preemptible, which isn't true on PREEMPT_RT.

The below seems simpler...

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1013,8 +1013,6 @@ static void __uclamp_sync_util_min_rt_de
        unsigned int default_util_min;
        struct uclamp_se *uc_se;
 
-       WARN_ON_ONCE(!rcu_read_lock_held());
-
        if (!rt_task(p))
                return;
 
@@ -1024,8 +1022,6 @@ static void __uclamp_sync_util_min_rt_de
        if (uc_se->user_defined)
                return;
 
-       /* Sync with smp_wmb() in uclamp_sync_util_min_rt_default() */
-       smp_rmb();
        default_util_min = sysctl_sched_uclamp_util_min_rt_default;
        uclamp_se_set(uc_se, default_util_min, false);
 }
@@ -1035,47 +1031,21 @@ static void uclamp_sync_util_min_rt_defa
        struct task_struct *g, *p;
 
        /*
-        * Make sure the updated sysctl_sched_uclamp_util_min_rt_default which
-        * was just written is synchronized against any future read on another
-        * cpu.
-        */
-       smp_wmb();
-
-       /*
-        * Wait for all updaters to observe the new change.
-        *
-        * There are 2 races to deal with here:
-        *
-        * 1. fork()->copy_process()
-        *
-        *      If a task was concurrently forking, for_each_process_thread()
-        *      will not see it, hence it could have copied the old value and
-        *      we missed the opportunity to update it.
-        *
-        *      This should be handled by sched_post_fork() where it'll ensure
-        *      it performs the sync after the fork.
-        *
-        * 2. fork()->sched_post_fork()
-        *    __setscheduler_uclamp()
-        *
-        *      Both of these functions could read the old value but then get
-        *      preempted, during which a user might write new value to
-        *      sysctl_sched_uclamp_util_min_rt_default.
-        *
-        *      // read sysctl_sched_uclamp_util_min_rt_default;
-        *      // PREEMPT-OUT
-        *      .
-        *      .                  <-- sync happens here
-        *      .
-        *      // PREEMPT-IN
-        *      // write p->uclamp_req[UCLAMP_MIN]
-        *
-        *      That section is protected with rcu_read_lock(), so
-        *      synchronize_rcu() will guarantee it has finished before we
-        *      perform the update. Hence ensure that this sync happens after
-        *      any concurrent sync which should guarantee correctness.
-        */
-       synchronize_rcu();
+        * copy_process()                       sysctl_uclamp
+        *                                        uclamp_min_rt = X;
+        *   write_lock(&tasklist_lock)           read_lock(&tasklist_lock)
+        *   // link thread                       smp_mb__after_spinlock()
+        *   write_unlock(&tasklist_lock)         read_unlock(&tasklist_lock);
+        *   sched_post_fork()                    for_each_process_thread()
+        *     __uclamp_sync_rt()                   __uclamp_sync_rt()
+        *
+        * Ensures that either sched_post_fork() will observe the new
+        * uclamp_min_rt or for_each_process_thread() will observe the new
+        * task.
+        */
+       read_lock(&tasklist_lock);
+       smp_mb__after_spinlock();
+       read_unlock(&tasklist_lock);
 
        rcu_read_lock();
        for_each_process_thread(g, p)
@@ -1408,6 +1378,9 @@ int sysctl_sched_uclamp_handler(struct c
                uclamp_update_root_tg();
        }
 
+       if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default)
+               uclamp_sync_util_min_rt_default();
+
        /*
         * We update all RUNNABLE tasks only when task groups are in use.
         * Otherwise, keep it simple and do just a lazy update at each next
@@ -1466,9 +1439,7 @@ static void __setscheduler_uclamp(struct
                 * at runtime.
                 */
                if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) {
-                       rcu_read_lock();
                        __uclamp_sync_util_min_rt_default(p);
-                       rcu_read_unlock();
                } else {
                        uclamp_se_set(uc_se, uclamp_none(clamp_id), false);
                }
@@ -1521,6 +1492,11 @@ static void __init init_uclamp_rq(struct
        rq->uclamp_flags = 0;
 }
 
+static void uclamp_post_fork(struct task_struct *p)
+{
+       __uclamp_sync_util_min_rt_default(p);
+}
+
 static void __init init_uclamp(void)
 {
        struct uclamp_se uc_max = {};

Reply via email to