On Thu, 14 Jul 2022 14:47:37 GMT Mark Johnston <[email protected]> wrote:
> The branch main has been updated by markj:
> 
> URL: 
> https://cgit.FreeBSD.org/src/commit/?id=03f868b163ad46d6f7cb03dc46fb83ca01fb8f69
> 
> commit 03f868b163ad46d6f7cb03dc46fb83ca01fb8f69
> Author:     Mark Johnston <[email protected]>
> AuthorDate: 2022-07-14 14:24:25 +0000
> Commit:     Mark Johnston <[email protected]>
> CommitDate: 2022-07-14 14:28:01 +0000
> 
>     x86: Add a required store-load barrier in cpu_idle()
>     
>     ULE's tdq_notify() tries to avoid delivering IPIs to the idle thread.
>     In particular, it tries to detect whether the idle thread is running.
>     There are two mechanisms for this:
>     - tdq_cpu_idle, an MI flag which is set prior to calling cpu_idle().  If
>       tdq_cpu_idle == 0, then no IPI is needed;
>     - idle_state, an x86-specific state flag which is updated after
>       cpu_idleclock() is called.
>     
>     The implementation of the second mechanism is racy; the race can cause a
>     CPU to go to sleep with pending work.  Specifically, cpu_idle_*() set
>     idle_state = STATE_SLEEPING, then check for pending work by loading the
>     tdq_load field of the CPU's runqueue.  These operations can be reordered
>     so that the idle thread observes tdq_load == 0, and tdq_notify()
>     observes idle_state == STATE_RUNNING.
>     
>     Some counters indicate that the idle_state check in tdq_notify()
>     frequently elides an IPI.  So, fix the problem by inserting a fence
>     after the store to idle_state, immediately before idling the CPU.
>     
>     PR:             264867
>     Reviewed by:    mav, kib, jhb
>     MFC after:      1 month
>     Sponsored by:   The FreeBSD Foundation
>     Differential Revision:  https://reviews.freebsd.org/D35777
> ---
>  sys/x86/x86/cpu_machdep.c | 103 
> ++++++++++++++++++++++++++++------------------
>  1 file changed, 62 insertions(+), 41 deletions(-)
> 
> diff --git a/sys/x86/x86/cpu_machdep.c b/sys/x86/x86/cpu_machdep.c
> index fa11f64e2779..040438043c73 100644
> --- a/sys/x86/x86/cpu_machdep.c
> +++ b/sys/x86/x86/cpu_machdep.c
> @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
>  #include "opt_maxmem.h"
>  #include "opt_mp_watchdog.h"
>  #include "opt_platform.h"
> +#include "opt_sched.h"
>  #ifdef __i386__
>  #include "opt_apic.h"
>  #endif
> @@ -532,32 +533,25 @@ static int      idle_mwait = 1;         /* Use 
> MONITOR/MWAIT for short idle. */
>  SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
>      0, "Use MONITOR/MWAIT for short idle");
>  
> -static void
> -cpu_idle_acpi(sbintime_t sbt)
> +static bool
> +cpu_idle_enter(int *statep, int newstate)
>  {
> -     int *state;
> +     KASSERT(atomic_load_int(statep) == STATE_RUNNING,
> +         ("%s: state %d", __func__, atomic_load_int(statep)));
>  
> -     state = &PCPU_PTR(monitorbuf)->idle_state;
> -     atomic_store_int(state, STATE_SLEEPING);
> -
> -     /* See comments in cpu_idle_hlt(). */
> -     disable_intr();
> -     if (sched_runnable())
> -             enable_intr();
> -     else if (cpu_idle_hook)
> -             cpu_idle_hook(sbt);
> -     else
> -             acpi_cpu_c1();
> -     atomic_store_int(state, STATE_RUNNING);
> -}
> -
> -static void
> -cpu_idle_hlt(sbintime_t sbt)
> -{
> -     int *state;
> -
> -     state = &PCPU_PTR(monitorbuf)->idle_state;
> -     atomic_store_int(state, STATE_SLEEPING);
> +     /*
> +      * A fence is needed to prevent reordering of the load in
> +      * sched_runnable() with this store to the idle state word.  Without it,
> +      * cpu_idle_wakeup() can observe the state as STATE_RUNNING after having
> +      * added load to the queue, and elide an IPI.  Then, sched_runnable()
> +      * can observe tdq_load == 0, so the CPU ends up idling with pending
> +      * work.  tdq_notify() similarly ensures that a prior update to tdq_load
> +      * is visible before calling cpu_idle_wakeup().
> +      */
> +     atomic_store_int(statep, newstate);
> +#if defined(SCHED_ULE) && defined(SMP)
> +     atomic_thread_fence_seq_cst();
> +#endif

Can't you combine the store and the fence with
atomic_store_explicit(memory_order_seq_cst) or, since this is x86
specific code, atomic_swap_int?

#if defined(SCHED_ULE) && defined(SMP)
        atomic_swap_int(statep, newstate);
#else
        atomic_store_int(statep, newstate);
#endif

>       /*
>        * Since we may be in a critical section from cpu_idle(), if
> @@ -576,35 +570,62 @@ cpu_idle_hlt(sbintime_t sbt)
>        * interrupt.
>        */
>       disable_intr();
> -     if (sched_runnable())
> +     if (sched_runnable()) {
>               enable_intr();
> -     else
> -             acpi_cpu_c1();
> -     atomic_store_int(state, STATE_RUNNING);
> +             atomic_store_int(statep, STATE_RUNNING);
> +             return (false);
> +     } else {
> +             return (true);
> +     }
>  }
>  
>  static void
> -cpu_idle_mwait(sbintime_t sbt)
> +cpu_idle_exit(int *statep)
> +{
> +     atomic_store_int(statep, STATE_RUNNING);
> +}

> +static void
> +cpu_idle_hlt(sbintime_t sbt)
> +{
> +     int *state;
> +
> +     state = &PCPU_PTR(monitorbuf)->idle_state;
> +     if (cpu_idle_enter(state, STATE_SLEEPING)) {
> +             acpi_cpu_c1();
>               atomic_store_int(state, STATE_RUNNING);

Should this call be replaced with cpu_idle_exit(state), just for
consistency?

> -             enable_intr();
> -             return;
>       }
> +}

Reply via email to