On Thu, Jul 27, 2017 at 02:59:43PM -0400, Mathieu Desnoyers wrote:
> Implement MEMBARRIER_CMD_PRIVATE_EXPEDITED with IPIs using cpumask built
> from all runqueues for which current thread's mm is the same as our own.
> 
> Scheduler-wise, it requires that we add a memory barrier after context
> switching between processes (which have different mm).
> 
> It would be interesting to benchmark the overhead of this added barrier
> on the performance of context switching between processes. If the
> preexisting overhead of switching between mm is high enough, the
> overhead of adding this extra barrier may be insignificant.

I sent this along to the people asking for faster sys_membarrier(),
CCing you, thank you!

Please see other feedback inline below.

> [ Compile-tested only! ]
> 
> CC: Peter Zijlstra <[email protected]>
> CC: Paul E. McKenney <[email protected]>
> CC: Boqun Feng <[email protected]>
> Signed-off-by: Mathieu Desnoyers <[email protected]>
> ---
>  include/uapi/linux/membarrier.h |  8 +++--
>  kernel/membarrier.c             | 76 
> ++++++++++++++++++++++++++++++++++++++++-
>  kernel/sched/core.c             | 21 ++++++++++++
>  3 files changed, 102 insertions(+), 3 deletions(-)
> 
> diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
> index e0b108bd2624..6a33c5852f6b 100644
> --- a/include/uapi/linux/membarrier.h
> +++ b/include/uapi/linux/membarrier.h
> @@ -40,14 +40,18 @@
>   *                          (non-running threads are de facto in such a
>   *                          state). This covers threads from all processes
>   *                          running on the system. This command returns 0.
> + * TODO: documentation.

How about something like this?

 * @MEMBARRIER_CMD_PRIVATE_EXPEDITED:  IPI each CPU running a thread belonging
 *                          to the same process as the current thread, causing
 *                          the recipient CPU to execute a full memory-barrier
 *                          instruction.  The same-process determination is
 *                          made using the task_struct ->mm field:  If some
 *                          other CPU's currently running task has the same
 *                          value in its ->mm field as the requesting thread,
 *                          that CPU is IPIed.

>   *
>   * Command to be passed to the membarrier system call. The commands need to
>   * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
>   * the value 0.
>   */
>  enum membarrier_cmd {
> -     MEMBARRIER_CMD_QUERY = 0,
> -     MEMBARRIER_CMD_SHARED = (1 << 0),
> +     MEMBARRIER_CMD_QUERY                    = 0,
> +     MEMBARRIER_CMD_SHARED                   = (1 << 0),
> +     /* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
> +     /* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
> +     MEMBARRIER_CMD_PRIVATE_EXPEDITED        = (1 << 3),
>  };
> 
>  #endif /* _UAPI_LINUX_MEMBARRIER_H */
> diff --git a/kernel/membarrier.c b/kernel/membarrier.c
> index 9f9284f37f8d..8c6c0f96f617 100644
> --- a/kernel/membarrier.c
> +++ b/kernel/membarrier.c
> @@ -19,10 +19,81 @@
>  #include <linux/tick.h>
> 
>  /*
> + * XXX For cpu_rq(). Should we rather move
> + * membarrier_private_expedited() to sched/core.c or create
> + * sched/membarrier.c ?

Any reason not to move the entirety of membarrier.c there?  Assuming that
Peter is OK with this, of course.

> + */
> +#include "sched/sched.h"
> +
> +/*
>   * Bitmask made from a "or" of all commands within enum membarrier_cmd,
>   * except MEMBARRIER_CMD_QUERY.
>   */
> -#define MEMBARRIER_CMD_BITMASK       (MEMBARRIER_CMD_SHARED)
> +#define MEMBARRIER_CMD_BITMASK       \
> +     (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
> +
> +static void ipi_mb(void *info)
> +{
> +     smp_mb();       /* IPIs should be serializing but paranoid. */

I am good with paranoia!  ;-)

> +}
> +
> +static void membarrier_private_expedited_ipi_each(void)
> +{
> +     int cpu;
> +
> +     for_each_online_cpu(cpu) {
> +             struct task_struct *p;
> +
> +             rcu_read_lock();
> +             p = task_rcu_dereference(&cpu_rq(cpu)->curr);
> +             if (p && p->mm == current->mm)
> +                     smp_call_function_single(cpu, ipi_mb, NULL, 1);
> +             rcu_read_unlock();
> +     }
> +}
> +
> +static void membarrier_private_expedited(void)
> +{
> +     int cpu, this_cpu;
> +     cpumask_var_t tmpmask;
> +
> +     if (num_online_cpus() == 1)
> +             return;
> +
> +     /*
> +      * Matches memory barriers around rq->curr modification in
> +      * scheduler.
> +      */
> +     smp_mb();       /* system call entry is not a mb. */
> +
> +     if (!alloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
> +             /* Fallback for OOM. */
> +             membarrier_private_expedited_ipi_each();
> +             goto end;
> +     }
> +
> +     this_cpu = raw_smp_processor_id();
> +     for_each_online_cpu(cpu) {
> +             struct task_struct *p;
> +
> +             if (cpu == this_cpu)
> +                     continue;
> +             rcu_read_lock();
> +             p = task_rcu_dereference(&cpu_rq(cpu)->curr);
> +             if (p && p->mm == current->mm)
> +                     __cpumask_set_cpu(cpu, tmpmask);
> +             rcu_read_unlock();
> +     }
> +     smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
> +     free_cpumask_var(tmpmask);
> +end:
> +     /*
> +     * Memory barrier on the caller thread _after_ we finished
> +     * waiting for the last IPI. Matches memory barriers around
> +     * rq->curr modification in scheduler.
> +     */
> +     smp_mb();       /* exit from system call is not a mb */
> +}
> 
>  /**
>   * sys_membarrier - issue memory barriers on a set of threads
> @@ -64,6 +135,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
>               if (num_online_cpus() > 1)
>                       synchronize_sched();
>               return 0;
> +     case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
> +             membarrier_private_expedited();
> +             return 0;
>       default:
>               return -EINVAL;
>       }
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 17c667b427b4..f171d2aaaf82 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2724,6 +2724,26 @@ asmlinkage __visible void schedule_tail(struct 
> task_struct *prev)
>               put_user(task_pid_vnr(current), current->set_child_tid);
>  }
> 
> +#ifdef CONFIG_MEMBARRIER
> +static void membarrier_expedited_mb_after_set_current(struct mm_struct *mm,
> +             struct mm_struct *oldmm)
> +{
> +     if (likely(mm == oldmm))
> +             return;         /* Thread context switch, same mm. */
> +     /*
> +      * When switching between processes, membarrier expedited
> +      * private requires a memory barrier after we set the current
> +      * task.
> +      */
> +     smp_mb();
> +}
> +#else /* #ifdef CONFIG_MEMBARRIER */
> +static void membarrier_expedited_mb_after_set_current(struct mm_struct *mm,
> +             struct mm_struct *oldmm)
> +{
> +}
> +#endif /* #else #ifdef CONFIG_MEMBARRIER */

Why not something like this?  Shorter, easier to read, and should generate
the same code.

static void membarrier_expedited_mb_after_set_current(struct mm_struct *mm,
                struct mm_struct *oldmm)
{
        if (!IS_ENABLED(CONFIG_MEMBARRIER))
                return;
        if (likely(mm == oldmm))
                return;         /* Thread context switch, same mm. */
        /*
         * When switching between processes, membarrier expedited
         * private requires a memory barrier after we set the current
         * task.
         */
        smp_mb();
}

> +
>  /*
>   * context_switch - switch to the new MM and the new thread's register state.
>   */
> @@ -2737,6 +2757,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
> 
>       mm = next->mm;
>       oldmm = prev->active_mm;
> +     membarrier_expedited_mb_after_set_current(mm, oldmm);
>       /*
>        * For paravirt, this is coupled with an exit in switch_to to
>        * combine the page table reload and the switch backend into
> -- 
> 2.11.0
> 

Reply via email to