On Thu, Jul 27, 2017 at 08:18:47PM +0000, Mathieu Desnoyers wrote: > > ----- On Jul 27, 2017, at 4:06 PM, Paul E. McKenney > [email protected] wrote: > > > On Thu, Jul 27, 2017 at 02:59:43PM -0400, Mathieu Desnoyers wrote: > >> Implement MEMBARRIER_CMD_PRIVATE_EXPEDITED with IPIs using cpumask built > >> from all runqueues for which current thread's mm is the same as our own. > >> > >> Scheduler-wise, it requires that we add a memory barrier after context > >> switching between processes (which have different mm). > >> > >> It would be interesting to benchmark the overhead of this added barrier > >> on the performance of context switching between processes. If the > >> preexisting overhead of switching between mm is high enough, the > >> overhead of adding this extra barrier may be insignificant. > > > > I sent this along to the people asking for faster sys_membarrier(), > > CCing you, thank you! > > Thanks! More below,
At at least one had it pass their initial testing. > > Please see other feedback inline below. > > > >> [ Compile-tested only! ] > >> > >> CC: Peter Zijlstra <[email protected]> > >> CC: Paul E. McKenney <[email protected]> > >> CC: Boqun Feng <[email protected]> > >> Signed-off-by: Mathieu Desnoyers <[email protected]> > >> --- > >> include/uapi/linux/membarrier.h | 8 +++-- > >> kernel/membarrier.c | 76 > >> ++++++++++++++++++++++++++++++++++++++++- > >> kernel/sched/core.c | 21 ++++++++++++ > >> 3 files changed, 102 insertions(+), 3 deletions(-) > >> > >> diff --git a/include/uapi/linux/membarrier.h > >> b/include/uapi/linux/membarrier.h > >> index e0b108bd2624..6a33c5852f6b 100644 > >> --- a/include/uapi/linux/membarrier.h > >> +++ b/include/uapi/linux/membarrier.h > >> @@ -40,14 +40,18 @@ > >> * (non-running threads are de facto in such a > >> * state). This covers threads from all processes > >> * running on the system. This command returns 0. > >> + * TODO: documentation. > > > > How about something like this? > > > > * @MEMBARRIER_CMD_PRIVATE_EXPEDITED: IPI each CPU running a thread > > belonging > > * to the same process as the current thread, > > causing > > * the recipient CPU to execute a full > > memory-barrier > > * instruction. The same-process determination is > > * made using the task_struct ->mm field: If some > > * other CPU's currently running task has the same > > * value in its ->mm field as the requesting thread, > > * that CPU is IPIed. > > > > I would prefer: > > * @MEMBARRIER_CMD_PRIVATE_EXPEDITED: > * Execute a memory barrier on each running > * thread belonging to the same process as the > current > * thread. Upon return from system call, the > * caller thread is ensured that all its running > * threads siblings have passed through a state > * where all memory accesses to user-space > * addresses match program order between entry > * to and return from the system call > * (non-running threads are de facto in such a > * state). This only covers threads from the > * same processes as the caller thread. This > * command returns 0. > > mainly because this is a uapi/ header, installed into distribution headers. So > I want to remove any implementation reference from that text. Fine by me, as long as there is documentation. ;-) > >> * > >> * Command to be passed to the membarrier system call. The commands need > >> to > >> * be a single bit each, except for MEMBARRIER_CMD_QUERY which is > >> assigned to > >> * the value 0. > >> */ > >> enum membarrier_cmd { > >> - MEMBARRIER_CMD_QUERY = 0, > >> - MEMBARRIER_CMD_SHARED = (1 << 0), > >> + MEMBARRIER_CMD_QUERY = 0, > >> + MEMBARRIER_CMD_SHARED = (1 << 0), > >> + /* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */ > >> + /* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */ > >> + MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3), > >> }; > >> > >> #endif /* _UAPI_LINUX_MEMBARRIER_H */ > >> diff --git a/kernel/membarrier.c b/kernel/membarrier.c > >> index 9f9284f37f8d..8c6c0f96f617 100644 > >> --- a/kernel/membarrier.c > >> +++ b/kernel/membarrier.c > >> @@ -19,10 +19,81 @@ > >> #include <linux/tick.h> > >> > >> /* > >> + * XXX For cpu_rq(). Should we rather move > >> + * membarrier_private_expedited() to sched/core.c or create > >> + * sched/membarrier.c ? > > > > Any reason not to move the entirety of membarrier.c there? Assuming that > > Peter is OK with this, of course. > > I'm planning to do exactly this for v2. Fair enough. > >> + */ > >> +#include "sched/sched.h" > >> + > >> +/* > >> * Bitmask made from a "or" of all commands within enum membarrier_cmd, > >> * except MEMBARRIER_CMD_QUERY. > >> */ > >> -#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED) > >> +#define MEMBARRIER_CMD_BITMASK \ > >> + (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED) > >> + > >> +static void ipi_mb(void *info) > >> +{ > >> + smp_mb(); /* IPIs should be serializing but paranoid. */ > > > > I am good with paranoia! ;-) > > > >> +} > >> + > >> +static void membarrier_private_expedited_ipi_each(void) > >> +{ > >> + int cpu; > >> + > >> + for_each_online_cpu(cpu) { > >> + struct task_struct *p; > >> + > >> + rcu_read_lock(); > >> + p = task_rcu_dereference(&cpu_rq(cpu)->curr); > >> + if (p && p->mm == current->mm) > >> + smp_call_function_single(cpu, ipi_mb, NULL, 1); > >> + rcu_read_unlock(); > >> + } > >> +} > >> + > >> +static void membarrier_private_expedited(void) > >> +{ > >> + int cpu, this_cpu; > >> + cpumask_var_t tmpmask; > >> + > >> + if (num_online_cpus() == 1) > >> + return; > >> + > >> + /* > >> + * Matches memory barriers around rq->curr modification in > >> + * scheduler. > >> + */ > >> + smp_mb(); /* system call entry is not a mb. */ > >> + > >> + if (!alloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { > >> + /* Fallback for OOM. */ > >> + membarrier_private_expedited_ipi_each(); > >> + goto end; > >> + } > >> + > >> + this_cpu = raw_smp_processor_id(); > >> + for_each_online_cpu(cpu) { > >> + struct task_struct *p; > >> + > >> + if (cpu == this_cpu) > >> + continue; > >> + rcu_read_lock(); > >> + p = task_rcu_dereference(&cpu_rq(cpu)->curr); > >> + if (p && p->mm == current->mm) > >> + __cpumask_set_cpu(cpu, tmpmask); > >> + rcu_read_unlock(); > >> + } > >> + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); > >> + free_cpumask_var(tmpmask); > >> +end: > >> + /* > >> + * Memory barrier on the caller thread _after_ we finished > >> + * waiting for the last IPI. Matches memory barriers around > >> + * rq->curr modification in scheduler. > >> + */ > >> + smp_mb(); /* exit from system call is not a mb */ > >> +} > >> > >> /** > >> * sys_membarrier - issue memory barriers on a set of threads > >> @@ -64,6 +135,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) > >> if (num_online_cpus() > 1) > >> synchronize_sched(); > >> return 0; > >> + case MEMBARRIER_CMD_PRIVATE_EXPEDITED: > >> + membarrier_private_expedited(); > >> + return 0; > >> default: > >> return -EINVAL; > >> } > >> diff --git a/kernel/sched/core.c b/kernel/sched/core.c > >> index 17c667b427b4..f171d2aaaf82 100644 > >> --- a/kernel/sched/core.c > >> +++ b/kernel/sched/core.c > >> @@ -2724,6 +2724,26 @@ asmlinkage __visible void schedule_tail(struct > >> task_struct *prev) > >> put_user(task_pid_vnr(current), current->set_child_tid); > >> } > >> > >> +#ifdef CONFIG_MEMBARRIER > >> +static void membarrier_expedited_mb_after_set_current(struct mm_struct > >> *mm, > >> + struct mm_struct *oldmm) > >> +{ > >> + if (likely(mm == oldmm)) > >> + return; /* Thread context switch, same mm. */ > >> + /* > >> + * When switching between processes, membarrier expedited > >> + * private requires a memory barrier after we set the current > >> + * task. > >> + */ > >> + smp_mb(); > >> +} > >> +#else /* #ifdef CONFIG_MEMBARRIER */ > >> +static void membarrier_expedited_mb_after_set_current(struct mm_struct > >> *mm, > >> + struct mm_struct *oldmm) > >> +{ > >> +} > >> +#endif /* #else #ifdef CONFIG_MEMBARRIER */ > > > > Why not something like this? Shorter, easier to read, and should generate > > the same code. > > Sure, I'll use this in my v2. Sounds good! Thanx, Paul > Thanks! > > Mathieu > > > > > static void membarrier_expedited_mb_after_set_current(struct mm_struct *mm, > > struct mm_struct *oldmm) > > { > > if (!IS_ENABLED(CONFIG_MEMBARRIER)) > > return; > > if (likely(mm == oldmm)) > > return; /* Thread context switch, same mm. */ > > /* > > * When switching between processes, membarrier expedited > > * private requires a memory barrier after we set the current > > * task. > > */ > > smp_mb(); > > } > > > >> + > >> /* > >> * context_switch - switch to the new MM and the new thread's register > >> state. > >> */ > >> @@ -2737,6 +2757,7 @@ context_switch(struct rq *rq, struct task_struct > >> *prev, > >> > >> mm = next->mm; > >> oldmm = prev->active_mm; > >> + membarrier_expedited_mb_after_set_current(mm, oldmm); > >> /* > >> * For paravirt, this is coupled with an exit in switch_to to > >> * combine the page table reload and the switch backend into > >> -- > >> 2.11.0 > > -- > Mathieu Desnoyers > EfficiOS Inc. > http://www.efficios.com >

