Implement MEMBARRIER_CMD_PRIVATE_EXPEDITED with IPIs using cpumask built
from all runqueues for which current thread's mm is the same as the
thread calling sys_membarrier.

Scheduler-wise, it requires that we add a memory barrier after context
switching between processes (which have different mm). Interestingly,
there is already a memory barrier in mmdrop(), so we only need to add
a barrier when switching from a kernel thread to a userspace thread.
We also don't need to add the barrier when switching to a kernel thread,
because it has no userspace memory mapping, which makes ordering of
user-space memory accesses pretty much useless.

* Benchmark

A stress-test benchmark of sched pipe shows that it does not add
significant overhead to the scheduler switching between processes:

100 runs of:

taskset 01 ./perf bench sched pipe

Running 'sched/pipe' benchmark:
Executed 1000000 pipe operations between two processes

Hardware: CPU: Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz

A) With 4.13.0-rc2+
   at commit a97fb594bc7d ("virtio-net: fix module unloading")

avg.:     2.923 usecs/op
std.dev:  0.057 usecs/op

B) With this commit:

avg.:     2.916 usecs/op
std.dev:  0.043 usecs/op

Changes since v1:
- move membarrier code under kernel/sched/ because it uses the
  scheduler runqueue,
- only add the barrier when we switch from a kernel thread. The case
  where we switch from a user-space thread is already handled by
  the atomic_dec_and_test() in mmdrop().
- add a comment to mmdrop() documenting the requirement on the implicit
  memory barrier.

CC: Peter Zijlstra <[email protected]>
CC: Paul E. McKenney <[email protected]>
CC: Boqun Feng <[email protected]>
CC: Andrew Hunter <[email protected]>
CC: Maged Michael <[email protected]>
CC: [email protected]
CC: Avi Kivity <[email protected]>
Signed-off-by: Mathieu Desnoyers <[email protected]>
---
 MAINTAINERS                     |  2 +-
 include/linux/sched/mm.h        |  5 +++
 include/uapi/linux/membarrier.h | 23 +++++++++++--
 kernel/Makefile                 |  1 -
 kernel/sched/Makefile           |  1 +
 kernel/sched/core.c             | 27 ++++++++++++++++
 kernel/{ => sched}/membarrier.c | 72 ++++++++++++++++++++++++++++++++++++++++-
 7 files changed, 126 insertions(+), 5 deletions(-)
 rename kernel/{ => sched}/membarrier.c (59%)

diff --git a/MAINTAINERS b/MAINTAINERS
index f66488dfdbc9..3b035584272f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8621,7 +8621,7 @@ M:        Mathieu Desnoyers 
<[email protected]>
 M:     "Paul E. McKenney" <[email protected]>
 L:     [email protected]
 S:     Supported
-F:     kernel/membarrier.c
+F:     kernel/sched/membarrier.c
 F:     include/uapi/linux/membarrier.h
 
 MEMORY MANAGEMENT
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 2b24a6974847..5c5384d9ae0f 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -38,6 +38,11 @@ static inline void mmgrab(struct mm_struct *mm)
 extern void __mmdrop(struct mm_struct *);
 static inline void mmdrop(struct mm_struct *mm)
 {
+       /*
+        * Implicit full memory barrier provided by
+        * atomic_dec_and_test() is required by membarrier. See comments
+        * around membarrier_expedited_mb_after_set_current().
+        */
        if (unlikely(atomic_dec_and_test(&mm->mm_count)))
                __mmdrop(mm);
 }
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index e0b108bd2624..6d47b3249d8a 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -40,14 +40,33 @@
  *                          (non-running threads are de facto in such a
  *                          state). This covers threads from all processes
  *                          running on the system. This command returns 0.
+ * @MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+ *                          Execute a memory barrier on each running
+ *                          thread belonging to the same process as the current
+ *                          thread. Upon return from system call, the
+ *                          caller thread is ensured that all its running
+ *                          threads siblings have passed through a state
+ *                          where all memory accesses to user-space
+ *                          addresses match program order between entry
+ *                          to and return from the system call
+ *                          (non-running threads are de facto in such a
+ *                          state). This only covers threads from the
+ *                          same processes as the caller thread. This
+ *                          command returns 0. The "expedited" commands
+ *                          complete faster than the non-expedited ones,
+ *                          they never block, but have the downside of
+ *                          causing extra overhead.
  *
  * Command to be passed to the membarrier system call. The commands need to
  * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
  * the value 0.
  */
 enum membarrier_cmd {
-       MEMBARRIER_CMD_QUERY = 0,
-       MEMBARRIER_CMD_SHARED = (1 << 0),
+       MEMBARRIER_CMD_QUERY                    = 0,
+       MEMBARRIER_CMD_SHARED                   = (1 << 0),
+       /* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
+       /* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
+       MEMBARRIER_CMD_PRIVATE_EXPEDITED        = (1 << 3),
 };
 
 #endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 4cb8e8b23c6e..9c323a6daa46 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -108,7 +108,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
-obj-$(CONFIG_MEMBARRIER) += membarrier.o
 
 obj-$(CONFIG_HAS_IOMEM) += memremap.o
 
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 53f0164ed362..78f54932ea1d 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 17c667b427b4..01e3b881ab3a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2724,6 +2724,32 @@ asmlinkage __visible void schedule_tail(struct 
task_struct *prev)
                put_user(task_pid_vnr(current), current->set_child_tid);
 }
 
+static void membarrier_expedited_mb_after_set_current(struct mm_struct *mm,
+               struct mm_struct *oldmm)
+{
+       if (!IS_ENABLED(CONFIG_MEMBARRIER))
+               return;
+       /*
+        * __schedule()->
+        *   finish_task_switch()->
+        *    if (mm)
+        *      mmdrop(mm) ->
+        *        atomic_dec_and_test()
+        * takes care of issuing a memory barrier when oldmm is
+        * non-NULL. We also don't need the barrier when switching to a
+        * kernel thread, nor when we switch between threads belonging
+        * to the same process.
+        */
+       if (likely(oldmm || !mm || mm == oldmm))
+               return;
+       /*
+        * When switching between processes, membarrier expedited
+        * private requires a memory barrier after we set the current
+        * task.
+        */
+       smp_mb();
+}
+
 /*
  * context_switch - switch to the new MM and the new thread's register state.
  */
@@ -2737,6 +2763,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
        mm = next->mm;
        oldmm = prev->active_mm;
+       membarrier_expedited_mb_after_set_current(mm, oldmm);
        /*
         * For paravirt, this is coupled with an exit in switch_to to
         * combine the page table reload and the switch backend into
diff --git a/kernel/membarrier.c b/kernel/sched/membarrier.c
similarity index 59%
rename from kernel/membarrier.c
rename to kernel/sched/membarrier.c
index 9f9284f37f8d..f80828b0b607 100644
--- a/kernel/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -17,12 +17,79 @@
 #include <linux/syscalls.h>
 #include <linux/membarrier.h>
 #include <linux/tick.h>
+#include <linux/cpumask.h>
+
+#include "sched.h"     /* for cpu_rq(). */
 
 /*
  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
  * except MEMBARRIER_CMD_QUERY.
  */
-#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED)
+#define MEMBARRIER_CMD_BITMASK \
+       (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
+
+static void ipi_mb(void *info)
+{
+       smp_mb();       /* IPIs should be serializing but paranoid. */
+}
+
+static void membarrier_private_expedited(void)
+{
+       int cpu, this_cpu;
+       bool fallback = false;
+       cpumask_var_t tmpmask;
+
+       if (num_online_cpus() == 1)
+               return;
+
+       /*
+        * Matches memory barriers around rq->curr modification in
+        * scheduler.
+        */
+       smp_mb();       /* system call entry is not a mb. */
+
+       if (!alloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+               /* Fallback for OOM. */
+               fallback = true;
+       }
+
+       /*
+        * Skipping the current CPU is OK even through we can be
+        * migrated at any point. The current CPU, at the point where we
+        * read raw_smp_processor_id(), is ensured to be in program
+        * order with respect to the caller thread. Therefore, we can
+        * skip this CPU from the iteration.
+        */
+       this_cpu = raw_smp_processor_id();
+       cpus_read_lock();
+       for_each_online_cpu(cpu) {
+               struct task_struct *p;
+
+               if (cpu == this_cpu)
+                       continue;
+               rcu_read_lock();
+               p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+               if (p && p->mm == current->mm) {
+                       if (!fallback)
+                               __cpumask_set_cpu(cpu, tmpmask);
+                       else
+                               smp_call_function_single(cpu, ipi_mb, NULL, 1);
+               }
+               rcu_read_unlock();
+       }
+       cpus_read_unlock();
+       if (!fallback) {
+               smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+               free_cpumask_var(tmpmask);
+       }
+
+       /*
+        * Memory barrier on the caller thread _after_ we finished
+        * waiting for the last IPI. Matches memory barriers around
+        * rq->curr modification in scheduler.
+        */
+       smp_mb();       /* exit from system call is not a mb */
+}
 
 /**
  * sys_membarrier - issue memory barriers on a set of threads
@@ -64,6 +131,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
                if (num_online_cpus() > 1)
                        synchronize_sched();
                return 0;
+       case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+               membarrier_private_expedited();
+               return 0;
        default:
                return -EINVAL;
        }
-- 
2.11.0

Reply via email to