[PATCH v2 04/10] x86/mm: Track the TLB's tlb_gen and update the flushing algorithm

2017-06-13 Thread Andy Lutomirski
There are two kernel features that would benefit from tracking
how up-to-date each CPU's TLB is in the case where IPIs aren't keeping
it up to date in real time:

 - Lazy mm switching currently works by switching to init_mm when
   it would otherwise flush.  This is wasteful: there isn't fundamentally
   any need to update CR3 at all when going lazy or when returning from
   lazy mode, nor is there any need to receive flush IPIs at all.  Instead,
   we should just stop trying to keep the TLB coherent when we go lazy and,
   when unlazying, check whether we missed any flushes.

 - PCID will let us keep recent user contexts alive in the TLB.  If we
   start doing this, we need a way to decide whether those contexts are
   up to date.

On some paravirt systems, remote TLBs can be flushed without IPIs.
This won't update the target CPUs' tlb_gens, which may cause
unnecessary local flushes later on.  We can address this if it becomes
a problem by carefully updating the target CPU's tlb_gen directly.

By itself, this patch is a very minor optimization that avoids
unnecessary flushes when multiple TLB flushes targetting the same CPU
race.

Signed-off-by: Andy Lutomirski 
---
 arch/x86/include/asm/tlbflush.h | 37 +++
 arch/x86/mm/tlb.c   | 79 +
 2 files changed, 109 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 1eb946c0507e..4f6c30d6ec39 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -82,6 +82,11 @@ static inline u64 bump_mm_tlb_gen(struct mm_struct *mm)
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
+struct tlb_context {
+   u64 ctx_id;
+   u64 tlb_gen;
+};
+
 struct tlb_state {
/*
 * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
@@ -97,6 +102,21 @@ struct tlb_state {
 * disabling interrupts when modifying either one.
 */
unsigned long cr4;
+
+   /*
+* This is a list of all contexts that might exist in the TLB.
+* Since we don't yet use PCID, there is only one context.
+*
+* For each context, ctx_id indicates which mm the TLB's user
+* entries came from.  As an invariant, the TLB will never
+* contain entries that are out-of-date as when that mm reached
+* the tlb_gen in the list.
+*
+* To be clear, this means that it's legal for the TLB code to
+* flush the TLB without updating tlb_gen.  This can happen
+* (for now, at least) due to paravirt remote flushes.
+*/
+   struct tlb_context ctxs[1];
 };
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
 
@@ -248,9 +268,26 @@ static inline void __flush_tlb_one(unsigned long addr)
  * and page-granular flushes are available only on i486 and up.
  */
 struct flush_tlb_info {
+   /*
+* We support several kinds of flushes.
+*
+* - Fully flush a single mm.  flush_mm will be set, flush_end will be
+*   TLB_FLUSH_ALL, and new_tlb_gen will be the tlb_gen to which the
+*   IPI sender is trying to catch us up.
+*
+* - Partially flush a single mm.  flush_mm will be set, flush_start
+*   and flush_end will indicate the range, and new_tlb_gen will be
+*   set such that the changes between generation new_tlb_gen-1 and
+*   new_tlb_gen are entirely contained in the indicated range.
+*
+* - Fully flush all mms whose tlb_gens have been updated.  flush_mm
+*   will be NULL, flush_end will be TLB_FLUSH_ALL, and new_tlb_gen
+*   will be zero.
+*/
struct mm_struct *mm;
unsigned long start;
unsigned long end;
+   u64 new_tlb_gen;
 };
 
 #define local_flush_tlb() __flush_tlb()
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 7c99c50e8bc9..3b19ba748e92 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -105,6 +105,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
}
 
this_cpu_write(cpu_tlbstate.loaded_mm, next);
+   this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
+   this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen,
+  atomic64_read(>context.tlb_gen));
 
WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
cpumask_set_cpu(cpu, mm_cpumask(next));
@@ -194,17 +197,70 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
 static void flush_tlb_func_common(const struct flush_tlb_info *f,
  bool local, enum tlb_flush_reason reason)
 {
+   struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+
+   /*
+* Our memory ordering requirement is that any TLB fills that
+* happen after we flush the TLB are ordered after we read
+* active_mm's 

[PATCH v2 04/10] x86/mm: Track the TLB's tlb_gen and update the flushing algorithm

2017-06-13 Thread Andy Lutomirski
There are two kernel features that would benefit from tracking
how up-to-date each CPU's TLB is in the case where IPIs aren't keeping
it up to date in real time:

 - Lazy mm switching currently works by switching to init_mm when
   it would otherwise flush.  This is wasteful: there isn't fundamentally
   any need to update CR3 at all when going lazy or when returning from
   lazy mode, nor is there any need to receive flush IPIs at all.  Instead,
   we should just stop trying to keep the TLB coherent when we go lazy and,
   when unlazying, check whether we missed any flushes.

 - PCID will let us keep recent user contexts alive in the TLB.  If we
   start doing this, we need a way to decide whether those contexts are
   up to date.

On some paravirt systems, remote TLBs can be flushed without IPIs.
This won't update the target CPUs' tlb_gens, which may cause
unnecessary local flushes later on.  We can address this if it becomes
a problem by carefully updating the target CPU's tlb_gen directly.

By itself, this patch is a very minor optimization that avoids
unnecessary flushes when multiple TLB flushes targetting the same CPU
race.

Signed-off-by: Andy Lutomirski 
---
 arch/x86/include/asm/tlbflush.h | 37 +++
 arch/x86/mm/tlb.c   | 79 +
 2 files changed, 109 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 1eb946c0507e..4f6c30d6ec39 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -82,6 +82,11 @@ static inline u64 bump_mm_tlb_gen(struct mm_struct *mm)
 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
 #endif
 
+struct tlb_context {
+   u64 ctx_id;
+   u64 tlb_gen;
+};
+
 struct tlb_state {
/*
 * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
@@ -97,6 +102,21 @@ struct tlb_state {
 * disabling interrupts when modifying either one.
 */
unsigned long cr4;
+
+   /*
+* This is a list of all contexts that might exist in the TLB.
+* Since we don't yet use PCID, there is only one context.
+*
+* For each context, ctx_id indicates which mm the TLB's user
+* entries came from.  As an invariant, the TLB will never
+* contain entries that are out-of-date as when that mm reached
+* the tlb_gen in the list.
+*
+* To be clear, this means that it's legal for the TLB code to
+* flush the TLB without updating tlb_gen.  This can happen
+* (for now, at least) due to paravirt remote flushes.
+*/
+   struct tlb_context ctxs[1];
 };
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
 
@@ -248,9 +268,26 @@ static inline void __flush_tlb_one(unsigned long addr)
  * and page-granular flushes are available only on i486 and up.
  */
 struct flush_tlb_info {
+   /*
+* We support several kinds of flushes.
+*
+* - Fully flush a single mm.  flush_mm will be set, flush_end will be
+*   TLB_FLUSH_ALL, and new_tlb_gen will be the tlb_gen to which the
+*   IPI sender is trying to catch us up.
+*
+* - Partially flush a single mm.  flush_mm will be set, flush_start
+*   and flush_end will indicate the range, and new_tlb_gen will be
+*   set such that the changes between generation new_tlb_gen-1 and
+*   new_tlb_gen are entirely contained in the indicated range.
+*
+* - Fully flush all mms whose tlb_gens have been updated.  flush_mm
+*   will be NULL, flush_end will be TLB_FLUSH_ALL, and new_tlb_gen
+*   will be zero.
+*/
struct mm_struct *mm;
unsigned long start;
unsigned long end;
+   u64 new_tlb_gen;
 };
 
 #define local_flush_tlb() __flush_tlb()
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 7c99c50e8bc9..3b19ba748e92 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -105,6 +105,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
}
 
this_cpu_write(cpu_tlbstate.loaded_mm, next);
+   this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
+   this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen,
+  atomic64_read(>context.tlb_gen));
 
WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
cpumask_set_cpu(cpu, mm_cpumask(next));
@@ -194,17 +197,70 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
 static void flush_tlb_func_common(const struct flush_tlb_info *f,
  bool local, enum tlb_flush_reason reason)
 {
+   struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+
+   /*
+* Our memory ordering requirement is that any TLB fills that
+* happen after we flush the TLB are ordered after we read
+* active_mm's tlb_gen.  We don't