flush_tlb_others_ipi depends on lot of statics in tlb.c.  Replicated
the flush_tlb_others_ipi as kvm_flush_tlb_others to further adapt to
paravirtualization.

Use the vcpu state information inside the kvm_flush_tlb_others to
avoid sending ipi to pre-empted vcpus.

* Do not send ipi's to offline vcpus and set flush_on_enter flag
* For online vcpus: Wait for them to clear the flag

The approach was discussed here: https://lkml.org/lkml/2012/2/20/157

Suggested-by: Peter Zijlstra <a.p.zijls...@chello.nl>
Signed-off-by: Nikunj A. Dadhania <nik...@linux.vnet.ibm.com>

--
Pseudo Algo:

   Write()
   ======

           guest_exit()
                   flush_on_enter[i]=0;
                   running[i] = 0;

           guest_enter()
                   running[i] = 1;
                   if(flush_on_enter[i]) {
                           tlb_flush()
                           flush_on_enter[i]=0;
                   }


   Read()
   ======

           GUEST                                                KVM-HV

   f->flushcpumask = cpumask - me;

again:
   for_each_cpu(i, f->flushmask) {

           if (!running[i]) {
                                                   case 1:

                                                   running[n]=1

                                                   (cpuN does not see
                                                   flush_on_enter set,
                                                   guest later finds it
                                                   running and sends ipi,
                                                   we are fine here, need
                                                   to clear the flag on
                                                   guest_exit)

                  flush_on_enter[i] = 1;
                                                   case2:

                                                   running[n]=1
                                                   (cpuN - will see flush
                                                   on enter and an IPI as
                                                   well - addressed in patch-4)

                  if (!running[i])
                     cpu_clear(f->flushmask);      All is well, vm_enter
                                                   will do the fixup
           }
                                                   case 3:
                                                   running[n] = 0;

                                                   (cpuN went to sleep,
                                                   we saw it as awake,
                                                   ipi sent, but wait
                                                   will break without
                                                   zero_mask and goto
                                                   again will take care)

   }
   send_ipi(f->flushmask)

   wait_a_while_for_zero_mask();

   if (!zero_mask)
           goto again;
---
 arch/x86/include/asm/kvm_para.h |    3 +-
 arch/x86/include/asm/tlbflush.h |    9 ++++++
 arch/x86/kernel/kvm.c           |    1 +
 arch/x86/kvm/x86.c              |    6 ++++
 arch/x86/mm/tlb.c               |   57 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 75 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index f57b5cc..684a285 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -55,7 +55,8 @@ struct kvm_steal_time {
 
 struct kvm_vcpu_state {
        __u32 state;
-       __u32 pad[15];
+       __u32 flush_on_enter;
+       __u32 pad[14];
 };
 
 #define KVM_VCPU_STATE_ALIGN_BITS 5
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index c0e108e..29470bd 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -119,6 +119,12 @@ static inline void native_flush_tlb_others(const struct 
cpumask *cpumask,
 {
 }
 
+static inline void kvm_flush_tlb_others(const struct cpumask *cpumask,
+                                       struct mm_struct *mm,
+                                       unsigned long va)
+{
+}
+
 static inline void reset_lazy_tlbstate(void)
 {
 }
@@ -145,6 +151,9 @@ static inline void flush_tlb_range(struct vm_area_struct 
*vma,
 void native_flush_tlb_others(const struct cpumask *cpumask,
                             struct mm_struct *mm, unsigned long va);
 
+void kvm_flush_tlb_others(const struct cpumask *cpumask,
+                         struct mm_struct *mm, unsigned long va);
+
 #define TLBSTATE_OK    1
 #define TLBSTATE_LAZY  2
 
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index bb686a6..66db54e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -465,6 +465,7 @@ void __init kvm_guest_init(void)
        }
 
        has_vcpu_state = 1;
+       pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
 
 #ifdef CONFIG_SMP
        smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 60546e9..6c42056 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1549,6 +1549,11 @@ static void kvm_set_vcpu_state(struct kvm_vcpu *vcpu)
                return;
 
        vs->state = 1;
+       if (vs->flush_on_enter) {
+               kvm_mmu_flush_tlb(vcpu);
+               vs->flush_on_enter = 0;
+       }
+
        kvm_write_guest_cached(vcpu->kvm, ghc, vs, 2*sizeof(__u32));
        smp_wmb();
 }
@@ -1561,6 +1566,7 @@ static void kvm_clear_vcpu_state(struct kvm_vcpu *vcpu)
        if (!(vcpu->arch.v_state.msr_val & KVM_MSR_ENABLED))
                return;
 
+       vs->flush_on_enter = 0;
        vs->state = 0;
        kvm_write_guest_cached(vcpu->kvm, ghc, vs, 2*sizeof(__u32));
        smp_wmb();
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index d6c0418..91ae34e 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -6,6 +6,7 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/cpu.h>
+#include <linux/kvm_para.h>
 
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -69,6 +70,7 @@ void leave_mm(int cpu)
 }
 EXPORT_SYMBOL_GPL(leave_mm);
 
+DECLARE_PER_CPU(struct kvm_vcpu_state, vcpu_state) __aligned(64);
 /*
  *
  * The flush IPI assumes that a thread switch happens in this order:
@@ -202,6 +204,61 @@ static void flush_tlb_others_ipi(const struct cpumask 
*cpumask,
                raw_spin_unlock(&f->tlbstate_lock);
 }
 
+void kvm_flush_tlb_others(const struct cpumask *cpumask,
+                       struct mm_struct *mm, unsigned long va)
+{
+       unsigned int sender;
+       union smp_flush_state *f;
+       int cpu, loop;
+       struct kvm_vcpu_state *v_state;
+
+       /* Caller has disabled preemption */
+       sender = this_cpu_read(tlb_vector_offset);
+       f = &flush_state[sender];
+
+       if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
+               raw_spin_lock(&f->tlbstate_lock);
+
+       f->flush_mm = mm;
+       f->flush_va = va;
+       if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, 
cpumask_of(smp_processor_id()))) {
+               /*
+                * We have to send the IPI only to online vCPUs
+                * affected. And queue flush_on_enter for pre-empted
+                * vCPUs
+                */
+again:
+               for_each_cpu(cpu, to_cpumask(f->flush_cpumask)) {
+                       v_state = &per_cpu(vcpu_state, cpu);
+
+                       if (!v_state->state) {
+                               v_state->flush_on_enter = 1;
+                               smp_mb();
+                               if (!v_state->state)
+                                       cpumask_clear_cpu(cpu, 
to_cpumask(f->flush_cpumask));
+                       }
+               }
+
+               if (cpumask_empty(to_cpumask(f->flush_cpumask)))
+                       goto out;
+
+               apic->send_IPI_mask(to_cpumask(f->flush_cpumask),
+                                   INVALIDATE_TLB_VECTOR_START + sender);
+
+               loop = 1000;
+               while (!cpumask_empty(to_cpumask(f->flush_cpumask)) && --loop)
+                       cpu_relax();
+
+               if (!cpumask_empty(to_cpumask(f->flush_cpumask)))
+                       goto again;
+       }
+out:
+       f->flush_mm = NULL;
+       f->flush_va = 0;
+       if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
+               raw_spin_unlock(&f->tlbstate_lock);
+}
+
 void native_flush_tlb_others(const struct cpumask *cpumask,
                             struct mm_struct *mm, unsigned long va)
 {

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to