From: Rik van Riel <r...@redhat.com>

When the FPU register set has not been touched by anybody else,
we can lazily skip the restore.

Intel has a number of clever optimizations to reduce the FPU
restore overhead, but those optimizations do not work across
the guest/host virtualization boundary, and even on bare metal
it should be faster to skip the restore entirely.

This code is still BROKEN. I am not yet sure why.

Signed-off-by: Rik van Riel <r...@redhat.com>
---
 arch/x86/include/asm/fpu/internal.h | 13 +++++++++++++
 arch/x86/kernel/process.c           |  3 +++
 arch/x86/kvm/x86.c                  |  8 +++++++-
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index b5accb35e434..f69960e9aea1 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -575,6 +575,19 @@ static inline void fpregs_deactivate(struct fpu *fpu)
 }
 
 /*
+ * Check whether an FPU's register set is still loaded in the CPU.
+ */
+static inline bool fpu_lazy_skip_restore(struct fpu *fpu)
+{
+       bool still_loaded = (fpu->fpstate_active &&
+                            fpu->last_cpu == raw_smp_processor_id() &&
+                            __this_cpu_read(fpu_fpregs_owner_ctx) == fpu);
+
+       fpu->fpregs_active = still_loaded;
+       return still_loaded;
+}
+
+/*
  * FPU state switching for scheduling.
  *
  * This is a three-stage process:
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 087413be39cf..6b72415e400f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -208,6 +208,9 @@ void switch_fpu_return(void)
                  (use_eager_fpu() || fpu->counter > 5);
 
        if (preload) {
+               if (fpu_lazy_skip_restore(fpu))
+                       return;
+
                prefetch(&fpu->state);
                fpu->counter++;
                __fpregs_activate(fpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 55c82d066d3a..16ebcd12edf7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7346,7 +7346,12 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 
        vcpu->guest_fpu_loaded = 1;
        __kernel_fpu_begin(fpu);
-       __copy_kernel_to_fpregs(&fpu->state);
+
+       if (!fpu_lazy_skip_restore(fpu)) {
+               fpu->last_cpu = raw_smp_processor_id();
+               __copy_kernel_to_fpregs(&fpu->state);
+       }
+
        trace_kvm_fpu(1);
 }
 
@@ -7358,6 +7363,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
        }
 
        vcpu->guest_fpu_loaded = 0;
+       vcpu->arch.guest_fpu.fpregs_active = 0;
        copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
        __kernel_fpu_end();
        ++vcpu->stat.fpu_reload;
-- 
2.7.4

Reply via email to