Lately tsc page was implemented but filled with empty
values. This patch setup tsc page scale and offset based
on vcpu tsc, tsc_khz and  HV_X64_MSR_TIME_REF_COUNT value.

The valid tsc page drops HV_X64_MSR_TIME_REF_COUNT msr
reads count to zero which potentially improves performance.

Signed-off-by: Andrey Smetanin <asmeta...@virtuozzo.com>
Reviewed-by: Peter Hornyack <peterhorny...@google.com>
CC: Paolo Bonzini <pbonz...@redhat.com>
CC: Radim Krčmář <rkrc...@redhat.com>
CC: Roman Kagan <rka...@virtuozzo.com>
CC: Denis V. Lunev <d...@openvz.org>
[Computation of TSC page parameters rewritten to use the Linux timekeeper
 parameters. - Paolo]
Signed-off-by: Paolo Bonzini <pbonz...@redhat.com>
---
 arch/x86/kvm/hyperv.c | 141 ++++++++++++++++++++++++++++++++++++++++++++------
 arch/x86/kvm/hyperv.h |   3 ++
 arch/x86/kvm/x86.c    |   8 +--
 3 files changed, 133 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index ed5b77f39ffb..e089d1f52dc0 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -756,6 +756,129 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu 
*vcpu,
        return 0;
 }
 
+/*
+ * The kvmclock and Hyper-V TSC page use similar formulas.  Because the KVM
+ * one is more precise, it is a little more complex.  However, converting
+ * between them is possible:
+ *
+ * kvmclock formula:
+ *    nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           + system_time
+ *
+ * Hyper-V formula:
+ *    nsec/100 = ticks * scale / 2^64 + offset
+ *
+ * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
+ * By dividing the kvmclock formula by 100 and equating what's left we get:
+ *    ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *            scale / 2^64 =         tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *            scale        =         tsc_to_system_mul * 2^(32+tsc_shift) / 100
+ *
+ * Now expand the kvmclock formula and divide by 100:
+ *    nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           + system_time
+ *    nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *               - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *               + system_time / 100
+ *
+ * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
+ *    nsec/100 = ticks * scale / 2^64
+ *               - tsc_timestamp * scale / 2^64
+ *               + system_time / 100
+ *
+ * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
+ *    offset = system_time / 100 - tsc_timestamp * scale / 2^64
+ *
+ * These two equivalencies are implemented in this function.
+ */
+static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info 
*hv_clock,
+                                       HV_REFERENCE_TSC_PAGE *tsc_ref)
+{
+       u64 max_mul;
+
+       if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
+               return false;
+
+       /*
+        * check if scale would overflow, if so we use the time ref counter
+        *    tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
+        *    tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
+        *    tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
+        */
+       max_mul = 100ull << (32 - hv_clock->tsc_shift);
+       if (hv_clock->tsc_to_system_mul >= max_mul)
+               return false;
+
+       /*
+        * Otherwise compute the scale and offset according to the formulas
+        * derived above.
+        */
+       tsc_ref->tsc_scale =
+               mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
+                               hv_clock->tsc_to_system_mul,
+                               100);
+
+       tsc_ref->tsc_offset = hv_clock->system_time;
+       do_div(tsc_ref->tsc_offset, 100);
+       tsc_ref->tsc_offset -=
+               mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 
64);
+       return true;
+}
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+                          struct pvclock_vcpu_time_info *hv_clock)
+{
+       struct kvm_hv *hv = &kvm->arch.hyperv;
+       HV_REFERENCE_TSC_PAGE tsc_ref = { 0 };
+       u32 tsc_seq;
+       u64 gfn;
+
+       BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(tsc_ref.tsc_sequence));
+       BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0);
+
+       if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+               return;
+
+       gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+       /*
+        * Because the TSC parameters only vary when there is a
+        * change in the master clock, do not bother with caching.
+        */
+       if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
+                                   &tsc_seq, sizeof(tsc_seq))))
+               return;
+
+       /*
+        * While we're computing and writing the parameters, force the
+        * guest to use the time reference count MSR.
+        */
+       if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+                           &tsc_ref, sizeof(tsc_ref.tsc_sequence)))
+               return;
+
+       if (!compute_tsc_page_parameters(hv_clock, &tsc_ref))
+               return;
+
+       /* Ensure sequence is zero before writing the rest of the struct.  */
+       smp_wmb();
+       if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &tsc_ref, sizeof(tsc_ref)))
+               return;
+
+       /*
+        * Now switch to the TSC page mechanism by writing the sequence.
+        */
+       if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
+               tsc_seq = 1;
+       else
+               tsc_seq++;
+
+       /* Write the struct entirely before the non-zero sequence.  */
+       smp_wmb();
+
+       kvm_write_guest(kvm, gfn_to_gpa(gfn), &tsc_seq, sizeof(tsc_seq));
+}
+
 static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
                             bool host)
 {
@@ -793,23 +916,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 
msr, u64 data,
                mark_page_dirty(kvm, gfn);
                break;
        }
-       case HV_X64_MSR_REFERENCE_TSC: {
-               u64 gfn;
-               HV_REFERENCE_TSC_PAGE tsc_ref;
-
-               memset(&tsc_ref, 0, sizeof(tsc_ref));
+       case HV_X64_MSR_REFERENCE_TSC:
                hv->hv_tsc_page = data;
-               if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
-                       break;
-               gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
-               if (kvm_write_guest(
-                               kvm,
-                               gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
-                               &tsc_ref, sizeof(tsc_ref)))
-                       return 1;
-               mark_page_dirty(kvm, gfn);
+               if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
+                       kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
                break;
-       }
        case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
                return kvm_hv_msr_set_crash_data(vcpu,
                                                 msr - HV_X64_MSR_CRASH_P0,
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 60eccd4bd1d3..cd1119538add 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -84,4 +84,7 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu 
*vcpu)
 
 void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
 
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+                          struct pvclock_vcpu_time_info *hv_clock);
+
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2edcfa054cbe..2e792bf80912 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1881,10 +1881,10 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
        vcpu->hv_clock.flags = pvclock_flags;
 
-       if (!vcpu->pv_time_enabled)
-               return 0;
-
-       kvm_setup_pvclock_page(v);
+       if (vcpu->pv_time_enabled)
+               kvm_setup_pvclock_page(v);
+       if (v == kvm_get_vcpu(v->kvm, 0))
+               kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
        return 0;
 }
 
-- 
1.8.3.1

Reply via email to