[PATCH RFC: kvm tsc virtualization 03/20] TSC offset framework

Zachary Amsden Mon, 14 Dec 2009 20:16:46 -0800

Add the framework for a preliminary way to cope with CPUs which have
different TSC offsets from each other.  The TSC delta is measured
(in cross-cache-directions for highest accuracy) and stored.


Signed-off-by: Zachary Amsden <[email protected]>
---
 arch/x86/kvm/x86.c |  202 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 202 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index faa467d..95e43b9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -730,6 +730,196 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct 
pvclock_vcpu_time_info *
 }
 
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
+static DEFINE_PER_CPU(unsigned long, cpu_tsc_multiplier);
+static DEFINE_PER_CPU(int, cpu_tsc_shift);
+static DEFINE_PER_CPU(s64, cpu_tsc_offset);
+static DEFINE_PER_CPU(u64, cpu_tsc_measure_base);
+static int tsc_base_cpu = -1;
+static unsigned long ref_tsc_khz;
+
+static inline unsigned long div_precise(unsigned long hi, unsigned long lo,
+       unsigned long divisor, unsigned long *rptr)
+{
+       unsigned long quotient, remainder;
+       __asm__ ( "div %4"
+               : "=a" (quotient), "=d" (remainder)
+               : "0" (lo), "1" (hi), "rm" (divisor));
+       *rptr = remainder;
+       return quotient;
+}
+
+/*
+ * compute the best multipler and shift pair m,s, such that for n,
+ *   n * a / b  = (n * m) >> s
+ */
+static void compute_best_multiplier(unsigned long a, unsigned long b,
+       unsigned long *m, int *s)
+{
+       int shift, bit;
+       unsigned long lo, hi, remainder, mult;
+
+       /*
+        * By pre-shifting and using maximum machine width, we get the most
+        * bits of precision.
+        */
+       shift = BITS_PER_LONG + fls(b) - fls(a) - 1;
+       if (shift > BITS_PER_LONG)
+               shift = BITS_PER_LONG;
+       if (shift < 0)
+               shift = 0;
+       lo = a << shift;
+       hi = a >> (BITS_PER_LONG - shift);
+       mult = div_precise(hi, lo, b, &remainder);
+
+       /* See if it can be further simplified */
+       bit = __ffs(mult);
+       if (bit > shift)
+               bit = shift;
+       mult >>= bit;
+       shift -= bit;
+       *m = mult;
+       *s = shift;
+}
+
+static inline unsigned long mult_precise(unsigned long val, unsigned long mult,
+       int shift)
+{
+       unsigned long top, bot;
+
+       __asm__ ( "mul %3; shrd %1, %0" :
+                "=&a" (bot), "=&d" (top) :
+                "0" (mult), "rm" (val), "c" (shift));
+       return bot;
+}
+
+static inline u64 compute_ref_tsc(int cpu)
+{
+       u64 tsc = native_read_tsc() - per_cpu(cpu_tsc_measure_base, cpu);
+       tsc = mult_precise(tsc, per_cpu(cpu_tsc_multiplier, cpu),
+                               per_cpu(cpu_tsc_shift, cpu));
+       return tsc + per_cpu(cpu_tsc_offset, cpu);
+}
+
+#define SYNC_TRIES 64
+
+/*
+ * sync_tsc_helper is a dual-entry coroutine meant to be run by only
+ * two CPUs at a time, one of which is a measuring CPU.  Both CPUs
+ * synchronize entry and exit as well as the central recording loop
+ * using only memory barriers and atomic variables to avoid lock latency.
+ *
+ * To discount cache latency effects, this routine will be called
+ * twice, one with the measure / recording CPUs reversed.  In addition,
+ * the first 4 and last 2 results will be discarded to allow branch
+ * predicition to become established (and to discount effects from
+ * a potentially early predicted loop exit).
+ *
+ * Because of this, we must be extra careful to guard the entrance
+ * and exit against the CPU switch.  I chose to use atomic instructions
+ * only at the end of the measure loop and use the same routine for
+ * both CPUs, with symmetric comparisons, and a statically placed
+ * recording array, hopefully maximizing the branch predicition and
+ * cache locality.  The results appear quite good; on known to be
+ * synchronized CPUs, I typically get < 10 TSC delta measured, with
+ * maximum observed error on the order of 100 cycles.
+ *
+ * This doesn't account for NUMA cache effects, and could potentially
+ * be improved there by moving the delta[] array to the stack of the
+ * measuring CPU.  In fact, this modification might be worth trying
+ * for non-NUMA systems as well, but this appears to be fine for now.
+ */
+static void sync_tsc_helper(int measure_cpu, u64 *delta, atomic_t *ready)
+{
+       int tries;
+       static u64 tsc_other;
+       int junk = 0;
+       u64 tsc;
+       int cpu = raw_smp_processor_id();
+
+       if (cpu == measure_cpu) {
+               atomic_set(ready, 0);
+               while (!atomic_read(ready))
+                       /* wait */;
+       } else {
+               while (atomic_read(ready))
+                       /* wait */;
+               atomic_set(ready, 1);
+       }
+       for (tries = 0; tries < SYNC_TRIES; tries++) {
+               mb();
+               if (cpu == measure_cpu) {
+                       atomic_set(ready, 0);
+               } else {
+                       while (atomic_read(ready))
+                               /* wait */;
+               }
+               native_cpuid(&junk, &junk, &junk, &junk);
+               tsc = compute_ref_tsc(cpu);
+               rdtsc_barrier();
+               if (cpu == measure_cpu) {
+                       while (!atomic_read(ready))
+                               /* wait */;
+                       rmb();
+                       delta[tries] = tsc - tsc_other;
+               } else {
+                       tsc_other = tsc;
+                       wmb();
+                       atomic_set(ready, 1);
+               }
+       }
+       if (cpu == measure_cpu)
+               atomic_dec(ready);
+       else
+               atomic_inc(ready);
+       while (atomic_read(ready) != 1)
+               /* wait */;
+       mb();
+}
+
+static void kvm_sync_tsc(void *cpup)
+{
+       int new_cpu = *(int *)cpup;
+       unsigned long flags;
+       static s64 delta[SYNC_TRIES*2];
+       static atomic_t ready = ATOMIC_INIT(1);
+
+       BUG_ON(tsc_base_cpu == -1);
+       pr_debug("%s: IN, cpu = %d, freq = %ldkHz, tsc_base_cpu = %d\n", 
__func__, raw_smp_processor_id(), per_cpu(cpu_tsc_khz, raw_smp_processor_id()) 
, tsc_base_cpu);
+       local_irq_save(flags);
+       if (raw_smp_processor_id() == new_cpu) {
+               per_cpu(cpu_tsc_measure_base, new_cpu) = native_read_tsc();
+               per_cpu(cpu_tsc_offset, new_cpu) = 0;
+               compute_best_multiplier(ref_tsc_khz,
+                       per_cpu(cpu_tsc_khz, new_cpu),
+                       &per_cpu(cpu_tsc_multiplier, new_cpu),
+                       &per_cpu(cpu_tsc_shift, new_cpu));
+       }
+       sync_tsc_helper(tsc_base_cpu, delta, &ready);
+       sync_tsc_helper(new_cpu, &delta[SYNC_TRIES], &ready);
+       if (raw_smp_processor_id() == new_cpu) {
+               int i;
+               s64 accumulator = 0;
+
+               /*
+                * accumulate [SYNC_TRIES+4,-2) of tsc{base} - tsc{new}
+                * subtract   [SYNC_TRIES+4,-2) of tsc{new} - tsc{base}
+                *
+                * this allows instruction cycle and cache differences to
+                * cancel each other out and drops warm up/cool down variation
+                *
+                * Note the arithmatic must be signed because of the divide
+                */
+
+               for (i = 4; i < SYNC_TRIES - 2; i++)
+                       accumulator += delta[i];
+               for (i = 4; i < SYNC_TRIES - 2; i++)
+                       accumulator -= delta[i+SYNC_TRIES];
+               accumulator = accumulator / (SYNC_TRIES*2-12);
+               per_cpu(cpu_tsc_offset, new_cpu) = accumulator;
+               pr_debug("%s: OUT, cpu = %d, cpu_tsc_offset = %lld, 
cpu_tsc_multiplier=%ld, cpu_tsc_shift=%d\n", __func__, raw_smp_processor_id(), 
per_cpu(cpu_tsc_offset, new_cpu), per_cpu(cpu_tsc_multiplier, new_cpu), 
per_cpu(cpu_tsc_shift, new_cpu));
+       }
+       local_irq_restore(flags);
+}
 
 static void kvm_write_guest_time(struct kvm_vcpu *v)
 {
@@ -3352,6 +3542,18 @@ static void kvm_timer_init(void)
                for_each_possible_cpu(cpu)
                        per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
        }
+       tsc_base_cpu = get_cpu();
+       ref_tsc_khz = per_cpu(cpu_tsc_khz, tsc_base_cpu);
+       per_cpu(cpu_tsc_multiplier, tsc_base_cpu) = 1;
+       per_cpu(cpu_tsc_shift, tsc_base_cpu) = 0;
+       per_cpu(cpu_tsc_offset, tsc_base_cpu) = 0;
+       for_each_online_cpu(cpu)
+               if (cpu != tsc_base_cpu) {
+                       smp_call_function_single(cpu, kvm_sync_tsc,
+                                                (void *)&cpu, 0);
+                       kvm_sync_tsc((void *)&cpu);
+               }
+       put_cpu();
 }
 
 int kvm_arch_init(void *opaque)
-- 
1.6.5.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC: kvm tsc virtualization 03/20] TSC offset framework

Reply via email to