Use per-cpu delta counters and statistically eliminate outliers which
might happen due to platform anomalies such as NMI.

Signed-off-by: Zachary Amsden <zams...@redhat.com>
---
 arch/x86/kvm/x86.c |  100 +++++++++++++++++++++++++++++++++++++---------------
 1 files changed, 71 insertions(+), 29 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3c4266f..c66dede 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -840,6 +840,11 @@ again:
 EXPORT_SYMBOL_GPL(kvm_get_ref_tsc);
 
 #define SYNC_TRIES 64
+struct cpu_delta_array {
+       s64 delta[SYNC_TRIES];
+};
+static DEFINE_PER_CPU(struct cpu_delta_array, delta_array);
+
 
 /*
  * sync_tsc_helper is a dual-entry coroutine meant to be run by only
@@ -849,25 +854,16 @@ EXPORT_SYMBOL_GPL(kvm_get_ref_tsc);
  *
  * To discount cache latency effects, this routine will be called
  * twice, one with the measure / recording CPUs reversed.  In addition,
- * the first 4 and last 2 results will be discarded to allow branch
- * predicition to become established (and to discount effects from
- * a potentially early predicted loop exit).
+ * the first and last results will be discarded to allow branch predicition
+ * to become established (and to discount effects from a potentially early
+ * predicted loop exit).
  *
- * Because of this, we must be extra careful to guard the entrance
- * and exit against the CPU switch.  I chose to use atomic instructions
- * only at the end of the measure loop and use the same routine for
- * both CPUs, with symmetric comparisons, and a statically placed
- * recording array, hopefully maximizing the branch predicition and
- * cache locality.  The results appear quite good; on known to be
- * synchronized CPUs, I typically get < 10 TSC delta measured, with
- * maximum observed error on the order of 100 cycles.
- *
- * This doesn't account for NUMA cache effects, and could potentially
- * be improved there by moving the delta[] array to the stack of the
- * measuring CPU.  In fact, this modification might be worth trying
- * for non-NUMA systems as well, but this appears to be fine for now.
+ * We allocate the delta array as a per-cpu variable to get local cache
+ * allocation, and also take steps to statistically ignore outliers which
+ * might be caused by SMIs.  It may seem like overkill, but it is very
+ * accurate, which is what we're aiming for.
  */
-static void sync_tsc_helper(int measure_cpu, u64 *delta, atomic_t *ready)
+static void sync_tsc_helper(int measure_cpu, s64 *delta, atomic_t *ready)
 {
        int tries;
        static u64 tsc_other;
@@ -915,12 +911,59 @@ static void sync_tsc_helper(int measure_cpu, u64 *delta, 
atomic_t *ready)
        mb();
 }
 
+/*
+ * Average and trim the samples of any outliers; we use > 2 x sigma
+ */
+static s64 average_samples(s64 *samples, unsigned num_samples)
+{
+       unsigned i, j;
+       s64 mean;
+       u64 stddev;
+       s64 average;
+
+       /* First get the mean */
+       mean = 0;
+       for (i = 0; i < num_samples; i++)
+               mean += samples[i];
+       mean = mean / num_samples;
+
+       /* Now the deviation */
+       stddev = 0;
+       for (i = 0; i < num_samples; i++) {
+               s64 dev = samples[i] - mean;
+               stddev += dev * dev;
+       }
+       stddev = stddev / (num_samples - 1);
+       stddev = int_sqrt(stddev);
+
+       /* Throw out anything outside 2x stddev */
+       stddev <<= 1;
+       average = 0, j = 0;
+       for (i = 0; i < num_samples; i++) {
+               s64 dev = samples[i] - mean;
+               if (dev < 0)
+                       dev = -dev;
+               if (dev <= stddev) {
+                       average += samples[i];
+                       j++;
+               }
+       }
+       if (j > 0)
+               average = average / j;
+       else
+               printk(KERN_ERR "kvm: TSC samples failed to converge!\n");
+       pr_debug("%s: mean = %lld, stddev = %llu, average = %lld\n",
+                __func__, mean, stddev, average);
+
+       return average;
+}
+
 static void kvm_sync_tsc(void *cpup)
 {
        int new_cpu = *(int *)cpup;
        unsigned long flags;
-       static s64 delta[SYNC_TRIES*2];
-       static atomic_t ready = ATOMIC_INIT(1);
+       s64 *delta1, *delta2;
+       static atomic_t ready ____cacheline_aligned = ATOMIC_INIT(1);
 
        BUG_ON(tsc_base_cpu == -1);
        pr_debug("%s: IN, cpu = %d, freq = %ldkHz, tsc_base_cpu = %d\n", 
__func__, raw_smp_processor_id(), per_cpu(cpu_tsc_khz, raw_smp_processor_id()) 
, tsc_base_cpu);
@@ -933,27 +976,26 @@ static void kvm_sync_tsc(void *cpup)
                        &per_cpu(cpu_tsc_multiplier, new_cpu),
                        &per_cpu(cpu_tsc_shift, new_cpu));
        }
-       sync_tsc_helper(tsc_base_cpu, delta, &ready);
-       sync_tsc_helper(new_cpu, &delta[SYNC_TRIES], &ready);
+       delta1 = per_cpu(delta_array, tsc_base_cpu).delta;
+       delta2 = per_cpu(delta_array, new_cpu).delta;
+       sync_tsc_helper(tsc_base_cpu, delta1, &ready);
+       sync_tsc_helper(new_cpu, delta2, &ready);
        if (raw_smp_processor_id() == new_cpu) {
-               int i;
                s64 accumulator = 0;
 
                /*
-                * accumulate [SYNC_TRIES+4,-2) of tsc{base} - tsc{new}
-                * subtract   [SYNC_TRIES+4,-2) of tsc{new} - tsc{base}
+                * accumulate [2,SYNC_TRIES-1) of tsc{base} - tsc{new}
+                * subtract   [SYNC_TRIES+2,-1) of tsc{new} - tsc{base}
                 *
                 * this allows instruction cycle and cache differences to
                 * cancel each other out and drops warm up/cool down variation
                 *
                 * Note the arithmatic must be signed because of the divide
                 */
+               accumulator += average_samples(&delta1[2], SYNC_TRIES-3);
+               accumulator -= average_samples(&delta2[2], SYNC_TRIES-3);
+               accumulator /= 2;
 
-               for (i = 4; i < SYNC_TRIES - 2; i++)
-                       accumulator += delta[i];
-               for (i = 4; i < SYNC_TRIES - 2; i++)
-                       accumulator -= delta[i+SYNC_TRIES];
-               accumulator = accumulator / (SYNC_TRIES*2-12);
                per_cpu(cpu_tsc_offset, new_cpu) = accumulator;
                ++per_cpu(cpu_tsc_generation, new_cpu);
                atomic_set(&per_cpu(cpu_tsc_synchronized, new_cpu), 1);
-- 
1.6.5.2

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to