Recently I updated the kernel lock profiling stuff I've been working on, since
it  had been rotting a bit since witness was introduced.  Running my diff on a
KVM VM, I found there was a pretty huge performance impact (10 minutes to
build a kernel instead of 4), which turned out to be because reading the
emulated HPET in KVM is slow, and lock profiling involves a lot of extra
clock reads.  The diff below adds a new TSC-based timecounter implementation
for KVM and Xen to remedy this.

KVM and Xen provide frequently-updated views of system time from the host to
each vcpu in a way that lets the VM get accurate high resolution time without
much work.  Linux calls this mechanism 'pvclock' so I'm doing the same.

The pvclock structure gives you a system time (in nanoseconds), the TSC
reading from when the time was updated, and scaling factors for converting TSC
values to nanoseconds.  Usually you subtract the TSC reading in the pvclock
structure from a current reading, convert that to nanoseconds, and add it to
the system time.  I decided to go the other way in order to keep all the
available resolution.

Using pvclock as the timecounter reduces the overhead of lock profiling to
almost nothing.  Even without the extra clock reads for lock profiling,
it cuts a few seconds off kernel compile time on a 2 vcpu vm.  I've run it
for ~12 hours without ntpd and the clock keeps time accurately.

One wrinkle here is that the KVM pvclock mechanism requires setup on each vcpu,
so I added a new pvbus function that gets called from cpu_hatch, allowing any
hypervisor-specific setup to happen there.

I still need to try this on xen, but comments at this stage are welcome.

Index: arch/i386/i386/cpu.c
===================================================================
RCS file: /cvs/src/sys/arch/i386/i386/cpu.c,v
retrieving revision 1.84
diff -u -p -u -p -r1.84 cpu.c
--- arch/i386/i386/cpu.c        30 May 2017 15:11:32 -0000      1.84
+++ arch/i386/i386/cpu.c        16 Jun 2017 06:07:16 -0000
@@ -67,6 +67,7 @@
 #include "lapic.h"
 #include "ioapic.h"
 #include "vmm.h"
+#include "pvbus.h"
 
 #include <sys/param.h>
 #include <sys/timeout.h>
@@ -104,6 +105,10 @@
 #include <machine/i82093var.h>
 #endif
 
+#if NPVBUS > 0
+#include <dev/pv/pvvar.h>
+#endif
+
 #include <dev/ic/mc146818reg.h>
 #include <i386/isa/nvram.h>
 #include <dev/isa/isareg.h>
@@ -626,6 +631,9 @@ cpu_hatch(void *v)
 
        ci->ci_curpmap = pmap_kernel();
        cpu_init(ci);
+#if NPVBUS > 0
+       pvbus_init_vcpu();
+#endif
 
        /* Re-initialise memory range handling on AP */
        if (mem_range_softc.mr_op != NULL)
Index: arch/i386/include/cpufunc.h
===================================================================
RCS file: /cvs/src/sys/arch/i386/include/cpufunc.h,v
retrieving revision 1.25
diff -u -p -u -p -r1.25 cpufunc.h
--- arch/i386/include/cpufunc.h 27 May 2017 12:21:50 -0000      1.25
+++ arch/i386/include/cpufunc.h 16 Jun 2017 06:07:16 -0000
@@ -217,6 +217,15 @@ mfence(void)
        __asm volatile("mfence" : : : "memory");
 }
 
+static __inline u_int64_t
+rdtsc(void)
+{
+       uint32_t hi, lo;
+
+       __asm volatile("rdtsc" : "=d" (hi), "=a" (lo));
+       return (((uint64_t)hi << 32) | (uint64_t) lo);
+}
+
 static __inline void
 wrmsr(u_int msr, u_int64_t newval)
 {
Index: arch/amd64/amd64/cpu.c
===================================================================
RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v
retrieving revision 1.105
diff -u -p -u -p -r1.105 cpu.c
--- arch/amd64/amd64/cpu.c      30 May 2017 15:11:32 -0000      1.105
+++ arch/amd64/amd64/cpu.c      16 Jun 2017 06:07:16 -0000
@@ -67,6 +67,7 @@
 #include "lapic.h"
 #include "ioapic.h"
 #include "vmm.h"
+#include "pvbus.h"
 
 #include <sys/param.h>
 #include <sys/timeout.h>
@@ -103,6 +104,10 @@
 #include <machine/i82093var.h>
 #endif
 
+#if NPVBUS > 0
+#include <dev/pv/pvvar.h>
+#endif
+
 #include <dev/ic/mc146818reg.h>
 #include <amd64/isa/nvram.h>
 #include <dev/isa/isareg.h>
@@ -728,6 +733,9 @@ cpu_hatch(void *v)
        lldt(0);
 
        cpu_init(ci);
+#if NPVBUS > 0
+       pvbus_init_vcpu();
+#endif
 
        /* Re-initialise memory range handling on AP */
        if (mem_range_softc.mr_op != NULL)
Index: dev/pv/files.pv
===================================================================
RCS file: /cvs/src/sys/dev/pv/files.pv,v
retrieving revision 1.13
diff -u -p -u -p -r1.13 files.pv
--- dev/pv/files.pv     14 Jun 2017 10:25:40 -0000      1.13
+++ dev/pv/files.pv     16 Jun 2017 06:07:16 -0000
@@ -75,3 +75,6 @@ file  dev/pv/vioscsi.c                vioscsi
 device vmmci
 attach vmmci at virtio
 file   dev/pv/vmmci.c                  vmmci
+
+# paravirtualized clock, used by kvm and xen
+file    dev/pv/pvclock.c
Index: dev/pv/pvbus.c
===================================================================
RCS file: /cvs/src/sys/dev/pv/pvbus.c,v
retrieving revision 1.16
diff -u -p -u -p -r1.16 pvbus.c
--- dev/pv/pvbus.c      10 Jan 2017 17:16:39 -0000      1.16
+++ dev/pv/pvbus.c      16 Jun 2017 06:07:16 -0000
@@ -57,6 +57,7 @@ int    pvbus_print(void *, const char *);
 int     pvbus_search(struct device *, void *, void *);
 
 void    pvbus_kvm(struct pvbus_hv *);
+void    pvbus_kvm_init_vcpu(struct pvbus_hv *);
 void    pvbus_hyperv(struct pvbus_hv *);
 void    pvbus_hyperv_print(struct pvbus_hv *);
 void    pvbus_xen(struct pvbus_hv *);
@@ -84,8 +85,9 @@ struct pvbus_type {
        const char      *name;
        void            (*init)(struct pvbus_hv *);
        void            (*print)(struct pvbus_hv *);
+       void            (*init_vcpu)(struct pvbus_hv *);
 } pvbus_types[PVBUS_MAX] = {
-       { "KVMKVMKVM\0\0\0",    "KVM",  pvbus_kvm },
+       { "KVMKVMKVM\0\0\0",    "KVM",  pvbus_kvm, NULL, pvbus_kvm_init_vcpu },
        { "Microsoft Hv",       "Hyper-V", pvbus_hyperv, pvbus_hyperv_print },
        { "VMwareVMware",       "VMware" },
        { "XenVMMXenVMM",       "Xen",  pvbus_xen, pvbus_xen_print },
@@ -210,6 +212,19 @@ pvbus_identify(void)
                has_hv_cpuid = 1;
 }
 
+void
+pvbus_init_vcpu(void)
+{
+       int i;
+
+       for (i = 0; i < PVBUS_MAX; i++) {
+               if (pvbus_hv[i].hv_base == 0)
+                       continue;
+               if (pvbus_types[i].init_vcpu != NULL)
+                       (pvbus_types[i].init_vcpu)(&pvbus_hv[i]);
+       }
+}
+
 int
 pvbus_activate(struct device *self, int act)
 {
@@ -287,6 +302,16 @@ pvbus_kvm(struct pvbus_hv *hv)
        CPUID(hv->hv_base + CPUID_OFFSET_KVM_FEATURES,
            regs[0], regs[1], regs[2], regs[3]);
        hv->hv_features = regs[0];
+
+       if (hv->hv_features & KVM_FEATURE_CLOCKSOURCE2)
+               pvclock_kvm_init();
+}
+
+void
+pvbus_kvm_init_vcpu(struct pvbus_hv *hv)
+{
+       if (hv->hv_features & KVM_FEATURE_CLOCKSOURCE2)
+               pvclock_kvm_init_vcpu();
 }
 
 void
@@ -328,6 +353,8 @@ pvbus_xen(struct pvbus_hv *hv)
                /* Remove CPU flag for x2apic */
                cpu_ecxfeature &= ~CPUIDECX_X2APIC;
        }
+
+       pvclock_xen_init();
 }
 
 void
Index: dev/pv/pvclock.c
===================================================================
RCS file: dev/pv/pvclock.c
diff -N dev/pv/pvclock.c
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ dev/pv/pvclock.c    16 Jun 2017 06:07:16 -0000
@@ -0,0 +1,168 @@
+/*     $OpenBSD$ */
+/*
+ * Copyright (c) 2017 Jonathan Matthew <jmatt...@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/timetc.h>
+#include <sys/atomic.h>
+#include <sys/tree.h>
+#include <sys/task.h>
+
+#include <machine/bus.h>
+#include <machine/cpufunc.h>
+
+#include <uvm/uvm_extern.h>
+
+#include "xen.h"
+
+#include <dev/pv/pvreg.h>
+#include <dev/pv/xenreg.h>
+#include <dev/pv/xenvar.h>
+
+u_int pvclock_kvm_get_timecount(struct timecounter *tc);
+u_int pvclock_xen_get_timecount(struct timecounter *tc);
+
+struct timecounter pvclock_kvm_timecounter = {
+       pvclock_kvm_get_timecount, NULL, ~0u, 0, "kvmclock", 2000, NULL
+};
+
+struct timecounter pvclock_xen_timecounter = {
+       pvclock_xen_get_timecount, NULL, ~0u, 0, "xenclock", 2000, NULL
+};
+
+uint64_t
+pvclock_ns_to_tsc(uint64_t ns, uint32_t mul, int shift)
+{
+       ns <<= 32;
+       ns = ns / mul;
+       if (shift >= 0)
+               ns >>= shift;
+       else
+               ns <<= -shift;
+
+       return (ns);
+}
+
+uint64_t
+pvclock_read(struct vcpu_time_info *time_info)
+{
+       uint32_t v, mul;
+       uint64_t systime, tsc, delta;
+       int shift;
+
+       do {
+               v = time_info->version;
+               virtio_membar_sync();   /* probably too much */
+               tsc = time_info->tsc_timestamp;
+               systime = time_info->system_time;
+               mul = time_info->tsc_to_system_mul;
+               shift = time_info->tsc_shift;
+               virtio_membar_sync();
+       } while (v != time_info->version);
+
+       delta = rdtsc() - tsc;
+       return (pvclock_ns_to_tsc(systime, mul, shift) + delta);
+}
+
+uint64_t
+pvclock_get_freq(struct vcpu_time_info *time_info)
+{
+       return (pvclock_ns_to_tsc(1000000000, time_info->tsc_to_system_mul,
+           time_info->tsc_shift));
+}
+
+u_int
+pvclock_kvm_get_timecount(struct timecounter *tc)
+{
+       struct vcpu_time_info *vcpus;
+       vcpus = pvclock_kvm_timecounter.tc_priv;
+       if (vcpus == NULL)
+               return (0);
+
+       return (pvclock_read(vcpus + cpu_number()));
+}
+
+void
+pvclock_kvm_init_vcpu(void)
+{
+       struct vcpu_time_info *vcpus;
+       paddr_t vcpu_pa;
+
+       vcpus = pvclock_kvm_timecounter.tc_priv;
+       if (vcpus == NULL)
+               return;
+
+       if (pmap_extract(pmap_kernel(), (vaddr_t)vcpus, &vcpu_pa) == FALSE)
+               return;
+
+       wrmsr(KVM_MSR_SYS_TIME, (vcpu_pa + (cpu_number() * sizeof(*vcpus))) | 
1);
+}
+
+void
+pvclock_kvm_init(void)
+{
+       struct vcpu_time_info *vcpus;
+       paddr_t vcpu_pa;
+
+       vcpus = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
+       if (vcpus == NULL)
+               return;
+
+       if (pmap_extract(pmap_kernel(), (vaddr_t)vcpus, &vcpu_pa) == FALSE) {
+               free(vcpus, M_DEVBUF, PAGE_SIZE);
+               return;
+       }
+
+       wrmsr(KVM_MSR_SYS_TIME, vcpu_pa | 1);
+       if ((vcpus[0].flags & PVCLOCK_FLAG_TSC_STABLE_BIT) == 0) {
+               free(vcpus, M_DEVBUF, PAGE_SIZE);
+               wrmsr(KVM_MSR_SYS_TIME, 0);
+               return;
+       }
+
+       pvclock_kvm_timecounter.tc_frequency = pvclock_get_freq(vcpus);
+       pvclock_kvm_timecounter.tc_priv = vcpus;
+       tc_init(&pvclock_kvm_timecounter);
+}
+
+u_int
+pvclock_xen_get_timecount(struct timecounter *tc)
+{
+#if NXEN > 0
+       struct xen_softc *sc = xen_sc;
+       struct shared_info *s = sc->sc_ipg;
+       struct vcpu_info *v = &s->vcpu_info[cpu_number()];
+
+       return (pvclock_read(&v->time));
+#else
+       return 0;
+#endif
+}
+
+void
+pvclock_xen_init(void)
+{
+#if NXEN > 0
+       struct xen_softc *sc = xen_sc;
+       struct shared_info *s = sc->sc_ipg;
+       struct vcpu_info *v = &s->vcpu_info[cpu_number()];
+
+       pvclock_xen_timecounter.tc_frequency = pvclock_get_freq(&v->time);
+       tc_init(&pvclock_xen_timecounter);
+#endif
+}
Index: dev/pv/pvreg.h
===================================================================
RCS file: /cvs/src/sys/dev/pv/pvreg.h,v
retrieving revision 1.4
diff -u -p -u -p -r1.4 pvreg.h
--- dev/pv/pvreg.h      12 Dec 2015 12:33:49 -0000      1.4
+++ dev/pv/pvreg.h      16 Jun 2017 06:07:16 -0000
@@ -40,8 +40,13 @@
 #define        KVM_FEATURE_PV_UNHALT                   7
 #define        KVM_FEATURE_CLOCKSOURCE_STABLE_BIT      24
 
+#define KVM_MSR_SYS_TIME                       0x4b564d01
+
 #define        KVM_MSR_EOI_EN                          0x4b564d04
 #define KVM_PV_EOI_BIT                         0
+
+#define PVCLOCK_FLAG_TSC_STABLE_BIT            (1 << 0)
+#define PVCLOCK_FLAG_GUEST_STOPPED             (1 << 1)
 
 /*
  * Hyper-V
Index: dev/pv/pvvar.h
===================================================================
RCS file: /cvs/src/sys/dev/pv/pvvar.h,v
retrieving revision 1.9
diff -u -p -u -p -r1.9 pvvar.h
--- dev/pv/pvvar.h      10 Jan 2017 17:16:39 -0000      1.9
+++ dev/pv/pvvar.h      16 Jun 2017 06:07:16 -0000
@@ -77,8 +77,13 @@ struct pv_attach_args {
 
 void    pvbus_identify(void);
 int     pvbus_probe(void);
+void    pvbus_init_vcpu(void);
 void    pvbus_reboot(struct device *);
 void    pvbus_shutdown(struct device *);
+
+void    pvclock_kvm_init(void);
+void    pvclock_kvm_init_vcpu(void);
+void    pvclock_xen_init(void);
 
 #endif /* _KERNEL */
 #endif /* _DEV_PV_PVBUS_H_ */
Index: dev/pv/xenreg.h
===================================================================
RCS file: /cvs/src/sys/dev/pv/xenreg.h,v
retrieving revision 1.10
diff -u -p -u -p -r1.10 xenreg.h
--- dev/pv/xenreg.h     14 Sep 2016 17:48:28 -0000      1.10
+++ dev/pv/xenreg.h     16 Jun 2017 06:07:16 -0000
@@ -156,7 +156,8 @@ struct vcpu_time_info {
         */
        uint32_t tsc_to_system_mul;
        int8_t tsc_shift;
-       int8_t pad1[3];
+       uint8_t flags;
+       int8_t pad1[2];
 } __packed; /* 32 bytes */
 
 struct vcpu_info {

Reply via email to