Recently I updated the kernel lock profiling stuff I've been working on, since it had been rotting a bit since witness was introduced. Running my diff on a KVM VM, I found there was a pretty huge performance impact (10 minutes to build a kernel instead of 4), which turned out to be because reading the emulated HPET in KVM is slow, and lock profiling involves a lot of extra clock reads. The diff below adds a new TSC-based timecounter implementation for KVM and Xen to remedy this.
KVM and Xen provide frequently-updated views of system time from the host to each vcpu in a way that lets the VM get accurate high resolution time without much work. Linux calls this mechanism 'pvclock' so I'm doing the same. The pvclock structure gives you a system time (in nanoseconds), the TSC reading from when the time was updated, and scaling factors for converting TSC values to nanoseconds. Usually you subtract the TSC reading in the pvclock structure from a current reading, convert that to nanoseconds, and add it to the system time. I decided to go the other way in order to keep all the available resolution. Using pvclock as the timecounter reduces the overhead of lock profiling to almost nothing. Even without the extra clock reads for lock profiling, it cuts a few seconds off kernel compile time on a 2 vcpu vm. I've run it for ~12 hours without ntpd and the clock keeps time accurately. One wrinkle here is that the KVM pvclock mechanism requires setup on each vcpu, so I added a new pvbus function that gets called from cpu_hatch, allowing any hypervisor-specific setup to happen there. I still need to try this on xen, but comments at this stage are welcome. Index: arch/i386/i386/cpu.c =================================================================== RCS file: /cvs/src/sys/arch/i386/i386/cpu.c,v retrieving revision 1.84 diff -u -p -u -p -r1.84 cpu.c --- arch/i386/i386/cpu.c 30 May 2017 15:11:32 -0000 1.84 +++ arch/i386/i386/cpu.c 16 Jun 2017 06:07:16 -0000 @@ -67,6 +67,7 @@ #include "lapic.h" #include "ioapic.h" #include "vmm.h" +#include "pvbus.h" #include <sys/param.h> #include <sys/timeout.h> @@ -104,6 +105,10 @@ #include <machine/i82093var.h> #endif +#if NPVBUS > 0 +#include <dev/pv/pvvar.h> +#endif + #include <dev/ic/mc146818reg.h> #include <i386/isa/nvram.h> #include <dev/isa/isareg.h> @@ -626,6 +631,9 @@ cpu_hatch(void *v) ci->ci_curpmap = pmap_kernel(); cpu_init(ci); +#if NPVBUS > 0 + pvbus_init_vcpu(); +#endif /* Re-initialise memory range handling on AP */ if (mem_range_softc.mr_op != NULL) Index: arch/i386/include/cpufunc.h =================================================================== RCS file: /cvs/src/sys/arch/i386/include/cpufunc.h,v retrieving revision 1.25 diff -u -p -u -p -r1.25 cpufunc.h --- arch/i386/include/cpufunc.h 27 May 2017 12:21:50 -0000 1.25 +++ arch/i386/include/cpufunc.h 16 Jun 2017 06:07:16 -0000 @@ -217,6 +217,15 @@ mfence(void) __asm volatile("mfence" : : : "memory"); } +static __inline u_int64_t +rdtsc(void) +{ + uint32_t hi, lo; + + __asm volatile("rdtsc" : "=d" (hi), "=a" (lo)); + return (((uint64_t)hi << 32) | (uint64_t) lo); +} + static __inline void wrmsr(u_int msr, u_int64_t newval) { Index: arch/amd64/amd64/cpu.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v retrieving revision 1.105 diff -u -p -u -p -r1.105 cpu.c --- arch/amd64/amd64/cpu.c 30 May 2017 15:11:32 -0000 1.105 +++ arch/amd64/amd64/cpu.c 16 Jun 2017 06:07:16 -0000 @@ -67,6 +67,7 @@ #include "lapic.h" #include "ioapic.h" #include "vmm.h" +#include "pvbus.h" #include <sys/param.h> #include <sys/timeout.h> @@ -103,6 +104,10 @@ #include <machine/i82093var.h> #endif +#if NPVBUS > 0 +#include <dev/pv/pvvar.h> +#endif + #include <dev/ic/mc146818reg.h> #include <amd64/isa/nvram.h> #include <dev/isa/isareg.h> @@ -728,6 +733,9 @@ cpu_hatch(void *v) lldt(0); cpu_init(ci); +#if NPVBUS > 0 + pvbus_init_vcpu(); +#endif /* Re-initialise memory range handling on AP */ if (mem_range_softc.mr_op != NULL) Index: dev/pv/files.pv =================================================================== RCS file: /cvs/src/sys/dev/pv/files.pv,v retrieving revision 1.13 diff -u -p -u -p -r1.13 files.pv --- dev/pv/files.pv 14 Jun 2017 10:25:40 -0000 1.13 +++ dev/pv/files.pv 16 Jun 2017 06:07:16 -0000 @@ -75,3 +75,6 @@ file dev/pv/vioscsi.c vioscsi device vmmci attach vmmci at virtio file dev/pv/vmmci.c vmmci + +# paravirtualized clock, used by kvm and xen +file dev/pv/pvclock.c Index: dev/pv/pvbus.c =================================================================== RCS file: /cvs/src/sys/dev/pv/pvbus.c,v retrieving revision 1.16 diff -u -p -u -p -r1.16 pvbus.c --- dev/pv/pvbus.c 10 Jan 2017 17:16:39 -0000 1.16 +++ dev/pv/pvbus.c 16 Jun 2017 06:07:16 -0000 @@ -57,6 +57,7 @@ int pvbus_print(void *, const char *); int pvbus_search(struct device *, void *, void *); void pvbus_kvm(struct pvbus_hv *); +void pvbus_kvm_init_vcpu(struct pvbus_hv *); void pvbus_hyperv(struct pvbus_hv *); void pvbus_hyperv_print(struct pvbus_hv *); void pvbus_xen(struct pvbus_hv *); @@ -84,8 +85,9 @@ struct pvbus_type { const char *name; void (*init)(struct pvbus_hv *); void (*print)(struct pvbus_hv *); + void (*init_vcpu)(struct pvbus_hv *); } pvbus_types[PVBUS_MAX] = { - { "KVMKVMKVM\0\0\0", "KVM", pvbus_kvm }, + { "KVMKVMKVM\0\0\0", "KVM", pvbus_kvm, NULL, pvbus_kvm_init_vcpu }, { "Microsoft Hv", "Hyper-V", pvbus_hyperv, pvbus_hyperv_print }, { "VMwareVMware", "VMware" }, { "XenVMMXenVMM", "Xen", pvbus_xen, pvbus_xen_print }, @@ -210,6 +212,19 @@ pvbus_identify(void) has_hv_cpuid = 1; } +void +pvbus_init_vcpu(void) +{ + int i; + + for (i = 0; i < PVBUS_MAX; i++) { + if (pvbus_hv[i].hv_base == 0) + continue; + if (pvbus_types[i].init_vcpu != NULL) + (pvbus_types[i].init_vcpu)(&pvbus_hv[i]); + } +} + int pvbus_activate(struct device *self, int act) { @@ -287,6 +302,16 @@ pvbus_kvm(struct pvbus_hv *hv) CPUID(hv->hv_base + CPUID_OFFSET_KVM_FEATURES, regs[0], regs[1], regs[2], regs[3]); hv->hv_features = regs[0]; + + if (hv->hv_features & KVM_FEATURE_CLOCKSOURCE2) + pvclock_kvm_init(); +} + +void +pvbus_kvm_init_vcpu(struct pvbus_hv *hv) +{ + if (hv->hv_features & KVM_FEATURE_CLOCKSOURCE2) + pvclock_kvm_init_vcpu(); } void @@ -328,6 +353,8 @@ pvbus_xen(struct pvbus_hv *hv) /* Remove CPU flag for x2apic */ cpu_ecxfeature &= ~CPUIDECX_X2APIC; } + + pvclock_xen_init(); } void Index: dev/pv/pvclock.c =================================================================== RCS file: dev/pv/pvclock.c diff -N dev/pv/pvclock.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ dev/pv/pvclock.c 16 Jun 2017 06:07:16 -0000 @@ -0,0 +1,168 @@ +/* $OpenBSD$ */ +/* + * Copyright (c) 2017 Jonathan Matthew <jmatt...@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/malloc.h> +#include <sys/timetc.h> +#include <sys/atomic.h> +#include <sys/tree.h> +#include <sys/task.h> + +#include <machine/bus.h> +#include <machine/cpufunc.h> + +#include <uvm/uvm_extern.h> + +#include "xen.h" + +#include <dev/pv/pvreg.h> +#include <dev/pv/xenreg.h> +#include <dev/pv/xenvar.h> + +u_int pvclock_kvm_get_timecount(struct timecounter *tc); +u_int pvclock_xen_get_timecount(struct timecounter *tc); + +struct timecounter pvclock_kvm_timecounter = { + pvclock_kvm_get_timecount, NULL, ~0u, 0, "kvmclock", 2000, NULL +}; + +struct timecounter pvclock_xen_timecounter = { + pvclock_xen_get_timecount, NULL, ~0u, 0, "xenclock", 2000, NULL +}; + +uint64_t +pvclock_ns_to_tsc(uint64_t ns, uint32_t mul, int shift) +{ + ns <<= 32; + ns = ns / mul; + if (shift >= 0) + ns >>= shift; + else + ns <<= -shift; + + return (ns); +} + +uint64_t +pvclock_read(struct vcpu_time_info *time_info) +{ + uint32_t v, mul; + uint64_t systime, tsc, delta; + int shift; + + do { + v = time_info->version; + virtio_membar_sync(); /* probably too much */ + tsc = time_info->tsc_timestamp; + systime = time_info->system_time; + mul = time_info->tsc_to_system_mul; + shift = time_info->tsc_shift; + virtio_membar_sync(); + } while (v != time_info->version); + + delta = rdtsc() - tsc; + return (pvclock_ns_to_tsc(systime, mul, shift) + delta); +} + +uint64_t +pvclock_get_freq(struct vcpu_time_info *time_info) +{ + return (pvclock_ns_to_tsc(1000000000, time_info->tsc_to_system_mul, + time_info->tsc_shift)); +} + +u_int +pvclock_kvm_get_timecount(struct timecounter *tc) +{ + struct vcpu_time_info *vcpus; + vcpus = pvclock_kvm_timecounter.tc_priv; + if (vcpus == NULL) + return (0); + + return (pvclock_read(vcpus + cpu_number())); +} + +void +pvclock_kvm_init_vcpu(void) +{ + struct vcpu_time_info *vcpus; + paddr_t vcpu_pa; + + vcpus = pvclock_kvm_timecounter.tc_priv; + if (vcpus == NULL) + return; + + if (pmap_extract(pmap_kernel(), (vaddr_t)vcpus, &vcpu_pa) == FALSE) + return; + + wrmsr(KVM_MSR_SYS_TIME, (vcpu_pa + (cpu_number() * sizeof(*vcpus))) | 1); +} + +void +pvclock_kvm_init(void) +{ + struct vcpu_time_info *vcpus; + paddr_t vcpu_pa; + + vcpus = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT); + if (vcpus == NULL) + return; + + if (pmap_extract(pmap_kernel(), (vaddr_t)vcpus, &vcpu_pa) == FALSE) { + free(vcpus, M_DEVBUF, PAGE_SIZE); + return; + } + + wrmsr(KVM_MSR_SYS_TIME, vcpu_pa | 1); + if ((vcpus[0].flags & PVCLOCK_FLAG_TSC_STABLE_BIT) == 0) { + free(vcpus, M_DEVBUF, PAGE_SIZE); + wrmsr(KVM_MSR_SYS_TIME, 0); + return; + } + + pvclock_kvm_timecounter.tc_frequency = pvclock_get_freq(vcpus); + pvclock_kvm_timecounter.tc_priv = vcpus; + tc_init(&pvclock_kvm_timecounter); +} + +u_int +pvclock_xen_get_timecount(struct timecounter *tc) +{ +#if NXEN > 0 + struct xen_softc *sc = xen_sc; + struct shared_info *s = sc->sc_ipg; + struct vcpu_info *v = &s->vcpu_info[cpu_number()]; + + return (pvclock_read(&v->time)); +#else + return 0; +#endif +} + +void +pvclock_xen_init(void) +{ +#if NXEN > 0 + struct xen_softc *sc = xen_sc; + struct shared_info *s = sc->sc_ipg; + struct vcpu_info *v = &s->vcpu_info[cpu_number()]; + + pvclock_xen_timecounter.tc_frequency = pvclock_get_freq(&v->time); + tc_init(&pvclock_xen_timecounter); +#endif +} Index: dev/pv/pvreg.h =================================================================== RCS file: /cvs/src/sys/dev/pv/pvreg.h,v retrieving revision 1.4 diff -u -p -u -p -r1.4 pvreg.h --- dev/pv/pvreg.h 12 Dec 2015 12:33:49 -0000 1.4 +++ dev/pv/pvreg.h 16 Jun 2017 06:07:16 -0000 @@ -40,8 +40,13 @@ #define KVM_FEATURE_PV_UNHALT 7 #define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24 +#define KVM_MSR_SYS_TIME 0x4b564d01 + #define KVM_MSR_EOI_EN 0x4b564d04 #define KVM_PV_EOI_BIT 0 + +#define PVCLOCK_FLAG_TSC_STABLE_BIT (1 << 0) +#define PVCLOCK_FLAG_GUEST_STOPPED (1 << 1) /* * Hyper-V Index: dev/pv/pvvar.h =================================================================== RCS file: /cvs/src/sys/dev/pv/pvvar.h,v retrieving revision 1.9 diff -u -p -u -p -r1.9 pvvar.h --- dev/pv/pvvar.h 10 Jan 2017 17:16:39 -0000 1.9 +++ dev/pv/pvvar.h 16 Jun 2017 06:07:16 -0000 @@ -77,8 +77,13 @@ struct pv_attach_args { void pvbus_identify(void); int pvbus_probe(void); +void pvbus_init_vcpu(void); void pvbus_reboot(struct device *); void pvbus_shutdown(struct device *); + +void pvclock_kvm_init(void); +void pvclock_kvm_init_vcpu(void); +void pvclock_xen_init(void); #endif /* _KERNEL */ #endif /* _DEV_PV_PVBUS_H_ */ Index: dev/pv/xenreg.h =================================================================== RCS file: /cvs/src/sys/dev/pv/xenreg.h,v retrieving revision 1.10 diff -u -p -u -p -r1.10 xenreg.h --- dev/pv/xenreg.h 14 Sep 2016 17:48:28 -0000 1.10 +++ dev/pv/xenreg.h 16 Jun 2017 06:07:16 -0000 @@ -156,7 +156,8 @@ struct vcpu_time_info { */ uint32_t tsc_to_system_mul; int8_t tsc_shift; - int8_t pad1[3]; + uint8_t flags; + int8_t pad1[2]; } __packed; /* 32 bytes */ struct vcpu_info {