This diff (most of which has been around for a while) changes delay_func
when running in KVM to use pvclock (effectively the TSC) to determine when to
stop spinning.  Since this is done in a KVM-specific driver, it won't have
any effect anywhere other than in KVM guests.

Using pvclock rather than the lapic avoids the hangs caused by KVM's use of
the VMX preemption timer to emulate a lapic.  To summarise that problem:
occasionally KVM fails to restart the lapic timer until it gets an exit from
the guest, and if we're busy polling the lapic, we'll never generate such an
exit, and the lapic counter will never reach the value we're waiting for, at
which point we're stuck.

It also adds a timecounter based on pvclock, with its priority set below
acpihpet, so it won't be used for now.  Later on I'd like to make this
the preferred timecounter as it's much faster to read than the emulated
acpihpet.

A couple of testers have confirmed that this does *not* fix the time slowdowns
seen on KVM guests.  I believe fixing that will require more invasive changes.

Since this fixes a concrete problem, rather than just making me feel better
about doing lots of clock reads in some other work I'm doing, I'd like to get
this in now.

ok?


Index: arch/amd64/conf/GENERIC
===================================================================
RCS file: /cvs/src/sys/arch/amd64/conf/GENERIC,v
retrieving revision 1.450
diff -u -p -u -p -r1.450 GENERIC
--- arch/amd64/conf/GENERIC     24 Dec 2017 19:50:56 -0000      1.450
+++ arch/amd64/conf/GENERIC     8 Feb 2018 02:32:49 -0000
@@ -81,6 +81,8 @@ hyperv0       at pvbus?               # Hyper-V guest
 hvn*   at hyperv?              # Hyper-V NetVSC
 hvs*   at hyperv?              # Hyper-V StorVSC
 
+kvmclock0 at pvbus?            # KVM clock
+
 option         PCIVERBOSE
 option         USBVERBOSE
 
Index: arch/amd64/conf/RAMDISK_CD
===================================================================
RCS file: /cvs/src/sys/arch/amd64/conf/RAMDISK_CD,v
retrieving revision 1.169
diff -u -p -u -p -r1.169 RAMDISK_CD
--- arch/amd64/conf/RAMDISK_CD  16 Nov 2017 18:12:27 -0000      1.169
+++ arch/amd64/conf/RAMDISK_CD  8 Feb 2018 02:32:49 -0000
@@ -64,6 +64,8 @@ hyperv0               at pvbus?               # Hyper-V guest
 hvn*           at hyperv?              # Hyper-V NetVSC
 hvs*           at hyperv?              # Hyper-V StorVSC
 
+kvmclock0      at pvbus?               # KVM clock
+
 pchb*          at pci?                 # PCI-Host bridges
 aapic*         at pci?                 # AMD 8131 IO apic
 ppb*           at pci?                 # PCI-PCI bridges
Index: arch/i386/conf/GENERIC
===================================================================
RCS file: /cvs/src/sys/arch/i386/conf/GENERIC,v
retrieving revision 1.829
diff -u -p -u -p -r1.829 GENERIC
--- arch/i386/conf/GENERIC      28 Aug 2017 19:32:53 -0000      1.829
+++ arch/i386/conf/GENERIC      8 Feb 2018 02:32:49 -0000
@@ -40,6 +40,7 @@ amdmsr0       at mainbus?             # MSR access for AM
 
 pvbus0 at mainbus0             # Paravirtual device bus
 vmt0   at pvbus?               # VMware Tools
+kvmclock0 at pvbus?            # KVM clock
 
 acpitimer*     at acpi?
 acpihpet*      at acpi?
Index: dev/pv/files.pv
===================================================================
RCS file: /cvs/src/sys/dev/pv/files.pv,v
retrieving revision 1.13
diff -u -p -u -p -r1.13 files.pv
--- dev/pv/files.pv     14 Jun 2017 10:25:40 -0000      1.13
+++ dev/pv/files.pv     8 Feb 2018 02:32:49 -0000
@@ -75,3 +75,8 @@ file  dev/pv/vioscsi.c                vioscsi
 device vmmci
 attach vmmci at virtio
 file   dev/pv/vmmci.c                  vmmci
+
+device kvmclock
+attach kvmclock at pvbus
+file   dev/pv/kvmclock.c               kvmclock
+file   dev/pv/pvclock.c                kvmclock
Index: dev/pv/kvmclock.c
===================================================================
RCS file: dev/pv/kvmclock.c
diff -N dev/pv/kvmclock.c
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ dev/pv/kvmclock.c   8 Feb 2018 02:32:49 -0000
@@ -0,0 +1,151 @@
+/*     $OpenBSD$       */
+
+/*
+ * Copyright (c) 2017 Jonathan Matthew
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/device.h>
+#include <sys/timetc.h>
+#include <sys/malloc.h>
+
+#include <machine/cpu.h>
+#include <machine/cpufunc.h>
+
+#include <uvm/uvm_extern.h>
+
+#include <dev/pv/pvreg.h>
+#include <dev/pv/pvvar.h>
+#include <dev/pv/pvclockreg.h>
+
+struct kvmclock_softc {
+       struct device                   sc_dev;
+       struct pvclock_vcpu_time_info   *sc_pvclock;
+       paddr_t                         sc_pvclock_pa;
+};
+
+int    kvmclock_match(struct device *, void *, void *);
+void   kvmclock_attach(struct device *, struct device *, void *);
+
+u_int  kvmclock_get_timecount(struct timecounter *tc);
+
+struct timecounter kvmclock_timecounter = {
+       kvmclock_get_timecount, NULL, ~0u, 0, NULL, 999, NULL
+};
+
+struct cfdriver kvmclock_cd = {
+       NULL, "kvmclock", DV_DULL
+};
+
+const struct cfattach kvmclock_ca = {
+       sizeof(struct kvmclock_softc), kvmclock_match, kvmclock_attach, NULL /* 
activate? */
+};
+
+int
+kvmclock_match(struct device *parent, void *match, void *aux)
+{
+       struct pv_attach_args *pva = aux;
+       struct pvbus_hv *hv = &pva->pva_hv[PVBUS_KVM];
+
+       if (hv->hv_base == 0)
+               return (0);
+
+       if ((hv->hv_features & KVM_FEATURE_CLOCKSOURCE2) == 0)
+               return (0);
+
+       return (1);
+}
+
+u_int
+kvmclock_get_timecount(struct timecounter *tc)
+{
+       struct kvmclock_softc *sc;
+       sc = kvmclock_timecounter.tc_priv;
+       if (sc->sc_pvclock == NULL)
+               return (0);
+
+       return (pvclock_read_tsc(sc->sc_pvclock + cpu_number()));
+}
+
+void
+kvmclock_init_cpu(struct pvbus_hv *hv)
+{
+       struct kvmclock_softc *sc;
+
+       sc = kvmclock_timecounter.tc_priv;
+       if (sc == NULL)
+               return;
+
+       wrmsr(KVM_MSR_SYS_TIME, (sc->sc_pvclock_pa + (cpu_number() *
+           sizeof(*sc->sc_pvclock))) | 1);
+}
+
+void
+kvmclock_delay(int usec)
+{
+       struct kvmclock_softc *sc;
+       struct pvclock_vcpu_time_info *pvclock;
+       uint64_t tscticks;
+       uint64_t target;
+
+       sc = kvmclock_timecounter.tc_priv;
+       pvclock = sc->sc_pvclock + cpu_number();
+
+       tscticks = (usec * pvclock_get_freq(pvclock)) / 1000000;
+       target = pvclock_read_tsc(pvclock) + tscticks;
+       while (pvclock_read_tsc(pvclock) < target)
+               CPU_BUSY_CYCLE();
+}
+
+void
+kvmclock_attach(struct device *parent, struct device *self, void *aux)
+{
+       struct pv_attach_args *pva = (struct pv_attach_args *)aux;
+       struct pvbus_hv *hv = &pva->pva_hv[PVBUS_KVM];
+       struct kvmclock_softc *sc = (struct kvmclock_softc *)self;
+
+       printf(": ");
+       sc->sc_pvclock = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
+       if (sc->sc_pvclock == NULL) {
+               printf("could not allocate memory\n");
+               return;
+       }
+
+       if (pmap_extract(pmap_kernel(), (vaddr_t)sc->sc_pvclock,
+           &sc->sc_pvclock_pa) == FALSE) {
+               printf("could not map memory\n");
+               free(sc->sc_pvclock, M_DEVBUF, PAGE_SIZE);
+               sc->sc_pvclock = NULL;
+       }
+
+       wrmsr(KVM_MSR_SYS_TIME, sc->sc_pvclock_pa | 1);
+
+       if (sc->sc_pvclock[0].flags & PVCLOCK_FLAG_TSC_STABLE_BIT) {
+               kvmclock_timecounter.tc_frequency =
+                   pvclock_get_freq(sc->sc_pvclock);
+               kvmclock_timecounter.tc_name = sc->sc_dev.dv_xname;
+               kvmclock_timecounter.tc_priv = sc;
+               tc_init(&kvmclock_timecounter);
+               printf("%lld Hz", kvmclock_timecounter.tc_frequency);
+
+               delay_func = kvmclock_delay;
+       } else {
+               printf("TSC is not stable");
+       }
+
+       hv->hv_init_cpu = kvmclock_init_cpu;
+       printf("\n");
+}
Index: dev/pv/pvclock.c
===================================================================
RCS file: dev/pv/pvclock.c
diff -N dev/pv/pvclock.c
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ dev/pv/pvclock.c    8 Feb 2018 02:32:49 -0000
@@ -0,0 +1,122 @@
+/*     $OpenBSD$ */
+/*
+ * Copyright (c) 2017 Jonathan Matthew <jmatt...@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/atomic.h>
+#include <sys/systm.h>
+
+#include <machine/cpufunc.h>
+
+#include <dev/pv/pvreg.h>
+#include <dev/pv/pvclockreg.h>
+
+#define NSEC_PER_SEC   1000000000LL
+
+uint64_t
+pvclock_ns_to_tsc(uint64_t ns, uint32_t mul, int shift)
+{
+       uint64_t hns;
+
+       hns = ns & 0xffffffff00000000ull;
+       ns <<= 32;
+       hns = hns / mul;
+       ns = (ns / mul) + (hns << 32);
+       if (shift >= 0)
+               ns >>= shift;
+       else
+               ns <<= -shift;
+
+       return (ns);
+}
+
+uint64_t
+pvclock_tsc_to_ns(uint64_t tsc, uint32_t mul, int shift)
+{
+       uint64_t otsc, htsc, nsec;
+
+       otsc = tsc;
+       if (shift >= 0)
+               tsc <<= shift;
+       else
+               tsc >>= -shift;
+
+       htsc = tsc >> 32;
+       tsc &= 0xffffffffll;
+       nsec = ((tsc * mul) >> 32) + (htsc * mul);
+       return (nsec);
+}
+
+void
+pvclock_read(struct pvclock_vcpu_time_info *time_info, uint64_t *systime,
+    uint64_t *tsc, uint32_t *mul, int *shift)
+{
+       uint32_t v;
+       int r = 0;
+
+       do {
+               v = time_info->version;
+               virtio_membar_sync();
+               *tsc = time_info->tsc_timestamp;
+               *systime = time_info->system_time;
+               *mul = time_info->tsc_to_system_mul;
+               *shift = time_info->tsc_shift;
+               virtio_membar_sync();
+               r++;
+       } while ((v != time_info->version) && (r < 100));
+
+       if (r == 100) {
+               printf("pvclock spun out\n");
+               *mul = 0;
+       }
+}
+
+uint64_t
+pvclock_read_tsc(struct pvclock_vcpu_time_info *time_info)
+{
+       uint32_t mul;
+       uint64_t systime, tsc, delta;
+       int shift;
+
+       pvclock_read(time_info, &systime, &tsc, &mul, &shift);
+       delta = rdtsc() - tsc;
+       return (pvclock_ns_to_tsc(systime, mul, shift) + delta);
+}
+
+void
+pvclock_read_nano(struct pvclock_vcpu_time_info *time_info, struct timespec 
*ts)
+{
+       uint32_t mul;
+       uint64_t systime, tsc;
+       int shift;
+
+       pvclock_read(time_info, &systime, &tsc, &mul, &shift);
+       if (mul == 0)
+               systime = 0;
+       else
+               systime += pvclock_tsc_to_ns(rdtsc() - tsc, mul, shift);
+
+       ts->tv_sec = systime / NSEC_PER_SEC;
+       ts->tv_nsec = systime % NSEC_PER_SEC;
+}
+
+uint64_t
+pvclock_get_freq(struct pvclock_vcpu_time_info *time_info)
+{
+       return (pvclock_ns_to_tsc(NSEC_PER_SEC, time_info->tsc_to_system_mul,
+           time_info->tsc_shift));
+}
Index: dev/pv/pvclockreg.h
===================================================================
RCS file: dev/pv/pvclockreg.h
diff -N dev/pv/pvclockreg.h
--- /dev/null   1 Jan 1970 00:00:00 -0000
+++ dev/pv/pvclockreg.h 8 Feb 2018 02:32:49 -0000
@@ -0,0 +1,24 @@
+#ifndef _PV_PVCLOCKREG_H_
+#define _PV_PVCLOCKREG_H_
+
+#define PVCLOCK_FLAG_TSC_STABLE_BIT             (1 << 0)
+#define PVCLOCK_FLAG_GUEST_STOPPED              (1 << 1)
+
+struct pvclock_vcpu_time_info {
+       volatile uint32_t       version;
+       volatile uint32_t       pad1;
+       volatile uint64_t       tsc_timestamp;
+       volatile uint64_t       system_time;
+       volatile uint32_t       tsc_to_system_mul;
+       volatile int8_t         tsc_shift;
+       volatile uint8_t        flags;
+       volatile uint8_t        pad2[2];
+} __packed;
+
+struct pvclock_wall_clock {
+       volatile uint32_t       version;
+       volatile uint32_t       sec;
+       volatile uint32_t       nsec;
+} __packed;
+
+#endif  /* _PV_PVCLOCKREG_H_ */
Index: dev/pv/pvreg.h
===================================================================
RCS file: /cvs/src/sys/dev/pv/pvreg.h,v
retrieving revision 1.4
diff -u -p -u -p -r1.4 pvreg.h
--- dev/pv/pvreg.h      12 Dec 2015 12:33:49 -0000      1.4
+++ dev/pv/pvreg.h      8 Feb 2018 02:32:49 -0000
@@ -40,6 +40,9 @@
 #define        KVM_FEATURE_PV_UNHALT                   7
 #define        KVM_FEATURE_CLOCKSOURCE_STABLE_BIT      24
 
+#define KVM_MSR_WALL_CLOCK                     0x4b564d00
+#define KVM_MSR_SYS_TIME                       0x4b564d01
+
 #define        KVM_MSR_EOI_EN                          0x4b564d04
 #define KVM_PV_EOI_BIT                         0
 
Index: dev/pv/pvvar.h
===================================================================
RCS file: /cvs/src/sys/dev/pv/pvvar.h,v
retrieving revision 1.10
diff -u -p -u -p -r1.10 pvvar.h
--- dev/pv/pvvar.h      22 Jun 2017 06:21:12 -0000      1.10
+++ dev/pv/pvvar.h      8 Feb 2018 02:32:49 -0000
@@ -82,5 +82,12 @@ void  pvbus_init_cpu(void);
 void    pvbus_reboot(struct device *);
 void    pvbus_shutdown(struct device *);
 
+/* pvclock.c */
+struct pvclock_vcpu_time_info;
+
+uint64_t pvclock_read_tsc(struct pvclock_vcpu_time_info *);
+void     pvclock_read_nano(struct pvclock_vcpu_time_info *, struct timespec *);
+uint64_t pvclock_get_freq(struct pvclock_vcpu_time_info *);
+
 #endif /* _KERNEL */
 #endif /* _DEV_PV_PVBUS_H_ */

Reply via email to