On Thu, Feb 08, 2018 at 04:32:29PM +1300, Jonathan Matthew wrote:
> This diff (most of which has been around for a while) changes delay_func
> when running in KVM to use pvclock (effectively the TSC) to determine when to
> stop spinning. Since this is done in a KVM-specific driver, it won't have
> any effect anywhere other than in KVM guests.
>
> Using pvclock rather than the lapic avoids the hangs caused by KVM's use of
> the VMX preemption timer to emulate a lapic. To summarise that problem:
> occasionally KVM fails to restart the lapic timer until it gets an exit from
> the guest, and if we're busy polling the lapic, we'll never generate such an
> exit, and the lapic counter will never reach the value we're waiting for, at
> which point we're stuck.
>
> It also adds a timecounter based on pvclock, with its priority set below
> acpihpet, so it won't be used for now. Later on I'd like to make this
> the preferred timecounter as it's much faster to read than the emulated
> acpihpet.
>
> A couple of testers have confirmed that this does *not* fix the time slowdowns
> seen on KVM guests. I believe fixing that will require more invasive changes.
>
> Since this fixes a concrete problem, rather than just making me feel better
> about doing lots of clock reads in some other work I'm doing, I'd like to get
> this in now.
>
> ok?
This diff needs a bit more work. It introduces a race in MP kernels that often
causes the machine to crash shortly after pvbus attaches.
While we're attaching mainbus etc., secondary cpus are waiting in a delay loop
for the primary cpu to tell them it's ok to run. pvclock requires some per-cpu
intialization that only happens after this delay loop finishes, so it's not safe
for kvmclock to change the delay_func pointer until all cpus have finished
hatching.
Another exciting discovery that came out of this is that you can't enter ddb on
a cpu that isn't meant to be running yet.
>
>
> Index: arch/amd64/conf/GENERIC
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/conf/GENERIC,v
> retrieving revision 1.450
> diff -u -p -u -p -r1.450 GENERIC
> --- arch/amd64/conf/GENERIC 24 Dec 2017 19:50:56 -0000 1.450
> +++ arch/amd64/conf/GENERIC 8 Feb 2018 02:32:49 -0000
> @@ -81,6 +81,8 @@ hyperv0 at pvbus? # Hyper-V guest
> hvn* at hyperv? # Hyper-V NetVSC
> hvs* at hyperv? # Hyper-V StorVSC
>
> +kvmclock0 at pvbus? # KVM clock
> +
> option PCIVERBOSE
> option USBVERBOSE
>
> Index: arch/amd64/conf/RAMDISK_CD
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/conf/RAMDISK_CD,v
> retrieving revision 1.169
> diff -u -p -u -p -r1.169 RAMDISK_CD
> --- arch/amd64/conf/RAMDISK_CD 16 Nov 2017 18:12:27 -0000 1.169
> +++ arch/amd64/conf/RAMDISK_CD 8 Feb 2018 02:32:49 -0000
> @@ -64,6 +64,8 @@ hyperv0 at pvbus? # Hyper-V guest
> hvn* at hyperv? # Hyper-V NetVSC
> hvs* at hyperv? # Hyper-V StorVSC
>
> +kvmclock0 at pvbus? # KVM clock
> +
> pchb* at pci? # PCI-Host bridges
> aapic* at pci? # AMD 8131 IO apic
> ppb* at pci? # PCI-PCI bridges
> Index: arch/i386/conf/GENERIC
> ===================================================================
> RCS file: /cvs/src/sys/arch/i386/conf/GENERIC,v
> retrieving revision 1.829
> diff -u -p -u -p -r1.829 GENERIC
> --- arch/i386/conf/GENERIC 28 Aug 2017 19:32:53 -0000 1.829
> +++ arch/i386/conf/GENERIC 8 Feb 2018 02:32:49 -0000
> @@ -40,6 +40,7 @@ amdmsr0 at mainbus? # MSR access for AM
>
> pvbus0 at mainbus0 # Paravirtual device bus
> vmt0 at pvbus? # VMware Tools
> +kvmclock0 at pvbus? # KVM clock
>
> acpitimer* at acpi?
> acpihpet* at acpi?
> Index: dev/pv/files.pv
> ===================================================================
> RCS file: /cvs/src/sys/dev/pv/files.pv,v
> retrieving revision 1.13
> diff -u -p -u -p -r1.13 files.pv
> --- dev/pv/files.pv 14 Jun 2017 10:25:40 -0000 1.13
> +++ dev/pv/files.pv 8 Feb 2018 02:32:49 -0000
> @@ -75,3 +75,8 @@ file dev/pv/vioscsi.c vioscsi
> device vmmci
> attach vmmci at virtio
> file dev/pv/vmmci.c vmmci
> +
> +device kvmclock
> +attach kvmclock at pvbus
> +file dev/pv/kvmclock.c kvmclock
> +file dev/pv/pvclock.c kvmclock
> Index: dev/pv/kvmclock.c
> ===================================================================
> RCS file: dev/pv/kvmclock.c
> diff -N dev/pv/kvmclock.c
> --- /dev/null 1 Jan 1970 00:00:00 -0000
> +++ dev/pv/kvmclock.c 8 Feb 2018 02:32:49 -0000
> @@ -0,0 +1,151 @@
> +/* $OpenBSD$ */
> +
> +/*
> + * Copyright (c) 2017 Jonathan Matthew
> + *
> + * Permission to use, copy, modify, and distribute this software for any
> + * purpose with or without fee is hereby granted, provided that the above
> + * copyright notice and this permission notice appear in all copies.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
> + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
> + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
> + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
> + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
> + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
> + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
> + */
> +
> +#include <sys/param.h>
> +#include <sys/systm.h>
> +#include <sys/device.h>
> +#include <sys/timetc.h>
> +#include <sys/malloc.h>
> +
> +#include <machine/cpu.h>
> +#include <machine/cpufunc.h>
> +
> +#include <uvm/uvm_extern.h>
> +
> +#include <dev/pv/pvreg.h>
> +#include <dev/pv/pvvar.h>
> +#include <dev/pv/pvclockreg.h>
> +
> +struct kvmclock_softc {
> + struct device sc_dev;
> + struct pvclock_vcpu_time_info *sc_pvclock;
> + paddr_t sc_pvclock_pa;
> +};
> +
> +int kvmclock_match(struct device *, void *, void *);
> +void kvmclock_attach(struct device *, struct device *, void *);
> +
> +u_int kvmclock_get_timecount(struct timecounter *tc);
> +
> +struct timecounter kvmclock_timecounter = {
> + kvmclock_get_timecount, NULL, ~0u, 0, NULL, 999, NULL
> +};
> +
> +struct cfdriver kvmclock_cd = {
> + NULL, "kvmclock", DV_DULL
> +};
> +
> +const struct cfattach kvmclock_ca = {
> + sizeof(struct kvmclock_softc), kvmclock_match, kvmclock_attach, NULL /*
> activate? */
> +};
> +
> +int
> +kvmclock_match(struct device *parent, void *match, void *aux)
> +{
> + struct pv_attach_args *pva = aux;
> + struct pvbus_hv *hv = &pva->pva_hv[PVBUS_KVM];
> +
> + if (hv->hv_base == 0)
> + return (0);
> +
> + if ((hv->hv_features & KVM_FEATURE_CLOCKSOURCE2) == 0)
> + return (0);
> +
> + return (1);
> +}
> +
> +u_int
> +kvmclock_get_timecount(struct timecounter *tc)
> +{
> + struct kvmclock_softc *sc;
> + sc = kvmclock_timecounter.tc_priv;
> + if (sc->sc_pvclock == NULL)
> + return (0);
> +
> + return (pvclock_read_tsc(sc->sc_pvclock + cpu_number()));
> +}
> +
> +void
> +kvmclock_init_cpu(struct pvbus_hv *hv)
> +{
> + struct kvmclock_softc *sc;
> +
> + sc = kvmclock_timecounter.tc_priv;
> + if (sc == NULL)
> + return;
> +
> + wrmsr(KVM_MSR_SYS_TIME, (sc->sc_pvclock_pa + (cpu_number() *
> + sizeof(*sc->sc_pvclock))) | 1);
> +}
> +
> +void
> +kvmclock_delay(int usec)
> +{
> + struct kvmclock_softc *sc;
> + struct pvclock_vcpu_time_info *pvclock;
> + uint64_t tscticks;
> + uint64_t target;
> +
> + sc = kvmclock_timecounter.tc_priv;
> + pvclock = sc->sc_pvclock + cpu_number();
> +
> + tscticks = (usec * pvclock_get_freq(pvclock)) / 1000000;
> + target = pvclock_read_tsc(pvclock) + tscticks;
> + while (pvclock_read_tsc(pvclock) < target)
> + CPU_BUSY_CYCLE();
> +}
> +
> +void
> +kvmclock_attach(struct device *parent, struct device *self, void *aux)
> +{
> + struct pv_attach_args *pva = (struct pv_attach_args *)aux;
> + struct pvbus_hv *hv = &pva->pva_hv[PVBUS_KVM];
> + struct kvmclock_softc *sc = (struct kvmclock_softc *)self;
> +
> + printf(": ");
> + sc->sc_pvclock = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
> + if (sc->sc_pvclock == NULL) {
> + printf("could not allocate memory\n");
> + return;
> + }
> +
> + if (pmap_extract(pmap_kernel(), (vaddr_t)sc->sc_pvclock,
> + &sc->sc_pvclock_pa) == FALSE) {
> + printf("could not map memory\n");
> + free(sc->sc_pvclock, M_DEVBUF, PAGE_SIZE);
> + sc->sc_pvclock = NULL;
> + }
> +
> + wrmsr(KVM_MSR_SYS_TIME, sc->sc_pvclock_pa | 1);
> +
> + if (sc->sc_pvclock[0].flags & PVCLOCK_FLAG_TSC_STABLE_BIT) {
> + kvmclock_timecounter.tc_frequency =
> + pvclock_get_freq(sc->sc_pvclock);
> + kvmclock_timecounter.tc_name = sc->sc_dev.dv_xname;
> + kvmclock_timecounter.tc_priv = sc;
> + tc_init(&kvmclock_timecounter);
> + printf("%lld Hz", kvmclock_timecounter.tc_frequency);
> +
> + delay_func = kvmclock_delay;
> + } else {
> + printf("TSC is not stable");
> + }
> +
> + hv->hv_init_cpu = kvmclock_init_cpu;
> + printf("\n");
> +}
> Index: dev/pv/pvclock.c
> ===================================================================
> RCS file: dev/pv/pvclock.c
> diff -N dev/pv/pvclock.c
> --- /dev/null 1 Jan 1970 00:00:00 -0000
> +++ dev/pv/pvclock.c 8 Feb 2018 02:32:49 -0000
> @@ -0,0 +1,122 @@
> +/* $OpenBSD$ */
> +/*
> + * Copyright (c) 2017 Jonathan Matthew <[email protected]>
> + *
> + * Permission to use, copy, modify, and distribute this software for any
> + * purpose with or without fee is hereby granted, provided that the above
> + * copyright notice and this permission notice appear in all copies.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
> + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
> + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
> + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
> + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
> + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
> + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
> + */
> +
> +#include <sys/param.h>
> +#include <sys/types.h>
> +#include <sys/atomic.h>
> +#include <sys/systm.h>
> +
> +#include <machine/cpufunc.h>
> +
> +#include <dev/pv/pvreg.h>
> +#include <dev/pv/pvclockreg.h>
> +
> +#define NSEC_PER_SEC 1000000000LL
> +
> +uint64_t
> +pvclock_ns_to_tsc(uint64_t ns, uint32_t mul, int shift)
> +{
> + uint64_t hns;
> +
> + hns = ns & 0xffffffff00000000ull;
> + ns <<= 32;
> + hns = hns / mul;
> + ns = (ns / mul) + (hns << 32);
> + if (shift >= 0)
> + ns >>= shift;
> + else
> + ns <<= -shift;
> +
> + return (ns);
> +}
> +
> +uint64_t
> +pvclock_tsc_to_ns(uint64_t tsc, uint32_t mul, int shift)
> +{
> + uint64_t otsc, htsc, nsec;
> +
> + otsc = tsc;
> + if (shift >= 0)
> + tsc <<= shift;
> + else
> + tsc >>= -shift;
> +
> + htsc = tsc >> 32;
> + tsc &= 0xffffffffll;
> + nsec = ((tsc * mul) >> 32) + (htsc * mul);
> + return (nsec);
> +}
> +
> +void
> +pvclock_read(struct pvclock_vcpu_time_info *time_info, uint64_t *systime,
> + uint64_t *tsc, uint32_t *mul, int *shift)
> +{
> + uint32_t v;
> + int r = 0;
> +
> + do {
> + v = time_info->version;
> + virtio_membar_sync();
> + *tsc = time_info->tsc_timestamp;
> + *systime = time_info->system_time;
> + *mul = time_info->tsc_to_system_mul;
> + *shift = time_info->tsc_shift;
> + virtio_membar_sync();
> + r++;
> + } while ((v != time_info->version) && (r < 100));
> +
> + if (r == 100) {
> + printf("pvclock spun out\n");
> + *mul = 0;
> + }
> +}
> +
> +uint64_t
> +pvclock_read_tsc(struct pvclock_vcpu_time_info *time_info)
> +{
> + uint32_t mul;
> + uint64_t systime, tsc, delta;
> + int shift;
> +
> + pvclock_read(time_info, &systime, &tsc, &mul, &shift);
> + delta = rdtsc() - tsc;
> + return (pvclock_ns_to_tsc(systime, mul, shift) + delta);
> +}
> +
> +void
> +pvclock_read_nano(struct pvclock_vcpu_time_info *time_info, struct timespec
> *ts)
> +{
> + uint32_t mul;
> + uint64_t systime, tsc;
> + int shift;
> +
> + pvclock_read(time_info, &systime, &tsc, &mul, &shift);
> + if (mul == 0)
> + systime = 0;
> + else
> + systime += pvclock_tsc_to_ns(rdtsc() - tsc, mul, shift);
> +
> + ts->tv_sec = systime / NSEC_PER_SEC;
> + ts->tv_nsec = systime % NSEC_PER_SEC;
> +}
> +
> +uint64_t
> +pvclock_get_freq(struct pvclock_vcpu_time_info *time_info)
> +{
> + return (pvclock_ns_to_tsc(NSEC_PER_SEC, time_info->tsc_to_system_mul,
> + time_info->tsc_shift));
> +}
> Index: dev/pv/pvclockreg.h
> ===================================================================
> RCS file: dev/pv/pvclockreg.h
> diff -N dev/pv/pvclockreg.h
> --- /dev/null 1 Jan 1970 00:00:00 -0000
> +++ dev/pv/pvclockreg.h 8 Feb 2018 02:32:49 -0000
> @@ -0,0 +1,24 @@
> +#ifndef _PV_PVCLOCKREG_H_
> +#define _PV_PVCLOCKREG_H_
> +
> +#define PVCLOCK_FLAG_TSC_STABLE_BIT (1 << 0)
> +#define PVCLOCK_FLAG_GUEST_STOPPED (1 << 1)
> +
> +struct pvclock_vcpu_time_info {
> + volatile uint32_t version;
> + volatile uint32_t pad1;
> + volatile uint64_t tsc_timestamp;
> + volatile uint64_t system_time;
> + volatile uint32_t tsc_to_system_mul;
> + volatile int8_t tsc_shift;
> + volatile uint8_t flags;
> + volatile uint8_t pad2[2];
> +} __packed;
> +
> +struct pvclock_wall_clock {
> + volatile uint32_t version;
> + volatile uint32_t sec;
> + volatile uint32_t nsec;
> +} __packed;
> +
> +#endif /* _PV_PVCLOCKREG_H_ */
> Index: dev/pv/pvreg.h
> ===================================================================
> RCS file: /cvs/src/sys/dev/pv/pvreg.h,v
> retrieving revision 1.4
> diff -u -p -u -p -r1.4 pvreg.h
> --- dev/pv/pvreg.h 12 Dec 2015 12:33:49 -0000 1.4
> +++ dev/pv/pvreg.h 8 Feb 2018 02:32:49 -0000
> @@ -40,6 +40,9 @@
> #define KVM_FEATURE_PV_UNHALT 7
> #define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24
>
> +#define KVM_MSR_WALL_CLOCK 0x4b564d00
> +#define KVM_MSR_SYS_TIME 0x4b564d01
> +
> #define KVM_MSR_EOI_EN 0x4b564d04
> #define KVM_PV_EOI_BIT 0
>
> Index: dev/pv/pvvar.h
> ===================================================================
> RCS file: /cvs/src/sys/dev/pv/pvvar.h,v
> retrieving revision 1.10
> diff -u -p -u -p -r1.10 pvvar.h
> --- dev/pv/pvvar.h 22 Jun 2017 06:21:12 -0000 1.10
> +++ dev/pv/pvvar.h 8 Feb 2018 02:32:49 -0000
> @@ -82,5 +82,12 @@ void pvbus_init_cpu(void);
> void pvbus_reboot(struct device *);
> void pvbus_shutdown(struct device *);
>
> +/* pvclock.c */
> +struct pvclock_vcpu_time_info;
> +
> +uint64_t pvclock_read_tsc(struct pvclock_vcpu_time_info *);
> +void pvclock_read_nano(struct pvclock_vcpu_time_info *, struct timespec
> *);
> +uint64_t pvclock_get_freq(struct pvclock_vcpu_time_info *);
> +
> #endif /* _KERNEL */
> #endif /* _DEV_PV_PVBUS_H_ */