On Thu, Feb 08, 2018 at 04:32:29PM +1300, Jonathan Matthew wrote:
> This diff (most of which has been around for a while) changes delay_func
> when running in KVM to use pvclock (effectively the TSC) to determine when to
> stop spinning.  Since this is done in a KVM-specific driver, it won't have
> any effect anywhere other than in KVM guests.
> 
> Using pvclock rather than the lapic avoids the hangs caused by KVM's use of
> the VMX preemption timer to emulate a lapic.  To summarise that problem:
> occasionally KVM fails to restart the lapic timer until it gets an exit from
> the guest, and if we're busy polling the lapic, we'll never generate such an
> exit, and the lapic counter will never reach the value we're waiting for, at
> which point we're stuck.
> 
> It also adds a timecounter based on pvclock, with its priority set below
> acpihpet, so it won't be used for now.  Later on I'd like to make this
> the preferred timecounter as it's much faster to read than the emulated
> acpihpet.
> 
> A couple of testers have confirmed that this does *not* fix the time slowdowns
> seen on KVM guests.  I believe fixing that will require more invasive changes.
> 
> Since this fixes a concrete problem, rather than just making me feel better
> about doing lots of clock reads in some other work I'm doing, I'd like to get
> this in now.
> 
> ok?

This diff needs a bit more work.  It introduces a race in MP kernels that often
causes the machine to crash shortly after pvbus attaches.

While we're attaching mainbus etc., secondary cpus are waiting in a delay loop
for the primary cpu to tell them it's ok to run.  pvclock requires some per-cpu
intialization that only happens after this delay loop finishes, so it's not safe
for kvmclock to change the delay_func pointer until all cpus have finished
hatching.

Another exciting discovery that came out of this is that you can't enter ddb on
a cpu that isn't meant to be running yet.

> 
> 
> Index: arch/amd64/conf/GENERIC
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/conf/GENERIC,v
> retrieving revision 1.450
> diff -u -p -u -p -r1.450 GENERIC
> --- arch/amd64/conf/GENERIC   24 Dec 2017 19:50:56 -0000      1.450
> +++ arch/amd64/conf/GENERIC   8 Feb 2018 02:32:49 -0000
> @@ -81,6 +81,8 @@ hyperv0     at pvbus?               # Hyper-V guest
>  hvn* at hyperv?              # Hyper-V NetVSC
>  hvs* at hyperv?              # Hyper-V StorVSC
>  
> +kvmclock0 at pvbus?          # KVM clock
> +
>  option               PCIVERBOSE
>  option               USBVERBOSE
>  
> Index: arch/amd64/conf/RAMDISK_CD
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/conf/RAMDISK_CD,v
> retrieving revision 1.169
> diff -u -p -u -p -r1.169 RAMDISK_CD
> --- arch/amd64/conf/RAMDISK_CD        16 Nov 2017 18:12:27 -0000      1.169
> +++ arch/amd64/conf/RAMDISK_CD        8 Feb 2018 02:32:49 -0000
> @@ -64,6 +64,8 @@ hyperv0             at pvbus?               # Hyper-V guest
>  hvn*         at hyperv?              # Hyper-V NetVSC
>  hvs*         at hyperv?              # Hyper-V StorVSC
>  
> +kvmclock0    at pvbus?               # KVM clock
> +
>  pchb*                at pci?                 # PCI-Host bridges
>  aapic*               at pci?                 # AMD 8131 IO apic
>  ppb*         at pci?                 # PCI-PCI bridges
> Index: arch/i386/conf/GENERIC
> ===================================================================
> RCS file: /cvs/src/sys/arch/i386/conf/GENERIC,v
> retrieving revision 1.829
> diff -u -p -u -p -r1.829 GENERIC
> --- arch/i386/conf/GENERIC    28 Aug 2017 19:32:53 -0000      1.829
> +++ arch/i386/conf/GENERIC    8 Feb 2018 02:32:49 -0000
> @@ -40,6 +40,7 @@ amdmsr0     at mainbus?             # MSR access for AM
>  
>  pvbus0       at mainbus0             # Paravirtual device bus
>  vmt0 at pvbus?               # VMware Tools
> +kvmclock0 at pvbus?          # KVM clock
>  
>  acpitimer*   at acpi?
>  acpihpet*    at acpi?
> Index: dev/pv/files.pv
> ===================================================================
> RCS file: /cvs/src/sys/dev/pv/files.pv,v
> retrieving revision 1.13
> diff -u -p -u -p -r1.13 files.pv
> --- dev/pv/files.pv   14 Jun 2017 10:25:40 -0000      1.13
> +++ dev/pv/files.pv   8 Feb 2018 02:32:49 -0000
> @@ -75,3 +75,8 @@ file        dev/pv/vioscsi.c                vioscsi
>  device       vmmci
>  attach       vmmci at virtio
>  file dev/pv/vmmci.c                  vmmci
> +
> +device       kvmclock
> +attach       kvmclock at pvbus
> +file dev/pv/kvmclock.c               kvmclock
> +file dev/pv/pvclock.c                kvmclock
> Index: dev/pv/kvmclock.c
> ===================================================================
> RCS file: dev/pv/kvmclock.c
> diff -N dev/pv/kvmclock.c
> --- /dev/null 1 Jan 1970 00:00:00 -0000
> +++ dev/pv/kvmclock.c 8 Feb 2018 02:32:49 -0000
> @@ -0,0 +1,151 @@
> +/*   $OpenBSD$       */
> +
> +/*
> + * Copyright (c) 2017 Jonathan Matthew
> + *
> + * Permission to use, copy, modify, and distribute this software for any
> + * purpose with or without fee is hereby granted, provided that the above
> + * copyright notice and this permission notice appear in all copies.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
> + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
> + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
> + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
> + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
> + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
> + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
> + */
> +
> +#include <sys/param.h>
> +#include <sys/systm.h>
> +#include <sys/device.h>
> +#include <sys/timetc.h>
> +#include <sys/malloc.h>
> +
> +#include <machine/cpu.h>
> +#include <machine/cpufunc.h>
> +
> +#include <uvm/uvm_extern.h>
> +
> +#include <dev/pv/pvreg.h>
> +#include <dev/pv/pvvar.h>
> +#include <dev/pv/pvclockreg.h>
> +
> +struct kvmclock_softc {
> +     struct device                   sc_dev;
> +     struct pvclock_vcpu_time_info   *sc_pvclock;
> +     paddr_t                         sc_pvclock_pa;
> +};
> +
> +int  kvmclock_match(struct device *, void *, void *);
> +void kvmclock_attach(struct device *, struct device *, void *);
> +
> +u_int        kvmclock_get_timecount(struct timecounter *tc);
> +
> +struct timecounter kvmclock_timecounter = {
> +     kvmclock_get_timecount, NULL, ~0u, 0, NULL, 999, NULL
> +};
> +
> +struct cfdriver kvmclock_cd = {
> +     NULL, "kvmclock", DV_DULL
> +};
> +
> +const struct cfattach kvmclock_ca = {
> +     sizeof(struct kvmclock_softc), kvmclock_match, kvmclock_attach, NULL /* 
> activate? */
> +};
> +
> +int
> +kvmclock_match(struct device *parent, void *match, void *aux)
> +{
> +     struct pv_attach_args *pva = aux;
> +     struct pvbus_hv *hv = &pva->pva_hv[PVBUS_KVM];
> +
> +     if (hv->hv_base == 0)
> +             return (0);
> +
> +     if ((hv->hv_features & KVM_FEATURE_CLOCKSOURCE2) == 0)
> +             return (0);
> +
> +     return (1);
> +}
> +
> +u_int
> +kvmclock_get_timecount(struct timecounter *tc)
> +{
> +     struct kvmclock_softc *sc;
> +     sc = kvmclock_timecounter.tc_priv;
> +     if (sc->sc_pvclock == NULL)
> +             return (0);
> +
> +     return (pvclock_read_tsc(sc->sc_pvclock + cpu_number()));
> +}
> +
> +void
> +kvmclock_init_cpu(struct pvbus_hv *hv)
> +{
> +     struct kvmclock_softc *sc;
> +
> +     sc = kvmclock_timecounter.tc_priv;
> +     if (sc == NULL)
> +             return;
> +
> +     wrmsr(KVM_MSR_SYS_TIME, (sc->sc_pvclock_pa + (cpu_number() *
> +         sizeof(*sc->sc_pvclock))) | 1);
> +}
> +
> +void
> +kvmclock_delay(int usec)
> +{
> +     struct kvmclock_softc *sc;
> +     struct pvclock_vcpu_time_info *pvclock;
> +     uint64_t tscticks;
> +     uint64_t target;
> +
> +     sc = kvmclock_timecounter.tc_priv;
> +     pvclock = sc->sc_pvclock + cpu_number();
> +
> +     tscticks = (usec * pvclock_get_freq(pvclock)) / 1000000;
> +     target = pvclock_read_tsc(pvclock) + tscticks;
> +     while (pvclock_read_tsc(pvclock) < target)
> +             CPU_BUSY_CYCLE();
> +}
> +
> +void
> +kvmclock_attach(struct device *parent, struct device *self, void *aux)
> +{
> +     struct pv_attach_args *pva = (struct pv_attach_args *)aux;
> +     struct pvbus_hv *hv = &pva->pva_hv[PVBUS_KVM];
> +     struct kvmclock_softc *sc = (struct kvmclock_softc *)self;
> +
> +     printf(": ");
> +     sc->sc_pvclock = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
> +     if (sc->sc_pvclock == NULL) {
> +             printf("could not allocate memory\n");
> +             return;
> +     }
> +
> +     if (pmap_extract(pmap_kernel(), (vaddr_t)sc->sc_pvclock,
> +         &sc->sc_pvclock_pa) == FALSE) {
> +             printf("could not map memory\n");
> +             free(sc->sc_pvclock, M_DEVBUF, PAGE_SIZE);
> +             sc->sc_pvclock = NULL;
> +     }
> +
> +     wrmsr(KVM_MSR_SYS_TIME, sc->sc_pvclock_pa | 1);
> +
> +     if (sc->sc_pvclock[0].flags & PVCLOCK_FLAG_TSC_STABLE_BIT) {
> +             kvmclock_timecounter.tc_frequency =
> +                 pvclock_get_freq(sc->sc_pvclock);
> +             kvmclock_timecounter.tc_name = sc->sc_dev.dv_xname;
> +             kvmclock_timecounter.tc_priv = sc;
> +             tc_init(&kvmclock_timecounter);
> +             printf("%lld Hz", kvmclock_timecounter.tc_frequency);
> +
> +             delay_func = kvmclock_delay;
> +     } else {
> +             printf("TSC is not stable");
> +     }
> +
> +     hv->hv_init_cpu = kvmclock_init_cpu;
> +     printf("\n");
> +}
> Index: dev/pv/pvclock.c
> ===================================================================
> RCS file: dev/pv/pvclock.c
> diff -N dev/pv/pvclock.c
> --- /dev/null 1 Jan 1970 00:00:00 -0000
> +++ dev/pv/pvclock.c  8 Feb 2018 02:32:49 -0000
> @@ -0,0 +1,122 @@
> +/*   $OpenBSD$ */
> +/*
> + * Copyright (c) 2017 Jonathan Matthew <jmatt...@openbsd.org>
> + *
> + * Permission to use, copy, modify, and distribute this software for any
> + * purpose with or without fee is hereby granted, provided that the above
> + * copyright notice and this permission notice appear in all copies.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
> + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
> + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
> + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
> + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
> + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
> + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
> + */
> +
> +#include <sys/param.h>
> +#include <sys/types.h>
> +#include <sys/atomic.h>
> +#include <sys/systm.h>
> +
> +#include <machine/cpufunc.h>
> +
> +#include <dev/pv/pvreg.h>
> +#include <dev/pv/pvclockreg.h>
> +
> +#define NSEC_PER_SEC 1000000000LL
> +
> +uint64_t
> +pvclock_ns_to_tsc(uint64_t ns, uint32_t mul, int shift)
> +{
> +     uint64_t hns;
> +
> +     hns = ns & 0xffffffff00000000ull;
> +     ns <<= 32;
> +     hns = hns / mul;
> +     ns = (ns / mul) + (hns << 32);
> +     if (shift >= 0)
> +             ns >>= shift;
> +     else
> +             ns <<= -shift;
> +
> +     return (ns);
> +}
> +
> +uint64_t
> +pvclock_tsc_to_ns(uint64_t tsc, uint32_t mul, int shift)
> +{
> +     uint64_t otsc, htsc, nsec;
> +
> +     otsc = tsc;
> +     if (shift >= 0)
> +             tsc <<= shift;
> +     else
> +             tsc >>= -shift;
> +
> +     htsc = tsc >> 32;
> +     tsc &= 0xffffffffll;
> +     nsec = ((tsc * mul) >> 32) + (htsc * mul);
> +     return (nsec);
> +}
> +
> +void
> +pvclock_read(struct pvclock_vcpu_time_info *time_info, uint64_t *systime,
> +    uint64_t *tsc, uint32_t *mul, int *shift)
> +{
> +     uint32_t v;
> +     int r = 0;
> +
> +     do {
> +             v = time_info->version;
> +             virtio_membar_sync();
> +             *tsc = time_info->tsc_timestamp;
> +             *systime = time_info->system_time;
> +             *mul = time_info->tsc_to_system_mul;
> +             *shift = time_info->tsc_shift;
> +             virtio_membar_sync();
> +             r++;
> +     } while ((v != time_info->version) && (r < 100));
> +
> +     if (r == 100) {
> +             printf("pvclock spun out\n");
> +             *mul = 0;
> +     }
> +}
> +
> +uint64_t
> +pvclock_read_tsc(struct pvclock_vcpu_time_info *time_info)
> +{
> +     uint32_t mul;
> +     uint64_t systime, tsc, delta;
> +     int shift;
> +
> +     pvclock_read(time_info, &systime, &tsc, &mul, &shift);
> +     delta = rdtsc() - tsc;
> +     return (pvclock_ns_to_tsc(systime, mul, shift) + delta);
> +}
> +
> +void
> +pvclock_read_nano(struct pvclock_vcpu_time_info *time_info, struct timespec 
> *ts)
> +{
> +     uint32_t mul;
> +     uint64_t systime, tsc;
> +     int shift;
> +
> +     pvclock_read(time_info, &systime, &tsc, &mul, &shift);
> +     if (mul == 0)
> +             systime = 0;
> +     else
> +             systime += pvclock_tsc_to_ns(rdtsc() - tsc, mul, shift);
> +
> +     ts->tv_sec = systime / NSEC_PER_SEC;
> +     ts->tv_nsec = systime % NSEC_PER_SEC;
> +}
> +
> +uint64_t
> +pvclock_get_freq(struct pvclock_vcpu_time_info *time_info)
> +{
> +     return (pvclock_ns_to_tsc(NSEC_PER_SEC, time_info->tsc_to_system_mul,
> +         time_info->tsc_shift));
> +}
> Index: dev/pv/pvclockreg.h
> ===================================================================
> RCS file: dev/pv/pvclockreg.h
> diff -N dev/pv/pvclockreg.h
> --- /dev/null 1 Jan 1970 00:00:00 -0000
> +++ dev/pv/pvclockreg.h       8 Feb 2018 02:32:49 -0000
> @@ -0,0 +1,24 @@
> +#ifndef _PV_PVCLOCKREG_H_
> +#define _PV_PVCLOCKREG_H_
> +
> +#define PVCLOCK_FLAG_TSC_STABLE_BIT             (1 << 0)
> +#define PVCLOCK_FLAG_GUEST_STOPPED              (1 << 1)
> +
> +struct pvclock_vcpu_time_info {
> +     volatile uint32_t       version;
> +     volatile uint32_t       pad1;
> +     volatile uint64_t       tsc_timestamp;
> +     volatile uint64_t       system_time;
> +     volatile uint32_t       tsc_to_system_mul;
> +     volatile int8_t         tsc_shift;
> +     volatile uint8_t        flags;
> +     volatile uint8_t        pad2[2];
> +} __packed;
> +
> +struct pvclock_wall_clock {
> +     volatile uint32_t       version;
> +     volatile uint32_t       sec;
> +     volatile uint32_t       nsec;
> +} __packed;
> +
> +#endif  /* _PV_PVCLOCKREG_H_ */
> Index: dev/pv/pvreg.h
> ===================================================================
> RCS file: /cvs/src/sys/dev/pv/pvreg.h,v
> retrieving revision 1.4
> diff -u -p -u -p -r1.4 pvreg.h
> --- dev/pv/pvreg.h    12 Dec 2015 12:33:49 -0000      1.4
> +++ dev/pv/pvreg.h    8 Feb 2018 02:32:49 -0000
> @@ -40,6 +40,9 @@
>  #define      KVM_FEATURE_PV_UNHALT                   7
>  #define      KVM_FEATURE_CLOCKSOURCE_STABLE_BIT      24
>  
> +#define KVM_MSR_WALL_CLOCK                   0x4b564d00
> +#define KVM_MSR_SYS_TIME                     0x4b564d01
> +
>  #define      KVM_MSR_EOI_EN                          0x4b564d04
>  #define KVM_PV_EOI_BIT                               0
>  
> Index: dev/pv/pvvar.h
> ===================================================================
> RCS file: /cvs/src/sys/dev/pv/pvvar.h,v
> retrieving revision 1.10
> diff -u -p -u -p -r1.10 pvvar.h
> --- dev/pv/pvvar.h    22 Jun 2017 06:21:12 -0000      1.10
> +++ dev/pv/pvvar.h    8 Feb 2018 02:32:49 -0000
> @@ -82,5 +82,12 @@ void        pvbus_init_cpu(void);
>  void  pvbus_reboot(struct device *);
>  void  pvbus_shutdown(struct device *);
>  
> +/* pvclock.c */
> +struct pvclock_vcpu_time_info;
> +
> +uint64_t pvclock_read_tsc(struct pvclock_vcpu_time_info *);
> +void     pvclock_read_nano(struct pvclock_vcpu_time_info *, struct timespec 
> *);
> +uint64_t pvclock_get_freq(struct pvclock_vcpu_time_info *);
> +
>  #endif /* _KERNEL */
>  #endif /* _DEV_PV_PVBUS_H_ */

Reply via email to