On Mon, Nov 19, 2018 at 01:12:46PM +0100, Reyk Floeter wrote: > the attached diff is another attempt at implementing a pvclock(4) > guest driver. This improves the clock on KVM and replaces the need > for using the VM-expensive acpihpet(4). >
So far I only got positive reports. Where are the problems? ;) Otherwise: OK? Reyk > Index: share/man/man4/pvclock.4 > =================================================================== > RCS file: share/man/man4/pvclock.4 > diff -N share/man/man4/pvclock.4 > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ share/man/man4/pvclock.4 19 Nov 2018 11:48:33 -0000 > @@ -0,0 +1,45 @@ > +.\" $OpenBSD$ > +.\" > +.\" Copyright (c) 2018 Reyk Floeter <r...@openbsd.org> > +.\" > +.\" Permission to use, copy, modify, and distribute this software for any > +.\" purpose with or without fee is hereby granted, provided that the above > +.\" copyright notice and this permission notice appear in all copies. > +.\" > +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES > +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF > +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR > +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES > +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN > +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF > +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. > +.\" > +.Dd $Mdocdate$ > +.Dt PVCLOCK 4 > +.Os > +.Sh NAME > +.Nm pvclock > +.Nd paravirtual clock driver > +.Sh SYNOPSIS > +.Cd "pvclock* at pvbus? > +.Sh DESCRIPTION > +The > +.Nm > +driver supports the paravirtual clock that is available in KVM and > +other hypervisors. > +.Nm > +uses a shared page between the host and the hypervisor to synchronize > +the TSC clock in an efficient way. > +.Sh SEE ALSO > +.Xr pvbus 4 > +.Sh HISTORY > +The > +.Nm > +driver first appeared in > +.Ox 6.5 . > +.Sh AUTHORS > +.An -nosplit > +The > +.Nm > +driver was written by > +.An Reyk Floeter Aq Mt r...@openbsd.org . > Index: sys/arch/amd64/conf/GENERIC > =================================================================== > RCS file: /cvs/src/sys/arch/amd64/conf/GENERIC,v > retrieving revision 1.464 > diff -u -p -u -p -r1.464 GENERIC > --- sys/arch/amd64/conf/GENERIC 26 Oct 2018 20:26:19 -0000 1.464 > +++ sys/arch/amd64/conf/GENERIC 19 Nov 2018 11:48:33 -0000 > @@ -79,6 +79,8 @@ ipmi0 at mainbus? disable # IPMI > > vmt0 at pvbus? # VMware Tools > > +pvclock0 at pvbus? # KVM pvclock > + > xen0 at pvbus? # Xen HVM domU > xnf* at xen? # Xen Netfront > xbf* at xen? # Xen Blkfront > Index: sys/dev/pv/files.pv > =================================================================== > RCS file: /cvs/src/sys/dev/pv/files.pv,v > retrieving revision 1.14 > diff -u -p -u -p -r1.14 files.pv > --- sys/dev/pv/files.pv 24 Aug 2018 16:07:01 -0000 1.14 > +++ sys/dev/pv/files.pv 19 Nov 2018 11:48:33 -0000 > @@ -8,6 +8,11 @@ device pvbus > attach pvbus at mainbus > file dev/pv/pvbus.c pvbus needs-flag > > +# KVM clock > +device pvclock > +attach pvclock at pvbus > +file dev/pv/pvclock.c pvclock needs-flag > + > # VMware Tools > device vmt > attach vmt at pvbus > Index: sys/dev/pv/pvclock.c > =================================================================== > RCS file: sys/dev/pv/pvclock.c > diff -N sys/dev/pv/pvclock.c > --- /dev/null 1 Jan 1970 00:00:00 -0000 > +++ sys/dev/pv/pvclock.c 19 Nov 2018 11:48:33 -0000 > @@ -0,0 +1,229 @@ > +/* $OpenBSD$ */ > + > +/* > + * Copyright (c) 2018 Reyk Floeter <r...@openbsd.org> > + * > + * Permission to use, copy, modify, and distribute this software for any > + * purpose with or without fee is hereby granted, provided that the above > + * copyright notice and this permission notice appear in all copies. > + * > + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES > + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF > + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR > + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES > + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN > + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF > + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. > + */ > + > +#if !defined(__i386__) && !defined(__amd64__) > +#error pvclock(4) is only supported on i386 and amd64 > +#endif > + > +#include <sys/param.h> > +#include <sys/systm.h> > +#include <sys/kernel.h> > +#include <sys/timetc.h> > +#include <sys/timeout.h> > +#include <sys/malloc.h> > +#include <sys/atomic.h> > + > +#include <machine/cpu.h> > +#include <uvm/uvm_extern.h> > + > +#include <dev/pv/pvvar.h> > +#include <dev/pv/pvreg.h> > + > +struct pvclock_softc { > + struct device sc_dev; > + void *sc_time; > + paddr_t sc_paddr; > + struct timecounter *sc_tc; > +}; > + > +struct pvclock_wall_clock { > + uint32_t wc_version; > + uint32_t wc_sec; > + uint32_t wc_nsec; > +} __packed; > + > +struct pvclock_time_info { > + uint32_t ti_version; > + uint32_t ti_pad0; > + uint64_t ti_tsc_timestamp; > + uint64_t ti_system_time; > + uint32_t ti_tsc_to_system_mul; > + int8_t ti_tsc_shift; > + uint8_t ti_flags; > + uint8_t ti_pad[2]; > +} __packed; > + > +#define PVCLOCK_FLAG_TSC_STABLE 0x01 > +#define PVCLOCK_SYSTEM_TIME_ENABLE 0x01 > +#define DEVNAME(_s) ((_s)->sc_dev.dv_xname) > + > +int pvclock_match(struct device *, void *, void *); > +void pvclock_attach(struct device *, struct device *, void *); > +int pvclock_activate(struct device *, int); > + > +uint pvclock_get_timecount(struct timecounter *); > +void pvclock_read_time_info(struct pvclock_softc *, > + struct pvclock_time_info *); > + > +struct cfattach pvclock_ca = { > + sizeof(struct pvclock_softc), > + pvclock_match, > + pvclock_attach, > + NULL, > + pvclock_activate > +}; > + > +struct cfdriver pvclock_cd = { > + NULL, > + "pvclock", > + DV_DULL > +}; > + > +struct timecounter pvclock_timecounter = { > + pvclock_get_timecount, NULL, ~0u, 0, NULL, -2000, NULL > +}; > + > +int > +pvclock_match(struct device *parent, void *match, void *aux) > +{ > + struct pv_attach_args *pva = aux; > + struct pvbus_hv *hv; > + > + /* > + * pvclock is provided by different hypervisors, we currently > + * only support the "kvmclock". > + */ > + hv = &pva->pva_hv[PVBUS_KVM]; > + if (hv->hv_base != 0) { > + /* > + * We only implement support for the 2nd version of pvclock. > + * The first version is basically the same but with different > + * non-standard MSRs and it is deprecated. > + */ > + if ((hv->hv_features & (1 << KVM_FEATURE_CLOCKSOURCE2)) == 0) > + return (0); > + > + /* > + * Only the "stable" clock with a sync'ed TSC is supported. > + * In this case the host guarantees that the TSC is constant > + * and invariant, either by the underlying TSC or by passing > + * on a synchronized value. > + */ > + if ((hv->hv_features & > + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) == 0) > + return (0); > + } > + > + return (1); > +} > + > +void > +pvclock_attach(struct device *parent, struct device *self, void *aux) > +{ > + struct pvclock_softc *sc = (struct pvclock_softc *)self; > + paddr_t pa; > + > + if ((sc->sc_time = km_alloc(PAGE_SIZE, > + &kv_any, &kp_zero, &kd_nowait)) == NULL) { > + printf(": time page allocation failed\n"); > + return; > + } > + if (!pmap_extract(pmap_kernel(), (vaddr_t)sc->sc_time, &pa)) { > + printf(": time page PA extraction failed\n"); > + km_free(sc->sc_time, PAGE_SIZE, &kv_any, &kp_zero); > + return; > + } > + > + wrmsr(KVM_MSR_SYSTEM_TIME, pa | PVCLOCK_SYSTEM_TIME_ENABLE); > + sc->sc_paddr = pa; > + > + sc->sc_tc = &pvclock_timecounter; > + sc->sc_tc->tc_name = DEVNAME(sc); > + sc->sc_tc->tc_frequency = 1000000000ULL; > + sc->sc_tc->tc_priv = sc; > + > + /* Better than HPET but below TSC */ > + sc->sc_tc->tc_quality = 1500; > + > + tc_init(sc->sc_tc); > + > + printf("\n"); > +} > + > +int > +pvclock_activate(struct device *self, int act) > +{ > + struct pvclock_softc *sc = (struct pvclock_softc *)self; > + int rv = 0; > + paddr_t pa = sc->sc_paddr; > + > + switch (act) { > + case DVACT_POWERDOWN: > + wrmsr(KVM_MSR_SYSTEM_TIME, pa & ~PVCLOCK_SYSTEM_TIME_ENABLE); > + break; > + case DVACT_RESUME: > + wrmsr(KVM_MSR_SYSTEM_TIME, pa | PVCLOCK_SYSTEM_TIME_ENABLE); > + break; > + } > + > + return (rv); > +} > + > +static inline uint32_t > +pvclock_read_begin(const struct pvclock_time_info *ti) > +{ > + uint32_t version = ti->ti_version & ~0x1; > + virtio_membar_sync(); > + return (version); > +} > + > +static inline int > +pvclock_read_done(const struct pvclock_time_info *ti, > + uint32_t version) > +{ > + virtio_membar_sync(); > + return (ti->ti_version == version); > +} > + > +uint > +pvclock_get_timecount(struct timecounter *tc) > +{ > + struct pvclock_softc *sc = tc->tc_priv; > + struct pvclock_time_info *ti; > + uint64_t tsc_timestamp, system_time, delta, ctr; > + uint32_t version, mul_frac; > + int8_t shift; > + uint8_t flags; > + > + ti = sc->sc_time; > + do { > + version = pvclock_read_begin(ti); > + system_time = ti->ti_system_time; > + tsc_timestamp = ti->ti_tsc_timestamp; > + mul_frac = ti->ti_tsc_to_system_mul; > + shift = ti->ti_tsc_shift; > + flags = ti->ti_flags; > + } while (!pvclock_read_done(ti, version)); > + > + /* This bit must be set as we attached based on the stable flag */ > + if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0) > + panic("%s: unstable result on stable clock", DEVNAME(sc)); > + > + /* > + * The algorithm is described in > + * linux/Documentation/virtual/kvm/msr.txt > + */ > + delta = rdtsc() - tsc_timestamp; > + if (shift < 0) > + delta >>= -shift; > + else > + delta <<= shift; > + ctr = ((delta * mul_frac) >> 32) + system_time; > + > + return (ctr); > +} > Index: sys/dev/pv/pvreg.h > =================================================================== > RCS file: /cvs/src/sys/dev/pv/pvreg.h,v > retrieving revision 1.4 > diff -u -p -u -p -r1.4 pvreg.h > --- sys/dev/pv/pvreg.h 12 Dec 2015 12:33:49 -0000 1.4 > +++ sys/dev/pv/pvreg.h 19 Nov 2018 11:48:33 -0000 > @@ -43,6 +43,9 @@ > #define KVM_MSR_EOI_EN 0x4b564d04 > #define KVM_PV_EOI_BIT 0 > > +#define KVM_MSR_WALL_CLOCK 0x4b564d00 > +#define KVM_MSR_SYSTEM_TIME 0x4b564d01 > + > /* > * Hyper-V > */