Re: faster timecounters for kvm/xen

2017-06-18 Thread Mike Belopuhov
On Sun, Jun 18, 2017 at 21:20 +1000, Jonathan Matthew wrote:
> On Fri, Jun 16, 2017 at 10:25:29AM +0200, Mike Belopuhov wrote:
> > Now regarding the diff.  pvbus_init_vcpu.  Ah yes, please.
> > It was a chicken and the egg problem for me: I didn't have
> > Xen, but wanted a callback from cpu_hatch to setup shared
> > info pages and events (interrupt delivery) for all CPUs.
> > So please factor it out and let's get that committed.
> 
> Updated version of this is below.  The init_cpu function pointer is now in
> the pvbus_hv so it's easier to decide what it does at runtime.
>
[...]
> oks on this bit?
>

OK mikeb

> Index: arch/amd64/amd64/cpu.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v
> retrieving revision 1.105
> diff -u -p -r1.105 cpu.c
> --- arch/amd64/amd64/cpu.c30 May 2017 15:11:32 -  1.105
> +++ arch/amd64/amd64/cpu.c18 Jun 2017 09:16:12 -
> @@ -67,6 +67,7 @@
>  #include "lapic.h"
>  #include "ioapic.h"
>  #include "vmm.h"
> +#include "pvbus.h"
>  
>  #include 
>  #include 
> @@ -103,6 +104,10 @@
>  #include 
>  #endif
>  
> +#if NPVBUS > 0
> +#include 
> +#endif
> +
>  #include 
>  #include 
>  #include 
> @@ -728,6 +733,9 @@ cpu_hatch(void *v)
>   lldt(0);
>  
>   cpu_init(ci);
> +#if NPVBUS > 0
> + pvbus_init_cpu();
> +#endif
>  
>   /* Re-initialise memory range handling on AP */
>   if (mem_range_softc.mr_op != NULL)
> Index: arch/i386/i386/cpu.c
> ===
> RCS file: /cvs/src/sys/arch/i386/i386/cpu.c,v
> retrieving revision 1.84
> diff -u -p -r1.84 cpu.c
> --- arch/i386/i386/cpu.c  30 May 2017 15:11:32 -  1.84
> +++ arch/i386/i386/cpu.c  18 Jun 2017 09:16:13 -
> @@ -67,6 +67,7 @@
>  #include "lapic.h"
>  #include "ioapic.h"
>  #include "vmm.h"
> +#include "pvbus.h"
>  
>  #include 
>  #include 
> @@ -104,6 +105,10 @@
>  #include 
>  #endif
>  
> +#if NPVBUS > 0
> +#include 
> +#endif
> +
>  #include 
>  #include 
>  #include 
> @@ -626,6 +631,9 @@ cpu_hatch(void *v)
>  
>   ci->ci_curpmap = pmap_kernel();
>   cpu_init(ci);
> +#if NPVBUS > 0
> + pvbus_init_cpu();
> +#endif
>  
>   /* Re-initialise memory range handling on AP */
>   if (mem_range_softc.mr_op != NULL)
> Index: dev/pv/pvbus.c
> ===
> RCS file: /cvs/src/sys/dev/pv/pvbus.c,v
> retrieving revision 1.16
> diff -u -p -r1.16 pvbus.c
> --- dev/pv/pvbus.c10 Jan 2017 17:16:39 -  1.16
> +++ dev/pv/pvbus.c18 Jun 2017 09:16:17 -
> @@ -210,6 +210,19 @@ pvbus_identify(void)
>   has_hv_cpuid = 1;
>  }
>  
> +void
> +pvbus_init_cpu(void)
> +{
> + int i;
> +
> + for (i = 0; i < PVBUS_MAX; i++) {
> + if (pvbus_hv[i].hv_base == 0)
> + continue;
> + if (pvbus_hv[i].hv_init_cpu != NULL)
> + (pvbus_hv[i].hv_init_cpu)(_hv[i]);
> + }
> +}
> +
>  int
>  pvbus_activate(struct device *self, int act)
>  {
> Index: dev/pv/pvvar.h
> ===
> RCS file: /cvs/src/sys/dev/pv/pvvar.h,v
> retrieving revision 1.9
> diff -u -p -r1.9 pvvar.h
> --- dev/pv/pvvar.h10 Jan 2017 17:16:39 -  1.9
> +++ dev/pv/pvvar.h18 Jun 2017 09:16:17 -
> @@ -56,6 +56,7 @@ struct pvbus_hv {
>  
>   void*hv_arg;
>   int (*hv_kvop)(void *, int, char *, char *, size_t);
> + void(*hv_init_cpu)(struct pvbus_hv *);
>  };
>  
>  struct pvbus_softc {
> @@ -77,6 +78,7 @@ struct pv_attach_args {
>  
>  void  pvbus_identify(void);
>  int   pvbus_probe(void);
> +void  pvbus_init_cpu(void);
>  void  pvbus_reboot(struct device *);
>  void  pvbus_shutdown(struct device *);
>  



Re: faster timecounters for kvm/xen

2017-06-18 Thread Jonathan Matthew
On Fri, Jun 16, 2017 at 10:25:29AM +0200, Mike Belopuhov wrote:
> On Fri, Jun 16, 2017 at 16:31 +1000, Jonathan Matthew wrote:
> > Recently I updated the kernel lock profiling stuff I've been working on, 
> > since
> > it  had been rotting a bit since witness was introduced.  Running my diff 
> > on a
> > KVM VM, I found there was a pretty huge performance impact (10 minutes to
> > build a kernel instead of 4), which turned out to be because reading the
> > emulated HPET in KVM is slow, and lock profiling involves a lot of extra
> > clock reads.  The diff below adds a new TSC-based timecounter implementation
> > for KVM and Xen to remedy this.
> > 
> > KVM and Xen provide frequently-updated views of system time from the host to
> > each vcpu in a way that lets the VM get accurate high resolution time 
> > without
> > much work.  Linux calls this mechanism 'pvclock' so I'm doing the same.
> > 
> > The pvclock structure gives you a system time (in nanoseconds), the TSC
> > reading from when the time was updated, and scaling factors for converting 
> > TSC
> > values to nanoseconds.  Usually you subtract the TSC reading in the pvclock
> > structure from a current reading, convert that to nanoseconds, and add it to
> > the system time.  I decided to go the other way in order to keep all the
> > available resolution.
> > 
> > Using pvclock as the timecounter reduces the overhead of lock profiling to
> > almost nothing.  Even without the extra clock reads for lock profiling,
> > it cuts a few seconds off kernel compile time on a 2 vcpu vm.  I've run it
> > for ~12 hours without ntpd and the clock keeps time accurately.
> > 
> > One wrinkle here is that the KVM pvclock mechanism requires setup on each 
> > vcpu,
> > so I added a new pvbus function that gets called from cpu_hatch, allowing 
> > any
> > hypervisor-specific setup to happen there.
> > 
> > I still need to try this on xen, but comments at this stage are welcome.
> >
> 
> Cool!  You've beaten both of us to it :)
> 
> Last time I've tried uebayashi's pvclock on Xen, it didn't
> work for me.  I didn't have time to investigate why but
> probably because we need per-cpu readings.  Which you do
> for KVM.  I'll test this on Xen as soon as I get to the
> office.
> 
> Now regarding the diff.  pvbus_init_vcpu.  Ah yes, please.
> It was a chicken and the egg problem for me: I didn't have
> Xen, but wanted a callback from cpu_hatch to setup shared
> info pages and events (interrupt delivery) for all CPUs.
> So please factor it out and let's get that committed.

Updated version of this is below.  The init_cpu function pointer is now in
the pvbus_hv so it's easier to decide what it does at runtime.

> 
> I don't know if it's a good idea to depend on Xen's
> definition of vcpu_time_info.  I think I have factored
> it out into the pvclock_time_info and put it into the
> pvclockvar.h or something like that.  And then made Xen
> use those definitions instead of its own.  Dunno what's
> the best course of action here.
> 
> But this brings another point: where and how to perform
> the pvclock initialization and attachment.  In your diff
> pvclock_xen_init comes a bit too early: none of the Xen
> things are initialized at that point, shared info page
> isn't allocated.

I'm dropping the xen bits for now, since they need more work in a few different
ways.

> 
> I told Stefan in Munich that perhaps having a kvm.c shim
> that would prepare and attach pvclock (and maybe provide
> some flags and other bells and whistles).
> 
> I think we need to call pvclock attachment from Xen code
> where it's appropriate, not from pvbus code.  Or do a
> config_attach on it.  Why didn't you want to put it in
> its own device driver?

I was hoping it'd remain as simple as the first diff, but since things aren't
going that way I'm happy to reconsider.

> 
> It's nice that this version avoids using assembly. Any idea
> what was the reason for Linux/FreeBSD code to use it?  Were
> they afraid to lose precision maybe?

I think so.  We probably want to try harder to keep the high bits of the system
time since the low 32 bits wrap every 4s or so.


oks on this bit?

Index: arch/amd64/amd64/cpu.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v
retrieving revision 1.105
diff -u -p -r1.105 cpu.c
--- arch/amd64/amd64/cpu.c  30 May 2017 15:11:32 -  1.105
+++ arch/amd64/amd64/cpu.c  18 Jun 2017 09:16:12 -
@@ -67,6 +67,7 @@
 #include "lapic.h"
 #include "ioapic.h"
 #include "vmm.h"
+#include "pvbus.h"
 
 #include 
 #include 
@@ -103,6 +104,10 @@
 #include 
 #endif
 
+#if NPVBUS > 0
+#include 
+#endif
+
 #include 
 #include 
 #include 
@@ -728,6 +733,9 @@ cpu_hatch(void *v)
lldt(0);
 
cpu_init(ci);
+#if NPVBUS > 0
+   pvbus_init_cpu();
+#endif
 
/* Re-initialise memory range handling on AP */
if (mem_range_softc.mr_op != NULL)
Index: arch/i386/i386/cpu.c

Re: faster timecounters for kvm/xen

2017-06-17 Thread Mike Belopuhov
On 17 June 2017 at 14:17, Jonathan Matthew  wrote:
>
> On Fri, Jun 16, 2017 at 01:19:02PM +0200, Mike Belopuhov wrote:
> > On Fri, Jun 16, 2017 at 10:25 +0200, Mike Belopuhov wrote:
> > > I don't know if it's a good idea to depend on Xen's
> > > definition of vcpu_time_info.  I think I have factored
> > > it out into the pvclock_time_info and put it into the
> > > pvclockvar.h or something like that.  And then made Xen
> > > use those definitions instead of its own.  Dunno what's
> > > the best course of action here.
> > >
> >
> > This is what I would like to use.  I've stripped the API
> > part, but we can add it as well.  I don't believe this
> > file requires a specific license since there's a handful
> > of pvclock header files out there implementing a common
> > interface so a person committing such a file can add his
> > own copyright.  Opinions?
>
> Looks good to me.  Can we put the #defines for flag bits in there too?
>
> #define PVCLOCK_FLAG_TSC_STABLE_BIT (1 << 0)
> #define PVCLOCK_FLAG_GUEST_STOPPED  (1 << 1)
>
> As far as I can tell, xen doesn't use these, but Linux handles them in its
> common pvclock code anyway.
>

Sure, go for it.


Re: faster timecounters for kvm/xen

2017-06-17 Thread Jonathan Matthew
On Fri, Jun 16, 2017 at 01:19:02PM +0200, Mike Belopuhov wrote:
> On Fri, Jun 16, 2017 at 10:25 +0200, Mike Belopuhov wrote:
> > I don't know if it's a good idea to depend on Xen's
> > definition of vcpu_time_info.  I think I have factored
> > it out into the pvclock_time_info and put it into the
> > pvclockvar.h or something like that.  And then made Xen
> > use those definitions instead of its own.  Dunno what's
> > the best course of action here.
> > 
> 
> This is what I would like to use.  I've stripped the API
> part, but we can add it as well.  I don't believe this
> file requires a specific license since there's a handful
> of pvclock header files out there implementing a common
> interface so a person committing such a file can add his
> own copyright.  Opinions?

Looks good to me.  Can we put the #defines for flag bits in there too?

#define PVCLOCK_FLAG_TSC_STABLE_BIT (1 << 0)
#define PVCLOCK_FLAG_GUEST_STOPPED  (1 << 1)

As far as I can tell, xen doesn't use these, but Linux handles them in its
common pvclock code anyway.

> 
> 
> #ifndef _PV_PVCLOCK_H_
> #define _PV_PVCLOCK_H_
> 
> struct pvclock_vcpu_time_info {
>   volatile uint32_t   version;
>   volatile uint32_t   pad1;
>   volatile uint64_t   tsc_timestamp;
>   volatile uint64_t   system_time;
>   volatile uint32_t   tsc_to_system_mul;
>   volatile int8_t tsc_shift;
>   volatile uint8_tflags;
>   volatile uint8_tpad2[2];
> } __packed;
> 
> struct pvclock_wall_clock {
>   volatile uint32_t   version;
>   volatile uint32_t   sec;
>   volatile uint32_t   nsec;
> } __packed;
> 
> #endif/* _PV_PVCLOCK_H_ */



Re: faster timecounters for kvm/xen

2017-06-16 Thread Mike Larkin
On Fri, Jun 16, 2017 at 11:09:01PM +1000, Jonathan Gray wrote:
> On Fri, Jun 16, 2017 at 02:23:36PM +0200, Mike Belopuhov wrote:
> > On Fri, Jun 16, 2017 at 16:31 +1000, Jonathan Matthew wrote:
> > > Index: arch/i386/include/cpufunc.h
> > > ===
> > > RCS file: /cvs/src/sys/arch/i386/include/cpufunc.h,v
> > > retrieving revision 1.25
> > > diff -u -p -u -p -r1.25 cpufunc.h
> > > --- arch/i386/include/cpufunc.h   27 May 2017 12:21:50 -  1.25
> > > +++ arch/i386/include/cpufunc.h   16 Jun 2017 06:07:16 -
> > > @@ -217,6 +217,15 @@ mfence(void)
> > >   __asm volatile("mfence" : : : "memory");
> > >  }
> > >  
> > > +static __inline u_int64_t
> > > +rdtsc(void)
> > > +{
> > > + uint32_t hi, lo;
> > > +
> > > + __asm volatile("rdtsc" : "=d" (hi), "=a" (lo));
> > > + return (((uint64_t)hi << 32) | (uint64_t) lo);
> > > +}
> > > +
> > >  static __inline void
> > >  wrmsr(u_int msr, u_int64_t newval)
> > >  {
> > 
> > I think it's OK to get this chunk in.  amd64 has got this already.
> > 
> 
> Perhaps make it __asm volatile ("rdtsc" : "=A" (v)); like the pctr.h version?
> 
> That's also what the gcc example uses for rdtsc in
> https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html#Machine-Constraints
> 
> I guess the chance of something pretending to be a 486 without TSC attaching
> pvbus is slim.
> 

I was going to mention the same thing but I think the risk is low, so ok
mlarkin@ on this if it isn't in already.

-ml



Re: faster timecounters for kvm/xen

2017-06-16 Thread Mike Belopuhov
On Fri, Jun 16, 2017 at 10:25 +0200, Mike Belopuhov wrote:
> Last time I've tried uebayashi's pvclock on Xen, it didn't
> work for me.  I didn't have time to investigate why but
> probably because we need per-cpu readings.  Which you do
> for KVM.  I'll test this on Xen as soon as I get to the
> office.
>
[...]
> 
> But this brings another point: where and how to perform
> the pvclock initialization and attachment.  In your diff
> pvclock_xen_init comes a bit too early: none of the Xen
> things are initialized at that point, shared info page
> isn't allocated.
> 

As I've told jmatthew@ privately this doesn't work on Xen,
but I've changed a few things and made it work:
   https://github.com/mbelop/src/commits/pvclock

However, I'm observing a huge drift: 6 minutes in about an
hour so it's not usable as it is. I think this is the same
thing as I saw before.  I'll ponder the per-CPU vcpu_info
path and report back when I've got something.  But in the
meantime I hope that jmatthew@ will check in rdtsc and
pvbus_init_cpu bits (we're lobbying for a s/_vcpu/_cpu/
change. :-)



Re: faster timecounters for kvm/xen

2017-06-16 Thread Mike Belopuhov
On Fri, Jun 16, 2017 at 23:09 +1000, Jonathan Gray wrote:
> On Fri, Jun 16, 2017 at 02:23:36PM +0200, Mike Belopuhov wrote:
> > On Fri, Jun 16, 2017 at 16:31 +1000, Jonathan Matthew wrote:
> > > Index: arch/i386/include/cpufunc.h
> > > ===
> > > RCS file: /cvs/src/sys/arch/i386/include/cpufunc.h,v
> > > retrieving revision 1.25
> > > diff -u -p -u -p -r1.25 cpufunc.h
> > > --- arch/i386/include/cpufunc.h   27 May 2017 12:21:50 -  1.25
> > > +++ arch/i386/include/cpufunc.h   16 Jun 2017 06:07:16 -
> > > @@ -217,6 +217,15 @@ mfence(void)
> > >   __asm volatile("mfence" : : : "memory");
> > >  }
> > >  
> > > +static __inline u_int64_t
> > > +rdtsc(void)
> > > +{
> > > + uint32_t hi, lo;
> > > +
> > > + __asm volatile("rdtsc" : "=d" (hi), "=a" (lo));
> > > + return (((uint64_t)hi << 32) | (uint64_t) lo);
> > > +}
> > > +
> > >  static __inline void
> > >  wrmsr(u_int msr, u_int64_t newval)
> > >  {
> > 
> > I think it's OK to get this chunk in.  amd64 has got this already.
> > 
> 
> Perhaps make it __asm volatile ("rdtsc" : "=A" (v)); like the pctr.h version?
>
> That's also what the gcc example uses for rdtsc in
> https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html#Machine-Constraints
>

Sure.

> I guess the chance of something pretending to be a 486 without TSC attaching
> pvbus is slim.

This file is full of instructions not supported on older CPUs
so no harm done as far as I can tell.



Re: faster timecounters for kvm/xen

2017-06-16 Thread Jonathan Gray
On Fri, Jun 16, 2017 at 02:23:36PM +0200, Mike Belopuhov wrote:
> On Fri, Jun 16, 2017 at 16:31 +1000, Jonathan Matthew wrote:
> > Index: arch/i386/include/cpufunc.h
> > ===
> > RCS file: /cvs/src/sys/arch/i386/include/cpufunc.h,v
> > retrieving revision 1.25
> > diff -u -p -u -p -r1.25 cpufunc.h
> > --- arch/i386/include/cpufunc.h 27 May 2017 12:21:50 -  1.25
> > +++ arch/i386/include/cpufunc.h 16 Jun 2017 06:07:16 -
> > @@ -217,6 +217,15 @@ mfence(void)
> > __asm volatile("mfence" : : : "memory");
> >  }
> >  
> > +static __inline u_int64_t
> > +rdtsc(void)
> > +{
> > +   uint32_t hi, lo;
> > +
> > +   __asm volatile("rdtsc" : "=d" (hi), "=a" (lo));
> > +   return (((uint64_t)hi << 32) | (uint64_t) lo);
> > +}
> > +
> >  static __inline void
> >  wrmsr(u_int msr, u_int64_t newval)
> >  {
> 
> I think it's OK to get this chunk in.  amd64 has got this already.
> 

Perhaps make it __asm volatile ("rdtsc" : "=A" (v)); like the pctr.h version?

That's also what the gcc example uses for rdtsc in
https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html#Machine-Constraints

I guess the chance of something pretending to be a 486 without TSC attaching
pvbus is slim.



Re: faster timecounters for kvm/xen

2017-06-16 Thread Mike Belopuhov
On Fri, Jun 16, 2017 at 16:31 +1000, Jonathan Matthew wrote:
> Index: arch/i386/include/cpufunc.h
> ===
> RCS file: /cvs/src/sys/arch/i386/include/cpufunc.h,v
> retrieving revision 1.25
> diff -u -p -u -p -r1.25 cpufunc.h
> --- arch/i386/include/cpufunc.h   27 May 2017 12:21:50 -  1.25
> +++ arch/i386/include/cpufunc.h   16 Jun 2017 06:07:16 -
> @@ -217,6 +217,15 @@ mfence(void)
>   __asm volatile("mfence" : : : "memory");
>  }
>  
> +static __inline u_int64_t
> +rdtsc(void)
> +{
> + uint32_t hi, lo;
> +
> + __asm volatile("rdtsc" : "=d" (hi), "=a" (lo));
> + return (((uint64_t)hi << 32) | (uint64_t) lo);
> +}
> +
>  static __inline void
>  wrmsr(u_int msr, u_int64_t newval)
>  {

I think it's OK to get this chunk in.  amd64 has got this already.



Re: faster timecounters for kvm/xen

2017-06-16 Thread Mike Belopuhov
On Fri, Jun 16, 2017 at 10:25 +0200, Mike Belopuhov wrote:
> I don't know if it's a good idea to depend on Xen's
> definition of vcpu_time_info.  I think I have factored
> it out into the pvclock_time_info and put it into the
> pvclockvar.h or something like that.  And then made Xen
> use those definitions instead of its own.  Dunno what's
> the best course of action here.
> 

This is what I would like to use.  I've stripped the API
part, but we can add it as well.  I don't believe this
file requires a specific license since there's a handful
of pvclock header files out there implementing a common
interface so a person committing such a file can add his
own copyright.  Opinions?


#ifndef _PV_PVCLOCK_H_
#define _PV_PVCLOCK_H_

struct pvclock_vcpu_time_info {
volatile uint32_t   version;
volatile uint32_t   pad1;
volatile uint64_t   tsc_timestamp;
volatile uint64_t   system_time;
volatile uint32_t   tsc_to_system_mul;
volatile int8_t tsc_shift;
volatile uint8_tflags;
volatile uint8_tpad2[2];
} __packed;

struct pvclock_wall_clock {
volatile uint32_t   version;
volatile uint32_t   sec;
volatile uint32_t   nsec;
} __packed;

#endif  /* _PV_PVCLOCK_H_ */



Re: faster timecounters for kvm/xen

2017-06-16 Thread Mike Belopuhov
On Fri, Jun 16, 2017 at 16:31 +1000, Jonathan Matthew wrote:
> Recently I updated the kernel lock profiling stuff I've been working on, since
> it  had been rotting a bit since witness was introduced.  Running my diff on a
> KVM VM, I found there was a pretty huge performance impact (10 minutes to
> build a kernel instead of 4), which turned out to be because reading the
> emulated HPET in KVM is slow, and lock profiling involves a lot of extra
> clock reads.  The diff below adds a new TSC-based timecounter implementation
> for KVM and Xen to remedy this.
> 
> KVM and Xen provide frequently-updated views of system time from the host to
> each vcpu in a way that lets the VM get accurate high resolution time without
> much work.  Linux calls this mechanism 'pvclock' so I'm doing the same.
> 
> The pvclock structure gives you a system time (in nanoseconds), the TSC
> reading from when the time was updated, and scaling factors for converting TSC
> values to nanoseconds.  Usually you subtract the TSC reading in the pvclock
> structure from a current reading, convert that to nanoseconds, and add it to
> the system time.  I decided to go the other way in order to keep all the
> available resolution.
> 
> Using pvclock as the timecounter reduces the overhead of lock profiling to
> almost nothing.  Even without the extra clock reads for lock profiling,
> it cuts a few seconds off kernel compile time on a 2 vcpu vm.  I've run it
> for ~12 hours without ntpd and the clock keeps time accurately.
> 
> One wrinkle here is that the KVM pvclock mechanism requires setup on each 
> vcpu,
> so I added a new pvbus function that gets called from cpu_hatch, allowing any
> hypervisor-specific setup to happen there.
> 
> I still need to try this on xen, but comments at this stage are welcome.
>

Cool!  You've beaten both of us to it :)

Last time I've tried uebayashi's pvclock on Xen, it didn't
work for me.  I didn't have time to investigate why but
probably because we need per-cpu readings.  Which you do
for KVM.  I'll test this on Xen as soon as I get to the
office.

Now regarding the diff.  pvbus_init_vcpu.  Ah yes, please.
It was a chicken and the egg problem for me: I didn't have
Xen, but wanted a callback from cpu_hatch to setup shared
info pages and events (interrupt delivery) for all CPUs.
So please factor it out and let's get that committed.

I don't know if it's a good idea to depend on Xen's
definition of vcpu_time_info.  I think I have factored
it out into the pvclock_time_info and put it into the
pvclockvar.h or something like that.  And then made Xen
use those definitions instead of its own.  Dunno what's
the best course of action here.

But this brings another point: where and how to perform
the pvclock initialization and attachment.  In your diff
pvclock_xen_init comes a bit too early: none of the Xen
things are initialized at that point, shared info page
isn't allocated.

I told Stefan in Munich that perhaps having a kvm.c shim
that would prepare and attach pvclock (and maybe provide
some flags and other bells and whistles).

I think we need to call pvclock attachment from Xen code
where it's appropriate, not from pvbus code.  Or do a
config_attach on it.  Why didn't you want to put it in
its own device driver?

It's nice that this version avoids using assembly. Any idea
what was the reason for Linux/FreeBSD code to use it?  Were
they afraid to lose precision maybe?

In any case, good job, lets try to get this in.



faster timecounters for kvm/xen

2017-06-16 Thread Jonathan Matthew
Recently I updated the kernel lock profiling stuff I've been working on, since
it  had been rotting a bit since witness was introduced.  Running my diff on a
KVM VM, I found there was a pretty huge performance impact (10 minutes to
build a kernel instead of 4), which turned out to be because reading the
emulated HPET in KVM is slow, and lock profiling involves a lot of extra
clock reads.  The diff below adds a new TSC-based timecounter implementation
for KVM and Xen to remedy this.

KVM and Xen provide frequently-updated views of system time from the host to
each vcpu in a way that lets the VM get accurate high resolution time without
much work.  Linux calls this mechanism 'pvclock' so I'm doing the same.

The pvclock structure gives you a system time (in nanoseconds), the TSC
reading from when the time was updated, and scaling factors for converting TSC
values to nanoseconds.  Usually you subtract the TSC reading in the pvclock
structure from a current reading, convert that to nanoseconds, and add it to
the system time.  I decided to go the other way in order to keep all the
available resolution.

Using pvclock as the timecounter reduces the overhead of lock profiling to
almost nothing.  Even without the extra clock reads for lock profiling,
it cuts a few seconds off kernel compile time on a 2 vcpu vm.  I've run it
for ~12 hours without ntpd and the clock keeps time accurately.

One wrinkle here is that the KVM pvclock mechanism requires setup on each vcpu,
so I added a new pvbus function that gets called from cpu_hatch, allowing any
hypervisor-specific setup to happen there.

I still need to try this on xen, but comments at this stage are welcome.

Index: arch/i386/i386/cpu.c
===
RCS file: /cvs/src/sys/arch/i386/i386/cpu.c,v
retrieving revision 1.84
diff -u -p -u -p -r1.84 cpu.c
--- arch/i386/i386/cpu.c30 May 2017 15:11:32 -  1.84
+++ arch/i386/i386/cpu.c16 Jun 2017 06:07:16 -
@@ -67,6 +67,7 @@
 #include "lapic.h"
 #include "ioapic.h"
 #include "vmm.h"
+#include "pvbus.h"
 
 #include 
 #include 
@@ -104,6 +105,10 @@
 #include 
 #endif
 
+#if NPVBUS > 0
+#include 
+#endif
+
 #include 
 #include 
 #include 
@@ -626,6 +631,9 @@ cpu_hatch(void *v)
 
ci->ci_curpmap = pmap_kernel();
cpu_init(ci);
+#if NPVBUS > 0
+   pvbus_init_vcpu();
+#endif
 
/* Re-initialise memory range handling on AP */
if (mem_range_softc.mr_op != NULL)
Index: arch/i386/include/cpufunc.h
===
RCS file: /cvs/src/sys/arch/i386/include/cpufunc.h,v
retrieving revision 1.25
diff -u -p -u -p -r1.25 cpufunc.h
--- arch/i386/include/cpufunc.h 27 May 2017 12:21:50 -  1.25
+++ arch/i386/include/cpufunc.h 16 Jun 2017 06:07:16 -
@@ -217,6 +217,15 @@ mfence(void)
__asm volatile("mfence" : : : "memory");
 }
 
+static __inline u_int64_t
+rdtsc(void)
+{
+   uint32_t hi, lo;
+
+   __asm volatile("rdtsc" : "=d" (hi), "=a" (lo));
+   return (((uint64_t)hi << 32) | (uint64_t) lo);
+}
+
 static __inline void
 wrmsr(u_int msr, u_int64_t newval)
 {
Index: arch/amd64/amd64/cpu.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/cpu.c,v
retrieving revision 1.105
diff -u -p -u -p -r1.105 cpu.c
--- arch/amd64/amd64/cpu.c  30 May 2017 15:11:32 -  1.105
+++ arch/amd64/amd64/cpu.c  16 Jun 2017 06:07:16 -
@@ -67,6 +67,7 @@
 #include "lapic.h"
 #include "ioapic.h"
 #include "vmm.h"
+#include "pvbus.h"
 
 #include 
 #include 
@@ -103,6 +104,10 @@
 #include 
 #endif
 
+#if NPVBUS > 0
+#include 
+#endif
+
 #include 
 #include 
 #include 
@@ -728,6 +733,9 @@ cpu_hatch(void *v)
lldt(0);
 
cpu_init(ci);
+#if NPVBUS > 0
+   pvbus_init_vcpu();
+#endif
 
/* Re-initialise memory range handling on AP */
if (mem_range_softc.mr_op != NULL)
Index: dev/pv/files.pv
===
RCS file: /cvs/src/sys/dev/pv/files.pv,v
retrieving revision 1.13
diff -u -p -u -p -r1.13 files.pv
--- dev/pv/files.pv 14 Jun 2017 10:25:40 -  1.13
+++ dev/pv/files.pv 16 Jun 2017 06:07:16 -
@@ -75,3 +75,6 @@ file  dev/pv/vioscsi.cvioscsi
 device vmmci
 attach vmmci at virtio
 file   dev/pv/vmmci.c  vmmci
+
+# paravirtualized clock, used by kvm and xen
+filedev/pv/pvclock.c
Index: dev/pv/pvbus.c
===
RCS file: /cvs/src/sys/dev/pv/pvbus.c,v
retrieving revision 1.16
diff -u -p -u -p -r1.16 pvbus.c
--- dev/pv/pvbus.c  10 Jan 2017 17:16:39 -  1.16
+++ dev/pv/pvbus.c  16 Jun 2017 06:07:16 -
@@ -57,6 +57,7 @@ intpvbus_print(void *, const char *);
 int pvbus_search(struct device *, void *, void *);
 
 voidpvbus_kvm(struct pvbus_hv *);
+voidpvbus_kvm_init_vcpu(struct pvbus_hv *);