Re: [RFC PATCH] [vmm] Lightweight mechanism to kick virtio VQs

2018-11-23 Thread Sergio Lopez
On Fri, Oct 19, 2018 at 08:32:52PM +0200, Sergio Lopez wrote:
> Yes, I ran some random read and random write tests with fio. The Guest
> is Linux, mainly because its virtio implementation is more mature. Next
> week I'll try with OpenBSD.

Here are the (long due) numbers with an OpenBSD 6.4 Guest. While on
Linux all tests performed better with kickfd, this time randread gets a
~10% improvement, while randwrite numbers are ~5% worse.

Please keep in mind that those are just the inital numbers. There's
still a lot of optimizations that can be applied on both the vmd side
(specially on writes, with things like request merging and asynchronous
I/O) and the Guest drivers (OpenBSD performance is roughly half of
Linux's, which indicates there's plenty of room for improvement).

In any case, I'd like to know if you think this kickfd feature is
something worth pursuing, or should I move to something else.

Cheers,
Sergio.



| OpenBSD 6.4 + kickfd |


 - randread

# fio --clocksource=clock_gettime --name=randread --rw=randread --bs=4k 
--filename=/dev/sd1c --size=1g --numjobs=1 --ioengine=sync --iodepth=1 
--direct=0 --group_reporting
randread: (g=0): rw=randread, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 
4096B-4096B, ioengine=sync, iodepth=1
fio-3.12-17-g0fcbc0
Starting 1 thread
Jobs: 1 (f=1): [r(1)][100.0%][r=24.5MiB/s][r=6278 IOPS][eta 00m:00s]
randread: (groupid=0, jobs=1): err= 0: pid=24580: Fri Nov 23 09:59:54 2018
  read: IOPS=6120, BW=23.9MiB/s (25.1MB/s)(1024MiB/42831msec)
clat (usec): min=10, max=10126, avg=127.91, stdev=36.28
 lat (usec): min=19, max=10134, avg=136.25, stdev=36.58
clat percentiles (usec):
 |  1.00th=[  110],  5.00th=[  112], 10.00th=[  113], 20.00th=[  119],
 | 30.00th=[  121], 40.00th=[  126], 50.00th=[  129], 60.00th=[  131],
 | 70.00th=[  131], 80.00th=[  133], 90.00th=[  135], 95.00th=[  145],
 | 99.00th=[  190], 99.50th=[  208], 99.90th=[  255], 99.95th=[  318],
 | 99.99th=[  766]
   bw (  KiB/s): min= 2669, max=24660, per=61.01%, avg=14936.35, stdev=7445.54, 
samples=75
   iops: min=  667, max= 6165, avg=3733.80, stdev=1861.45, samples=75
  lat (usec)   : 20=0.03%, 50=0.01%, 100=0.01%, 250=99.84%, 500=0.08%
  lat (usec)   : 750=0.02%, 1000=0.01%
  lat (msec)   : 2=0.01%, 4=0.01%, 10=0.01%, 20=0.01%
  cpu  : usr=0.69%, sys=41.15%, ctx=261996, majf=1, minf=3
  IO depths: 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
 submit: 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
 complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
 issued rwts: total=262144,0,0,0 short=0,0,0,0 dropped=0,0,0,0
 latency   : target=0, window=0, percentile=100.00%, depth=1

Run status group 0 (all jobs):
   READ: bw=23.9MiB/s (25.1MB/s), 23.9MiB/s-23.9MiB/s (25.1MB/s-25.1MB/s), 
io=1024MiB (1074MB), run=42831-42831msec


# ./fio --clocksource=clock_gettime --name=randread --rw=randread --bs=4k 
--filename=/dev/sd1c --size=1g --numjobs=2 --ioengine=sync --iodepth=1 
--direct=0 --group_reporting
fio: this platform does not support process shared mutexes, forcing use of 
threads. Use the 'thread' option to get rid of this warning.
randread: (g=0): rw=randread, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 
4096B-4096B, ioengine=sync, iodepth=1
...
fio-3.12-17-g0fcbc0
Starting 2 threads
Jobs: 1 (f=1): [r(1),_(1)][98.8%][r=23.2MiB/s][r=5947 IOPS][eta 00m:01s]
randread: (groupid=0, jobs=2): err= 0: pid=1989935168: Fri Nov 23 10:03:27 2018
  read: IOPS=6366, BW=24.9MiB/s (26.1MB/s)(2048MiB/82356msec)
clat (usec): min=10, max=1104.9k, avg=187.28, stdev=4505.64
 lat (usec): min=19, max=1104.9k, avg=221.12, stdev=5417.17
clat percentiles (usec):
 |  1.00th=[20],  5.00th=[   110], 10.00th=[   113], 20.00th=[   118],
 | 30.00th=[   119], 40.00th=[   120], 50.00th=[   121], 60.00th=[   122],
 | 70.00th=[   127], 80.00th=[   133], 90.00th=[   135], 95.00th=[   135],
 | 99.00th=[   184], 99.50th=[   221], 99.90th=[   537], 99.95th=[   906],
 | 99.99th=[225444]
   bw (  KiB/s): min=  110, max=24515, per=18.61%, avg=4738.70, stdev=5790.82, 
samples=242
   iops: min=   27, max= 6128, avg=1184.30, stdev=1447.69, samples=242
  lat (usec)   : 20=1.53%, 50=1.88%, 100=0.04%, 250=96.25%, 500=0.17%
  lat (usec)   : 750=0.07%, 1000=0.01%
  lat (msec)   : 2=0.01%, 4=0.01%, 20=0.01%, 50=0.02%, 100=0.01%
  lat (msec)   : 250=0.02%, 500=0.01%, 750=0.01%
  cpu  : usr=0.32%, sys=25.10%, ctx=638508, majf=0, minf=4
  IO depths: 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
 submit: 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
 complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
 issued rwts: total=524288,0,0,0 short=0,0,0,0 dropped=0,0,0,0
 latency   : target=0, window=0, percentile=100.00%, depth=1

Run status group 0 (all jobs):
   READ: bw=2

Re: [RFC PATCH] [vmm] Lightweight mechanism to kick virtio VQs

2018-10-19 Thread Sergio Lopez
On Wed, Oct 17, 2018 at 02:22:23PM -0700, Mike Larkin wrote:
> On Fri, Oct 05, 2018 at 12:07:13PM +0200, Sergio Lopez wrote:
> > Hi,
> > 
> > This patch implements a mechanism to allow users register an I/O port
> > with a special file descriptor (kickfd) which can be monitored for
> > events using kevent. The kernel will note an event each time the Guest
> > writes to an I/O port registered with a kickfd.
> 
> > 
> > This is mainly intended for kicking virtio virtqueues, and allows the
> > VCPU to return immediately to the Guest, instead of having to exit all
> > the way to vmd to process the request. With kickfd, the actual work
> > (i.e. an I/O operation) requested by the Guest can be done in a
> > parallel thread that eventually asserts the corresponding IRQ.
> 
> The reason our virtio is so slow today is, as you point out, we process
> the I/O sequentially with the queue notify back in vmd. To this end, there
> are three "performance overhead components" involved:
> 
> 1. the amount of time it takes to return from the vmm run ioctl back to
>vmd
> 2. the time it takes to process the events in the queue
> 3. the time it takes to re-enter the vm back to the point after the OUT
>instruction that did the queue notify.
> 
> I've benchmarked this before; the biggest performance penalty here is in
> #2. The exit and re-entry overheads are miniscule compared to the overhead
> of the I/O itself.

I think that, in addition to the absolute number of cycles spent on each
approach, we should also consider where are being spent.

The kickfd approach is probably more expensive (ignoring TLB flushes,
which may end making it cheaper) than having the vCPU do the actual I/O,
but it allows the latter to return much earlier to the Guest, resulting
in a significantly lower submission latency and more CPU time available
for the VM.

> Optimizing #2 is therefore the most bang for the buck. We (dlg@ and I)
> had a taskq-based diff to do this previously (I'm not sure what state it
> was left in). In that diff, the I/O was processed in a set of worker threads,
> effectively eliminating the overhead for #2.

I had to partially rewrite the vioblk_notifyq function, as it didn't
play nice with concurrent access (Guest + iothread), but I haven't
implemented any optimization for #2.

There's the option of merging requests and using working threads to
avoid having the iothread actively waiting for the I/O, but this
would significantly increase the complexity of vmd's virtio code, so
must be done with care (it's the problem behind *many* qemu issues).

> Your diff (if I read it correctly), optimizes all 3 cases, and that's good.
> I'll admit it's not a way I thought of solving the performance issue, and
> I'm not sure if this is the right way or if the taskq way is better. The
> taskq way is a bit cleaner and easier to understand (there is no separate
> backchannel from vmm->vmd, but it doesn't look too crazy from what I
> read below).
> 
> Do you have some performance measurements with this diff applied vs without?

Yes, I ran some random read and random write tests with fio. The Guest
is Linux, mainly because its virtio implementation is more mature. Next
week I'll try with OpenBSD.

In a general sense, the kickfd approach is similar or slightly better
than the current one when generating I/O from just one process, and
significantly better (up to 2x the throughput) with 2 and 4 processes.

The test machine is an AMD Phenom II X4 (yeah, not the best thing for
this kind of tests, but the only non-virtual environment I have here)
with a Samsung SSD Evo 850. OpenBSD's kernel is running with HZ=1000 and
Linux's with HZ=100 (otherwise I couldn't get reliable numbers due to
missing ticks). The target device is backed by a raw image.

I'm pasting the raw output for fio below.

Sergio (slp).


==
| Linux + kickfd |
==

 - randread

[root@localhost ~]# fio --clocksource=clock_gettime --name=randread 
--rw=randread --bs=4k --filename=/dev/vdb --size=1g --numjobs=1 --ioengine=sync 
--iodepth=1 --direct=1 --group_reporting 
randread: (g=0): rw=randread, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 
4096B-4096B, ioengine=sync, iodepth=1
fio-3.3
Starting 1 process
Jobs: 1 (f=1): [r(1)][95.7%][r=49.3MiB/s,w=0KiB/s][r=12.6k,w=0 IOPS][eta 
00m:01s]
randread: (groupid=0, jobs=1): err= 0: pid=318: Fri Oct 19 18:36:02 2018
   read: IOPS=11.5k, BW=44.8MiB/s (46.0MB/s)(1024MiB/22850msec)
clat (nsec): min=0, max=2k, avg=85372.92, stdev=924160.90
 lat (nsec): min=0, max=2k, avg=85525.51, stdev=924971.98
clat percentiles (nsec):
 |  1.00th=[   0],  5.00th=[   0], 10.00th=[   0],
 | 20.00th=[   0], 30.00th=[   0], 40.00th=[   0],
 | 50.00th=[   0], 60.00th=[   0], 70.00th=[   0],
 | 80.00th=[   0], 90.00th=[   0], 95.00th=[   0],
 | 99.00th=[   0], 99.50th=[10027008], 99.90th=[10027008],
 | 99.95th=[10027008], 99.99th=[10027008]
  

Re: [RFC PATCH] [vmm] Lightweight mechanism to kick virtio VQs

2018-10-17 Thread Mike Larkin
On Fri, Oct 05, 2018 at 12:07:13PM +0200, Sergio Lopez wrote:
> Hi,
> 
> This patch implements a mechanism to allow users register an I/O port
> with a special file descriptor (kickfd) which can be monitored for
> events using kevent. The kernel will note an event each time the Guest
> writes to an I/O port registered with a kickfd.

> 
> This is mainly intended for kicking virtio virtqueues, and allows the
> VCPU to return immediately to the Guest, instead of having to exit all
> the way to vmd to process the request. With kickfd, the actual work
> (i.e. an I/O operation) requested by the Guest can be done in a
> parallel thread that eventually asserts the corresponding IRQ.

The reason our virtio is so slow today is, as you point out, we process
the I/O sequentially with the queue notify back in vmd. To this end, there
are three "performance overhead components" involved:

1. the amount of time it takes to return from the vmm run ioctl back to
   vmd
2. the time it takes to process the events in the queue
3. the time it takes to re-enter the vm back to the point after the OUT
   instruction that did the queue notify.

I've benchmarked this before; the biggest performance penalty here is in
#2. The exit and re-entry overheads are miniscule compared to the overhead
of the I/O itself.

Optimizing #2 is therefore the most bang for the buck. We (dlg@ and I)
had a taskq-based diff to do this previously (I'm not sure what state it
was left in). In that diff, the I/O was processed in a set of worker threads,
effectively eliminating the overhead for #2.

Your diff (if I read it correctly), optimizes all 3 cases, and that's good.
I'll admit it's not a way I thought of solving the performance issue, and
I'm not sure if this is the right way or if the taskq way is better. The
taskq way is a bit cleaner and easier to understand (there is no separate
backchannel from vmm->vmd, but it doesn't look too crazy from what I
read below).

Do you have some performance measurements with this diff applied vs without?

PS don't worry about i386.

-ml

> 
> In a general sense, this improves the I/O performance when the Guest is
> issuing asynchronous operations (or its virtio implementation is able
> to access the virtqueue asynchronously) and makes it more responsive
> (as the VCPU spends less time outside the Guest), but it may hurt a bit
> when the Guest is actively waiting for the operation to finish. This is
> also a first towards the possibility of having virtio devices running
> on separate processes, but I'll write more about this on another email
> thread.
> 
> I deliberately omitted the i386 implementation to make this easier to
> review.
> 
> Sergio (slp).
> 
> 
> Index: arch/amd64/amd64/vmm.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.220
> diff -u -p -r1.220 vmm.c
> --- arch/amd64/amd64/vmm.c20 Sep 2018 14:32:59 -  1.220
> +++ arch/amd64/amd64/vmm.c5 Oct 2018 09:54:34 -
> @@ -20,6 +20,9 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -63,6 +66,16 @@ void *l1tf_flush_region;
>  #define VMX_EXIT_INFO_COMPLETE   \
>  (VMX_EXIT_INFO_HAVE_RIP | VMX_EXIT_INFO_HAVE_REASON)
>  
> +struct kickfd {
> + uint16_t ioport;
> + struct klist note;
> + struct vm*vm;
> +
> + SLIST_ENTRY(kickfd)  kickfd_link;
> +};
> +
> +SLIST_HEAD(kickfd_head, kickfd);
> +
>  struct vm {
>   vm_map_t vm_map;
>   uint32_t vm_id;
> @@ -77,6 +90,9 @@ struct vm {
>   u_intvm_vcpus_running;
>   struct rwlockvm_vcpu_lock;
>  
> + struct kickfd_head   vm_kickfd_list;
> + struct rwlockvm_kickfd_lock;
> +
>   SLIST_ENTRY(vm)  vm_link;
>  };
>  
> @@ -122,6 +138,7 @@ int vm_get_info(struct vm_info_params *)
>  int vm_resetcpu(struct vm_resetcpu_params *);
>  int vm_intr_pending(struct vm_intr_params *);
>  int vm_rwregs(struct vm_rwregs_params *, int);
> +int vm_register_kickfd(struct vm_regkickfd_params *, struct proc *);
>  int vm_find(uint32_t, struct vm **);
>  int vcpu_readregs_vmx(struct vcpu *, uint64_t, struct vcpu_reg_state *);
>  int vcpu_readregs_svm(struct vcpu *, uint64_t, struct vcpu_reg_state *);
> @@ -281,6 +298,7 @@ const struct {
>  /* Pools for VMs and VCPUs */
>  struct pool vm_pool;
>  struct pool vcpu_pool;
> +struct pool kickfd_pool;
>  
>  struct vmm_softc *vmm_softc;
>  
> @@ -419,6 +437,8 @@ vmm_attach(struct device *parent, struct
>   "vmpool", NULL);
>   pool_init(&vcpu_pool, sizeof(struct vcpu), 64, IPL_NONE, PR_WAITOK,
>   "vcpupl", NULL);
> + pool_init(&kickfd_pool, sizeof(struct kickfd), 64, IPL_NONE, PR_WAITOK,
> + "kickfdpl", NULL);
>  
>   vmm_softc = sc;
>  }
> @@ -482,6 +502,9 @@ vmmioc

[RFC PATCH] [vmm] Lightweight mechanism to kick virtio VQs

2018-10-05 Thread Sergio Lopez
Hi,

This patch implements a mechanism to allow users register an I/O port
with a special file descriptor (kickfd) which can be monitored for
events using kevent. The kernel will note an event each time the Guest
writes to an I/O port registered with a kickfd.

This is mainly intended for kicking virtio virtqueues, and allows the
VCPU to return immediately to the Guest, instead of having to exit all
the way to vmd to process the request. With kickfd, the actual work
(i.e. an I/O operation) requested by the Guest can be done in a
parallel thread that eventually asserts the corresponding IRQ.

In a general sense, this improves the I/O performance when the Guest is
issuing asynchronous operations (or its virtio implementation is able
to access the virtqueue asynchronously) and makes it more responsive
(as the VCPU spends less time outside the Guest), but it may hurt a bit
when the Guest is actively waiting for the operation to finish. This is
also a first towards the possibility of having virtio devices running
on separate processes, but I'll write more about this on another email
thread.

I deliberately omitted the i386 implementation to make this easier to
review.

Sergio (slp).


Index: arch/amd64/amd64/vmm.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
retrieving revision 1.220
diff -u -p -r1.220 vmm.c
--- arch/amd64/amd64/vmm.c  20 Sep 2018 14:32:59 -  1.220
+++ arch/amd64/amd64/vmm.c  5 Oct 2018 09:54:34 -
@@ -20,6 +20,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -63,6 +66,16 @@ void *l1tf_flush_region;
 #define VMX_EXIT_INFO_COMPLETE \
 (VMX_EXIT_INFO_HAVE_RIP | VMX_EXIT_INFO_HAVE_REASON)
 
+struct kickfd {
+   uint16_t ioport;
+   struct klist note;
+   struct vm*vm;
+
+   SLIST_ENTRY(kickfd)  kickfd_link;
+};
+
+SLIST_HEAD(kickfd_head, kickfd);
+
 struct vm {
vm_map_t vm_map;
uint32_t vm_id;
@@ -77,6 +90,9 @@ struct vm {
u_intvm_vcpus_running;
struct rwlockvm_vcpu_lock;
 
+   struct kickfd_head   vm_kickfd_list;
+   struct rwlockvm_kickfd_lock;
+
SLIST_ENTRY(vm)  vm_link;
 };
 
@@ -122,6 +138,7 @@ int vm_get_info(struct vm_info_params *)
 int vm_resetcpu(struct vm_resetcpu_params *);
 int vm_intr_pending(struct vm_intr_params *);
 int vm_rwregs(struct vm_rwregs_params *, int);
+int vm_register_kickfd(struct vm_regkickfd_params *, struct proc *);
 int vm_find(uint32_t, struct vm **);
 int vcpu_readregs_vmx(struct vcpu *, uint64_t, struct vcpu_reg_state *);
 int vcpu_readregs_svm(struct vcpu *, uint64_t, struct vcpu_reg_state *);
@@ -281,6 +298,7 @@ const struct {
 /* Pools for VMs and VCPUs */
 struct pool vm_pool;
 struct pool vcpu_pool;
+struct pool kickfd_pool;
 
 struct vmm_softc *vmm_softc;
 
@@ -419,6 +437,8 @@ vmm_attach(struct device *parent, struct
"vmpool", NULL);
pool_init(&vcpu_pool, sizeof(struct vcpu), 64, IPL_NONE, PR_WAITOK,
"vcpupl", NULL);
+   pool_init(&kickfd_pool, sizeof(struct kickfd), 64, IPL_NONE, PR_WAITOK,
+   "kickfdpl", NULL);
 
vmm_softc = sc;
 }
@@ -482,6 +502,9 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t 
case VMM_IOC_WRITEREGS:
ret = vm_rwregs((struct vm_rwregs_params *)data, 1);
break;
+   case VMM_IOC_REGKICKFD:
+   ret = vm_register_kickfd((struct vm_regkickfd_params *)data, p);
+   break;
default:
DPRINTF("%s: unknown ioctl code 0x%lx\n", __func__, cmd);
ret = ENOTTY;
@@ -513,6 +536,7 @@ pledge_ioctl_vmm(struct proc *p, long co
case VMM_IOC_INTR:
case VMM_IOC_READREGS:
case VMM_IOC_WRITEREGS:
+   case VMM_IOC_REGKICKFD:
return (0);
}
 
@@ -725,6 +749,176 @@ vm_rwregs(struct vm_rwregs_params *vrwp,
}
 }
 
+void
+filt_vmmkickfd_detach(struct knote *kn)
+{
+   struct kickfd *kickfd = kn->kn_fp->f_data;
+
+   rw_enter_write(&kickfd->vm->vm_kickfd_lock);
+   SLIST_REMOVE(&kickfd->note, kn, knote, kn_selnext);
+   rw_exit_write(&kickfd->vm->vm_kickfd_lock);
+}
+
+int
+filt_vmmkickfd_read(struct knote *kn, long hint)
+{
+   return(1);
+}
+
+struct filterops vmmkickfd_filtops =
+   { 1, NULL, filt_vmmkickfd_detach, filt_vmmkickfd_read };
+
+int
+vmmkickfd_read(struct file *fp, struct uio *uio, int fflags)
+{
+   return (EINVAL);
+}
+
+int
+vmmkickfd_write(struct file *fp, struct uio *uio, int fflags)
+{
+   return (EINVAL);
+}
+
+int
+vmmkickfd_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p)
+{
+   return (ENOTTY);
+}
+
+int
+vmmkickfd_poll(struct file *fp, int events, struct proc *p)
+{
+   return (EINVAL);
+}
+
+int
+vmmkickfd_