Re: [RFC PATCH] [vmm] Lightweight mechanism to kick virtio VQs

2018-11-23 Thread Sergio Lopez
On Fri, Oct 19, 2018 at 08:32:52PM +0200, Sergio Lopez wrote:
> Yes, I ran some random read and random write tests with fio. The Guest
> is Linux, mainly because its virtio implementation is more mature. Next
> week I'll try with OpenBSD.

Here are the (long due) numbers with an OpenBSD 6.4 Guest. While on
Linux all tests performed better with kickfd, this time randread gets a
~10% improvement, while randwrite numbers are ~5% worse.

Please keep in mind that those are just the inital numbers. There's
still a lot of optimizations that can be applied on both the vmd side
(specially on writes, with things like request merging and asynchronous
I/O) and the Guest drivers (OpenBSD performance is roughly half of
Linux's, which indicates there's plenty of room for improvement).

In any case, I'd like to know if you think this kickfd feature is
something worth pursuing, or should I move to something else.

Cheers,
Sergio.



| OpenBSD 6.4 + kickfd |


 - randread

# fio --clocksource=clock_gettime --name=randread --rw=randread --bs=4k 
--filename=/dev/sd1c --size=1g --numjobs=1 --ioengine=sync --iodepth=1 
--direct=0 --group_reporting
randread: (g=0): rw=randread, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 
4096B-4096B, ioengine=sync, iodepth=1
fio-3.12-17-g0fcbc0
Starting 1 thread
Jobs: 1 (f=1): [r(1)][100.0%][r=24.5MiB/s][r=6278 IOPS][eta 00m:00s]
randread: (groupid=0, jobs=1): err= 0: pid=24580: Fri Nov 23 09:59:54 2018
  read: IOPS=6120, BW=23.9MiB/s (25.1MB/s)(1024MiB/42831msec)
clat (usec): min=10, max=10126, avg=127.91, stdev=36.28
 lat (usec): min=19, max=10134, avg=136.25, stdev=36.58
clat percentiles (usec):
 |  1.00th=[  110],  5.00th=[  112], 10.00th=[  113], 20.00th=[  119],
 | 30.00th=[  121], 40.00th=[  126], 50.00th=[  129], 60.00th=[  131],
 | 70.00th=[  131], 80.00th=[  133], 90.00th=[  135], 95.00th=[  145],
 | 99.00th=[  190], 99.50th=[  208], 99.90th=[  255], 99.95th=[  318],
 | 99.99th=[  766]
   bw (  KiB/s): min= 2669, max=24660, per=61.01%, avg=14936.35, stdev=7445.54, 
samples=75
   iops: min=  667, max= 6165, avg=3733.80, stdev=1861.45, samples=75
  lat (usec)   : 20=0.03%, 50=0.01%, 100=0.01%, 250=99.84%, 500=0.08%
  lat (usec)   : 750=0.02%, 1000=0.01%
  lat (msec)   : 2=0.01%, 4=0.01%, 10=0.01%, 20=0.01%
  cpu  : usr=0.69%, sys=41.15%, ctx=261996, majf=1, minf=3
  IO depths: 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
 submit: 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
 complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
 issued rwts: total=262144,0,0,0 short=0,0,0,0 dropped=0,0,0,0
 latency   : target=0, window=0, percentile=100.00%, depth=1

Run status group 0 (all jobs):
   READ: bw=23.9MiB/s (25.1MB/s), 23.9MiB/s-23.9MiB/s (25.1MB/s-25.1MB/s), 
io=1024MiB (1074MB), run=42831-42831msec


# ./fio --clocksource=clock_gettime --name=randread --rw=randread --bs=4k 
--filename=/dev/sd1c --size=1g --numjobs=2 --ioengine=sync --iodepth=1 
--direct=0 --group_reporting
fio: this platform does not support process shared mutexes, forcing use of 
threads. Use the 'thread' option to get rid of this warning.
randread: (g=0): rw=randread, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 
4096B-4096B, ioengine=sync, iodepth=1
...
fio-3.12-17-g0fcbc0
Starting 2 threads
Jobs: 1 (f=1): [r(1),_(1)][98.8%][r=23.2MiB/s][r=5947 IOPS][eta 00m:01s]
randread: (groupid=0, jobs=2): err= 0: pid=1989935168: Fri Nov 23 10:03:27 2018
  read: IOPS=6366, BW=24.9MiB/s (26.1MB/s)(2048MiB/82356msec)
clat (usec): min=10, max=1104.9k, avg=187.28, stdev=4505.64
 lat (usec): min=19, max=1104.9k, avg=221.12, stdev=5417.17
clat percentiles (usec):
 |  1.00th=[20],  5.00th=[   110], 10.00th=[   113], 20.00th=[   118],
 | 30.00th=[   119], 40.00th=[   120], 50.00th=[   121], 60.00th=[   122],
 | 70.00th=[   127], 80.00th=[   133], 90.00th=[   135], 95.00th=[   135],
 | 99.00th=[   184], 99.50th=[   221], 99.90th=[   537], 99.95th=[   906],
 | 99.99th=[225444]
   bw (  KiB/s): min=  110, max=24515, per=18.61%, avg=4738.70, stdev=5790.82, 
samples=242
   iops: min=   27, max= 6128, avg=1184.30, stdev=1447.69, samples=242
  lat (usec)   : 20=1.53%, 50=1.88%, 100=0.04%, 250=96.25%, 500=0.17%
  lat (usec)   : 750=0.07%, 1000=0.01%
  lat (msec)   : 2=0.01%, 4=0.01%, 20=0.01%, 50=0.02%, 100=0.01%
  lat (msec)   : 250=0.02%, 500=0.01%, 750=0.01%
  cpu  : usr=0.32%, sys=25.10%, ctx=638508, majf=0, minf=4
  IO depths: 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
 submit: 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
 complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
 issued rwts: total=524288,0,0,0 short=0,0,0,0 dropped=0,0,0,0
 latency   : target=0, window=0, percentile=100.00%, depth=1

Run status grou

Re: vmd: servicing virtio devices from separate processes

2018-10-22 Thread Sergio Lopez
On Sat, Oct 20, 2018 at 10:09:39AM -0700, Pratik Vyas wrote:
> * David Gwynne  [2018-10-20 12:19:56 +1000]:
> 
> > 
> > Would sending and receiving a VM still work if I/O is run in different 
> > processes?
> > 
> > dlg
> > 
> 
> Hi dlg,
> 
> It will have to be reworked completely but can be done, I think.

Actually, the high-level picture doesn't change. You still need to stop
the vCPUs and dump the RAM and the state of the devices.

The difference is that now vmd does all the job, while with separate
processes it'd need to request them to flush the buffers, finalize or
freeze pending requests, and dump the state to either the same file or a
different one.

Sergio (slp).



Re: [RFC PATCH] [vmm] Lightweight mechanism to kick virtio VQs

2018-10-19 Thread Sergio Lopez
On Wed, Oct 17, 2018 at 02:22:23PM -0700, Mike Larkin wrote:
> On Fri, Oct 05, 2018 at 12:07:13PM +0200, Sergio Lopez wrote:
> > Hi,
> > 
> > This patch implements a mechanism to allow users register an I/O port
> > with a special file descriptor (kickfd) which can be monitored for
> > events using kevent. The kernel will note an event each time the Guest
> > writes to an I/O port registered with a kickfd.
> 
> > 
> > This is mainly intended for kicking virtio virtqueues, and allows the
> > VCPU to return immediately to the Guest, instead of having to exit all
> > the way to vmd to process the request. With kickfd, the actual work
> > (i.e. an I/O operation) requested by the Guest can be done in a
> > parallel thread that eventually asserts the corresponding IRQ.
> 
> The reason our virtio is so slow today is, as you point out, we process
> the I/O sequentially with the queue notify back in vmd. To this end, there
> are three "performance overhead components" involved:
> 
> 1. the amount of time it takes to return from the vmm run ioctl back to
>vmd
> 2. the time it takes to process the events in the queue
> 3. the time it takes to re-enter the vm back to the point after the OUT
>instruction that did the queue notify.
> 
> I've benchmarked this before; the biggest performance penalty here is in
> #2. The exit and re-entry overheads are miniscule compared to the overhead
> of the I/O itself.

I think that, in addition to the absolute number of cycles spent on each
approach, we should also consider where are being spent.

The kickfd approach is probably more expensive (ignoring TLB flushes,
which may end making it cheaper) than having the vCPU do the actual I/O,
but it allows the latter to return much earlier to the Guest, resulting
in a significantly lower submission latency and more CPU time available
for the VM.

> Optimizing #2 is therefore the most bang for the buck. We (dlg@ and I)
> had a taskq-based diff to do this previously (I'm not sure what state it
> was left in). In that diff, the I/O was processed in a set of worker threads,
> effectively eliminating the overhead for #2.

I had to partially rewrite the vioblk_notifyq function, as it didn't
play nice with concurrent access (Guest + iothread), but I haven't
implemented any optimization for #2.

There's the option of merging requests and using working threads to
avoid having the iothread actively waiting for the I/O, but this
would significantly increase the complexity of vmd's virtio code, so
must be done with care (it's the problem behind *many* qemu issues).

> Your diff (if I read it correctly), optimizes all 3 cases, and that's good.
> I'll admit it's not a way I thought of solving the performance issue, and
> I'm not sure if this is the right way or if the taskq way is better. The
> taskq way is a bit cleaner and easier to understand (there is no separate
> backchannel from vmm->vmd, but it doesn't look too crazy from what I
> read below).
> 
> Do you have some performance measurements with this diff applied vs without?

Yes, I ran some random read and random write tests with fio. The Guest
is Linux, mainly because its virtio implementation is more mature. Next
week I'll try with OpenBSD.

In a general sense, the kickfd approach is similar or slightly better
than the current one when generating I/O from just one process, and
significantly better (up to 2x the throughput) with 2 and 4 processes.

The test machine is an AMD Phenom II X4 (yeah, not the best thing for
this kind of tests, but the only non-virtual environment I have here)
with a Samsung SSD Evo 850. OpenBSD's kernel is running with HZ=1000 and
Linux's with HZ=100 (otherwise I couldn't get reliable numbers due to
missing ticks). The target device is backed by a raw image.

I'm pasting the raw output for fio below.

Sergio (slp).


==
| Linux + kickfd |
==

 - randread

[root@localhost ~]# fio --clocksource=clock_gettime --name=randread 
--rw=randread --bs=4k --filename=/dev/vdb --size=1g --numjobs=1 --ioengine=sync 
--iodepth=1 --direct=1 --group_reporting 
randread: (g=0): rw=randread, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 
4096B-4096B, ioengine=sync, iodepth=1
fio-3.3
Starting 1 process
Jobs: 1 (f=1): [r(1)][95.7%][r=49.3MiB/s,w=0KiB/s][r=12.6k,w=0 IOPS][eta 
00m:01s]
randread: (groupid=0, jobs=1): err= 0: pid=318: Fri Oct 19 18:36:02 2018
   read: IOPS=11.5k, BW=44.8MiB/s (46.0MB/s)(1024MiB/22850msec)
clat (nsec): min=0, max=2k, avg=85372.92, stdev=924160.90
 lat (nsec): min=0, max=2k, avg=85525.51, stdev=924971.98
clat percentiles (nsec):
 |  1.00th=[   0],  5.00th=[   0], 10.00th=[   0],
 | 20.00th=[   0], 30.00th=[   0], 40.00th=[   0],
 | 50.00th=[   0], 60.00th=[   0], 70.00th=[   0],
 | 80

Re: vmd: servicing virtio devices from separate processes

2018-10-19 Thread Sergio Lopez
On Wed, Oct 17, 2018 at 02:04:46PM -0700, Mike Larkin wrote:
> On Fri, Oct 05, 2018 at 01:50:10PM +0200, Sergio Lopez wrote:
> > Right now, vmd already features an excellent privsep model to ensure
> > the process servicing the VM requests to the outside world is running
> > with the lowest possible privileges.
> > 
> > I was wondering if we could take a step further, servicing each virtio
> > device from a different process. This design would simplify the
> > implementation and maintenance of those devices, improve the privsep
> > model and increase the resilience of VMs (the crash of a process
> > servicing a device won't bring down the whole VM, and a mechanism to
> > recover from this scenario could be explored).
> 
> Our model is generally to not try to recover from crashes like that. Indeed,
> you *want* to crash so the underlying bug can be found and fixed.

With separate processes you'll still have a crash with core dump from
the process servicing the device, with the additional advantage of being
able to, optionally, try to recover device or debug it independently. 

> >  - An in-kernel IRQ chip. This one is the most controversial, as it
> > means emulating a device from a privileged domain, but I'm pretty sure
> > a lapic implementation with enough functionality to serve *BSD/Linux
> > Guests can be small and simple enough to be easily auditable.
> 
> This needs to be done, for a number of reasons (device emulation being just
> one). pd@ and I are working on how to implement this using features in
> recent CPUs, since much of the LAPIC emulation can now be handled by the
> CPU itself. We're thinking skylake and later will be the line in the sand
> for this. Otherwise the software emulation is more complex and more prone
> to bugs. I've resisted the urge to put this stuff in the kernel for exactly
> that reason, but with later model CPUs we may be in better shape. We may
> also decide to focus solely on x2APIC. If you're interested in helping in
> this area, I'll keep you in the loop.

Sure, I'd like to help. I'm quite familiar with KVM's in-kernel irqchip
implementation.

> > Do you think it's worth exploring this model? What are feelings
> > regarding the in-kernel IRQ chip?
> > 
> > Sergio (slp).
> > 
> 
> All things considered, I'm less sold on the idea of splitting out devices
> into their own processes. I don't see any compelling reason. But we do
> need an IOAPIC and LAPIC implementation at some point, as you point out.

Well, vmd is still small enough (thankfully) to be able debug it easily.
But, eventually, it'll grow in both size and complexity (specially with
SMP support, I/O optimizations, additional storage backends...) and
having a high degree of modularity really helps here.

In fact, the QEMU/KVM community is starting to consider going this route
(but QEMU is *huge*). [1]

Anyways, this is not a "now-or-never" thing. As you suggest, we can work
now on kickfd and IOAPIC/LAPIC, which are useful by themselves. And when
those are in place, I'll be able to write a PoC so we can evaluate its
benefits and drawbacks.

Sergio (slp).


[1] https://www.linux-kvm.org/images/f/fc/KVM_FORUM_multi-process.pdf



Re: vmd: servicing virtio devices from separate processes

2018-10-11 Thread Sergio Lopez
On Thu, 2018-10-11 at 09:02 -0700, Mike Larkin wrote:
> On Wed, Oct 10, 2018 at 05:04:34PM +0200, Sergio Lopez wrote:
> > On Mon, 2018-10-08 at 09:58 -0700, Mike Larkin wrote:
> > > On Fri, Oct 05, 2018 at 01:50:10PM +0200, Sergio Lopez wrote:
> > > > Hi,
> > > > 
> > > > I have an idea in mind that I'd like to share to ask you if you think
> > > > it's worth giving it a try.
> > > > 
> > > > Right now, vmd already features an excellent privsep model to ensure
> > > > the process servicing the VM requests to the outside world is running
> > > > with the lowest possible privileges.
> > > > 
> > > > I was wondering if we could take a step further, servicing each virtio
> > > > device from a different process. This design would simplify the
> > > > implementation and maintenance of those devices, improve the privsep
> > > > model and increase the resilience of VMs (the crash of a process
> > > > servicing a device won't bring down the whole VM, and a mechanism to
> > > > recover from this scenario could be explored).
> > > > 
> > > > Doing this in an efficient way requires:
> > > > 
> > > >  - The ability to receive virtqueue notifications directly on the
> > > > process. I've already sent an RFC patch for this (see "Lightweight
> > > > mechanism to kick virtio VQs"), as it'd be useful for the non-separated
> > > > model too.
> > > > 
> > > >  - An in-kernel IRQ chip. This one is the most controversial, as it
> > > > means emulating a device from a privileged domain, but I'm pretty sure
> > > > a lapic implementation with enough functionality to serve *BSD/Linux
> > > > Guests can be small and simple enough to be easily auditable.
> > > > 
> > > >  - A way to map the VM memory into a third process context. Can
> > > > uvm_share for this? Here's also the opportunity to explore options to
> > > > avoid mapping the whole VM memory, though that'll possibly require
> > > > functionality non covered by the virtio specs.
> > > > 
> > > > Do you think it's worth exploring this model? What are feelings
> > > > regarding the in-kernel IRQ chip?
> > > > 
> > > > Sergio (slp).
> > > > 
> > > 
> > > Lots of things to read through in this and the attached diff. I'll try to
> > > catch up and reply as soon as I can.
> > 
> > Ack. Let me know if you need me to split it in different patches, or if
> > you want to take a look at its use on vmd (the actual patch for vmd
> > still needs polishing, but works).
> > 
> > Sergio (slp).
> > 
> 
> I'm not sure if splitting this into a separate process or using a taskq is
> the right way to go ; dlg@ had done the latter before, but we dropped that
> for some reason.

Well, it's not a question of one thing or the other. It'd be possible
to have some devices serviced by code emmbedded in vmd, as it is today,
and others serviced by separated processes.

A nice use case for having a separated process servicing a device could
be implementing a virtio_blk disk with a complex backend, like ceph or
glusterfs. Not only you would avoid linking vmd against
librbd/libgfapi, but also would be able to attach gdb to inspect it
without halting the whole VM. Perhaps even restarting the process (hot-
plug/unplug?).

On the other hand, the underlying features (kickfd/in-kernel lapic) are
useful on their own, even with all devices being served by vmd,
providing lower latency, less vmexits, and parallel execution of I/O
requests. The only drawback is enlarging the kernel a bit, but I guess
this could be out of the GENERIC* configs by default.

IMHO this is win-win scenario.

Sergio (slp).



Re: vmd: servicing virtio devices from separate processes

2018-10-10 Thread Sergio Lopez
On Mon, 2018-10-08 at 09:58 -0700, Mike Larkin wrote:
> On Fri, Oct 05, 2018 at 01:50:10PM +0200, Sergio Lopez wrote:
> > Hi,
> > 
> > I have an idea in mind that I'd like to share to ask you if you think
> > it's worth giving it a try.
> > 
> > Right now, vmd already features an excellent privsep model to ensure
> > the process servicing the VM requests to the outside world is running
> > with the lowest possible privileges.
> > 
> > I was wondering if we could take a step further, servicing each virtio
> > device from a different process. This design would simplify the
> > implementation and maintenance of those devices, improve the privsep
> > model and increase the resilience of VMs (the crash of a process
> > servicing a device won't bring down the whole VM, and a mechanism to
> > recover from this scenario could be explored).
> > 
> > Doing this in an efficient way requires:
> > 
> >  - The ability to receive virtqueue notifications directly on the
> > process. I've already sent an RFC patch for this (see "Lightweight
> > mechanism to kick virtio VQs"), as it'd be useful for the non-separated
> > model too.
> > 
> >  - An in-kernel IRQ chip. This one is the most controversial, as it
> > means emulating a device from a privileged domain, but I'm pretty sure
> > a lapic implementation with enough functionality to serve *BSD/Linux
> > Guests can be small and simple enough to be easily auditable.
> > 
> >  - A way to map the VM memory into a third process context. Can
> > uvm_share for this? Here's also the opportunity to explore options to
> > avoid mapping the whole VM memory, though that'll possibly require
> > functionality non covered by the virtio specs.
> > 
> > Do you think it's worth exploring this model? What are feelings
> > regarding the in-kernel IRQ chip?
> > 
> > Sergio (slp).
> > 
> 
> Lots of things to read through in this and the attached diff. I'll try to
> catch up and reply as soon as I can.

Ack. Let me know if you need me to split it in different patches, or if
you want to take a look at its use on vmd (the actual patch for vmd
still needs polishing, but works).

Sergio (slp).



vmd: servicing virtio devices from separate processes

2018-10-05 Thread Sergio Lopez
Hi,

I have an idea in mind that I'd like to share to ask you if you think
it's worth giving it a try.

Right now, vmd already features an excellent privsep model to ensure
the process servicing the VM requests to the outside world is running
with the lowest possible privileges.

I was wondering if we could take a step further, servicing each virtio
device from a different process. This design would simplify the
implementation and maintenance of those devices, improve the privsep
model and increase the resilience of VMs (the crash of a process
servicing a device won't bring down the whole VM, and a mechanism to
recover from this scenario could be explored).

Doing this in an efficient way requires:

 - The ability to receive virtqueue notifications directly on the
process. I've already sent an RFC patch for this (see "Lightweight
mechanism to kick virtio VQs"), as it'd be useful for the non-separated
model too.

 - An in-kernel IRQ chip. This one is the most controversial, as it
means emulating a device from a privileged domain, but I'm pretty sure
a lapic implementation with enough functionality to serve *BSD/Linux
Guests can be small and simple enough to be easily auditable.

 - A way to map the VM memory into a third process context. Can
uvm_share for this? Here's also the opportunity to explore options to
avoid mapping the whole VM memory, though that'll possibly require
functionality non covered by the virtio specs.

Do you think it's worth exploring this model? What are feelings
regarding the in-kernel IRQ chip?

Sergio (slp).



[RFC PATCH] [vmm] Lightweight mechanism to kick virtio VQs

2018-10-05 Thread Sergio Lopez
Hi,

This patch implements a mechanism to allow users register an I/O port
with a special file descriptor (kickfd) which can be monitored for
events using kevent. The kernel will note an event each time the Guest
writes to an I/O port registered with a kickfd.

This is mainly intended for kicking virtio virtqueues, and allows the
VCPU to return immediately to the Guest, instead of having to exit all
the way to vmd to process the request. With kickfd, the actual work
(i.e. an I/O operation) requested by the Guest can be done in a
parallel thread that eventually asserts the corresponding IRQ.

In a general sense, this improves the I/O performance when the Guest is
issuing asynchronous operations (or its virtio implementation is able
to access the virtqueue asynchronously) and makes it more responsive
(as the VCPU spends less time outside the Guest), but it may hurt a bit
when the Guest is actively waiting for the operation to finish. This is
also a first towards the possibility of having virtio devices running
on separate processes, but I'll write more about this on another email
thread.

I deliberately omitted the i386 implementation to make this easier to
review.

Sergio (slp).


Index: arch/amd64/amd64/vmm.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
retrieving revision 1.220
diff -u -p -r1.220 vmm.c
--- arch/amd64/amd64/vmm.c  20 Sep 2018 14:32:59 -  1.220
+++ arch/amd64/amd64/vmm.c  5 Oct 2018 09:54:34 -
@@ -20,6 +20,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -63,6 +66,16 @@ void *l1tf_flush_region;
 #define VMX_EXIT_INFO_COMPLETE \
 (VMX_EXIT_INFO_HAVE_RIP | VMX_EXIT_INFO_HAVE_REASON)
 
+struct kickfd {
+   uint16_t ioport;
+   struct klist note;
+   struct vm*vm;
+
+   SLIST_ENTRY(kickfd)  kickfd_link;
+};
+
+SLIST_HEAD(kickfd_head, kickfd);
+
 struct vm {
vm_map_t vm_map;
uint32_t vm_id;
@@ -77,6 +90,9 @@ struct vm {
u_intvm_vcpus_running;
struct rwlockvm_vcpu_lock;
 
+   struct kickfd_head   vm_kickfd_list;
+   struct rwlockvm_kickfd_lock;
+
SLIST_ENTRY(vm)  vm_link;
 };
 
@@ -122,6 +138,7 @@ int vm_get_info(struct vm_info_params *)
 int vm_resetcpu(struct vm_resetcpu_params *);
 int vm_intr_pending(struct vm_intr_params *);
 int vm_rwregs(struct vm_rwregs_params *, int);
+int vm_register_kickfd(struct vm_regkickfd_params *, struct proc *);
 int vm_find(uint32_t, struct vm **);
 int vcpu_readregs_vmx(struct vcpu *, uint64_t, struct vcpu_reg_state *);
 int vcpu_readregs_svm(struct vcpu *, uint64_t, struct vcpu_reg_state *);
@@ -281,6 +298,7 @@ const struct {
 /* Pools for VMs and VCPUs */
 struct pool vm_pool;
 struct pool vcpu_pool;
+struct pool kickfd_pool;
 
 struct vmm_softc *vmm_softc;
 
@@ -419,6 +437,8 @@ vmm_attach(struct device *parent, struct
"vmpool", NULL);
pool_init(_pool, sizeof(struct vcpu), 64, IPL_NONE, PR_WAITOK,
"vcpupl", NULL);
+   pool_init(_pool, sizeof(struct kickfd), 64, IPL_NONE, PR_WAITOK,
+   "kickfdpl", NULL);
 
vmm_softc = sc;
 }
@@ -482,6 +502,9 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t 
case VMM_IOC_WRITEREGS:
ret = vm_rwregs((struct vm_rwregs_params *)data, 1);
break;
+   case VMM_IOC_REGKICKFD:
+   ret = vm_register_kickfd((struct vm_regkickfd_params *)data, p);
+   break;
default:
DPRINTF("%s: unknown ioctl code 0x%lx\n", __func__, cmd);
ret = ENOTTY;
@@ -513,6 +536,7 @@ pledge_ioctl_vmm(struct proc *p, long co
case VMM_IOC_INTR:
case VMM_IOC_READREGS:
case VMM_IOC_WRITEREGS:
+   case VMM_IOC_REGKICKFD:
return (0);
}
 
@@ -725,6 +749,176 @@ vm_rwregs(struct vm_rwregs_params *vrwp,
}
 }
 
+void
+filt_vmmkickfd_detach(struct knote *kn)
+{
+   struct kickfd *kickfd = kn->kn_fp->f_data;
+
+   rw_enter_write(>vm->vm_kickfd_lock);
+   SLIST_REMOVE(>note, kn, knote, kn_selnext);
+   rw_exit_write(>vm->vm_kickfd_lock);
+}
+
+int
+filt_vmmkickfd_read(struct knote *kn, long hint)
+{
+   return(1);
+}
+
+struct filterops vmmkickfd_filtops =
+   { 1, NULL, filt_vmmkickfd_detach, filt_vmmkickfd_read };
+
+int
+vmmkickfd_read(struct file *fp, struct uio *uio, int fflags)
+{
+   return (EINVAL);
+}
+
+int
+vmmkickfd_write(struct file *fp, struct uio *uio, int fflags)
+{
+   return (EINVAL);
+}
+
+int
+vmmkickfd_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p)
+{
+   return (ENOTTY);
+}
+
+int
+vmmkickfd_poll(struct file *fp, int events, struct proc *p)
+{
+   return (EINVAL);
+}
+
+int
+vmmkickfd_kqfilter(struct file *fp, struct