On Fri, Oct 05, 2018 at 12:07:13PM +0200, Sergio Lopez wrote:
> Hi,
> 
> This patch implements a mechanism to allow users register an I/O port
> with a special file descriptor (kickfd) which can be monitored for
> events using kevent. The kernel will note an event each time the Guest
> writes to an I/O port registered with a kickfd.

> 
> This is mainly intended for kicking virtio virtqueues, and allows the
> VCPU to return immediately to the Guest, instead of having to exit all
> the way to vmd to process the request. With kickfd, the actual work
> (i.e. an I/O operation) requested by the Guest can be done in a
> parallel thread that eventually asserts the corresponding IRQ.

The reason our virtio is so slow today is, as you point out, we process
the I/O sequentially with the queue notify back in vmd. To this end, there
are three "performance overhead components" involved:

1. the amount of time it takes to return from the vmm run ioctl back to
   vmd
2. the time it takes to process the events in the queue
3. the time it takes to re-enter the vm back to the point after the OUT
   instruction that did the queue notify.

I've benchmarked this before; the biggest performance penalty here is in
#2. The exit and re-entry overheads are miniscule compared to the overhead
of the I/O itself.

Optimizing #2 is therefore the most bang for the buck. We (dlg@ and I)
had a taskq-based diff to do this previously (I'm not sure what state it
was left in). In that diff, the I/O was processed in a set of worker threads,
effectively eliminating the overhead for #2.

Your diff (if I read it correctly), optimizes all 3 cases, and that's good.
I'll admit it's not a way I thought of solving the performance issue, and
I'm not sure if this is the right way or if the taskq way is better. The
taskq way is a bit cleaner and easier to understand (there is no separate
backchannel from vmm->vmd, but it doesn't look too crazy from what I
read below).

Do you have some performance measurements with this diff applied vs without?

PS don't worry about i386.

-ml

> 
> In a general sense, this improves the I/O performance when the Guest is
> issuing asynchronous operations (or its virtio implementation is able
> to access the virtqueue asynchronously) and makes it more responsive
> (as the VCPU spends less time outside the Guest), but it may hurt a bit
> when the Guest is actively waiting for the operation to finish. This is
> also a first towards the possibility of having virtio devices running
> on separate processes, but I'll write more about this on another email
> thread.
> 
> I deliberately omitted the i386 implementation to make this easier to
> review.
> 
> Sergio (slp).
> 
> 
> Index: arch/amd64/amd64/vmm.c
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
> retrieving revision 1.220
> diff -u -p -r1.220 vmm.c
> --- arch/amd64/amd64/vmm.c    20 Sep 2018 14:32:59 -0000      1.220
> +++ arch/amd64/amd64/vmm.c    5 Oct 2018 09:54:34 -0000
> @@ -20,6 +20,9 @@
>  #include <sys/signalvar.h>
>  #include <sys/malloc.h>
>  #include <sys/device.h>
> +#include <sys/fcntl.h>
> +#include <sys/file.h>
> +#include <sys/filedesc.h>
>  #include <sys/pool.h>
>  #include <sys/proc.h>
>  #include <sys/user.h>
> @@ -63,6 +66,16 @@ void *l1tf_flush_region;
>  #define VMX_EXIT_INFO_COMPLETE                               \
>      (VMX_EXIT_INFO_HAVE_RIP | VMX_EXIT_INFO_HAVE_REASON)
>  
> +struct kickfd {
> +     uint16_t                 ioport;
> +     struct klist             note;
> +     struct vm                *vm;
> +
> +     SLIST_ENTRY(kickfd)      kickfd_link;
> +};
> +
> +SLIST_HEAD(kickfd_head, kickfd);
> +
>  struct vm {
>       vm_map_t                 vm_map;
>       uint32_t                 vm_id;
> @@ -77,6 +90,9 @@ struct vm {
>       u_int                    vm_vcpus_running;
>       struct rwlock            vm_vcpu_lock;
>  
> +     struct kickfd_head       vm_kickfd_list;
> +     struct rwlock            vm_kickfd_lock;
> +
>       SLIST_ENTRY(vm)          vm_link;
>  };
>  
> @@ -122,6 +138,7 @@ int vm_get_info(struct vm_info_params *)
>  int vm_resetcpu(struct vm_resetcpu_params *);
>  int vm_intr_pending(struct vm_intr_params *);
>  int vm_rwregs(struct vm_rwregs_params *, int);
> +int vm_register_kickfd(struct vm_regkickfd_params *, struct proc *);
>  int vm_find(uint32_t, struct vm **);
>  int vcpu_readregs_vmx(struct vcpu *, uint64_t, struct vcpu_reg_state *);
>  int vcpu_readregs_svm(struct vcpu *, uint64_t, struct vcpu_reg_state *);
> @@ -281,6 +298,7 @@ const struct {
>  /* Pools for VMs and VCPUs */
>  struct pool vm_pool;
>  struct pool vcpu_pool;
> +struct pool kickfd_pool;
>  
>  struct vmm_softc *vmm_softc;
>  
> @@ -419,6 +437,8 @@ vmm_attach(struct device *parent, struct
>           "vmpool", NULL);
>       pool_init(&vcpu_pool, sizeof(struct vcpu), 64, IPL_NONE, PR_WAITOK,
>           "vcpupl", NULL);
> +     pool_init(&kickfd_pool, sizeof(struct kickfd), 64, IPL_NONE, PR_WAITOK,
> +         "kickfdpl", NULL);
>  
>       vmm_softc = sc;
>  }
> @@ -482,6 +502,9 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t 
>       case VMM_IOC_WRITEREGS:
>               ret = vm_rwregs((struct vm_rwregs_params *)data, 1);
>               break;
> +     case VMM_IOC_REGKICKFD:
> +             ret = vm_register_kickfd((struct vm_regkickfd_params *)data, p);
> +             break;
>       default:
>               DPRINTF("%s: unknown ioctl code 0x%lx\n", __func__, cmd);
>               ret = ENOTTY;
> @@ -513,6 +536,7 @@ pledge_ioctl_vmm(struct proc *p, long co
>       case VMM_IOC_INTR:
>       case VMM_IOC_READREGS:
>       case VMM_IOC_WRITEREGS:
> +     case VMM_IOC_REGKICKFD:
>               return (0);
>       }
>  
> @@ -725,6 +749,176 @@ vm_rwregs(struct vm_rwregs_params *vrwp,
>       }
>  }
>  
> +void
> +filt_vmmkickfd_detach(struct knote *kn)
> +{
> +     struct kickfd *kickfd = kn->kn_fp->f_data;
> +
> +     rw_enter_write(&kickfd->vm->vm_kickfd_lock);
> +     SLIST_REMOVE(&kickfd->note, kn, knote, kn_selnext);
> +     rw_exit_write(&kickfd->vm->vm_kickfd_lock);
> +}
> +
> +int
> +filt_vmmkickfd_read(struct knote *kn, long hint)
> +{
> +     return(1);
> +}
> +
> +struct filterops vmmkickfd_filtops =
> +     { 1, NULL, filt_vmmkickfd_detach, filt_vmmkickfd_read };
> +
> +int
> +vmmkickfd_read(struct file *fp, struct uio *uio, int fflags)
> +{
> +     return (EINVAL);
> +}
> +
> +int
> +vmmkickfd_write(struct file *fp, struct uio *uio, int fflags)
> +{
> +     return (EINVAL);
> +}
> +
> +int
> +vmmkickfd_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p)
> +{
> +     return (ENOTTY);
> +}
> +
> +int
> +vmmkickfd_poll(struct file *fp, int events, struct proc *p)
> +{
> +     return (EINVAL);
> +}
> +
> +int
> +vmmkickfd_kqfilter(struct file *fp, struct knote *kn)
> +{
> +     struct kickfd *kickfd = kn->kn_fp->f_data;
> +
> +     switch (kn->kn_filter) {
> +     case EVFILT_READ:
> +             kn->kn_fop = &vmmkickfd_filtops;
> +             rw_enter_write(&kickfd->vm->vm_kickfd_lock);
> +             SLIST_INSERT_HEAD(&kickfd->note, kn, kn_selnext);
> +             rw_exit_write(&kickfd->vm->vm_kickfd_lock);
> +             break;
> +     default:
> +             return (EINVAL);
> +     }
> +
> +     return (0);
> +}
> +
> +int
> +vmmkickfd_stat(struct file *fp, struct stat *ub, struct proc *p)
> +{
> +     return (EINVAL);
> +}
> +
> +int
> +vmmkickfd_close(struct file *fp, struct proc *p)
> +{
> +     struct kickfd *kickfd = fp->f_data;
> +
> +     rw_enter_write(&kickfd->vm->vm_kickfd_lock);
> +     SLIST_REMOVE(&kickfd->vm->vm_kickfd_list, kickfd, kickfd, kickfd_link);
> +     rw_exit_write(&kickfd->vm->vm_kickfd_lock);
> +     fp->f_ops = NULL;
> +     fp->f_data = NULL;
> +     pool_put(&kickfd_pool, kickfd);
> +
> +     return (0);
> +}
> +
> +int
> +vmmkickfd_seek(struct file *fp, off_t *offset, int whence, struct proc *p)
> +{
> +     return (EINVAL);
> +}
> +
> +static struct fileops vmmkickfdops = {
> +     .fo_read        = vmmkickfd_read,
> +     .fo_write       = vmmkickfd_write,
> +     .fo_ioctl       = vmmkickfd_ioctl,
> +     .fo_poll        = vmmkickfd_poll,
> +     .fo_kqfilter    = vmmkickfd_kqfilter,
> +     .fo_stat        = vmmkickfd_stat,
> +     .fo_close       = vmmkickfd_close,
> +     .fo_seek        = vmmkickfd_seek,
> +};
> +
> +/*
> + * vm_register_kickfd
> + *
> + * IOCTL handler to register an I/O port for which the user wants to
> + * receive a notification via kevent on a file descriptor, instead of
> + * exiting to vmd.
> + *
> + * Parameters:
> + *   vrkp: Describes the VM and the I/O port to register with a kickfd.
> + *    The file descriptor assigned to the kickfd is also returned here.
> + *   p: process requesting the I/O port to kickfd registration
> + *
> + * Return values:
> + *  0: if successful
> + *  ENOENT: if the VM defined by 'vrkp' cannot be found
> + *  EPERM: if the vm cannot be accessed from the calling process
> + *  ENFILE: if the process file table is full
> + *  EMFILE: if the process has reached its file descriptor limit
> + */
> +int
> +vm_register_kickfd(struct vm_regkickfd_params *vrkp, struct proc *p)
> +{
> +     struct filedesc *fdp = p->p_fd;
> +     struct file *fl;
> +     struct kickfd *kickfd = NULL;
> +     struct vm *vm = NULL;
> +     int fd, error;
> +
> +     /* Find the desired VM */
> +     rw_enter_read(&vmm_softc->vm_lock);
> +     error = vm_find(vrkp->vrkp_vm_id, &vm);
> +     rw_exit_read(&vmm_softc->vm_lock);
> +
> +     /* Not found? exit. */
> +     if (error != 0) {
> +             DPRINTF("%s: vm id %u not found\n", __func__,
> +                 vrkp->vrkp_vm_id);
> +             return (error);
> +     }
> +
> +     /* Allocate a new kickfd and attach it to the file descriptor */
> +     kickfd = pool_get(&kickfd_pool, PR_WAITOK | PR_ZERO);
> +     kickfd->ioport = vrkp->vrkp_ioport;
> +     kickfd->vm = vm;
> +
> +     fdplock(fdp);
> +     error = falloc(p, &fl, &fd);
> +     if (error != 0)
> +             goto free;
> +     fl->f_flag = FREAD | FWRITE | FNONBLOCK;
> +     fl->f_type = DTYPE_KICKFD;
> +     fl->f_data = kickfd;
> +     fl->f_ops = &vmmkickfdops;
> +
> +     fdinsert(fdp, fd, 0, fl);
> +     vrkp->vrkp_fd = fd;
> +     fdpunlock(fdp);
> +
> +     /* Add this kickfd to the list evaluated on each in/out exit */
> +     SLIST_INSERT_HEAD(&vm->vm_kickfd_list, kickfd, kickfd_link);
> +
> +     FRELE(fl, p);
> +     return (0);
> +
> +free:
> +     fdpunlock(fdp);
> +     pool_put(&kickfd_pool, kickfd);
> +     return (error);
> +}
> +
>  /*
>   * vm_find
>   *
> @@ -1080,6 +1274,8 @@ vm_create(struct vm_create_params *vcp, 
>       vm = pool_get(&vm_pool, PR_WAITOK | PR_ZERO);
>       SLIST_INIT(&vm->vm_vcpu_list);
>       rw_init(&vm->vm_vcpu_lock, "vcpulock");
> +     SLIST_INIT(&vm->vm_kickfd_list);
> +     rw_init(&vm->vm_kickfd_lock, "kickfdlock");
>  
>       vm->vm_creator_pid = p->p_p->ps_pid;
>       vm->vm_nmemranges = vcp->vcp_nmemranges;
> @@ -3953,6 +4149,8 @@ vcpu_run_vmx(struct vcpu *vcpu, struct v
>                       break;
>               case VMX_EXIT_XSETBV:
>                       break;
> +             case VM_EXIT_NONE:
> +                     break;
>  #ifdef VMM_DEBUG
>               case VMX_EXIT_TRIPLE_FAULT:
>                       DPRINTF("%s: vm %d vcpu %d triple fault\n",
> @@ -5083,6 +5281,8 @@ svm_handle_inout(struct vcpu *vcpu)
>       uint64_t insn_length, exit_qual;
>       int ret;
>       struct vmcb *vmcb = (struct vmcb *)vcpu->vc_control_va;
> +     struct vm *vm = vcpu->vc_parent;
> +     struct kickfd *kickfd;
>  
>       insn_length = vmcb->v_exitinfo2 - vmcb->v_rip;
>       if (insn_length != 1 && insn_length != 2) {
> @@ -5134,8 +5334,25 @@ svm_handle_inout(struct vcpu *vcpu)
>       case 0x500 ... 0x50f:
>       case 0xcf8:
>       case 0xcfc ... 0xcff:
> +             ret = EAGAIN;
> +             break;
>       case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
>               ret = EAGAIN;
> +             if (vcpu->vc_exit.vei.vei_dir != 0)
> +                     break;
> +
> +             /* Check if we have a kickfd registered for this ioport */
> +             rw_enter_read(&vm->vm_kickfd_lock);
> +             SLIST_FOREACH(kickfd, &vm->vm_kickfd_list, kickfd_link) {
> +                     if (vcpu->vc_exit.vei.vei_port == kickfd->ioport) {
> +                             KNOTE(&kickfd->note, 0);
> +                             /* Reset reason to avoid exiting to vmd */
> +                             vcpu->vc_gueststate.vg_exit_reason =
> +                                 VM_EXIT_NONE;
> +                             ret = 0;
> +                     }
> +             }
> +             rw_exit_read(&vm->vm_kickfd_lock);
>               break;
>       default:
>               /* Read from unsupported ports returns FFs */
> @@ -5174,6 +5391,8 @@ vmx_handle_inout(struct vcpu *vcpu)
>  {
>       uint64_t insn_length, exit_qual;
>       int ret;
> +     struct vm *vm = vcpu->vc_parent;
> +     struct kickfd *kickfd = NULL;
>  
>       if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) {
>               printf("%s: can't obtain instruction length\n", __func__);
> @@ -5227,8 +5446,25 @@ vmx_handle_inout(struct vcpu *vcpu)
>       case 0xcf8:
>       case 0xcfc ... 0xcff:
>       case 0x500 ... 0x50f:
> +             ret = EAGAIN;
> +             break;
>       case VMM_PCI_IO_BAR_BASE ... VMM_PCI_IO_BAR_END:
>               ret = EAGAIN;
> +             if (vcpu->vc_exit.vei.vei_dir != 0)
> +                     break;
> +
> +             /* Check if we have a kickfd registered for this ioport */
> +             rw_enter_read(&vm->vm_kickfd_lock);
> +             SLIST_FOREACH(kickfd, &vm->vm_kickfd_list, kickfd_link) {
> +                     if (vcpu->vc_exit.vei.vei_port == kickfd->ioport) {
> +                             KNOTE(&kickfd->note, 0);
> +                             /* Reset reason to avoid exiting to vmd */
> +                             vcpu->vc_gueststate.vg_exit_reason =
> +                                 VM_EXIT_NONE;
> +                             ret = 0;
> +                     }
> +             }
> +             rw_exit_read(&vm->vm_kickfd_lock);
>               break;
>       default:
>               /* Read from unsupported ports returns FFs */
> Index: arch/amd64/include/vmmvar.h
> ===================================================================
> RCS file: /cvs/src/sys/arch/amd64/include/vmmvar.h,v
> retrieving revision 1.59
> diff -u -p -r1.59 vmmvar.h
> --- arch/amd64/include/vmmvar.h       20 Sep 2018 14:32:59 -0000      1.59
> +++ arch/amd64/include/vmmvar.h       5 Oct 2018 09:54:35 -0000
> @@ -533,6 +533,13 @@ struct vm_rwregs_params {
>       struct vcpu_reg_state   vrwp_regs;
>  };
>  
> +struct vm_regkickfd_params {
> +     /* Input/output parameters to VM_IOC_REGKICKFD */
> +     uint32_t                vrkp_vm_id;
> +     uint16_t                vrkp_ioport;
> +     int                     vrkp_fd;
> +};
> +
>  /* IOCTL definitions */
>  #define VMM_IOC_CREATE _IOWR('V', 1, struct vm_create_params) /* Create VM */
>  #define VMM_IOC_RUN _IOWR('V', 2, struct vm_run_params) /* Run VCPU */
> @@ -542,6 +549,7 @@ struct vm_rwregs_params {
>  #define VMM_IOC_INTR _IOW('V', 6, struct vm_intr_params) /* Intr pending */
>  #define VMM_IOC_READREGS _IOWR('V', 7, struct vm_rwregs_params) /* Get regs 
> */
>  #define VMM_IOC_WRITEREGS _IOW('V', 8, struct vm_rwregs_params) /* Set regs 
> */
> +#define VMM_IOC_REGKICKFD _IOWR('V', 9, struct vm_regkickfd_params) /* 
> Register kick FD */
>  
>  
>  /* CPUID masks */
> Index: sys/file.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/file.h,v
> retrieving revision 1.53
> diff -u -p -r1.53 file.h
> --- sys/file.h        20 Aug 2018 16:00:22 -0000      1.53
> +++ sys/file.h        5 Oct 2018 09:54:35 -0000
> @@ -79,6 +79,7 @@ struct file {
>  #define      DTYPE_PIPE      3       /* pipe */
>  #define      DTYPE_KQUEUE    4       /* event queue */
>  #define      DTYPE_DMABUF    5       /* DMA buffer (for DRM) */
> +#define      DTYPE_KICKFD    6       /* kickfd (for vmm) */
>       short   f_type;         /* [I] descriptor type */
>       u_int   f_count;        /* [a] reference count */
>       struct  ucred *f_cred;  /* [I] credentials associated with descriptor */
> 

Reply via email to