Re: vmm guests die during host's supspend/resume

Dave Voutila Mon, 21 Mar 2022 08:59:15 -0700


Dave Voutila <[email protected]> writes:


> Martin Pieuchot <[email protected]> writes:
>
>> I see the following in the dmesg:
>>
>> vcpu_run_vmx: failed vmresume for unknown reason
>> vcpu_run_vmx: error code = 5, VMRESUME: non-launched VMCS
>
> This is due to intel's vmx design. We need some handling to flush vmcs
> state on suspend and reload it on resume. This is on my backlog of
> things to clean up.
>

Here's a diff that fixes the issue for me on my x270. I've performed
limited testing on my AMD-based X13. Will probably send to tech@ later
today for more eyeballs and tests.

In short, it adds both a barrier and refcounting to the ioctl
handler. On device quiesce, we drain device users from the critical path
requiring access to guest state. For every vcpu, we vmclear (if Intel
host) where needed.

To handle hibernate, we also do a vmm_stop()/vmm_start(), which on Intel
means issuing vmxoff/vmxon instructions on each cpu. Normally we only do
this during vm creation/termination.

On wakeup, we reverse the process and notify any waiting device users
blocked by the barrier.


diff refs/heads/master refs/heads/vmm-suspend
blob - a195b5d247b957c1f5c12cb153ba81ed2810e89a
blob + 6a96d7b1297b231b0cffdd1dca933ebdf9ff2a5d
--- sys/arch/amd64/amd64/vmm.c
+++ sys/arch/amd64/amd64/vmm.c
@@ -25,6 +25,7 @@
 #include <sys/user.h>
 #include <sys/ioctl.h>
 #include <sys/queue.h>
+#include <sys/refcnt.h>
 #include <sys/rwlock.h>
 #include <sys/pledge.h>
 #include <sys/memrange.h>
@@ -88,6 +89,12 @@ SLIST_HEAD(vmlist_head, vm);
 struct vmm_softc {
        struct device           sc_dev;

+       /* Suspend/Resume Synchronization */
+       struct refcnt           sc_refcnt;
+       volatile unsigned int   sc_status;
+#define VMM_SUSPENDED          (unsigned int) 0
+#define VMM_ACTIVE             (unsigned int) 1
+
        /* Capabilities */
        uint32_t                nr_vmx_cpus;
        uint32_t                nr_svm_cpus;
@@ -115,9 +122,11 @@ void vmx_dump_vmcs_field(uint16_t, const char *);
 int vmm_enabled(void);
 int vmm_probe(struct device *, void *, void *);
 void vmm_attach(struct device *, struct device *, void *);
+int vmm_activate(struct device *, int);
 int vmmopen(dev_t, int, int, struct proc *);
 int vmmioctl(dev_t, u_long, caddr_t, int, struct proc *);
 int vmmclose(dev_t, int, int, struct proc *);
+int vmm_quiesce_vmx(void);
 int vmm_start(void);
 int vmm_stop(void);
 size_t vm_create_check_mem_ranges(struct vm_create_params *);
@@ -264,7 +273,7 @@ struct cfdriver vmm_cd = {
 };

 const struct cfattach vmm_ca = {
-       sizeof(struct vmm_softc), vmm_probe, vmm_attach, NULL, NULL
+       sizeof(struct vmm_softc), vmm_probe, vmm_attach, NULL, vmm_activate
 };

 /*
@@ -367,6 +376,12 @@ vmm_attach(struct device *parent, struct device *self,
        struct cpu_info *ci;
        CPU_INFO_ITERATOR cii;

+       sc->sc_status = VMM_ACTIVE;
+
+       /* We're in autoconf, so immediately release our refcnt. */
+       refcnt_init(&sc->sc_refcnt);
+       refcnt_rele(&sc->sc_refcnt);
+
        sc->nr_vmx_cpus = 0;
        sc->nr_svm_cpus = 0;
        sc->nr_rvi_cpus = 0;
@@ -441,6 +456,163 @@ vmm_attach(struct device *parent, struct device *self,
 }

 /*
+ * vmm_quiesce_vmx
+ *
+ * Prepare the host for suspend by flushing all VMCS states.
+ */
+int
+vmm_quiesce_vmx(void)
+{
+       struct vm               *vm;
+       struct vcpu             *vcpu;
+       int                      err;
+
+       /*
+        * We should be only called from a quiescing device state so we
+        * don't expect to sleep here. If we can't get all our locks,
+        * something is wrong.
+        */
+       if ((err = rw_enter(&vmm_softc->vm_lock, RW_WRITE | RW_NOSLEEP)))
+               return (err);
+
+       /* Iterate over each vm... */
+       SLIST_FOREACH(vm, &vmm_softc->vm_list, vm_link) {
+               if ((err = rw_enter(&vm->vm_vcpu_lock, RW_READ | RW_NOSLEEP)))
+                       break;
+
+               /* Iterate over each vcpu... */
+               SLIST_FOREACH(vcpu, &vm->vm_vcpu_list, vc_vcpu_link) {
+                       err = rw_enter(&vcpu->vc_lock, RW_WRITE | RW_NOSLEEP);
+                       if (err)
+                               break;
+
+                       /* We can skip unlaunched VMCS. Nothing to flush. */
+                       if (atomic_load_int(&vcpu->vc_vmx_vmcs_state)
+                           != VMCS_LAUNCHED) {
+                               DPRINTF("%s: skipping vcpu %d for vm %d\n",
+                                   __func__, vcpu->vc_id, vm->vm_id);
+                               rw_exit_write(&vcpu->vc_lock);
+                               continue;
+                       }
+
+                       if (vcpu->vc_last_pcpu != curcpu()) {
+                               /* Remote cpu vmclear via ipi. */
+                               err = vmx_remote_vmclear(vcpu->vc_last_pcpu,
+                                   vcpu);
+                               if (err)
+                                       printf("%s: failed to remote vmclear "
+                                           "vcpu %d of vm %d\n", __func__,
+                                           vcpu->vc_id, vm->vm_id);
+                       } else {
+                               /* Local cpu vmclear instruction. */
+                               if ((err = vmclear(&vcpu->vc_control_pa)))
+                                       printf("%s: failed to locally vmclear "
+                                           "vcpu %d of vm %d\n", __func__,
+                                           vcpu->vc_id, vm->vm_id);
+                               atomic_swap_uint(&vcpu->vc_vmx_vmcs_state,
+                                   VMCS_CLEARED);
+                       }
+
+                       rw_exit_write(&vcpu->vc_lock);
+                       if (err)
+                               break;
+                       DPRINTF("%s: cleared vcpu %d for vm %d\n", __func__,
+                           vcpu->vc_id, vm->vm_id);
+               }
+               rw_exit_read(&vm->vm_vcpu_lock);
+               if (err)
+                       break;
+       }
+       rw_exit_write(&vmm_softc->vm_lock);
+
+       if (err)
+               return (err);
+       return (0);
+}
+
+/*
+ * vmm_activate
+ */
+int
+vmm_activate(struct device *self, int act)
+{
+       struct cpu_info         *ci = curcpu();
+       unsigned int             old_state;
+
+       switch (act) {
+       case DVACT_QUIESCE:
+               /* Block device users as we're suspending operation. */
+               old_state = atomic_cas_uint(&vmm_softc->sc_status, VMM_ACTIVE,
+                   VMM_SUSPENDED);
+               if (old_state != VMM_ACTIVE) {
+                       printf("%s: invalid device state on quiesce (%d)\n",
+                           __func__, old_state);
+                       return (1);
+               }
+
+               /* Wait for any device users to finish. */
+               while (refcnt_read(&vmm_softc->sc_refcnt) > 0) {
+                       if (tsleep_nsec(&vmm_softc, PPAUSE, "vmm",
+                           MSEC_TO_NSEC(1)) != EWOULDBLOCK) {
+                               printf("%s: interrupted while draining users\n",
+                                   __func__);
+                               goto quiesce_fail;
+                       }
+               }
+
+               /* If we're not in vmm mode, nothing to do. */
+               if ((ci->ci_flags & CPUF_VMM) == 0)
+                       break;
+
+               /* Intel systems need extra steps to sync vcpu state. */
+               if (vmm_softc->mode == VMM_MODE_EPT ||
+                   vmm_softc->mode == VMM_MODE_VMX)
+                       if (vmm_quiesce_vmx()) {
+                               printf("%s: failed to prepare vcpus for "
+                                   "host suspend\n", __func__);
+                               goto quiesce_fail;
+                       }
+
+               /* Stop virtualization mode on all cpus. */
+               if (vmm_stop()) {
+                       printf("%s: failed to stop vmm mode on quiesce\n",
+                           __func__);
+                       goto quiesce_fail;
+               }
+               break;
+
+       case DVACT_WAKEUP:
+               /* Restart virtualization mode on all cpu's. */
+               if (vmm_softc->vm_ct > 0 && vmm_start()) {
+                       printf("%s: failed to restart vmm mode on wakeup\n",
+                           __func__);
+                       return (1);
+               }
+
+               /* Set the device back to active. */
+               old_state = atomic_cas_uint(&vmm_softc->sc_status,
+                   VMM_SUSPENDED, VMM_ACTIVE);
+               if (old_state != VMM_SUSPENDED) {
+                       printf("%s: invalid device state on wakeup (%d)\n",
+                           __func__, old_state);
+                       return (1);
+               }
+
+               /* Notify any waiting device users. */
+               wakeup(&vmm_softc);
+               break;
+       }
+
+       return (0);
+
+quiesce_fail:
+       /* Something's wrong. Need to try getting back to a good state. */
+       atomic_store_int(&vmm_softc->sc_status, VMM_ACTIVE);
+       wakeup(&vmm_softc);
+       return (1);
+}
+
+/*
  * vmmopen
  *
  * Called during open of /dev/vmm.
@@ -476,10 +648,24 @@ vmmopen(dev_t dev, int flag, int mode, struct proc *p)
 int
 vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p)
 {
-       int ret;
+       int                     ret;

        KERNEL_UNLOCK();

+       while (atomic_load_int(&vmm_softc->sc_status) != VMM_ACTIVE) {
+               /* Wait for the signal that we're running again. */
+               ret = tsleep_nsec(&vmm_softc, PWAIT | PCATCH, "vmm",
+                   MSEC_TO_NSEC(1));
+               if (ret != ERESTART && ret != EINTR && ret != EWOULDBLOCK
+                   && ret != 0) {
+                       printf("%s: unhandled wakeup (%d) for device\n",
+                           __func__, ret);
+                       ret = EBUSY;
+                       goto out;
+               }
+       }
+       refcnt_take(&vmm_softc->sc_refcnt);
+
        switch (cmd) {
        case VMM_IOC_CREATE:
                if ((ret = vmm_start()) != 0) {
@@ -524,6 +710,8 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag
                ret = ENOTTY;
        }

+       refcnt_rele(&vmm_softc->sc_refcnt);
+out:
        KERNEL_LOCK();

        return (ret);

Re: vmm guests die during host's supspend/resume

Reply via email to