This diff removes instability from VMX-based hosts by either removing
the possibility of the process sleeping while the VMCS is active or
reloading it if we had no choice.

A mutex is added to help guard the VMCS state so testing with witness
has helped verify the diff.

The rwlock on the cpu originally used in the remote vmclear routine is
changed to a mutex accordingly.

This diff does not remote possible calls to printf(9) via the DPRINTF
macro as that's part of the next diff.

One area of note: in vmx_load_pdptes() there's a XXX to call out that
because of the printf(9) call on failure to km_alloc that the VMCS is
potentially no longer valid. The upcoming diff to swap out printf(9) for
log(9) will remove that.

ok?

-dv


diff refs/heads/master refs/heads/vmm-vmx-02
blob - cc189771bc76745cd391e9b2a58f46dd92aa32ce
blob + d7c6c592b12b236d14ade863fd1e75d8effc179c
--- sys/arch/amd64/amd64/cpu.c
+++ sys/arch/amd64/amd64/cpu.c
@@ -790,7 +790,7 @@ cpu_init_vmm(struct cpu_info *ci)
                    &ci->ci_vmxon_region_pa))
                        panic("Can't locate VMXON region in phys mem");
                ci->ci_vmcs_pa = VMX_VMCS_PA_CLEAR;
-               rw_init(&ci->ci_vmcs_lock, "vmcslock");
+               mtx_init(&ci->ci_vmcs_mtx, IPL_MPFLOOR);
        }
 }
 #endif /* NVMM > 0 */
blob - 2535558cce5f07f4d6a150ce53b100524801755a
blob + 76c08badb82e3a90c5b7ab75c1b349fdb3229737
--- sys/arch/amd64/amd64/vmm.c
+++ sys/arch/amd64/amd64/vmm.c
@@ -810,11 +810,13 @@ vm_rwregs(struct vm_rwregs_params *vrwp, int dir)

        rw_enter_write(&vcpu->vc_lock);
        if (vmm_softc->mode == VMM_MODE_VMX ||
-           vmm_softc->mode == VMM_MODE_EPT)
+           vmm_softc->mode == VMM_MODE_EPT) {
+               mtx_enter(&vcpu->vc_vmx_mtx);
                ret = (dir == 0) ?
                    vcpu_readregs_vmx(vcpu, vrwp->vrwp_mask, vrs) :
                    vcpu_writeregs_vmx(vcpu, vrwp->vrwp_mask, 1, vrs);
-       else if (vmm_softc->mode == VMM_MODE_SVM ||
+               mtx_leave(&vcpu->vc_vmx_mtx);
+       } else if (vmm_softc->mode == VMM_MODE_SVM ||
            vmm_softc->mode == VMM_MODE_RVI)
                ret = (dir == 0) ?
                    vcpu_readregs_svm(vcpu, vrwp->vrwp_mask, vrs) :
@@ -1370,7 +1372,7 @@ vmx_remote_vmclear(struct cpu_info *ci, struct vcpu *v
 {
        int ret = 0, nticks = 100000;

-       rw_enter_write(&ci->ci_vmcs_lock);
+       mtx_enter(&ci->ci_vmcs_mtx);
        atomic_swap_ulong(&ci->ci_vmcs_pa, vcpu->vc_control_pa);
        x86_send_ipi(ci, X86_IPI_VMCLEAR_VMM);

@@ -1383,7 +1385,7 @@ vmx_remote_vmclear(struct cpu_info *ci, struct vcpu *v
                }
        }
        atomic_swap_uint(&vcpu->vc_vmx_vmcs_state, VMCS_CLEARED);
-       rw_exit_write(&ci->ci_vmcs_lock);
+       mtx_leave(&ci->ci_vmcs_mtx);

        return (ret);
 }
@@ -1785,6 +1787,7 @@ vcpu_reload_vmcs_vmx(struct vcpu *vcpu)
        struct cpu_info *ci, *last_ci;

        rw_assert_wrlock(&vcpu->vc_lock);
+       MUTEX_ASSERT_LOCKED(&vcpu->vc_vmx_mtx);

        ci = curcpu();
        last_ci = vcpu->vc_last_pcpu;
@@ -1838,6 +1841,8 @@ vcpu_readregs_vmx(struct vcpu *vcpu, uint64_t regmask,
        struct vcpu_segment_info *sregs = vrs->vrs_sregs;
        struct vmx_msr_store *msr_store;

+       MUTEX_ASSERT_LOCKED(&vcpu->vc_vmx_mtx);
+
 #ifdef VMM_DEBUG
        /* VMCS should be loaded... */
        paddr_t pa = 0ULL;
@@ -2113,6 +2118,8 @@ vcpu_writeregs_vmx(struct vcpu *vcpu, uint64_t regmask
        struct vcpu_segment_info *sregs = vrs->vrs_sregs;
        struct vmx_msr_store *msr_store;

+       MUTEX_ASSERT_LOCKED(&vcpu->vc_vmx_mtx);
+
        if (loadvmcs) {
                if (vcpu_reload_vmcs_vmx(vcpu))
                        return (EINVAL);
@@ -2744,6 +2751,7 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg
        struct vmx_msr_store *msr_store;

        rw_assert_wrlock(&vcpu->vc_lock);
+       mtx_enter(&vcpu->vc_vmx_mtx);

        cr0 = vrs->vrs_crs[VCPU_REGS_CR0];

@@ -3028,12 +3036,25 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg
            IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) {
                if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
                    IA32_VMX_ENABLE_VPID, 1)) {
+
+                       /* We need to drop the mutex before acquiring a vpid */
+                       vcpu->vc_last_pcpu = curcpu();
+                       mtx_leave(&vcpu->vc_vmx_mtx);
+
                        if (vmm_alloc_vpid(&vpid)) {
                                DPRINTF("%s: could not allocate VPID\n",
                                    __func__);
                                ret = EINVAL;
+                               goto exit_no_mtx;
+                       }
+
+                       mtx_enter(&vcpu->vc_vmx_mtx);
+                       if (vcpu_reload_vmcs_vmx(vcpu)) {
+                               printf("%s: failed to reload vmcs\n", __func__);
+                               ret = EINVAL;
                                goto exit;
                        }
+
                        if (vmwrite(VMCS_GUEST_VPID, vpid)) {
                                DPRINTF("%s: error setting guest VPID\n",
                                    __func__);
@@ -3297,6 +3318,8 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg
        atomic_swap_uint(&vcpu->vc_vmx_vmcs_state, VMCS_CLEARED);

 exit:
+       mtx_leave(&vcpu->vc_vmx_mtx);
+exit_no_mtx:
        return (ret);
 }

@@ -3323,6 +3346,8 @@ vcpu_init_vmx(struct vcpu *vcpu)
        uint32_t cr0, cr4;
        int ret = 0;

+       mtx_init(&vcpu->vc_vmx_mtx, IPL_MPFLOOR);
+
        /* Allocate VMCS VA */
        vcpu->vc_control_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page, &kp_zero,
            &kd_waitok);
@@ -4598,10 +4623,13 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
        struct region_descriptor gdtr, idtr;

        rw_assert_wrlock(&vcpu->vc_lock);
+       MUTEX_ASSERT_UNLOCKED(&vcpu->vc_vmx_mtx);
+       mtx_enter(&vcpu->vc_vmx_mtx);

        if (vcpu_reload_vmcs_vmx(vcpu)) {
                printf("%s: failed (re)loading vmcs\n", __func__);
-               return (EINVAL);
+               ret = EINVAL;
+               goto out;
        }

        /*
@@ -4643,7 +4671,8 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
                            vcpu->vc_id);
                        vmx_vcpu_dump_regs(vcpu);
                        dump_vcpu(vcpu);
-                       return (EINVAL);
+                       ret = EINVAL;
+                       goto out;
                default:
                        DPRINTF("%s: unimplemented exit type %d (%s)\n",
                            __func__,
@@ -4660,21 +4689,24 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
        setregion(&gdt, ci->ci_gdt, GDT_SIZE - 1);
        if (gdt.rd_base == 0) {
                printf("%s: setregion\n", __func__);
-               return (EINVAL);
+               ret = EINVAL;
+               goto out;
        }

        /* Host GDTR base */
        if (vmwrite(VMCS_HOST_IA32_GDTR_BASE, gdt.rd_base)) {
                printf("%s: vmwrite(0x%04X, 0x%llx)\n", __func__,
                    VMCS_HOST_IA32_GDTR_BASE, gdt.rd_base);
-               return (EINVAL);
+               ret = EINVAL;
+               goto out;
        }

        /* Host TR base */
        if (vmwrite(VMCS_HOST_IA32_TR_BASE, (uint64_t)ci->ci_tss)) {
                printf("%s: vmwrite(0x%04X, 0x%llx)\n", __func__,
                    VMCS_HOST_IA32_TR_BASE, (uint64_t)ci->ci_tss);
-               return (EINVAL);
+               ret = EINVAL;
+               goto out;
        }

        /* Host CR3 */
@@ -4682,7 +4714,8 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
        if (vmwrite(VMCS_HOST_IA32_CR3, cr3)) {
                printf("%s: vmwrite(0x%04X, 0x%llx)\n", __func__,
                    VMCS_HOST_IA32_CR3, cr3);
-               return (EINVAL);
+               ret = EINVAL;
+               goto out;
        }

        /* Handle vmd(8) injected interrupts */
@@ -4691,7 +4724,8 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
                if (vmread(VMCS_GUEST_INTERRUPTIBILITY_ST, &int_st)) {
                        printf("%s: can't get interruptibility state\n",
                            __func__);
-                       return (EINVAL);
+                       ret = EINVAL;
+                       goto out;
                }

                /* Interruptbility state 0x3 covers NMIs and STI */
@@ -4702,7 +4736,8 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
                        if (vmwrite(VMCS_ENTRY_INTERRUPTION_INFO, eii)) {
                                printf("vcpu_run_vmx: can't vector "
                                    "interrupt to guest\n");
-                               return (EINVAL);
+                               ret = EINVAL;
+                               goto out;
                        }

                        irq = 0xFFFF;
@@ -4714,13 +4749,15 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
                if (vmread(VMCS_PROCBASED_CTLS, &procbased)) {
                        printf("%s: can't read procbased ctls on exit\n",
                            __func__);
-                       return (EINVAL);
+                       ret = EINVAL;
+                       goto out;
                } else {
                        procbased &= ~IA32_VMX_INTERRUPT_WINDOW_EXITING;
                        if (vmwrite(VMCS_PROCBASED_CTLS, procbased)) {
                                printf("%s: can't write procbased ctls "
                                    "on exit\n", __func__);
-                               return (EINVAL);
+                               ret = EINVAL;
+                               goto out;
                        }
                }
        }
@@ -4935,13 +4972,14 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
                }
        }

-       vcpu->vc_last_pcpu = curcpu();
-
        /* Copy the VCPU register state to the exit structure */
        if (vcpu_readregs_vmx(vcpu, VM_RWREGS_ALL, &vcpu->vc_exit.vrs))
                ret = EINVAL;
        vcpu->vc_exit.cpl = vmm_get_guest_cpu_cpl(vcpu);

+out:
+       vcpu->vc_last_pcpu = curcpu();
+       mtx_leave(&vcpu->vc_vmx_mtx);
        return (ret);
 }

@@ -5569,11 +5607,23 @@ vmx_fault_page(struct vcpu *vcpu, paddr_t gpa)
                return (EAGAIN);
        }

-       KERNEL_LOCK();
+       /*
+        * We're holding a VMX specific mutex and must release it before
+        * attempting to fault a page. Since we may sleep, we can't assume
+        * we're on the same CPU as before.
+        */
+       vcpu->vc_last_pcpu = curcpu();
+       mtx_leave(&vcpu->vc_vmx_mtx);
+
        ret = uvm_fault(vcpu->vc_parent->vm_map, gpa, VM_FAULT_WIRE,
            PROT_READ | PROT_WRITE | PROT_EXEC);
-       KERNEL_UNLOCK();

+       mtx_enter(&vcpu->vc_vmx_mtx);
+       if (vcpu_reload_vmcs_vmx(vcpu)) {
+               printf("%s: failed to reload vmcs\n", __func__);
+               return (EINVAL);
+       }
+
        if (ret)
                printf("%s: uvm_fault returns %d, GPA=0x%llx, rip=0x%llx\n",
                    __func__, ret, (uint64_t)gpa, vcpu->vc_gueststate.vg_rip);
@@ -5962,15 +6012,29 @@ vmx_load_pdptes(struct vcpu *vcpu)

        ret = 0;

-       cr3_host_virt = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, &kp_none, 
&kd_waitok);
+       /* We need to drop the mutex as km_alloc may sleep. */
+       vcpu->vc_last_pcpu = curcpu();
+       mtx_leave(&vcpu->vc_vmx_mtx);
+
+       cr3_host_virt = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, &kp_none,
+           &kd_waitok);
        if (!cr3_host_virt) {
                printf("%s: can't allocate address for guest CR3 mapping\n",
                    __func__);
+               mtx_enter(&vcpu->vc_vmx_mtx);
+               /* XXX VMCS may be bogus */
                return (ENOMEM);
        }
-
        pmap_kenter_pa(cr3_host_virt, cr3_host_phys, PROT_READ);

+       mtx_enter(&vcpu->vc_vmx_mtx);
+       if (vcpu_reload_vmcs_vmx(vcpu)) {
+               printf("%s: failed to reload vmcs\n", __func__);
+               mtx_leave(&vcpu->vc_vmx_mtx);
+               ret = EINVAL;
+               goto exit;
+       }
+
        pdptes = (pd_entry_t *)cr3_host_virt;
        if (vmwrite(VMCS_GUEST_PDPTE0, pdptes[0])) {
                printf("%s: can't write guest PDPTE0\n", __func__);
@@ -5998,7 +6062,18 @@ vmx_load_pdptes(struct vcpu *vcpu)

 exit:
        pmap_kremove(cr3_host_virt, PAGE_SIZE);
+
+       /* km_free can sleep, so we need to release our mutex */
+       vcpu->vc_last_pcpu = curcpu();
+       mtx_leave(&vcpu->vc_vmx_mtx);
+
        km_free((void *)cr3_host_virt, PAGE_SIZE, &kv_any, &kp_none);
+
+       mtx_enter(&vcpu->vc_vmx_mtx);
+       if (vcpu_reload_vmcs_vmx(vcpu)) {
+               printf("%s: failed to reload vmcs after km_free\n", __func__);
+               ret = EINVAL;
+       }
        return (ret);
 }

blob - 4bc8627b554e1233ed55f23df30c8c2052fa84b2
blob + 357fc099c307aaa7ee075243847e791188ad615a
--- sys/arch/amd64/include/cpu.h
+++ sys/arch/amd64/include/cpu.h
@@ -49,7 +49,7 @@
 #endif /* _KERNEL */

 #include <sys/device.h>
-#include <sys/rwlock.h>
+#include <sys/mutex.h>
 #include <sys/sched.h>
 #include <sys/sensors.h>
 #include <sys/srp.h>
@@ -214,7 +214,7 @@ struct cpu_info {
        char            ci_panicbuf[512];

        paddr_t         ci_vmcs_pa;
-       struct rwlock   ci_vmcs_lock;
+       struct mutex    ci_vmcs_mtx;
 };

 #define CPUF_BSP       0x0001          /* CPU is the original BSP */
blob - 94bb172832d4c2847b1e83ebb9cc05538db6ac80
blob + dc8eb4ca62dbe3a0c01a3d37f298b250bd8daac2
--- sys/arch/amd64/include/vmmvar.h
+++ sys/arch/amd64/include/vmmvar.h
@@ -945,6 +945,7 @@ struct vcpu {
        uint64_t vc_shadow_pat;

        /* VMX only */
+       struct mutex vc_vmx_mtx;
        uint64_t vc_vmx_basic;
        uint64_t vc_vmx_entry_ctls;
        uint64_t vc_vmx_true_entry_ctls;

Reply via email to