This diff removes instability from VMX-based hosts by either removing the possibility of the process sleeping while the VMCS is active or reloading it if we had no choice.
A mutex is added to help guard the VMCS state so testing with witness has helped verify the diff. The rwlock on the cpu originally used in the remote vmclear routine is changed to a mutex accordingly. This diff does not remote possible calls to printf(9) via the DPRINTF macro as that's part of the next diff. One area of note: in vmx_load_pdptes() there's a XXX to call out that because of the printf(9) call on failure to km_alloc that the VMCS is potentially no longer valid. The upcoming diff to swap out printf(9) for log(9) will remove that. ok? -dv diff refs/heads/master refs/heads/vmm-vmx-02 blob - cc189771bc76745cd391e9b2a58f46dd92aa32ce blob + d7c6c592b12b236d14ade863fd1e75d8effc179c --- sys/arch/amd64/amd64/cpu.c +++ sys/arch/amd64/amd64/cpu.c @@ -790,7 +790,7 @@ cpu_init_vmm(struct cpu_info *ci) &ci->ci_vmxon_region_pa)) panic("Can't locate VMXON region in phys mem"); ci->ci_vmcs_pa = VMX_VMCS_PA_CLEAR; - rw_init(&ci->ci_vmcs_lock, "vmcslock"); + mtx_init(&ci->ci_vmcs_mtx, IPL_MPFLOOR); } } #endif /* NVMM > 0 */ blob - 2535558cce5f07f4d6a150ce53b100524801755a blob + 76c08badb82e3a90c5b7ab75c1b349fdb3229737 --- sys/arch/amd64/amd64/vmm.c +++ sys/arch/amd64/amd64/vmm.c @@ -810,11 +810,13 @@ vm_rwregs(struct vm_rwregs_params *vrwp, int dir) rw_enter_write(&vcpu->vc_lock); if (vmm_softc->mode == VMM_MODE_VMX || - vmm_softc->mode == VMM_MODE_EPT) + vmm_softc->mode == VMM_MODE_EPT) { + mtx_enter(&vcpu->vc_vmx_mtx); ret = (dir == 0) ? vcpu_readregs_vmx(vcpu, vrwp->vrwp_mask, vrs) : vcpu_writeregs_vmx(vcpu, vrwp->vrwp_mask, 1, vrs); - else if (vmm_softc->mode == VMM_MODE_SVM || + mtx_leave(&vcpu->vc_vmx_mtx); + } else if (vmm_softc->mode == VMM_MODE_SVM || vmm_softc->mode == VMM_MODE_RVI) ret = (dir == 0) ? vcpu_readregs_svm(vcpu, vrwp->vrwp_mask, vrs) : @@ -1370,7 +1372,7 @@ vmx_remote_vmclear(struct cpu_info *ci, struct vcpu *v { int ret = 0, nticks = 100000; - rw_enter_write(&ci->ci_vmcs_lock); + mtx_enter(&ci->ci_vmcs_mtx); atomic_swap_ulong(&ci->ci_vmcs_pa, vcpu->vc_control_pa); x86_send_ipi(ci, X86_IPI_VMCLEAR_VMM); @@ -1383,7 +1385,7 @@ vmx_remote_vmclear(struct cpu_info *ci, struct vcpu *v } } atomic_swap_uint(&vcpu->vc_vmx_vmcs_state, VMCS_CLEARED); - rw_exit_write(&ci->ci_vmcs_lock); + mtx_leave(&ci->ci_vmcs_mtx); return (ret); } @@ -1785,6 +1787,7 @@ vcpu_reload_vmcs_vmx(struct vcpu *vcpu) struct cpu_info *ci, *last_ci; rw_assert_wrlock(&vcpu->vc_lock); + MUTEX_ASSERT_LOCKED(&vcpu->vc_vmx_mtx); ci = curcpu(); last_ci = vcpu->vc_last_pcpu; @@ -1838,6 +1841,8 @@ vcpu_readregs_vmx(struct vcpu *vcpu, uint64_t regmask, struct vcpu_segment_info *sregs = vrs->vrs_sregs; struct vmx_msr_store *msr_store; + MUTEX_ASSERT_LOCKED(&vcpu->vc_vmx_mtx); + #ifdef VMM_DEBUG /* VMCS should be loaded... */ paddr_t pa = 0ULL; @@ -2113,6 +2118,8 @@ vcpu_writeregs_vmx(struct vcpu *vcpu, uint64_t regmask struct vcpu_segment_info *sregs = vrs->vrs_sregs; struct vmx_msr_store *msr_store; + MUTEX_ASSERT_LOCKED(&vcpu->vc_vmx_mtx); + if (loadvmcs) { if (vcpu_reload_vmcs_vmx(vcpu)) return (EINVAL); @@ -2744,6 +2751,7 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg struct vmx_msr_store *msr_store; rw_assert_wrlock(&vcpu->vc_lock); + mtx_enter(&vcpu->vc_vmx_mtx); cr0 = vrs->vrs_crs[VCPU_REGS_CR0]; @@ -3028,12 +3036,25 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) { if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS, IA32_VMX_ENABLE_VPID, 1)) { + + /* We need to drop the mutex before acquiring a vpid */ + vcpu->vc_last_pcpu = curcpu(); + mtx_leave(&vcpu->vc_vmx_mtx); + if (vmm_alloc_vpid(&vpid)) { DPRINTF("%s: could not allocate VPID\n", __func__); ret = EINVAL; + goto exit_no_mtx; + } + + mtx_enter(&vcpu->vc_vmx_mtx); + if (vcpu_reload_vmcs_vmx(vcpu)) { + printf("%s: failed to reload vmcs\n", __func__); + ret = EINVAL; goto exit; } + if (vmwrite(VMCS_GUEST_VPID, vpid)) { DPRINTF("%s: error setting guest VPID\n", __func__); @@ -3297,6 +3318,8 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg atomic_swap_uint(&vcpu->vc_vmx_vmcs_state, VMCS_CLEARED); exit: + mtx_leave(&vcpu->vc_vmx_mtx); +exit_no_mtx: return (ret); } @@ -3323,6 +3346,8 @@ vcpu_init_vmx(struct vcpu *vcpu) uint32_t cr0, cr4; int ret = 0; + mtx_init(&vcpu->vc_vmx_mtx, IPL_MPFLOOR); + /* Allocate VMCS VA */ vcpu->vc_control_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page, &kp_zero, &kd_waitok); @@ -4598,10 +4623,13 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params * struct region_descriptor gdtr, idtr; rw_assert_wrlock(&vcpu->vc_lock); + MUTEX_ASSERT_UNLOCKED(&vcpu->vc_vmx_mtx); + mtx_enter(&vcpu->vc_vmx_mtx); if (vcpu_reload_vmcs_vmx(vcpu)) { printf("%s: failed (re)loading vmcs\n", __func__); - return (EINVAL); + ret = EINVAL; + goto out; } /* @@ -4643,7 +4671,8 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params * vcpu->vc_id); vmx_vcpu_dump_regs(vcpu); dump_vcpu(vcpu); - return (EINVAL); + ret = EINVAL; + goto out; default: DPRINTF("%s: unimplemented exit type %d (%s)\n", __func__, @@ -4660,21 +4689,24 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params * setregion(&gdt, ci->ci_gdt, GDT_SIZE - 1); if (gdt.rd_base == 0) { printf("%s: setregion\n", __func__); - return (EINVAL); + ret = EINVAL; + goto out; } /* Host GDTR base */ if (vmwrite(VMCS_HOST_IA32_GDTR_BASE, gdt.rd_base)) { printf("%s: vmwrite(0x%04X, 0x%llx)\n", __func__, VMCS_HOST_IA32_GDTR_BASE, gdt.rd_base); - return (EINVAL); + ret = EINVAL; + goto out; } /* Host TR base */ if (vmwrite(VMCS_HOST_IA32_TR_BASE, (uint64_t)ci->ci_tss)) { printf("%s: vmwrite(0x%04X, 0x%llx)\n", __func__, VMCS_HOST_IA32_TR_BASE, (uint64_t)ci->ci_tss); - return (EINVAL); + ret = EINVAL; + goto out; } /* Host CR3 */ @@ -4682,7 +4714,8 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params * if (vmwrite(VMCS_HOST_IA32_CR3, cr3)) { printf("%s: vmwrite(0x%04X, 0x%llx)\n", __func__, VMCS_HOST_IA32_CR3, cr3); - return (EINVAL); + ret = EINVAL; + goto out; } /* Handle vmd(8) injected interrupts */ @@ -4691,7 +4724,8 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params * if (vmread(VMCS_GUEST_INTERRUPTIBILITY_ST, &int_st)) { printf("%s: can't get interruptibility state\n", __func__); - return (EINVAL); + ret = EINVAL; + goto out; } /* Interruptbility state 0x3 covers NMIs and STI */ @@ -4702,7 +4736,8 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params * if (vmwrite(VMCS_ENTRY_INTERRUPTION_INFO, eii)) { printf("vcpu_run_vmx: can't vector " "interrupt to guest\n"); - return (EINVAL); + ret = EINVAL; + goto out; } irq = 0xFFFF; @@ -4714,13 +4749,15 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params * if (vmread(VMCS_PROCBASED_CTLS, &procbased)) { printf("%s: can't read procbased ctls on exit\n", __func__); - return (EINVAL); + ret = EINVAL; + goto out; } else { procbased &= ~IA32_VMX_INTERRUPT_WINDOW_EXITING; if (vmwrite(VMCS_PROCBASED_CTLS, procbased)) { printf("%s: can't write procbased ctls " "on exit\n", __func__); - return (EINVAL); + ret = EINVAL; + goto out; } } } @@ -4935,13 +4972,14 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params * } } - vcpu->vc_last_pcpu = curcpu(); - /* Copy the VCPU register state to the exit structure */ if (vcpu_readregs_vmx(vcpu, VM_RWREGS_ALL, &vcpu->vc_exit.vrs)) ret = EINVAL; vcpu->vc_exit.cpl = vmm_get_guest_cpu_cpl(vcpu); +out: + vcpu->vc_last_pcpu = curcpu(); + mtx_leave(&vcpu->vc_vmx_mtx); return (ret); } @@ -5569,11 +5607,23 @@ vmx_fault_page(struct vcpu *vcpu, paddr_t gpa) return (EAGAIN); } - KERNEL_LOCK(); + /* + * We're holding a VMX specific mutex and must release it before + * attempting to fault a page. Since we may sleep, we can't assume + * we're on the same CPU as before. + */ + vcpu->vc_last_pcpu = curcpu(); + mtx_leave(&vcpu->vc_vmx_mtx); + ret = uvm_fault(vcpu->vc_parent->vm_map, gpa, VM_FAULT_WIRE, PROT_READ | PROT_WRITE | PROT_EXEC); - KERNEL_UNLOCK(); + mtx_enter(&vcpu->vc_vmx_mtx); + if (vcpu_reload_vmcs_vmx(vcpu)) { + printf("%s: failed to reload vmcs\n", __func__); + return (EINVAL); + } + if (ret) printf("%s: uvm_fault returns %d, GPA=0x%llx, rip=0x%llx\n", __func__, ret, (uint64_t)gpa, vcpu->vc_gueststate.vg_rip); @@ -5962,15 +6012,29 @@ vmx_load_pdptes(struct vcpu *vcpu) ret = 0; - cr3_host_virt = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_waitok); + /* We need to drop the mutex as km_alloc may sleep. */ + vcpu->vc_last_pcpu = curcpu(); + mtx_leave(&vcpu->vc_vmx_mtx); + + cr3_host_virt = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, &kp_none, + &kd_waitok); if (!cr3_host_virt) { printf("%s: can't allocate address for guest CR3 mapping\n", __func__); + mtx_enter(&vcpu->vc_vmx_mtx); + /* XXX VMCS may be bogus */ return (ENOMEM); } - pmap_kenter_pa(cr3_host_virt, cr3_host_phys, PROT_READ); + mtx_enter(&vcpu->vc_vmx_mtx); + if (vcpu_reload_vmcs_vmx(vcpu)) { + printf("%s: failed to reload vmcs\n", __func__); + mtx_leave(&vcpu->vc_vmx_mtx); + ret = EINVAL; + goto exit; + } + pdptes = (pd_entry_t *)cr3_host_virt; if (vmwrite(VMCS_GUEST_PDPTE0, pdptes[0])) { printf("%s: can't write guest PDPTE0\n", __func__); @@ -5998,7 +6062,18 @@ vmx_load_pdptes(struct vcpu *vcpu) exit: pmap_kremove(cr3_host_virt, PAGE_SIZE); + + /* km_free can sleep, so we need to release our mutex */ + vcpu->vc_last_pcpu = curcpu(); + mtx_leave(&vcpu->vc_vmx_mtx); + km_free((void *)cr3_host_virt, PAGE_SIZE, &kv_any, &kp_none); + + mtx_enter(&vcpu->vc_vmx_mtx); + if (vcpu_reload_vmcs_vmx(vcpu)) { + printf("%s: failed to reload vmcs after km_free\n", __func__); + ret = EINVAL; + } return (ret); } blob - 4bc8627b554e1233ed55f23df30c8c2052fa84b2 blob + 357fc099c307aaa7ee075243847e791188ad615a --- sys/arch/amd64/include/cpu.h +++ sys/arch/amd64/include/cpu.h @@ -49,7 +49,7 @@ #endif /* _KERNEL */ #include <sys/device.h> -#include <sys/rwlock.h> +#include <sys/mutex.h> #include <sys/sched.h> #include <sys/sensors.h> #include <sys/srp.h> @@ -214,7 +214,7 @@ struct cpu_info { char ci_panicbuf[512]; paddr_t ci_vmcs_pa; - struct rwlock ci_vmcs_lock; + struct mutex ci_vmcs_mtx; }; #define CPUF_BSP 0x0001 /* CPU is the original BSP */ blob - 94bb172832d4c2847b1e83ebb9cc05538db6ac80 blob + dc8eb4ca62dbe3a0c01a3d37f298b250bd8daac2 --- sys/arch/amd64/include/vmmvar.h +++ sys/arch/amd64/include/vmmvar.h @@ -945,6 +945,7 @@ struct vcpu { uint64_t vc_shadow_pat; /* VMX only */ + struct mutex vc_vmx_mtx; uint64_t vc_vmx_basic; uint64_t vc_vmx_entry_ctls; uint64_t vc_vmx_true_entry_ctls;