On Mon, Nov 29, 2021 at 08:41:22PM -0500, Dave Voutila wrote:
>
> Dave Voutila <d...@sisu.io> writes:
>
> > This diff removes instability from VMX-based hosts by either removing
> > the possibility of the process sleeping while the VMCS is active or
> > reloading it if we had no choice.
> >
> > A mutex is added to help guard the VMCS state so testing with witness
> > has helped verify the diff.
> >
>
> Removed the mutex as it has served its purpose in ferreting out some
> sleep points.
>
> > The rwlock on the cpu originally used in the remote vmclear routine is
> > changed to a mutex accordingly.
> >
>
> Reverted this. This update doesn't change the rwlock to a mutex...it's
> fine if we sleep while we wait for a remote clear as it doesn't matter
> which CPU we wake up on as we're about to reload the VMCS anyways.
>
> > This diff does not remote possible calls to printf(9) via the DPRINTF
> > macro as that's part of the next diff.
> >
>
> Moot at this point.
>
> > One area of note: in vmx_load_pdptes() there's a XXX to call out that
> > because of the printf(9) call on failure to km_alloc that the VMCS is
> > potentially no longer valid. The upcoming diff to swap out printf(9) for
> > log(9) will remove that.
> >
>
> Revisited the above now that we're holding off on this printf -> log
> changeover.
>
> It was in the previous diff as well, but just to point out this removes
> the KERNEL_LOCK dance around uvm_fault. We were only doing this on Intel
> hosts as it wasn't understood (at that time) what was causing the VMCS
> corruption. AMD hosts haven't done this during nested page fault exit
> handling since my work to unlock vmm(4) at k2k21.
>
> ok?
>

ok mlarkin, and thanks for tracking these down.

-ml

>
> blob - 8e588f7dcbd1cec2e61e7b7292ee32ff4eb9a2e1
> blob + ac91b74fd4d5da774808ad1c78d75469ff89b458
> --- sys/arch/amd64/amd64/vmm.c
> +++ sys/arch/amd64/amd64/vmm.c
> @@ -3028,12 +3028,22 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, struct vcpu_reg
>           IA32_VMX_ACTIVATE_SECONDARY_CONTROLS, 1)) {
>               if (vcpu_vmx_check_cap(vcpu, IA32_VMX_PROCBASED2_CTLS,
>                   IA32_VMX_ENABLE_VPID, 1)) {
> -                     if (vmm_alloc_vpid(&vpid)) {
> +
> +                     /* We may sleep during allocation, so reload VMCS. */
> +                     vcpu->vc_last_pcpu = curcpu();
> +                     ret = vmm_alloc_vpid(&vpid);
> +                     if (vcpu_reload_vmcs_vmx(vcpu)) {
> +                             printf("%s: failed to reload vmcs\n", __func__);
> +                             ret = EINVAL;
> +                             goto exit;
> +                     }
> +                     if (ret) {
>                               DPRINTF("%s: could not allocate VPID\n",
>                                   __func__);
>                               ret = EINVAL;
>                               goto exit;
>                       }
> +
>                       if (vmwrite(VMCS_GUEST_VPID, vpid)) {
>                               DPRINTF("%s: error setting guest VPID\n",
>                                   __func__);
> @@ -5549,7 +5559,7 @@ svm_handle_np_fault(struct vcpu *vcpu)
>   *
>   * Return Values:
>   *  0: if successful
> - *  EINVAL: if fault type could not be determined
> + *  EINVAL: if fault type could not be determined or VMCS reload fails
>   *  EAGAIN: if a protection fault occurred, ie writing to a read-only page
>   *  errno: if uvm_fault(9) fails to wire in the page
>   */
> @@ -5569,10 +5579,14 @@ vmx_fault_page(struct vcpu *vcpu, paddr_t gpa)
>               return (EAGAIN);
>       }
>
> -     KERNEL_LOCK();
> +     /* We may sleep during uvm_fault(9), so reload VMCS. */
> +     vcpu->vc_last_pcpu = curcpu();
>       ret = uvm_fault(vcpu->vc_parent->vm_map, gpa, VM_FAULT_WIRE,
>           PROT_READ | PROT_WRITE | PROT_EXEC);
> -     KERNEL_UNLOCK();
> +     if (vcpu_reload_vmcs_vmx(vcpu)) {
> +             printf("%s: failed to reload vmcs\n", __func__);
> +             return (EINVAL);
> +     }
>
>       if (ret)
>               printf("%s: uvm_fault returns %d, GPA=0x%llx, rip=0x%llx\n",
> @@ -5962,7 +5976,16 @@ vmx_load_pdptes(struct vcpu *vcpu)
>
>       ret = 0;
>
> -     cr3_host_virt = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, &kp_none, 
> &kd_waitok);
> +     /* We may sleep during km_alloc(9), so reload VMCS. */
> +     vcpu->vc_last_pcpu = curcpu();
> +     cr3_host_virt = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, &kp_none,
> +         &kd_waitok);
> +     if (vcpu_reload_vmcs_vmx(vcpu)) {
> +             printf("%s: failed to reload vmcs\n", __func__);
> +             ret = EINVAL;
> +             goto exit;
> +     }
> +
>       if (!cr3_host_virt) {
>               printf("%s: can't allocate address for guest CR3 mapping\n",
>                   __func__);
> @@ -5998,7 +6021,15 @@ vmx_load_pdptes(struct vcpu *vcpu)
>
>  exit:
>       pmap_kremove(cr3_host_virt, PAGE_SIZE);
> +
> +     /* km_free(9) might sleep, so we need to reload VMCS. */
> +     vcpu->vc_last_pcpu = curcpu();
>       km_free((void *)cr3_host_virt, PAGE_SIZE, &kv_any, &kp_none);
> +     if (vcpu_reload_vmcs_vmx(vcpu)) {
> +             printf("%s: failed to reload vmcs after km_free\n", __func__);
> +             ret = EINVAL;
> +     }
> +
>       return (ret);
>  }

Reply via email to