Module Name: src Committed By: maxv Date: Wed Feb 13 16:03:16 UTC 2019
Modified Files: src/sys/dev/nvmm: nvmm.c nvmm_internal.h src/sys/modules/nvmm: Makefile Added Files: src/sys/dev/nvmm/x86: nvmm_x86_vmx.c nvmm_x86_vmxfunc.S Log Message: Add Intel-VMX support in NVMM. This allows us to run hardware-accelerated VMs on Intel CPUs. Overall this implementation is fast and reliable, I am able to run NetBSD VMs with many VCPUs on a quad-core Intel i5. NVMM-Intel applies several optimizations already present in NVMM-AMD, and has a code structure similar to it. No change was needed in the NVMM MI frontend, or in libnvmm. Some differences exist against AMD: - On Intel the ASID space is big, so we don't fall back to a shared ASID when there are more VCPUs executing than available ASIDs in the host, contrary to AMD. There are enough ASIDs for the maximum number of VCPUs supported by NVMM. - On Intel there are two TLBs we need to take care of, one for the host (EPT) and one for the guest (VPID). Changes in EPT paging flush the host TLB, changes to the guest mode flush the guest TLB. - On Intel there is no easy way to set/fetch the VTPR, so we intercept reads/writes to CR8 and maintain a software TPR, that we give to the virtualizer as if it was the effective TPR in the guest. - On Intel, because of SVS, the host CR4 and LSTAR are not static, so we're forced to save them on each VMENTRY. - There is extra Intel weirdness we need to take care of, for example the reserved bits in CR0 and CR4 when accesses trap. While this implementation is functional and can already run many OSes, we likely have a problem on 32bit-PAE guests, because they require special care on Intel CPUs, and currently we don't handle that correctly; such guests may misbehave for now (without altering the host stability). I expect to fix that soon. To generate a diff of this commit: cvs rdiff -u -r1.6 -r1.7 src/sys/dev/nvmm/nvmm.c cvs rdiff -u -r1.4 -r1.5 src/sys/dev/nvmm/nvmm_internal.h cvs rdiff -u -r0 -r1.1 src/sys/dev/nvmm/x86/nvmm_x86_vmx.c \ src/sys/dev/nvmm/x86/nvmm_x86_vmxfunc.S cvs rdiff -u -r1.1 -r1.2 src/sys/modules/nvmm/Makefile Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/dev/nvmm/nvmm.c diff -u src/sys/dev/nvmm/nvmm.c:1.6 src/sys/dev/nvmm/nvmm.c:1.7 --- src/sys/dev/nvmm/nvmm.c:1.6 Sat Jan 26 15:25:51 2019 +++ src/sys/dev/nvmm/nvmm.c Wed Feb 13 16:03:16 2019 @@ -1,4 +1,4 @@ -/* $NetBSD: nvmm.c,v 1.6 2019/01/26 15:25:51 maxv Exp $ */ +/* $NetBSD: nvmm.c,v 1.7 2019/02/13 16:03:16 maxv Exp $ */ /* * Copyright (c) 2018 The NetBSD Foundation, Inc. @@ -30,7 +30,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.6 2019/01/26 15:25:51 maxv Exp $"); +__KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.7 2019/02/13 16:03:16 maxv Exp $"); #include <sys/param.h> #include <sys/systm.h> @@ -54,7 +54,8 @@ __KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.6 static struct nvmm_machine machines[NVMM_MAX_MACHINES]; static const struct nvmm_impl *nvmm_impl_list[] = { - &nvmm_x86_svm /* x86 AMD SVM */ + &nvmm_x86_svm, /* x86 AMD SVM */ + &nvmm_x86_vmx /* x86 Intel VMX */ }; static const struct nvmm_impl *nvmm_impl = NULL; Index: src/sys/dev/nvmm/nvmm_internal.h diff -u src/sys/dev/nvmm/nvmm_internal.h:1.4 src/sys/dev/nvmm/nvmm_internal.h:1.5 --- src/sys/dev/nvmm/nvmm_internal.h:1.4 Mon Feb 11 07:07:37 2019 +++ src/sys/dev/nvmm/nvmm_internal.h Wed Feb 13 16:03:16 2019 @@ -1,4 +1,4 @@ -/* $NetBSD: nvmm_internal.h,v 1.4 2019/02/11 07:07:37 maxv Exp $ */ +/* $NetBSD: nvmm_internal.h,v 1.5 2019/02/13 16:03:16 maxv Exp $ */ /* * Copyright (c) 2018 The NetBSD Foundation, Inc. @@ -109,5 +109,6 @@ int nvmm_vcpu_get(struct nvmm_machine *, void nvmm_vcpu_put(struct nvmm_cpu *); extern const struct nvmm_impl nvmm_x86_svm; +extern const struct nvmm_impl nvmm_x86_vmx; #endif /* _NVMM_INTERNAL_H_ */ Index: src/sys/modules/nvmm/Makefile diff -u src/sys/modules/nvmm/Makefile:1.1 src/sys/modules/nvmm/Makefile:1.2 --- src/sys/modules/nvmm/Makefile:1.1 Wed Nov 7 07:43:08 2018 +++ src/sys/modules/nvmm/Makefile Wed Feb 13 16:03:16 2019 @@ -1,4 +1,4 @@ -# $NetBSD: Makefile,v 1.1 2018/11/07 07:43:08 maxv Exp $ +# $NetBSD: Makefile,v 1.2 2019/02/13 16:03:16 maxv Exp $ .include "../Makefile.inc" .include "../Makefile.assym" @@ -14,6 +14,7 @@ SRCS= nvmm.c .if ${MACHINE_ARCH} == "x86_64" SRCS+= nvmm_x86_svm.c nvmm_x86_svmfunc.S +SRCS+= nvmm_x86_vmx.c nvmm_x86_vmxfunc.S .endif .include <bsd.kmodule.mk> Added files: Index: src/sys/dev/nvmm/x86/nvmm_x86_vmx.c diff -u /dev/null src/sys/dev/nvmm/x86/nvmm_x86_vmx.c:1.1 --- /dev/null Wed Feb 13 16:03:16 2019 +++ src/sys/dev/nvmm/x86/nvmm_x86_vmx.c Wed Feb 13 16:03:16 2019 @@ -0,0 +1,2823 @@ +/* $NetBSD: nvmm_x86_vmx.c,v 1.1 2019/02/13 16:03:16 maxv Exp $ */ + +/* + * Copyright (c) 2018 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Maxime Villard. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: nvmm_x86_vmx.c,v 1.1 2019/02/13 16:03:16 maxv Exp $"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/kmem.h> +#include <sys/cpu.h> +#include <sys/xcall.h> + +#include <uvm/uvm.h> +#include <uvm/uvm_page.h> + +#include <x86/cputypes.h> +#include <x86/specialreg.h> +#include <x86/pmap.h> +#include <x86/dbregs.h> +#include <machine/cpuvar.h> + +#include <dev/nvmm/nvmm.h> +#include <dev/nvmm/nvmm_internal.h> +#include <dev/nvmm/x86/nvmm_x86.h> + +int _vmx_vmxon(paddr_t *pa); +int _vmx_vmxoff(void); +int _vmx_invept(uint64_t op, void *desc); +int _vmx_invvpid(uint64_t op, void *desc); +int _vmx_vmread(uint64_t op, uint64_t *val); +int _vmx_vmwrite(uint64_t op, uint64_t val); +int _vmx_vmptrld(paddr_t *pa); +int _vmx_vmptrst(paddr_t *pa); +int _vmx_vmclear(paddr_t *pa); +int vmx_vmlaunch(uint64_t *gprs); +int vmx_vmresume(uint64_t *gprs); + +#define vmx_vmxon(a) \ + if (__predict_false(_vmx_vmxon(a) != 0)) { \ + panic("%s: VMXON failed", __func__); \ + } +#define vmx_vmxoff() \ + if (__predict_false(_vmx_vmxoff() != 0)) { \ + panic("%s: VMXOFF failed", __func__); \ + } +#define vmx_invept(a, b) \ + if (__predict_false(_vmx_invept(a, b) != 0)) { \ + panic("%s: INVEPT failed", __func__); \ + } +#define vmx_invvpid(a, b) \ + if (__predict_false(_vmx_invvpid(a, b) != 0)) { \ + panic("%s: INVVPID failed", __func__); \ + } +#define vmx_vmread(a, b) \ + if (__predict_false(_vmx_vmread(a, b) != 0)) { \ + panic("%s: VMREAD failed", __func__); \ + } +#define vmx_vmwrite(a, b) \ + if (__predict_false(_vmx_vmwrite(a, b) != 0)) { \ + panic("%s: VMWRITE failed", __func__); \ + } +#define vmx_vmptrld(a) \ + if (__predict_false(_vmx_vmptrld(a) != 0)) { \ + panic("%s: VMPTRLD failed", __func__); \ + } +#define vmx_vmptrst(a) \ + if (__predict_false(_vmx_vmptrst(a) != 0)) { \ + panic("%s: VMPTRST failed", __func__); \ + } +#define vmx_vmclear(a) \ + if (__predict_false(_vmx_vmclear(a) != 0)) { \ + panic("%s: VMCLEAR failed", __func__); \ + } + +#define MSR_IA32_FEATURE_CONTROL 0x003A +#define IA32_FEATURE_CONTROL_LOCK __BIT(0) +#define IA32_FEATURE_CONTROL_IN_SMX __BIT(1) +#define IA32_FEATURE_CONTROL_OUT_SMX __BIT(2) + +#define MSR_IA32_VMX_BASIC 0x0480 +#define IA32_VMX_BASIC_IDENT __BITS(30,0) +#define IA32_VMX_BASIC_DATA_SIZE __BITS(44,32) +#define IA32_VMX_BASIC_MEM_WIDTH __BIT(48) +#define IA32_VMX_BASIC_DUAL __BIT(49) +#define IA32_VMX_BASIC_MEM_TYPE __BITS(53,50) +#define MEM_TYPE_UC 0 +#define MEM_TYPE_WB 6 +#define IA32_VMX_BASIC_IO_REPORT __BIT(54) +#define IA32_VMX_BASIC_TRUE_CTLS __BIT(55) + +#define MSR_IA32_VMX_PINBASED_CTLS 0x0481 +#define MSR_IA32_VMX_PROCBASED_CTLS 0x0482 +#define MSR_IA32_VMX_EXIT_CTLS 0x0483 +#define MSR_IA32_VMX_ENTRY_CTLS 0x0484 +#define MSR_IA32_VMX_PROCBASED_CTLS2 0x048B + +#define MSR_IA32_VMX_TRUE_PINBASED_CTLS 0x048D +#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x048E +#define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x048F +#define MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x0490 + +#define MSR_IA32_VMX_CR0_FIXED0 0x0486 +#define MSR_IA32_VMX_CR0_FIXED1 0x0487 +#define MSR_IA32_VMX_CR4_FIXED0 0x0488 +#define MSR_IA32_VMX_CR4_FIXED1 0x0489 + +#define MSR_IA32_VMX_EPT_VPID_CAP 0x048C +#define IA32_VMX_EPT_VPID_WALKLENGTH_4 __BIT(6) +#define IA32_VMX_EPT_VPID_UC __BIT(8) +#define IA32_VMX_EPT_VPID_WB __BIT(14) +#define IA32_VMX_EPT_VPID_INVEPT __BIT(20) +#define IA32_VMX_EPT_VPID_FLAGS_AD __BIT(21) +#define IA32_VMX_EPT_VPID_INVEPT_CONTEXT __BIT(25) +#define IA32_VMX_EPT_VPID_INVEPT_ALL __BIT(26) +#define IA32_VMX_EPT_VPID_INVVPID __BIT(32) +#define IA32_VMX_EPT_VPID_INVVPID_ADDR __BIT(40) +#define IA32_VMX_EPT_VPID_INVVPID_CONTEXT __BIT(41) +#define IA32_VMX_EPT_VPID_INVVPID_ALL __BIT(42) +#define IA32_VMX_EPT_VPID_INVVPID_CONTEXT_NOG __BIT(43) + +/* -------------------------------------------------------------------------- */ + +/* 16-bit control fields */ +#define VMCS_VPID 0x00000000 +#define VMCS_PIR_VECTOR 0x00000002 +#define VMCS_EPTP_INDEX 0x00000004 +/* 16-bit guest-state fields */ +#define VMCS_GUEST_ES_SELECTOR 0x00000800 +#define VMCS_GUEST_CS_SELECTOR 0x00000802 +#define VMCS_GUEST_SS_SELECTOR 0x00000804 +#define VMCS_GUEST_DS_SELECTOR 0x00000806 +#define VMCS_GUEST_FS_SELECTOR 0x00000808 +#define VMCS_GUEST_GS_SELECTOR 0x0000080A +#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C +#define VMCS_GUEST_TR_SELECTOR 0x0000080E +#define VMCS_GUEST_INTR_STATUS 0x00000810 +#define VMCS_PML_INDEX 0x00000812 +/* 16-bit host-state fields */ +#define VMCS_HOST_ES_SELECTOR 0x00000C00 +#define VMCS_HOST_CS_SELECTOR 0x00000C02 +#define VMCS_HOST_SS_SELECTOR 0x00000C04 +#define VMCS_HOST_DS_SELECTOR 0x00000C06 +#define VMCS_HOST_FS_SELECTOR 0x00000C08 +#define VMCS_HOST_GS_SELECTOR 0x00000C0A +#define VMCS_HOST_TR_SELECTOR 0x00000C0C +/* 64-bit control fields */ +#define VMCS_IO_BITMAP_A 0x00002000 +#define VMCS_IO_BITMAP_B 0x00002002 +#define VMCS_MSR_BITMAP 0x00002004 +#define VMCS_EXIT_MSR_STORE_ADDRESS 0x00002006 +#define VMCS_EXIT_MSR_LOAD_ADDRESS 0x00002008 +#define VMCS_ENTRY_MSR_LOAD_ADDRESS 0x0000200A +#define VMCS_EXECUTIVE_VMCS 0x0000200C +#define VMCS_PML_ADDRESS 0x0000200E +#define VMCS_TSC_OFFSET 0x00002010 +#define VMCS_VIRTUAL_APIC 0x00002012 +#define VMCS_APIC_ACCESS 0x00002014 +#define VMCS_PIR_DESC 0x00002016 +#define VMCS_VM_CONTROL 0x00002018 +#define VMCS_EPTP 0x0000201A +#define EPTP_TYPE __BITS(2,0) +#define EPTP_TYPE_UC 0 +#define EPTP_TYPE_WB 6 +#define EPTP_WALKLEN __BITS(5,3) +#define EPTP_FLAGS_AD __BIT(6) +#define EPTP_PHYSADDR __BITS(63,12) +#define VMCS_EOI_EXIT0 0x0000201C +#define VMCS_EOI_EXIT1 0x0000201E +#define VMCS_EOI_EXIT2 0x00002020 +#define VMCS_EOI_EXIT3 0x00002022 +#define VMCS_EPTP_LIST 0x00002024 +#define VMCS_VMREAD_BITMAP 0x00002026 +#define VMCS_VMWRITE_BITMAP 0x00002028 +#define VMCS_VIRTUAL_EXCEPTION 0x0000202A +#define VMCS_XSS_EXIT_BITMAP 0x0000202C +#define VMCS_ENCLS_EXIT_BITMAP 0x0000202E +#define VMCS_TSC_MULTIPLIER 0x00002032 +/* 64-bit read-only fields */ +#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 +/* 64-bit guest-state fields */ +#define VMCS_LINK_POINTER 0x00002800 +#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802 +#define VMCS_GUEST_IA32_PAT 0x00002804 +#define VMCS_GUEST_IA32_EFER 0x00002806 +#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808 +#define VMCS_GUEST_PDPTE0 0x0000280A +#define VMCS_GUEST_PDPTE1 0x0000280C +#define VMCS_GUEST_PDPTE2 0x0000280E +#define VMCS_GUEST_PDPTE3 0x00002810 +#define VMCS_GUEST_BNDCFGS 0x00002812 +/* 64-bit host-state fields */ +#define VMCS_HOST_IA32_PAT 0x00002C00 +#define VMCS_HOST_IA32_EFER 0x00002C02 +#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04 +/* 32-bit control fields */ +#define VMCS_PINBASED_CTLS 0x00004000 +#define PIN_CTLS_INT_EXITING __BIT(0) +#define PIN_CTLS_NMI_EXITING __BIT(3) +#define PIN_CTLS_VIRTUAL_NMIS __BIT(5) +#define PIN_CTLS_ACTIVATE_PREEMPT_TIMER __BIT(6) +#define PIN_CTLS_PROCESS_POSTEd_INTS __BIT(7) +#define VMCS_PROCBASED_CTLS 0x00004002 +#define PROC_CTLS_INT_WINDOW_EXITING __BIT(2) +#define PROC_CTLS_USE_TSC_OFFSETTING __BIT(3) +#define PROC_CTLS_HLT_EXITING __BIT(7) +#define PROC_CTLS_INVLPG_EXITING __BIT(9) +#define PROC_CTLS_MWAIT_EXITING __BIT(10) +#define PROC_CTLS_RDPMC_EXITING __BIT(11) +#define PROC_CTLS_RDTSC_EXITING __BIT(12) +#define PROC_CTLS_RCR3_EXITING __BIT(15) +#define PROC_CTLS_LCR3_EXITING __BIT(16) +#define PROC_CTLS_RCR8_EXITING __BIT(19) +#define PROC_CTLS_LCR8_EXITING __BIT(20) +#define PROC_CTLS_USE_TPR_SHADOW __BIT(21) +#define PROC_CTLS_NMI_WINDOW_EXITING __BIT(22) +#define PROC_CTLS_DR_EXITING __BIT(23) +#define PROC_CTLS_UNCOND_IO_EXITING __BIT(24) +#define PROC_CTLS_USE_IO_BITMAPS __BIT(25) +#define PROC_CTLS_MONITOR_TRAP_FLAG __BIT(27) +#define PROC_CTLS_USE_MSR_BITMAPS __BIT(28) +#define PROC_CTLS_MONITOR_EXITING __BIT(29) +#define PROC_CTLS_PAUSE_EXITING __BIT(30) +#define PROC_CTLS_ACTIVATE_CTLS2 __BIT(31) +#define VMCS_EXCEPTION_BITMAP 0x00004004 +#define VMCS_PF_ERROR_MASK 0x00004006 +#define VMCS_PF_ERROR_MATCH 0x00004008 +#define VMCS_CR3_TARGET_COUNT 0x0000400A +#define VMCS_EXIT_CTLS 0x0000400C +#define EXIT_CTLS_SAVE_DEBUG_CONTROLS __BIT(2) +#define EXIT_CTLS_HOST_LONG_MODE __BIT(9) +#define EXIT_CTLS_LOAD_PERFGLOBALCTRL __BIT(12) +#define EXIT_CTLS_ACK_INTERRUPT __BIT(15) +#define EXIT_CTLS_SAVE_PAT __BIT(18) +#define EXIT_CTLS_LOAD_PAT __BIT(19) +#define EXIT_CTLS_SAVE_EFER __BIT(20) +#define EXIT_CTLS_LOAD_EFER __BIT(21) +#define EXIT_CTLS_SAVE_PREEMPT_TIMER __BIT(22) +#define EXIT_CTLS_CLEAR_BNDCFGS __BIT(23) +#define EXIT_CTLS_CONCEAL_PT __BIT(24) +#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E +#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010 +#define VMCS_ENTRY_CTLS 0x00004012 +#define ENTRY_CTLS_LOAD_DEBUG_CONTROLS __BIT(2) +#define ENTRY_CTLS_LONG_MODE __BIT(9) +#define ENTRY_CTLS_SMM __BIT(10) +#define ENTRY_CTLS_DISABLE_DUAL __BIT(11) +#define ENTRY_CTLS_LOAD_PERFGLOBALCTRL __BIT(13) +#define ENTRY_CTLS_LOAD_PAT __BIT(14) +#define ENTRY_CTLS_LOAD_EFER __BIT(15) +#define ENTRY_CTLS_LOAD_BNDCFGS __BIT(16) +#define ENTRY_CTLS_CONCEAL_PT __BIT(17) +#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014 +#define VMCS_ENTRY_INTR_INFO 0x00004016 +#define INTR_INFO_VECTOR __BITS(7,0) +#define INTR_INFO_TYPE_EXT_INT (0 << 8) +#define INTR_INFO_TYPE_NMI (2 << 8) +#define INTR_INFO_TYPE_HW_EXC (3 << 8) +#define INTR_INFO_TYPE_SW_INT (4 << 8) +#define INTR_INFO_TYPE_PRIV_SW_EXC (5 << 8) +#define INTR_INFO_TYPE_SW_EXC (6 << 8) +#define INTR_INFO_TYPE_OTHER (7 << 8) +#define INTR_INFO_ERROR __BIT(11) +#define INTR_INFO_VALID __BIT(31) +#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018 +#define VMCS_ENTRY_INST_LENGTH 0x0000401A +#define VMCS_TPR_THRESHOLD 0x0000401C +#define VMCS_PROCBASED_CTLS2 0x0000401E +#define PROC_CTLS2_VIRT_APIC_ACCESSES __BIT(0) +#define PROC_CTLS2_ENABLE_EPT __BIT(1) +#define PROC_CTLS2_DESC_TABLE_EXITING __BIT(2) +#define PROC_CTLS2_ENABLE_RDTSCP __BIT(3) +#define PROC_CTLS2_VIRT_X2APIC __BIT(4) +#define PROC_CTLS2_ENABLE_VPID __BIT(5) +#define PROC_CTLS2_WBINVD_EXITING __BIT(6) +#define PROC_CTLS2_UNRESTRICTED_GUEST __BIT(7) +#define PROC_CTLS2_APIC_REG_VIRT __BIT(8) +#define PROC_CTLS2_VIRT_INT_DELIVERY __BIT(9) +#define PROC_CTLS2_PAUSE_LOOP_EXITING __BIT(10) +#define PROC_CTLS2_RDRAND_EXITING __BIT(11) +#define PROC_CTLS2_INVPCID_ENABLE __BIT(12) +#define PROC_CTLS2_VMFUNC_ENABLE __BIT(13) +#define PROC_CTLS2_VMCS_SHADOWING __BIT(14) +#define PROC_CTLS2_ENCLS_EXITING __BIT(15) +#define PROC_CTLS2_RDSEED_EXITING __BIT(16) +#define PROC_CTLS2_PML_ENABLE __BIT(17) +#define PROC_CTLS2_EPT_VIOLATION __BIT(18) +#define PROC_CTLS2_CONCEAL_VMX_FROM_PT __BIT(19) +#define PROC_CTLS2_XSAVES_ENABLE __BIT(20) +#define PROC_CTLS2_MODE_BASED_EXEC_EPT __BIT(22) +#define PROC_CTLS2_USE_TSC_SCALING __BIT(25) +#define VMCS_PLE_GAP 0x00004020 +#define VMCS_PLE_WINDOW 0x00004022 +/* 32-bit read-only data fields */ +#define VMCS_INSTRUCTION_ERROR 0x00004400 +#define VMCS_EXIT_REASON 0x00004402 +#define VMCS_EXIT_INTR_INFO 0x00004404 +#define VMCS_EXIT_INTR_ERRCODE 0x00004406 +#define VMCS_IDT_VECTORING_INFO 0x00004408 +#define VMCS_IDT_VECTORING_ERROR 0x0000440A +#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C +#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E +/* 32-bit guest-state fields */ +#define VMCS_GUEST_ES_LIMIT 0x00004800 +#define VMCS_GUEST_CS_LIMIT 0x00004802 +#define VMCS_GUEST_SS_LIMIT 0x00004804 +#define VMCS_GUEST_DS_LIMIT 0x00004806 +#define VMCS_GUEST_FS_LIMIT 0x00004808 +#define VMCS_GUEST_GS_LIMIT 0x0000480A +#define VMCS_GUEST_LDTR_LIMIT 0x0000480C +#define VMCS_GUEST_TR_LIMIT 0x0000480E +#define VMCS_GUEST_GDTR_LIMIT 0x00004810 +#define VMCS_GUEST_IDTR_LIMIT 0x00004812 +#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814 +#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816 +#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818 +#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A +#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C +#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E +#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820 +#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822 +#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824 +#define INT_STATE_STI __BIT(0) +#define INT_STATE_MOVSS __BIT(1) +#define INT_STATE_SMI __BIT(2) +#define INT_STATE_NMI __BIT(3) +#define INT_STATE_ENCLAVE __BIT(4) +#define VMCS_GUEST_ACTIVITY 0x00004826 +#define VMCS_GUEST_SMBASE 0x00004828 +#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A +#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E +/* 32-bit host state fields */ +#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00 +/* Natural-Width control fields */ +#define VMCS_CR0_MASK 0x00006000 +#define VMCS_CR4_MASK 0x00006002 +#define VMCS_CR0_SHADOW 0x00006004 +#define VMCS_CR4_SHADOW 0x00006006 +#define VMCS_CR3_TARGET0 0x00006008 +#define VMCS_CR3_TARGET1 0x0000600A +#define VMCS_CR3_TARGET2 0x0000600C +#define VMCS_CR3_TARGET3 0x0000600E +/* Natural-Width read-only fields */ +#define VMCS_EXIT_QUALIFICATION 0x00006400 +#define VMCS_IO_RCX 0x00006402 +#define VMCS_IO_RSI 0x00006404 +#define VMCS_IO_RDI 0x00006406 +#define VMCS_IO_RIP 0x00006408 +#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A +/* Natural-Width guest-state fields */ +#define VMCS_GUEST_CR0 0x00006800 +#define VMCS_GUEST_CR3 0x00006802 +#define VMCS_GUEST_CR4 0x00006804 +#define VMCS_GUEST_ES_BASE 0x00006806 +#define VMCS_GUEST_CS_BASE 0x00006808 +#define VMCS_GUEST_SS_BASE 0x0000680A +#define VMCS_GUEST_DS_BASE 0x0000680C +#define VMCS_GUEST_FS_BASE 0x0000680E +#define VMCS_GUEST_GS_BASE 0x00006810 +#define VMCS_GUEST_LDTR_BASE 0x00006812 +#define VMCS_GUEST_TR_BASE 0x00006814 +#define VMCS_GUEST_GDTR_BASE 0x00006816 +#define VMCS_GUEST_IDTR_BASE 0x00006818 +#define VMCS_GUEST_DR7 0x0000681A +#define VMCS_GUEST_RSP 0x0000681C +#define VMCS_GUEST_RIP 0x0000681E +#define VMCS_GUEST_RFLAGS 0x00006820 +#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822 +#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824 +#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826 +/* Natural-Width host-state fields */ +#define VMCS_HOST_CR0 0x00006C00 +#define VMCS_HOST_CR3 0x00006C02 +#define VMCS_HOST_CR4 0x00006C04 +#define VMCS_HOST_FS_BASE 0x00006C06 +#define VMCS_HOST_GS_BASE 0x00006C08 +#define VMCS_HOST_TR_BASE 0x00006C0A +#define VMCS_HOST_GDTR_BASE 0x00006C0C +#define VMCS_HOST_IDTR_BASE 0x00006C0E +#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10 +#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12 +#define VMCS_HOST_RSP 0x00006C14 +#define VMCS_HOST_RIP 0x00006c16 + +/* VMX basic exit reasons. */ +#define VMCS_EXITCODE_EXC_NMI 0 +#define VMCS_EXITCODE_EXT_INT 1 +#define VMCS_EXITCODE_SHUTDOWN 2 +#define VMCS_EXITCODE_INIT 3 +#define VMCS_EXITCODE_SIPI 4 +#define VMCS_EXITCODE_SMI 5 +#define VMCS_EXITCODE_OTHER_SMI 6 +#define VMCS_EXITCODE_INT_WINDOW 7 +#define VMCS_EXITCODE_NMI_WINDOW 8 +#define VMCS_EXITCODE_TASK_SWITCH 9 +#define VMCS_EXITCODE_CPUID 10 +#define VMCS_EXITCODE_GETSEC 11 +#define VMCS_EXITCODE_HLT 12 +#define VMCS_EXITCODE_INVD 13 +#define VMCS_EXITCODE_INVLPG 14 +#define VMCS_EXITCODE_RDPMC 15 +#define VMCS_EXITCODE_RDTSC 16 +#define VMCS_EXITCODE_RSM 17 +#define VMCS_EXITCODE_VMCALL 18 +#define VMCS_EXITCODE_VMCLEAR 19 +#define VMCS_EXITCODE_VMLAUNCH 20 +#define VMCS_EXITCODE_VMPTRLD 21 +#define VMCS_EXITCODE_VMPTRST 22 +#define VMCS_EXITCODE_VMREAD 23 +#define VMCS_EXITCODE_VMRESUME 24 +#define VMCS_EXITCODE_VMWRITE 25 +#define VMCS_EXITCODE_VMXOFF 26 +#define VMCS_EXITCODE_VMXON 27 +#define VMCS_EXITCODE_CR 28 +#define VMCS_EXITCODE_DR 29 +#define VMCS_EXITCODE_IO 30 +#define VMCS_EXITCODE_RDMSR 31 +#define VMCS_EXITCODE_WRMSR 32 +#define VMCS_EXITCODE_FAIL_GUEST_INVALID 33 +#define VMCS_EXITCODE_FAIL_MSR_INVALID 34 +#define VMCS_EXITCODE_MWAIT 36 +#define VMCS_EXITCODE_TRAP_FLAG 37 +#define VMCS_EXITCODE_MONITOR 39 +#define VMCS_EXITCODE_PAUSE 40 +#define VMCS_EXITCODE_FAIL_MACHINE_CHECK 41 +#define VMCS_EXITCODE_TPR_BELOW 43 +#define VMCS_EXITCODE_APIC_ACCESS 44 +#define VMCS_EXITCODE_VEOI 45 +#define VMCS_EXITCODE_GDTR_IDTR 46 +#define VMCS_EXITCODE_LDTR_TR 47 +#define VMCS_EXITCODE_EPT_VIOLATION 48 +#define VMCS_EXITCODE_EPT_MISCONFIG 49 +#define VMCS_EXITCODE_INVEPT 50 +#define VMCS_EXITCODE_RDTSCP 51 +#define VMCS_EXITCODE_PREEMPT_TIMEOUT 52 +#define VMCS_EXITCODE_INVVPID 53 +#define VMCS_EXITCODE_WBINVD 54 +#define VMCS_EXITCODE_XSETBV 55 +#define VMCS_EXITCODE_APIC_WRITE 56 +#define VMCS_EXITCODE_RDRAND 57 +#define VMCS_EXITCODE_INVPCID 58 +#define VMCS_EXITCODE_VMFUNC 59 +#define VMCS_EXITCODE_ENCLS 60 +#define VMCS_EXITCODE_RDSEED 61 +#define VMCS_EXITCODE_PAGE_LOG_FULL 62 +#define VMCS_EXITCODE_XSAVES 63 +#define VMCS_EXITCODE_XRSTORS 64 + +/* -------------------------------------------------------------------------- */ + +#define VMX_MSRLIST_STAR 0 +#define VMX_MSRLIST_LSTAR 1 +#define VMX_MSRLIST_CSTAR 2 +#define VMX_MSRLIST_SFMASK 3 +#define VMX_MSRLIST_KERNELGSBASE 4 +#define VMX_MSRLIST_EXIT_NMSR 5 +#define VMX_MSRLIST_L1DFLUSH 5 + +/* On entry, we may do +1 to include L1DFLUSH. */ +static size_t vmx_msrlist_entry_nmsr __read_mostly = VMX_MSRLIST_EXIT_NMSR; + +struct vmxon { + uint32_t ident; +#define VMXON_IDENT_REVISION __BITS(30,0) + + uint8_t data[PAGE_SIZE - 4]; +} __packed; + +CTASSERT(sizeof(struct vmxon) == PAGE_SIZE); + +struct vmxoncpu { + vaddr_t va; + paddr_t pa; +}; + +static struct vmxoncpu vmxoncpu[MAXCPUS]; + +struct vmcs { + uint32_t ident; +#define VMCS_IDENT_REVISION __BITS(30,0) +#define VMCS_IDENT_SHADOW __BIT(31) + + uint32_t abort; + uint8_t data[PAGE_SIZE - 8]; +} __packed; + +CTASSERT(sizeof(struct vmcs) == PAGE_SIZE); + +struct msr_entry { + uint32_t msr; + uint32_t rsvd; + uint64_t val; +} __packed; + +struct ept_desc { + uint64_t eptp; + uint64_t mbz; +} __packed; + +struct vpid_desc { + uint64_t vpid; + uint64_t addr; +} __packed; + +#define VPID_MAX 0xFFFF + +/* Make sure we never run out of VPIDs. */ +CTASSERT(VPID_MAX-1 >= NVMM_MAX_MACHINES * NVMM_MAX_VCPUS); + +static uint64_t vmx_tlb_flush_op __read_mostly; +static uint64_t vmx_ept_flush_op __read_mostly; +static uint64_t vmx_eptp_type __read_mostly; + +static uint64_t vmx_pinbased_ctls __read_mostly; +static uint64_t vmx_procbased_ctls __read_mostly; +static uint64_t vmx_procbased_ctls2 __read_mostly; +static uint64_t vmx_entry_ctls __read_mostly; +static uint64_t vmx_exit_ctls __read_mostly; + +static uint64_t vmx_cr0_fixed0 __read_mostly; +static uint64_t vmx_cr0_fixed1 __read_mostly; +static uint64_t vmx_cr4_fixed0 __read_mostly; +static uint64_t vmx_cr4_fixed1 __read_mostly; + +#define VMX_PINBASED_CTLS_ONE \ + (PIN_CTLS_INT_EXITING| \ + PIN_CTLS_NMI_EXITING| \ + PIN_CTLS_VIRTUAL_NMIS) + +#define VMX_PINBASED_CTLS_ZERO 0 + +#define VMX_PROCBASED_CTLS_ONE \ + (PROC_CTLS_USE_TSC_OFFSETTING| \ + PROC_CTLS_HLT_EXITING| \ + PROC_CTLS_MWAIT_EXITING | \ + PROC_CTLS_RDPMC_EXITING | \ + PROC_CTLS_RCR8_EXITING | \ + PROC_CTLS_LCR8_EXITING | \ + PROC_CTLS_UNCOND_IO_EXITING | /* no I/O bitmap */ \ + PROC_CTLS_USE_MSR_BITMAPS | \ + PROC_CTLS_MONITOR_EXITING | \ + PROC_CTLS_ACTIVATE_CTLS2) + +#define VMX_PROCBASED_CTLS_ZERO \ + (PROC_CTLS_RCR3_EXITING| \ + PROC_CTLS_LCR3_EXITING) + +#define VMX_PROCBASED_CTLS2_ONE \ + (PROC_CTLS2_ENABLE_EPT| \ + PROC_CTLS2_ENABLE_VPID| \ + PROC_CTLS2_UNRESTRICTED_GUEST) + +#define VMX_PROCBASED_CTLS2_ZERO 0 + +#define VMX_ENTRY_CTLS_ONE \ + (ENTRY_CTLS_LOAD_DEBUG_CONTROLS| \ + ENTRY_CTLS_LOAD_EFER| \ + ENTRY_CTLS_LOAD_PAT) + +#define VMX_ENTRY_CTLS_ZERO \ + (ENTRY_CTLS_SMM| \ + ENTRY_CTLS_DISABLE_DUAL) + +#define VMX_EXIT_CTLS_ONE \ + (EXIT_CTLS_SAVE_DEBUG_CONTROLS| \ + EXIT_CTLS_HOST_LONG_MODE| \ + EXIT_CTLS_SAVE_PAT| \ + EXIT_CTLS_LOAD_PAT| \ + EXIT_CTLS_SAVE_EFER| \ + EXIT_CTLS_LOAD_EFER) + +#define VMX_EXIT_CTLS_ZERO 0 + +static uint8_t *vmx_asidmap __read_mostly; +static uint32_t vmx_maxasid __read_mostly; +static kmutex_t vmx_asidlock __cacheline_aligned; + +#define VMX_XCR0_MASK_DEFAULT (XCR0_X87|XCR0_SSE) +static uint64_t vmx_xcr0_mask __read_mostly; + +#define VMX_NCPUIDS 32 + +#define VMCS_NPAGES 1 +#define VMCS_SIZE (VMCS_NPAGES * PAGE_SIZE) + +#define MSRBM_NPAGES 1 +#define MSRBM_SIZE (MSRBM_NPAGES * PAGE_SIZE) + +#define EFER_TLB_FLUSH \ + (EFER_NXE|EFER_LMA|EFER_LME) +#define CR0_TLB_FLUSH \ + (CR0_PG|CR0_WP|CR0_CD|CR0_NW) +#define CR4_TLB_FLUSH \ + (CR4_PGE|CR4_PAE|CR4_PSE) + +/* -------------------------------------------------------------------------- */ + +struct vmx_machdata { + bool cpuidpresent[VMX_NCPUIDS]; + struct nvmm_x86_conf_cpuid cpuid[VMX_NCPUIDS]; + kcpuset_t *ept_want_flush; +}; + +static const size_t vmx_conf_sizes[NVMM_X86_NCONF] = { + [NVMM_X86_CONF_CPUID] = sizeof(struct nvmm_x86_conf_cpuid) +}; + +struct vmx_cpudata { + /* General */ + uint64_t asid; + bool tlb_want_flush; + + /* VMCS */ + struct vmcs *vmcs; + paddr_t vmcs_pa; + size_t vmcs_refcnt; + + /* MSR bitmap */ + uint8_t *msrbm; + paddr_t msrbm_pa; + + /* Host state */ + uint64_t hxcr0; + uint64_t star; + uint64_t lstar; + uint64_t cstar; + uint64_t sfmask; + uint64_t kernelgsbase; + bool ts_set; + struct xsave_header hfpu __aligned(64); + + /* Event state */ + bool int_window_exit; + bool nmi_window_exit; + + /* Guest state */ + struct msr_entry *gmsr; + paddr_t gmsr_pa; + uint64_t gcr2; + uint64_t gcr8; + uint64_t gxcr0; + uint64_t gprs[NVMM_X64_NGPR]; + uint64_t drs[NVMM_X64_NDR]; + uint64_t tsc_offset; + struct xsave_header gfpu __aligned(64); +}; + +static const struct { + uint16_t selector; + uint16_t attrib; + uint32_t limit; + uint64_t base; +} vmx_guest_segs[NVMM_X64_NSEG] = { + [NVMM_X64_SEG_ES] = { + VMCS_GUEST_ES_SELECTOR, + VMCS_GUEST_ES_ACCESS_RIGHTS, + VMCS_GUEST_ES_LIMIT, + VMCS_GUEST_ES_BASE + }, + [NVMM_X64_SEG_CS] = { + VMCS_GUEST_CS_SELECTOR, + VMCS_GUEST_CS_ACCESS_RIGHTS, + VMCS_GUEST_CS_LIMIT, + VMCS_GUEST_CS_BASE + }, + [NVMM_X64_SEG_SS] = { + VMCS_GUEST_SS_SELECTOR, + VMCS_GUEST_SS_ACCESS_RIGHTS, + VMCS_GUEST_SS_LIMIT, + VMCS_GUEST_SS_BASE + }, + [NVMM_X64_SEG_DS] = { + VMCS_GUEST_DS_SELECTOR, + VMCS_GUEST_DS_ACCESS_RIGHTS, + VMCS_GUEST_DS_LIMIT, + VMCS_GUEST_DS_BASE + }, + [NVMM_X64_SEG_FS] = { + VMCS_GUEST_FS_SELECTOR, + VMCS_GUEST_FS_ACCESS_RIGHTS, + VMCS_GUEST_FS_LIMIT, + VMCS_GUEST_FS_BASE + }, + [NVMM_X64_SEG_GS] = { + VMCS_GUEST_GS_SELECTOR, + VMCS_GUEST_GS_ACCESS_RIGHTS, + VMCS_GUEST_GS_LIMIT, + VMCS_GUEST_GS_BASE + }, + [NVMM_X64_SEG_GDT] = { + 0, /* doesn't exist */ + 0, /* doesn't exist */ + VMCS_GUEST_GDTR_LIMIT, + VMCS_GUEST_GDTR_BASE + }, + [NVMM_X64_SEG_IDT] = { + 0, /* doesn't exist */ + 0, /* doesn't exist */ + VMCS_GUEST_IDTR_LIMIT, + VMCS_GUEST_IDTR_BASE + }, + [NVMM_X64_SEG_LDT] = { + VMCS_GUEST_LDTR_SELECTOR, + VMCS_GUEST_LDTR_ACCESS_RIGHTS, + VMCS_GUEST_LDTR_LIMIT, + VMCS_GUEST_LDTR_BASE + }, + [NVMM_X64_SEG_TR] = { + VMCS_GUEST_TR_SELECTOR, + VMCS_GUEST_TR_ACCESS_RIGHTS, + VMCS_GUEST_TR_LIMIT, + VMCS_GUEST_TR_BASE + } +}; + +/* -------------------------------------------------------------------------- */ + +static uint64_t +vmx_get_revision(void) +{ + uint64_t msr; + + msr = rdmsr(MSR_IA32_VMX_BASIC); + msr &= IA32_VMX_BASIC_IDENT; + + return msr; +} + +static void +vmx_vmcs_enter(struct nvmm_cpu *vcpu) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + paddr_t oldpa __diagused; + + cpudata->vmcs_refcnt++; + if (cpudata->vmcs_refcnt > 1) { +#ifdef DIAGNOSTIC + KASSERT(kpreempt_disabled()); + vmx_vmptrst(&oldpa); + KASSERT(oldpa == cpudata->vmcs_pa); +#endif + return; + } + + kpreempt_disable(); + +#ifdef DIAGNOSTIC + vmx_vmptrst(&oldpa); + KASSERT(oldpa == 0xFFFFFFFFFFFFFFFF); +#endif + + vmx_vmptrld(&cpudata->vmcs_pa); +} + +static void +vmx_vmcs_leave(struct nvmm_cpu *vcpu) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + paddr_t oldpa __diagused; + + KASSERT(kpreempt_disabled()); + KASSERT(cpudata->vmcs_refcnt > 0); + cpudata->vmcs_refcnt--; + + if (cpudata->vmcs_refcnt > 0) { +#ifdef DIAGNOSTIC + vmx_vmptrst(&oldpa); + KASSERT(oldpa == cpudata->vmcs_pa); +#endif + return; + } + + vmx_vmclear(&cpudata->vmcs_pa); + kpreempt_enable(); +} + +/* -------------------------------------------------------------------------- */ + +static void +vmx_event_waitexit_enable(struct nvmm_cpu *vcpu, bool nmi) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + uint64_t ctls1; + + vmx_vmread(VMCS_PROCBASED_CTLS, &ctls1); + + if (nmi) { + // XXX INT_STATE_NMI? + ctls1 |= PROC_CTLS_NMI_WINDOW_EXITING; + cpudata->nmi_window_exit = true; + } else { + ctls1 |= PROC_CTLS_INT_WINDOW_EXITING; + cpudata->int_window_exit = true; + } + + vmx_vmwrite(VMCS_PROCBASED_CTLS, ctls1); +} + +static void +vmx_event_waitexit_disable(struct nvmm_cpu *vcpu, bool nmi) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + uint64_t ctls1; + + vmx_vmread(VMCS_PROCBASED_CTLS, &ctls1); + + if (nmi) { + ctls1 &= ~PROC_CTLS_NMI_WINDOW_EXITING; + cpudata->nmi_window_exit = false; + } else { + ctls1 &= ~PROC_CTLS_INT_WINDOW_EXITING; + cpudata->int_window_exit = false; + } + + vmx_vmwrite(VMCS_PROCBASED_CTLS, ctls1); +} + +static inline int +vmx_event_has_error(uint64_t vector) +{ + switch (vector) { + case 8: /* #DF */ + case 10: /* #TS */ + case 11: /* #NP */ + case 12: /* #SS */ + case 13: /* #GP */ + case 14: /* #PF */ + case 17: /* #AC */ + case 30: /* #SX */ + return 1; + default: + return 0; + } +} + +static int +vmx_vcpu_inject(struct nvmm_machine *mach, struct nvmm_cpu *vcpu, + struct nvmm_event *event) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + int type = 0, err = 0, ret = 0; + uint64_t info, intstate, rflags; + + if (event->vector >= 256) { + return EINVAL; + } + + vmx_vmcs_enter(vcpu); + + switch (event->type) { + case NVMM_EVENT_INTERRUPT_HW: + type = INTR_INFO_TYPE_EXT_INT; + if (event->vector == 2) { + type = INTR_INFO_TYPE_NMI; + } + vmx_vmread(VMCS_GUEST_INTERRUPTIBILITY, &intstate); + if (type == INTR_INFO_TYPE_NMI) { + if (cpudata->nmi_window_exit) { + ret = EAGAIN; + goto out; + } + vmx_event_waitexit_enable(vcpu, true); + } else { + vmx_vmread(VMCS_GUEST_RFLAGS, &rflags); + if ((rflags & PSL_I) == 0 || + (intstate & (INT_STATE_STI|INT_STATE_MOVSS)) != 0) { + vmx_event_waitexit_enable(vcpu, false); + ret = EAGAIN; + goto out; + } + } + err = 0; + break; + case NVMM_EVENT_INTERRUPT_SW: + ret = EINVAL; + goto out; + case NVMM_EVENT_EXCEPTION: + if (event->vector == 2 || event->vector >= 32) { + ret = EINVAL; + goto out; + } + if (event->vector == 3 || event->vector == 0) { + ret = EINVAL; + goto out; + } + type = INTR_INFO_TYPE_HW_EXC; + err = vmx_event_has_error(event->vector); + break; + default: + ret = EAGAIN; + goto out; + } + + info = + __SHIFTIN(event->vector, INTR_INFO_VECTOR) | + type | + __SHIFTIN(err, INTR_INFO_ERROR) | + __SHIFTIN(1, INTR_INFO_VALID); + vmx_vmwrite(VMCS_ENTRY_INTR_INFO, info); + vmx_vmwrite(VMCS_ENTRY_EXCEPTION_ERROR, event->u.error); + +out: + vmx_vmcs_leave(vcpu); + return ret; +} + +static void +vmx_inject_ud(struct nvmm_machine *mach, struct nvmm_cpu *vcpu) +{ + struct nvmm_event event; + int ret __diagused; + + event.type = NVMM_EVENT_EXCEPTION; + event.vector = 6; + event.u.error = 0; + + ret = vmx_vcpu_inject(mach, vcpu, &event); + KASSERT(ret == 0); +} + +static void +vmx_inject_gp(struct nvmm_machine *mach, struct nvmm_cpu *vcpu) +{ + struct nvmm_event event; + int ret __diagused; + + event.type = NVMM_EVENT_EXCEPTION; + event.vector = 13; + event.u.error = 0; + + ret = vmx_vcpu_inject(mach, vcpu, &event); + KASSERT(ret == 0); +} + +static inline void +vmx_inkernel_advance(void) +{ + uint64_t rip, inslen, intstate; + + /* + * Maybe we should also apply single-stepping and debug exceptions. + * Matters for guest-ring3, because it can execute 'cpuid' under a + * debugger. + */ + vmx_vmread(VMCS_EXIT_INSTRUCTION_LENGTH, &inslen); + vmx_vmread(VMCS_GUEST_RIP, &rip); + vmx_vmwrite(VMCS_GUEST_RIP, rip + inslen); + vmx_vmread(VMCS_GUEST_INTERRUPTIBILITY, &intstate); + vmx_vmwrite(VMCS_GUEST_INTERRUPTIBILITY, + intstate & ~(INT_STATE_STI|INT_STATE_MOVSS)); +} + +static void +vmx_inkernel_handle_cpuid(struct nvmm_cpu *vcpu, uint64_t eax, uint64_t ecx) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + + switch (eax) { + case 0x00000001: + cpudata->gprs[NVMM_X64_GPR_RBX] &= ~CPUID_LOCAL_APIC_ID; + cpudata->gprs[NVMM_X64_GPR_RBX] |= __SHIFTIN(vcpu->cpuid, + CPUID_LOCAL_APIC_ID); + cpudata->gprs[NVMM_X64_GPR_RCX] &= + ~(CPUID2_VMX|CPUID2_SMX|CPUID2_EST|CPUID2_TM2|CPUID2_PDCM| + CPUID2_PCID|CPUID2_DEADLINE); + cpudata->gprs[NVMM_X64_GPR_RDX] &= + ~(CPUID_DS|CPUID_ACPI|CPUID_TM); + break; + case 0x00000005: + case 0x00000006: + cpudata->gprs[NVMM_X64_GPR_RAX] = 0; + cpudata->gprs[NVMM_X64_GPR_RBX] = 0; + cpudata->gprs[NVMM_X64_GPR_RCX] = 0; + cpudata->gprs[NVMM_X64_GPR_RDX] = 0; + break; + case 0x00000007: + cpudata->gprs[NVMM_X64_GPR_RBX] &= ~CPUID_SEF_INVPCID; + cpudata->gprs[NVMM_X64_GPR_RDX] &= + ~(CPUID_SEF_IBRS|CPUID_SEF_STIBP|CPUID_SEF_L1D_FLUSH| + CPUID_SEF_SSBD); + break; + case 0x0000000D: + if (ecx != 0 || vmx_xcr0_mask == 0) { + break; + } + cpudata->gprs[NVMM_X64_GPR_RAX] = vmx_xcr0_mask & 0xFFFFFFFF; + if (cpudata->gxcr0 & XCR0_SSE) { + cpudata->gprs[NVMM_X64_GPR_RBX] = sizeof(struct fxsave); + } else { + cpudata->gprs[NVMM_X64_GPR_RBX] = sizeof(struct save87); + } + cpudata->gprs[NVMM_X64_GPR_RBX] += 64; /* XSAVE header */ + cpudata->gprs[NVMM_X64_GPR_RCX] = sizeof(struct fxsave); + cpudata->gprs[NVMM_X64_GPR_RDX] = vmx_xcr0_mask >> 32; + break; + case 0x40000000: + cpudata->gprs[NVMM_X64_GPR_RBX] = 0; + cpudata->gprs[NVMM_X64_GPR_RCX] = 0; + cpudata->gprs[NVMM_X64_GPR_RDX] = 0; + memcpy(&cpudata->gprs[NVMM_X64_GPR_RBX], "___ ", 4); + memcpy(&cpudata->gprs[NVMM_X64_GPR_RCX], "NVMM", 4); + memcpy(&cpudata->gprs[NVMM_X64_GPR_RDX], " ___", 4); + break; + case 0x80000001: + cpudata->gprs[NVMM_X64_GPR_RDX] &= ~CPUID_RDTSCP; + break; + default: + break; + } +} + +static void +vmx_exit_cpuid(struct nvmm_machine *mach, struct nvmm_cpu *vcpu, + struct nvmm_exit *exit) +{ + struct vmx_machdata *machdata = mach->machdata; + struct vmx_cpudata *cpudata = vcpu->cpudata; + struct nvmm_x86_conf_cpuid *cpuid; + uint64_t eax, ecx; + u_int descs[4]; + size_t i; + + eax = cpudata->gprs[NVMM_X64_GPR_RAX]; + ecx = cpudata->gprs[NVMM_X64_GPR_RCX]; + x86_cpuid2(eax, ecx, descs); + + cpudata->gprs[NVMM_X64_GPR_RAX] = descs[0]; + cpudata->gprs[NVMM_X64_GPR_RBX] = descs[1]; + cpudata->gprs[NVMM_X64_GPR_RCX] = descs[2]; + cpudata->gprs[NVMM_X64_GPR_RDX] = descs[3]; + + for (i = 0; i < VMX_NCPUIDS; i++) { + cpuid = &machdata->cpuid[i]; + if (!machdata->cpuidpresent[i]) { + continue; + } + if (cpuid->leaf != eax) { + continue; + } + + /* del */ + cpudata->gprs[NVMM_X64_GPR_RAX] &= ~cpuid->del.eax; + cpudata->gprs[NVMM_X64_GPR_RBX] &= ~cpuid->del.ebx; + cpudata->gprs[NVMM_X64_GPR_RCX] &= ~cpuid->del.ecx; + cpudata->gprs[NVMM_X64_GPR_RDX] &= ~cpuid->del.edx; + + /* set */ + cpudata->gprs[NVMM_X64_GPR_RAX] |= cpuid->set.eax; + cpudata->gprs[NVMM_X64_GPR_RBX] |= cpuid->set.ebx; + cpudata->gprs[NVMM_X64_GPR_RCX] |= cpuid->set.ecx; + cpudata->gprs[NVMM_X64_GPR_RDX] |= cpuid->set.edx; + + break; + } + + /* Overwrite non-tunable leaves. */ + vmx_inkernel_handle_cpuid(vcpu, eax, ecx); + + vmx_inkernel_advance(); + exit->reason = NVMM_EXIT_NONE; +} + +static void +vmx_exit_hlt(struct nvmm_machine *mach, struct nvmm_cpu *vcpu, + struct nvmm_exit *exit) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + uint64_t rflags; + + if (cpudata->int_window_exit) { + vmx_vmread(VMCS_GUEST_RFLAGS, &rflags); + if (rflags & PSL_I) { + vmx_event_waitexit_disable(vcpu, false); + } + } + + vmx_inkernel_advance(); + exit->reason = NVMM_EXIT_HALTED; +} + +#define VMX_QUAL_CR_NUM __BITS(3,0) +#define VMX_QUAL_CR_TYPE __BITS(5,4) +#define CR_TYPE_WRITE 0 +#define CR_TYPE_READ 1 +#define CR_TYPE_CLTS 2 +#define CR_TYPE_LMSW 3 +#define VMX_QUAL_CR_LMSW_OPMEM __BIT(6) +#define VMX_QUAL_CR_GPR __BITS(11,8) +#define VMX_QUAL_CR_LMSW_SRC __BIT(31,16) + +static inline int +vmx_check_cr(uint64_t crval, uint64_t fixed0, uint64_t fixed1) +{ + /* Bits set to 1 in fixed0 are fixed to 1. */ + if ((crval & fixed0) != fixed0) { + return -1; + } + /* Bits set to 0 in fixed1 are fixed to 0. */ + if (crval & ~fixed1) { + return -1; + } + return 0; +} + +static int +vmx_inkernel_handle_cr0(struct nvmm_machine *mach, struct nvmm_cpu *vcpu, + uint64_t qual) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + uint64_t type, gpr, cr0; + + type = __SHIFTOUT(qual, VMX_QUAL_CR_TYPE); + if (type != CR_TYPE_WRITE) { + return -1; + } + + gpr = __SHIFTOUT(qual, VMX_QUAL_CR_GPR); + KASSERT(gpr < 16); + + if (gpr == NVMM_X64_GPR_RSP) { + vmx_vmread(VMCS_GUEST_RSP, &gpr); + } else { + gpr = cpudata->gprs[gpr]; + } + + cr0 = gpr | CR0_NE | CR0_ET; + cr0 &= ~(CR0_NW|CR0_CD); + + if (vmx_check_cr(cr0, vmx_cr0_fixed0, vmx_cr0_fixed1) == -1) { + return -1; + } + + vmx_vmwrite(VMCS_GUEST_CR0, cr0); + vmx_inkernel_advance(); + return 0; +} + +static int +vmx_inkernel_handle_cr4(struct nvmm_machine *mach, struct nvmm_cpu *vcpu, + uint64_t qual) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + uint64_t type, gpr, cr4; + + type = __SHIFTOUT(qual, VMX_QUAL_CR_TYPE); + if (type != CR_TYPE_WRITE) { + return -1; + } + + gpr = __SHIFTOUT(qual, VMX_QUAL_CR_GPR); + KASSERT(gpr < 16); + + if (gpr == NVMM_X64_GPR_RSP) { + vmx_vmread(VMCS_GUEST_RSP, &gpr); + } else { + gpr = cpudata->gprs[gpr]; + } + + cr4 = gpr | CR4_VMXE; + + if (vmx_check_cr(cr4, vmx_cr4_fixed0, vmx_cr4_fixed1) == -1) { + return -1; + } + + vmx_vmwrite(VMCS_GUEST_CR4, cr4); + vmx_inkernel_advance(); + return 0; +} + +static int +vmx_inkernel_handle_cr8(struct nvmm_machine *mach, struct nvmm_cpu *vcpu, + uint64_t qual) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + uint64_t type, gpr; + bool write; + + type = __SHIFTOUT(qual, VMX_QUAL_CR_TYPE); + if (type == CR_TYPE_WRITE) { + write = true; + } else if (type == CR_TYPE_READ) { + write = false; + } else { + return -1; + } + + gpr = __SHIFTOUT(qual, VMX_QUAL_CR_GPR); + KASSERT(gpr < 16); + + if (write) { + if (gpr == NVMM_X64_GPR_RSP) { + vmx_vmread(VMCS_GUEST_RSP, &cpudata->gcr8); + } else { + cpudata->gcr8 = cpudata->gprs[gpr]; + } + } else { + if (gpr == NVMM_X64_GPR_RSP) { + vmx_vmwrite(VMCS_GUEST_RSP, cpudata->gcr8); + } else { + cpudata->gprs[gpr] = cpudata->gcr8; + } + } + + vmx_inkernel_advance(); + return 0; +} + +static void +vmx_exit_cr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu, + struct nvmm_exit *exit) +{ + uint64_t qual; + int ret; + + vmx_vmread(VMCS_EXIT_QUALIFICATION, &qual); + + switch (__SHIFTOUT(qual, VMX_QUAL_CR_NUM)) { + case 0: + ret = vmx_inkernel_handle_cr0(mach, vcpu, qual); + break; + case 4: + ret = vmx_inkernel_handle_cr4(mach, vcpu, qual); + break; + case 8: + ret = vmx_inkernel_handle_cr8(mach, vcpu, qual); + break; + default: + ret = -1; + break; + } + + if (ret == -1) { + vmx_inject_gp(mach, vcpu); + } + + exit->reason = NVMM_EXIT_NONE; +} + +#define VMX_QUAL_IO_SIZE __BITS(2,0) +#define IO_SIZE_8 0 +#define IO_SIZE_16 1 +#define IO_SIZE_32 3 +#define VMX_QUAL_IO_IN __BIT(3) +#define VMX_QUAL_IO_STR __BIT(4) +#define VMX_QUAL_IO_REP __BIT(5) +#define VMX_QUAL_IO_DX __BIT(6) +#define VMX_QUAL_IO_PORT __BITS(31,16) + +#define VMX_INFO_IO_ADRSIZE __BITS(9,7) +#define IO_ADRSIZE_16 0 +#define IO_ADRSIZE_32 1 +#define IO_ADRSIZE_64 2 +#define VMX_INFO_IO_SEG __BITS(17,15) + +static const int seg_to_nvmm[] = { + [0] = NVMM_X64_SEG_ES, + [1] = NVMM_X64_SEG_CS, + [2] = NVMM_X64_SEG_SS, + [3] = NVMM_X64_SEG_DS, + [4] = NVMM_X64_SEG_FS, + [5] = NVMM_X64_SEG_GS +}; + +static void +vmx_exit_io(struct nvmm_machine *mach, struct nvmm_cpu *vcpu, + struct nvmm_exit *exit) +{ + uint64_t qual, info, inslen, rip; + + vmx_vmread(VMCS_EXIT_QUALIFICATION, &qual); + vmx_vmread(VMCS_EXIT_INSTRUCTION_INFO, &info); + + exit->reason = NVMM_EXIT_IO; + + if (qual & VMX_QUAL_IO_IN) { + exit->u.io.type = NVMM_EXIT_IO_IN; + } else { + exit->u.io.type = NVMM_EXIT_IO_OUT; + } + + exit->u.io.port = __SHIFTOUT(qual, VMX_QUAL_IO_PORT); + + KASSERT(__SHIFTOUT(info, VMX_INFO_IO_SEG) < 6); + exit->u.io.seg = seg_to_nvmm[__SHIFTOUT(info, VMX_INFO_IO_SEG)]; + + if (__SHIFTOUT(info, VMX_INFO_IO_ADRSIZE) == IO_ADRSIZE_64) { + exit->u.io.address_size = 8; + } else if (__SHIFTOUT(info, VMX_INFO_IO_ADRSIZE) == IO_ADRSIZE_32) { + exit->u.io.address_size = 4; + } else if (__SHIFTOUT(info, VMX_INFO_IO_ADRSIZE) == IO_ADRSIZE_16) { + exit->u.io.address_size = 2; + } + + if (__SHIFTOUT(qual, VMX_QUAL_IO_SIZE) == IO_SIZE_32) { + exit->u.io.operand_size = 4; + } else if (__SHIFTOUT(qual, VMX_QUAL_IO_SIZE) == IO_SIZE_16) { + exit->u.io.operand_size = 2; + } else if (__SHIFTOUT(qual, VMX_QUAL_IO_SIZE) == IO_SIZE_8) { + exit->u.io.operand_size = 1; + } + + exit->u.io.rep = (qual & VMX_QUAL_IO_REP) != 0; + exit->u.io.str = (qual & VMX_QUAL_IO_STR) != 0; + + if ((exit->u.io.type == NVMM_EXIT_IO_IN) && exit->u.io.str) { + exit->u.io.seg = NVMM_X64_SEG_ES; + } + + vmx_vmread(VMCS_EXIT_INSTRUCTION_LENGTH, &inslen); + vmx_vmread(VMCS_GUEST_RIP, &rip); + exit->u.io.npc = rip + inslen; +} + +static const uint64_t msr_ignore_list[] = { + MSR_BIOS_SIGN, + MSR_IA32_PLATFORM_ID +}; + +static bool +vmx_inkernel_handle_msr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu, + struct nvmm_exit *exit) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + uint64_t val; + size_t i; + + switch (exit->u.msr.type) { + case NVMM_EXIT_MSR_RDMSR: + if (exit->u.msr.msr == MSR_CR_PAT) { + vmx_vmread(VMCS_GUEST_IA32_PAT, &val); + cpudata->gprs[NVMM_X64_GPR_RAX] = (val & 0xFFFFFFFF); + cpudata->gprs[NVMM_X64_GPR_RDX] = (val >> 32); + goto handled; + } + for (i = 0; i < __arraycount(msr_ignore_list); i++) { + if (msr_ignore_list[i] != exit->u.msr.msr) + continue; + val = 0; + cpudata->gprs[NVMM_X64_GPR_RAX] = (val & 0xFFFFFFFF); + cpudata->gprs[NVMM_X64_GPR_RDX] = (val >> 32); + goto handled; + } + break; + case NVMM_EXIT_MSR_WRMSR: + if (exit->u.msr.msr == MSR_CR_PAT) { + vmx_vmwrite(VMCS_GUEST_IA32_PAT, exit->u.msr.val); + goto handled; + } + for (i = 0; i < __arraycount(msr_ignore_list); i++) { + if (msr_ignore_list[i] != exit->u.msr.msr) + continue; + goto handled; + } + break; + } + + return false; + +handled: + vmx_inkernel_advance(); + return true; +} + +static void +vmx_exit_msr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu, + struct nvmm_exit *exit, bool rdmsr) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + uint64_t inslen, rip; + + if (rdmsr) { + exit->u.msr.type = NVMM_EXIT_MSR_RDMSR; + } else { + exit->u.msr.type = NVMM_EXIT_MSR_WRMSR; + } + + exit->u.msr.msr = (cpudata->gprs[NVMM_X64_GPR_RCX] & 0xFFFFFFFF); + + if (rdmsr) { + exit->u.msr.val = 0; + } else { + uint64_t rdx, rax; + rdx = cpudata->gprs[NVMM_X64_GPR_RDX]; + rax = cpudata->gprs[NVMM_X64_GPR_RAX]; + exit->u.msr.val = (rdx << 32) | (rax & 0xFFFFFFFF); + } + + if (vmx_inkernel_handle_msr(mach, vcpu, exit)) { + exit->reason = NVMM_EXIT_NONE; + return; + } + + exit->reason = NVMM_EXIT_MSR; + vmx_vmread(VMCS_EXIT_INSTRUCTION_LENGTH, &inslen); + vmx_vmread(VMCS_GUEST_RIP, &rip); + exit->u.msr.npc = rip + inslen; +} + +static void +vmx_exit_xsetbv(struct nvmm_machine *mach, struct nvmm_cpu *vcpu, + struct nvmm_exit *exit) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + uint16_t val; + uint64_t ss; + + exit->reason = NVMM_EXIT_NONE; + + val = (cpudata->gprs[NVMM_X64_GPR_RDX] << 32) | + (cpudata->gprs[NVMM_X64_GPR_RAX] & 0xFFFFFFFF); + + vmx_vmread(VMCS_GUEST_SS_SELECTOR, &ss); + + if (__predict_false(cpudata->gprs[NVMM_X64_GPR_RCX] != 0)) { + goto error; + } else if (__predict_false((ss & SEL_UPL) != 0)) { + goto error; + } else if (__predict_false((val & ~vmx_xcr0_mask) != 0)) { + goto error; + } else if (__predict_false((val & XCR0_X87) == 0)) { + goto error; + } + + cpudata->gxcr0 = val; + + vmx_inkernel_advance(); + return; + +error: + vmx_inject_gp(mach, vcpu); +} + +#define VMX_EPT_VIOLATION_READ __BIT(0) +#define VMX_EPT_VIOLATION_WRITE __BIT(1) +#define VMX_EPT_VIOLATION_EXECUTE __BIT(2) + +static void +vmx_exit_epf(struct nvmm_machine *mach, struct nvmm_cpu *vcpu, + struct nvmm_exit *exit) +{ + uint64_t perm; + gpaddr_t gpa; + int error; + + vmx_vmread(VMCS_GUEST_PHYSICAL_ADDRESS, &gpa); + + error = uvm_fault(&mach->vm->vm_map, gpa, VM_PROT_ALL); + + if (error) { + exit->reason = NVMM_EXIT_MEMORY; + vmx_vmread(VMCS_EXIT_QUALIFICATION, &perm); + if (perm & VMX_EPT_VIOLATION_WRITE) + exit->u.mem.perm = NVMM_EXIT_MEMORY_WRITE; + else if (perm & VMX_EPT_VIOLATION_EXECUTE) + exit->u.mem.perm = NVMM_EXIT_MEMORY_EXEC; + else + exit->u.mem.perm = NVMM_EXIT_MEMORY_READ; + exit->u.mem.gpa = gpa; + exit->u.mem.inst_len = 0; + } else { + exit->reason = NVMM_EXIT_NONE; + } +} + +static void +vmx_vcpu_guest_fpu_enter(struct nvmm_cpu *vcpu) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + + cpudata->ts_set = (rcr0() & CR0_TS) != 0; + + fpu_area_save(&cpudata->hfpu, vmx_xcr0_mask); + fpu_area_restore(&cpudata->gfpu, vmx_xcr0_mask); + + if (vmx_xcr0_mask != 0) { + cpudata->hxcr0 = rdxcr(0); + wrxcr(0, cpudata->gxcr0); + } +} + +static void +vmx_vcpu_guest_fpu_leave(struct nvmm_cpu *vcpu) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + + if (vmx_xcr0_mask != 0) { + cpudata->gxcr0 = rdxcr(0); + wrxcr(0, cpudata->hxcr0); + } + + fpu_area_save(&cpudata->gfpu, vmx_xcr0_mask); + fpu_area_restore(&cpudata->hfpu, vmx_xcr0_mask); + + if (cpudata->ts_set) { + stts(); + } +} + +static void +vmx_vcpu_guest_dbregs_enter(struct nvmm_cpu *vcpu) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + + x86_dbregs_save(curlwp); + + ldr7(0); + + ldr0(cpudata->drs[NVMM_X64_DR_DR0]); + ldr1(cpudata->drs[NVMM_X64_DR_DR1]); + ldr2(cpudata->drs[NVMM_X64_DR_DR2]); + ldr3(cpudata->drs[NVMM_X64_DR_DR3]); + ldr6(cpudata->drs[NVMM_X64_DR_DR6]); +} + +static void +vmx_vcpu_guest_dbregs_leave(struct nvmm_cpu *vcpu) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + + cpudata->drs[NVMM_X64_DR_DR0] = rdr0(); + cpudata->drs[NVMM_X64_DR_DR1] = rdr1(); + cpudata->drs[NVMM_X64_DR_DR2] = rdr2(); + cpudata->drs[NVMM_X64_DR_DR3] = rdr3(); + cpudata->drs[NVMM_X64_DR_DR6] = rdr6(); + + x86_dbregs_restore(curlwp); +} + +static void +vmx_vcpu_guest_misc_enter(struct nvmm_cpu *vcpu) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + + /* This gets restored automatically by the CPU. */ + vmx_vmwrite(VMCS_HOST_FS_BASE, rdmsr(MSR_FSBASE)); + vmx_vmwrite(VMCS_HOST_CR3, rcr3()); + vmx_vmwrite(VMCS_HOST_CR4, rcr4()); + + /* Note: MSR_LSTAR is not static, because of SVS. */ + cpudata->lstar = rdmsr(MSR_LSTAR); + cpudata->kernelgsbase = rdmsr(MSR_KERNELGSBASE); +} + +static void +vmx_vcpu_guest_misc_leave(struct nvmm_cpu *vcpu) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + + wrmsr(MSR_STAR, cpudata->star); + wrmsr(MSR_LSTAR, cpudata->lstar); + wrmsr(MSR_CSTAR, cpudata->cstar); + wrmsr(MSR_SFMASK, cpudata->sfmask); + wrmsr(MSR_KERNELGSBASE, cpudata->kernelgsbase); +} + +#define VMX_INVVPID_ADDRESS 0 +#define VMX_INVVPID_CONTEXT 1 +#define VMX_INVVPID_ALL 2 +#define VMX_INVVPID_CONTEXT_NOGLOBAL 3 + +#define VMX_INVEPT_CONTEXT 1 +#define VMX_INVEPT_ALL 2 + +static int +vmx_vcpu_run(struct nvmm_machine *mach, struct nvmm_cpu *vcpu, + struct nvmm_exit *exit) +{ + struct vmx_machdata *machdata = mach->machdata; + struct vmx_cpudata *cpudata = vcpu->cpudata; + bool tlb_need_flush = false; + struct vpid_desc vpid_desc; + struct ept_desc ept_desc; + struct cpu_info *ci; + uint64_t exitcode; + uint64_t intstate; + int hcpu, s, ret; + bool launched = false; + + vmx_vmcs_enter(vcpu); + ci = curcpu(); + hcpu = cpu_number(); + + if (__predict_false(kcpuset_isset(machdata->ept_want_flush, hcpu))) { + vmx_vmread(VMCS_EPTP, &ept_desc.eptp); + ept_desc.mbz = 0; + vmx_invept(vmx_ept_flush_op, &ept_desc); + kcpuset_clear(machdata->ept_want_flush, hcpu); + } + + if (vcpu->hcpu_last != hcpu) { + tlb_need_flush = true; + } + + if (vcpu->hcpu_last != hcpu) { + vmx_vmwrite(VMCS_HOST_TR_SELECTOR, ci->ci_tss_sel); + vmx_vmwrite(VMCS_HOST_TR_BASE, (uint64_t)ci->ci_tss); + vmx_vmwrite(VMCS_HOST_GDTR_BASE, (uint64_t)ci->ci_gdt); + vmx_vmwrite(VMCS_HOST_GS_BASE, rdmsr(MSR_GSBASE)); + vmx_vmwrite(VMCS_TSC_OFFSET, cpudata->tsc_offset + + curcpu()->ci_data.cpu_cc_skew); + vcpu->hcpu_last = hcpu; + } + + vmx_vcpu_guest_dbregs_enter(vcpu); + vmx_vcpu_guest_misc_enter(vcpu); + + while (1) { + if (cpudata->tlb_want_flush || tlb_need_flush) { + vpid_desc.vpid = cpudata->asid; + vpid_desc.addr = 0; + vmx_invvpid(vmx_tlb_flush_op, &vpid_desc); + cpudata->tlb_want_flush = false; + tlb_need_flush = false; + } + + s = splhigh(); + vmx_vcpu_guest_fpu_enter(vcpu); + lcr2(cpudata->gcr2); + if (launched) { + ret = vmx_vmresume(cpudata->gprs); + } else { + ret = vmx_vmlaunch(cpudata->gprs); + } + cpudata->gcr2 = rcr2(); + vmx_vcpu_guest_fpu_leave(vcpu); + splx(s); + + if (__predict_false(ret != 0)) { + exit->reason = NVMM_EXIT_INVALID; + break; + } + + launched = true; + + vmx_vmread(VMCS_EXIT_REASON, &exitcode); + exitcode &= __BITS(15,0); + + switch (exitcode) { + case VMCS_EXITCODE_EXT_INT: + exit->reason = NVMM_EXIT_NONE; + break; + case VMCS_EXITCODE_CPUID: + vmx_exit_cpuid(mach, vcpu, exit); + break; + case VMCS_EXITCODE_HLT: + vmx_exit_hlt(mach, vcpu, exit); + break; + case VMCS_EXITCODE_CR: + vmx_exit_cr(mach, vcpu, exit); + break; + case VMCS_EXITCODE_IO: + vmx_exit_io(mach, vcpu, exit); + break; + case VMCS_EXITCODE_RDMSR: + vmx_exit_msr(mach, vcpu, exit, true); + break; + case VMCS_EXITCODE_WRMSR: + vmx_exit_msr(mach, vcpu, exit, false); + break; + case VMCS_EXITCODE_SHUTDOWN: + exit->reason = NVMM_EXIT_SHUTDOWN; + break; + case VMCS_EXITCODE_MONITOR: + exit->reason = NVMM_EXIT_MONITOR; + break; + case VMCS_EXITCODE_MWAIT: + exit->reason = NVMM_EXIT_MWAIT; + break; + case VMCS_EXITCODE_XSETBV: + vmx_exit_xsetbv(mach, vcpu, exit); + break; + case VMCS_EXITCODE_RDPMC: + case VMCS_EXITCODE_RDTSCP: + case VMCS_EXITCODE_INVVPID: + case VMCS_EXITCODE_INVEPT: + case VMCS_EXITCODE_VMCALL: + case VMCS_EXITCODE_VMCLEAR: + case VMCS_EXITCODE_VMLAUNCH: + case VMCS_EXITCODE_VMPTRLD: + case VMCS_EXITCODE_VMPTRST: + case VMCS_EXITCODE_VMREAD: + case VMCS_EXITCODE_VMRESUME: + case VMCS_EXITCODE_VMWRITE: + case VMCS_EXITCODE_VMXOFF: + case VMCS_EXITCODE_VMXON: + vmx_inject_ud(mach, vcpu); + exit->reason = NVMM_EXIT_NONE; + break; + case VMCS_EXITCODE_EPT_VIOLATION: + vmx_exit_epf(mach, vcpu, exit); + break; + case VMCS_EXITCODE_INT_WINDOW: + vmx_event_waitexit_disable(vcpu, false); + exit->reason = NVMM_EXIT_INT_READY; + break; + case VMCS_EXITCODE_NMI_WINDOW: + vmx_event_waitexit_disable(vcpu, true); + exit->reason = NVMM_EXIT_NMI_READY; + break; + default: + exit->reason = NVMM_EXIT_INVALID; + break; + } + + /* If no reason to return to userland, keep rolling. */ + if (curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD) { + break; + } + if (curcpu()->ci_data.cpu_softints != 0) { + break; + } + if (curlwp->l_flag & LW_USERRET) { + break; + } + if (exit->reason != NVMM_EXIT_NONE) { + break; + } + } + + vmx_vcpu_guest_misc_leave(vcpu); + vmx_vcpu_guest_dbregs_leave(vcpu); + + exit->exitstate[NVMM_X64_EXITSTATE_CR8] = cpudata->gcr8; + vmx_vmread(VMCS_GUEST_RFLAGS, + &exit->exitstate[NVMM_X64_EXITSTATE_RFLAGS]); + vmx_vmread(VMCS_GUEST_INTERRUPTIBILITY, &intstate); + exit->exitstate[NVMM_X64_EXITSTATE_INT_SHADOW] = + (intstate & (INT_STATE_STI|INT_STATE_MOVSS)) != 0; + exit->exitstate[NVMM_X64_EXITSTATE_INT_WINDOW_EXIT] = + cpudata->int_window_exit; + exit->exitstate[NVMM_X64_EXITSTATE_NMI_WINDOW_EXIT] = + cpudata->nmi_window_exit; + + vmx_vmcs_leave(vcpu); + + return 0; +} + +/* -------------------------------------------------------------------------- */ + +static int +vmx_memalloc(paddr_t *pa, vaddr_t *va, size_t npages) +{ + struct pglist pglist; + paddr_t _pa; + vaddr_t _va; + size_t i; + int ret; + + ret = uvm_pglistalloc(npages * PAGE_SIZE, 0, ~0UL, PAGE_SIZE, 0, + &pglist, 1, 0); + if (ret != 0) + return ENOMEM; + _pa = TAILQ_FIRST(&pglist)->phys_addr; + _va = uvm_km_alloc(kernel_map, npages * PAGE_SIZE, 0, + UVM_KMF_VAONLY | UVM_KMF_NOWAIT); + if (_va == 0) + goto error; + + for (i = 0; i < npages; i++) { + pmap_kenter_pa(_va + i * PAGE_SIZE, _pa + i * PAGE_SIZE, + VM_PROT_READ | VM_PROT_WRITE, PMAP_WRITE_BACK); + } + pmap_update(pmap_kernel()); + + memset((void *)_va, 0, npages * PAGE_SIZE); + + *pa = _pa; + *va = _va; + return 0; + +error: + for (i = 0; i < npages; i++) { + uvm_pagefree(PHYS_TO_VM_PAGE(_pa + i * PAGE_SIZE)); + } + return ENOMEM; +} + +static void +vmx_memfree(paddr_t pa, vaddr_t va, size_t npages) +{ + size_t i; + + pmap_kremove(va, npages * PAGE_SIZE); + pmap_update(pmap_kernel()); + uvm_km_free(kernel_map, va, npages * PAGE_SIZE, UVM_KMF_VAONLY); + for (i = 0; i < npages; i++) { + uvm_pagefree(PHYS_TO_VM_PAGE(pa + i * PAGE_SIZE)); + } +} + +/* -------------------------------------------------------------------------- */ + +static void +vmx_asid_alloc(struct nvmm_cpu *vcpu) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + size_t i, oct, bit; + + mutex_enter(&vmx_asidlock); + + for (i = 0; i < vmx_maxasid; i++) { + oct = i / 8; + bit = i % 8; + + if (vmx_asidmap[oct] & __BIT(bit)) { + continue; + } + + cpudata->asid = i; + + vmx_asidmap[oct] |= __BIT(bit); + vmx_vmwrite(VMCS_VPID, i); + mutex_exit(&vmx_asidlock); + return; + } + + mutex_exit(&vmx_asidlock); + + panic("%s: impossible", __func__); +} + +static void +vmx_asid_free(struct nvmm_cpu *vcpu) +{ + size_t oct, bit; + uint64_t asid; + + vmx_vmread(VMCS_VPID, &asid); + + oct = asid / 8; + bit = asid % 8; + + mutex_enter(&vmx_asidlock); + vmx_asidmap[oct] &= ~__BIT(bit); + mutex_exit(&vmx_asidlock); +} + +static void +vmx_init_asid(uint32_t maxasid) +{ + size_t allocsz; + + mutex_init(&vmx_asidlock, MUTEX_DEFAULT, IPL_NONE); + + vmx_maxasid = maxasid; + allocsz = roundup(maxasid, 8) / 8; + vmx_asidmap = kmem_zalloc(allocsz, KM_SLEEP); + + /* ASID 0 is reserved for the host. */ + vmx_asidmap[0] |= __BIT(0); +} + +static void +vmx_vcpu_msr_allow(uint8_t *bitmap, uint64_t msr, bool read, bool write) +{ + uint64_t byte; + uint8_t bitoff; + + if (msr < 0x00002000) { + /* Range 1 */ + byte = ((msr - 0x00000000) / 8) + 0; + } else if (msr >= 0xC0000000 && msr < 0xC0002000) { + /* Range 2 */ + byte = ((msr - 0xC0000000) / 8) + 1024; + } else { + panic("%s: wrong range", __func__); + } + + bitoff = (msr & 0x7); + + if (read) { + bitmap[byte] &= ~__BIT(bitoff); + } + if (write) { + bitmap[2048 + byte] &= ~__BIT(bitoff); + } +} + +static void +vmx_vcpu_init(struct nvmm_machine *mach, struct nvmm_cpu *vcpu) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + struct vmcs *vmcs = cpudata->vmcs; + struct msr_entry *gmsr = cpudata->gmsr; + extern uint8_t vmx_resume_rip; + uint64_t rev, eptp; + + rev = vmx_get_revision(); + + memset(vmcs, 0, VMCS_SIZE); + vmcs->ident = __SHIFTIN(rev, VMCS_IDENT_REVISION); + vmcs->abort = 0; + + vmx_vmcs_enter(vcpu); + + /* No link pointer. */ + vmx_vmwrite(VMCS_LINK_POINTER, 0xFFFFFFFFFFFFFFFF); + + /* Install the CTLSs. */ + vmx_vmwrite(VMCS_PINBASED_CTLS, vmx_pinbased_ctls); + vmx_vmwrite(VMCS_PROCBASED_CTLS, vmx_procbased_ctls); + vmx_vmwrite(VMCS_PROCBASED_CTLS2, vmx_procbased_ctls2); + vmx_vmwrite(VMCS_ENTRY_CTLS, vmx_entry_ctls); + vmx_vmwrite(VMCS_EXIT_CTLS, vmx_exit_ctls); + + /* Allow direct access to certain MSRs. */ + memset(cpudata->msrbm, 0xFF, MSRBM_SIZE); + vmx_vcpu_msr_allow(cpudata->msrbm, MSR_EFER, true, true); + vmx_vcpu_msr_allow(cpudata->msrbm, MSR_STAR, true, true); + vmx_vcpu_msr_allow(cpudata->msrbm, MSR_LSTAR, true, true); + vmx_vcpu_msr_allow(cpudata->msrbm, MSR_CSTAR, true, true); + vmx_vcpu_msr_allow(cpudata->msrbm, MSR_SFMASK, true, true); + vmx_vcpu_msr_allow(cpudata->msrbm, MSR_KERNELGSBASE, true, true); + vmx_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_CS, true, true); + vmx_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_ESP, true, true); + vmx_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_EIP, true, true); + vmx_vcpu_msr_allow(cpudata->msrbm, MSR_FSBASE, true, true); + vmx_vcpu_msr_allow(cpudata->msrbm, MSR_GSBASE, true, true); + vmx_vcpu_msr_allow(cpudata->msrbm, MSR_TSC, true, false); + vmx_vcpu_msr_allow(cpudata->msrbm, MSR_IA32_ARCH_CAPABILITIES, + true, false); + vmx_vmwrite(VMCS_MSR_BITMAP, (uint64_t)cpudata->msrbm_pa); + + /* + * List of Guest MSRs loaded on VMENTRY, saved on VMEXIT. This + * includes the L1D_FLUSH MSR, to mitigate L1TF. + */ + gmsr[VMX_MSRLIST_STAR].msr = MSR_STAR; + gmsr[VMX_MSRLIST_STAR].val = 0; + gmsr[VMX_MSRLIST_LSTAR].msr = MSR_LSTAR; + gmsr[VMX_MSRLIST_LSTAR].val = 0; + gmsr[VMX_MSRLIST_CSTAR].msr = MSR_CSTAR; + gmsr[VMX_MSRLIST_CSTAR].val = 0; + gmsr[VMX_MSRLIST_SFMASK].msr = MSR_SFMASK; + gmsr[VMX_MSRLIST_SFMASK].val = 0; + gmsr[VMX_MSRLIST_KERNELGSBASE].msr = MSR_KERNELGSBASE; + gmsr[VMX_MSRLIST_KERNELGSBASE].val = 0; + gmsr[VMX_MSRLIST_L1DFLUSH].msr = MSR_IA32_FLUSH_CMD; + gmsr[VMX_MSRLIST_L1DFLUSH].val = IA32_FLUSH_CMD_L1D_FLUSH; + vmx_vmwrite(VMCS_ENTRY_MSR_LOAD_ADDRESS, cpudata->gmsr_pa); + vmx_vmwrite(VMCS_EXIT_MSR_STORE_ADDRESS, cpudata->gmsr_pa); + vmx_vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, vmx_msrlist_entry_nmsr); + vmx_vmwrite(VMCS_EXIT_MSR_STORE_COUNT, VMX_MSRLIST_EXIT_NMSR); + + /* Force CR0_NW and CR0_CD to zero, CR0_ET to one. */ + vmx_vmwrite(VMCS_CR0_MASK, CR0_NW|CR0_CD); + vmx_vmwrite(VMCS_CR0_SHADOW, CR0_ET); + + /* Force CR4_VMXE to zero. */ + vmx_vmwrite(VMCS_CR4_MASK, CR4_VMXE); + + /* Set the Host state for resuming. */ + vmx_vmwrite(VMCS_HOST_RIP, (uint64_t)&vmx_resume_rip); + vmx_vmwrite(VMCS_HOST_CS_SELECTOR, GSEL(GCODE_SEL, SEL_KPL)); + vmx_vmwrite(VMCS_HOST_SS_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)); + vmx_vmwrite(VMCS_HOST_DS_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)); + vmx_vmwrite(VMCS_HOST_ES_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)); + vmx_vmwrite(VMCS_HOST_FS_SELECTOR, 0); + vmx_vmwrite(VMCS_HOST_GS_SELECTOR, 0); + vmx_vmwrite(VMCS_HOST_IA32_SYSENTER_CS, 0); + vmx_vmwrite(VMCS_HOST_IA32_SYSENTER_ESP, 0); + vmx_vmwrite(VMCS_HOST_IA32_SYSENTER_EIP, 0); + vmx_vmwrite(VMCS_HOST_IDTR_BASE, (uint64_t)idt); + vmx_vmwrite(VMCS_HOST_IA32_PAT, rdmsr(MSR_CR_PAT)); + vmx_vmwrite(VMCS_HOST_IA32_EFER, rdmsr(MSR_EFER)); + vmx_vmwrite(VMCS_HOST_CR0, rcr0()); + + /* Generate ASID. */ + vmx_asid_alloc(vcpu); + + /* Enable Extended Paging, 4-Level. */ + eptp = + __SHIFTIN(vmx_eptp_type, EPTP_TYPE) | + __SHIFTIN(4-1, EPTP_WALKLEN) | + EPTP_FLAGS_AD | + mach->vm->vm_map.pmap->pm_pdirpa[0]; + vmx_vmwrite(VMCS_EPTP, eptp); + + /* Must always be set. */ + vmx_vmwrite(VMCS_GUEST_CR4, CR4_VMXE); + vmx_vmwrite(VMCS_GUEST_CR0, CR0_NE); + cpudata->gxcr0 = XCR0_X87; + + /* Init XSAVE header. */ + cpudata->gfpu.xsh_xstate_bv = vmx_xcr0_mask; + cpudata->gfpu.xsh_xcomp_bv = 0; + + /* Bluntly hide the host TSC. */ + cpudata->tsc_offset = rdtsc(); + + /* These MSRs are static. */ + cpudata->star = rdmsr(MSR_STAR); + cpudata->cstar = rdmsr(MSR_CSTAR); + cpudata->sfmask = rdmsr(MSR_SFMASK); + + vmx_vmcs_leave(vcpu); +} + +static int +vmx_vcpu_create(struct nvmm_machine *mach, struct nvmm_cpu *vcpu) +{ + struct vmx_cpudata *cpudata; + int error; + + /* Allocate the VMX cpudata. */ + cpudata = (struct vmx_cpudata *)uvm_km_alloc(kernel_map, + roundup(sizeof(*cpudata), PAGE_SIZE), 0, + UVM_KMF_WIRED|UVM_KMF_ZERO); + vcpu->cpudata = cpudata; + + /* VMCS */ + error = vmx_memalloc(&cpudata->vmcs_pa, (vaddr_t *)&cpudata->vmcs, + VMCS_NPAGES); + if (error) + goto error; + + /* MSR Bitmap */ + error = vmx_memalloc(&cpudata->msrbm_pa, (vaddr_t *)&cpudata->msrbm, + MSRBM_NPAGES); + if (error) + goto error; + + /* Guest MSR List */ + error = vmx_memalloc(&cpudata->gmsr_pa, (vaddr_t *)&cpudata->gmsr, 1); + if (error) + goto error; + + /* Init the VCPU info. */ + vmx_vcpu_init(mach, vcpu); + + return 0; + +error: + if (cpudata->vmcs_pa) { + vmx_memfree(cpudata->vmcs_pa, (vaddr_t)cpudata->vmcs, + VMCS_NPAGES); + } + if (cpudata->msrbm_pa) { + vmx_memfree(cpudata->msrbm_pa, (vaddr_t)cpudata->msrbm, + MSRBM_NPAGES); + } + if (cpudata->gmsr_pa) { + vmx_memfree(cpudata->gmsr_pa, (vaddr_t)cpudata->gmsr, 1); + } + + kmem_free(cpudata, sizeof(*cpudata)); + return error; +} + +static void +vmx_vcpu_destroy(struct nvmm_machine *mach, struct nvmm_cpu *vcpu) +{ + struct vmx_cpudata *cpudata = vcpu->cpudata; + + vmx_vmcs_enter(vcpu); + vmx_asid_free(vcpu); + vmx_vmcs_leave(vcpu); + + vmx_memfree(cpudata->vmcs_pa, (vaddr_t)cpudata->vmcs, VMCS_NPAGES); + vmx_memfree(cpudata->msrbm_pa, (vaddr_t)cpudata->msrbm, MSRBM_NPAGES); + vmx_memfree(cpudata->gmsr_pa, (vaddr_t)cpudata->gmsr, 1); + uvm_km_free(kernel_map, (vaddr_t)cpudata, + roundup(sizeof(*cpudata), PAGE_SIZE), UVM_KMF_WIRED); +} + +#define VMX_SEG_ATTRIB_TYPE __BITS(4,0) +#define VMX_SEG_ATTRIB_DPL __BITS(6,5) +#define VMX_SEG_ATTRIB_P __BIT(7) +#define VMX_SEG_ATTRIB_AVL __BIT(12) +#define VMX_SEG_ATTRIB_LONG __BIT(13) +#define VMX_SEG_ATTRIB_DEF32 __BIT(14) +#define VMX_SEG_ATTRIB_GRAN __BIT(15) +#define VMX_SEG_ATTRIB_UNUSABLE __BIT(16) + +static void +vmx_vcpu_setstate_seg(struct nvmm_x64_state_seg *segs, int idx) +{ + uint64_t attrib; + + attrib = + __SHIFTIN(segs[idx].attrib.type, VMX_SEG_ATTRIB_TYPE) | + __SHIFTIN(segs[idx].attrib.dpl, VMX_SEG_ATTRIB_DPL) | + __SHIFTIN(segs[idx].attrib.p, VMX_SEG_ATTRIB_P) | + __SHIFTIN(segs[idx].attrib.avl, VMX_SEG_ATTRIB_AVL) | + __SHIFTIN(segs[idx].attrib.lng, VMX_SEG_ATTRIB_LONG) | + __SHIFTIN(segs[idx].attrib.def32, VMX_SEG_ATTRIB_DEF32) | + __SHIFTIN(segs[idx].attrib.gran, VMX_SEG_ATTRIB_GRAN); + + if (idx != NVMM_X64_SEG_GDT && idx != NVMM_X64_SEG_IDT) { + vmx_vmwrite(vmx_guest_segs[idx].selector, segs[idx].selector); + vmx_vmwrite(vmx_guest_segs[idx].attrib, attrib); + } + vmx_vmwrite(vmx_guest_segs[idx].limit, segs[idx].limit); + vmx_vmwrite(vmx_guest_segs[idx].base, segs[idx].base); +} + +static void +vmx_vcpu_getstate_seg(struct nvmm_x64_state_seg *segs, int idx) +{ + uint64_t attrib = 0; + + if (idx != NVMM_X64_SEG_GDT && idx != NVMM_X64_SEG_IDT) { + vmx_vmread(vmx_guest_segs[idx].selector, &segs[idx].selector); + vmx_vmread(vmx_guest_segs[idx].attrib, &attrib); + } + vmx_vmread(vmx_guest_segs[idx].limit, &segs[idx].limit); + vmx_vmread(vmx_guest_segs[idx].base, &segs[idx].base); + + segs[idx].attrib.type = __SHIFTOUT(attrib, VMX_SEG_ATTRIB_TYPE); + segs[idx].attrib.dpl = __SHIFTOUT(attrib, VMX_SEG_ATTRIB_DPL); + segs[idx].attrib.p = __SHIFTOUT(attrib, VMX_SEG_ATTRIB_P); + segs[idx].attrib.avl = __SHIFTOUT(attrib, VMX_SEG_ATTRIB_AVL); + segs[idx].attrib.lng = __SHIFTOUT(attrib, VMX_SEG_ATTRIB_LONG); + segs[idx].attrib.def32 = __SHIFTOUT(attrib, VMX_SEG_ATTRIB_DEF32); + segs[idx].attrib.gran = __SHIFTOUT(attrib, VMX_SEG_ATTRIB_GRAN); +} + +static inline bool +vmx_state_tlb_flush(struct nvmm_x64_state *state, uint64_t flags) +{ + uint64_t cr0, cr3, cr4, efer; + + if (flags & NVMM_X64_STATE_CRS) { + vmx_vmread(VMCS_GUEST_CR0, &cr0); + if ((cr0 ^ state->crs[NVMM_X64_CR_CR0]) & CR0_TLB_FLUSH) { + return true; + } + vmx_vmread(VMCS_GUEST_CR3, &cr3); + if (cr3 != state->crs[NVMM_X64_CR_CR3]) { + return true; + } + vmx_vmread(VMCS_GUEST_CR4, &cr4); + if ((cr4 ^ state->crs[NVMM_X64_CR_CR4]) & CR4_TLB_FLUSH) { + return true; + } + } + + if (flags & NVMM_X64_STATE_MSRS) { + vmx_vmread(VMCS_GUEST_IA32_EFER, &efer); + if ((efer ^ + state->msrs[NVMM_X64_MSR_EFER]) & EFER_TLB_FLUSH) { + return true; + } + } + + return false; +} + +static void +vmx_vcpu_setstate(struct nvmm_cpu *vcpu, void *data, uint64_t flags) +{ + struct nvmm_x64_state *state = (struct nvmm_x64_state *)data; + struct vmx_cpudata *cpudata = vcpu->cpudata; + struct fxsave *fpustate; + uint64_t ctls1, intstate; + + vmx_vmcs_enter(vcpu); + + if (vmx_state_tlb_flush(state, flags)) { + cpudata->tlb_want_flush = true; + } + + if (flags & NVMM_X64_STATE_SEGS) { + vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_CS); + vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_DS); + vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_ES); + vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_FS); + vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_GS); + vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_SS); + vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_GDT); + vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_IDT); + vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_LDT); + vmx_vcpu_setstate_seg(state->segs, NVMM_X64_SEG_TR); + } + + CTASSERT(sizeof(cpudata->gprs) == sizeof(state->gprs)); + if (flags & NVMM_X64_STATE_GPRS) { + memcpy(cpudata->gprs, state->gprs, sizeof(state->gprs)); + + vmx_vmwrite(VMCS_GUEST_RIP, state->gprs[NVMM_X64_GPR_RIP]); + vmx_vmwrite(VMCS_GUEST_RSP, state->gprs[NVMM_X64_GPR_RSP]); + vmx_vmwrite(VMCS_GUEST_RFLAGS, state->gprs[NVMM_X64_GPR_RFLAGS]); + } + + if (flags & NVMM_X64_STATE_CRS) { + /* These bits are mandatory. */ + state->crs[NVMM_X64_CR_CR4] |= CR4_VMXE; + state->crs[NVMM_X64_CR_CR0] |= CR0_NE; + + vmx_vmwrite(VMCS_GUEST_CR0, state->crs[NVMM_X64_CR_CR0]); + cpudata->gcr2 = state->crs[NVMM_X64_CR_CR2]; + vmx_vmwrite(VMCS_GUEST_CR3, state->crs[NVMM_X64_CR_CR3]); // XXX PDPTE? + vmx_vmwrite(VMCS_GUEST_CR4, state->crs[NVMM_X64_CR_CR4]); + cpudata->gcr8 = state->crs[NVMM_X64_CR_CR8]; + + if (vmx_xcr0_mask != 0) { + /* Clear illegal XCR0 bits, set mandatory X87 bit. */ + cpudata->gxcr0 = state->crs[NVMM_X64_CR_XCR0]; + cpudata->gxcr0 &= vmx_xcr0_mask; + cpudata->gxcr0 |= XCR0_X87; + } + } + + CTASSERT(sizeof(cpudata->drs) == sizeof(state->drs)); + if (flags & NVMM_X64_STATE_DRS) { + memcpy(cpudata->drs, state->drs, sizeof(state->drs)); + + cpudata->drs[NVMM_X64_DR_DR6] &= 0xFFFFFFFF; + vmx_vmwrite(VMCS_GUEST_DR7, cpudata->drs[NVMM_X64_DR_DR7]); + } + + if (flags & NVMM_X64_STATE_MSRS) { + cpudata->gmsr[VMX_MSRLIST_STAR].val = + state->msrs[NVMM_X64_MSR_STAR]; + cpudata->gmsr[VMX_MSRLIST_LSTAR].val = + state->msrs[NVMM_X64_MSR_LSTAR]; + cpudata->gmsr[VMX_MSRLIST_CSTAR].val = + state->msrs[NVMM_X64_MSR_CSTAR]; + cpudata->gmsr[VMX_MSRLIST_SFMASK].val = + state->msrs[NVMM_X64_MSR_SFMASK]; + cpudata->gmsr[VMX_MSRLIST_KERNELGSBASE].val = + state->msrs[NVMM_X64_MSR_KERNELGSBASE]; + + vmx_vmwrite(VMCS_GUEST_IA32_EFER, + state->msrs[NVMM_X64_MSR_EFER]); + vmx_vmwrite(VMCS_GUEST_IA32_PAT, + state->msrs[NVMM_X64_MSR_PAT]); + vmx_vmwrite(VMCS_GUEST_IA32_SYSENTER_CS, + state->msrs[NVMM_X64_MSR_SYSENTER_CS]); + vmx_vmwrite(VMCS_GUEST_IA32_SYSENTER_ESP, + state->msrs[NVMM_X64_MSR_SYSENTER_ESP]); + vmx_vmwrite(VMCS_GUEST_IA32_SYSENTER_EIP, + state->msrs[NVMM_X64_MSR_SYSENTER_EIP]); + + /* ENTRY_CTLS_LONG_MODE must match EFER_LMA. */ + vmx_vmread(VMCS_ENTRY_CTLS, &ctls1); + if (state->msrs[NVMM_X64_MSR_EFER] & EFER_LMA) { + ctls1 |= ENTRY_CTLS_LONG_MODE; + } else { + ctls1 &= ~ENTRY_CTLS_LONG_MODE; + } + vmx_vmwrite(VMCS_ENTRY_CTLS, ctls1); + } + + if (flags & NVMM_X64_STATE_MISC) { + // XXX CPL? not sure + + vmx_vmread(VMCS_GUEST_INTERRUPTIBILITY, &intstate); + intstate &= ~(INT_STATE_STI|INT_STATE_MOVSS); + if (state->misc[NVMM_X64_MISC_INT_SHADOW]) { + intstate |= INT_STATE_MOVSS; + } + vmx_vmwrite(VMCS_GUEST_INTERRUPTIBILITY, intstate); + + if (state->misc[NVMM_X64_MISC_INT_WINDOW_EXIT]) { + vmx_event_waitexit_enable(vcpu, false); + } else { + vmx_event_waitexit_disable(vcpu, false); + } + + if (state->misc[NVMM_X64_MISC_NMI_WINDOW_EXIT]) { + vmx_event_waitexit_enable(vcpu, true); + } else { + vmx_event_waitexit_disable(vcpu, true); + } + } + + CTASSERT(sizeof(cpudata->gfpu.xsh_fxsave) == sizeof(state->fpu)); + if (flags & NVMM_X64_STATE_FPU) { + memcpy(cpudata->gfpu.xsh_fxsave, &state->fpu, + sizeof(state->fpu)); + + fpustate = (struct fxsave *)cpudata->gfpu.xsh_fxsave; + fpustate->fx_mxcsr_mask &= x86_fpu_mxcsr_mask; + fpustate->fx_mxcsr &= fpustate->fx_mxcsr_mask; + + if (vmx_xcr0_mask != 0) { + /* Reset XSTATE_BV, to force a reload. */ + cpudata->gfpu.xsh_xstate_bv = vmx_xcr0_mask; + } + } + + vmx_vmcs_leave(vcpu); +} + +static void +vmx_vcpu_getstate(struct nvmm_cpu *vcpu, void *data, uint64_t flags) +{ + struct nvmm_x64_state *state = (struct nvmm_x64_state *)data; + struct vmx_cpudata *cpudata = vcpu->cpudata; + uint64_t intstate; + + vmx_vmcs_enter(vcpu); + + if (flags & NVMM_X64_STATE_SEGS) { + vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_CS); + vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_DS); + vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_ES); + vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_FS); + vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_GS); + vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_SS); + vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_GDT); + vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_IDT); + vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_LDT); + vmx_vcpu_getstate_seg(state->segs, NVMM_X64_SEG_TR); + } + + CTASSERT(sizeof(cpudata->gprs) == sizeof(state->gprs)); + if (flags & NVMM_X64_STATE_GPRS) { + memcpy(state->gprs, cpudata->gprs, sizeof(state->gprs)); + + vmx_vmread(VMCS_GUEST_RIP, &state->gprs[NVMM_X64_GPR_RIP]); + vmx_vmread(VMCS_GUEST_RSP, &state->gprs[NVMM_X64_GPR_RSP]); + vmx_vmread(VMCS_GUEST_RFLAGS, &state->gprs[NVMM_X64_GPR_RFLAGS]); + } + + if (flags & NVMM_X64_STATE_CRS) { + vmx_vmread(VMCS_GUEST_CR0, &state->crs[NVMM_X64_CR_CR0]); + state->crs[NVMM_X64_CR_CR2] = cpudata->gcr2; + vmx_vmread(VMCS_GUEST_CR3, &state->crs[NVMM_X64_CR_CR3]); + vmx_vmread(VMCS_GUEST_CR4, &state->crs[NVMM_X64_CR_CR4]); + state->crs[NVMM_X64_CR_CR8] = cpudata->gcr8; + state->crs[NVMM_X64_CR_XCR0] = cpudata->gxcr0; + + /* Hide VMXE. */ + state->crs[NVMM_X64_CR_CR4] &= ~CR4_VMXE; + } + + CTASSERT(sizeof(cpudata->drs) == sizeof(state->drs)); + if (flags & NVMM_X64_STATE_DRS) { + memcpy(state->drs, cpudata->drs, sizeof(state->drs)); + + vmx_vmread(VMCS_GUEST_DR7, &state->drs[NVMM_X64_DR_DR7]); + } + + if (flags & NVMM_X64_STATE_MSRS) { + state->msrs[NVMM_X64_MSR_STAR] = + cpudata->gmsr[VMX_MSRLIST_STAR].val; + state->msrs[NVMM_X64_MSR_LSTAR] = + cpudata->gmsr[VMX_MSRLIST_LSTAR].val; + state->msrs[NVMM_X64_MSR_CSTAR] = + cpudata->gmsr[VMX_MSRLIST_CSTAR].val; + state->msrs[NVMM_X64_MSR_SFMASK] = + cpudata->gmsr[VMX_MSRLIST_SFMASK].val; + state->msrs[NVMM_X64_MSR_KERNELGSBASE] = + cpudata->gmsr[VMX_MSRLIST_KERNELGSBASE].val; + + vmx_vmread(VMCS_GUEST_IA32_EFER, + &state->msrs[NVMM_X64_MSR_EFER]); + vmx_vmread(VMCS_GUEST_IA32_PAT, + &state->msrs[NVMM_X64_MSR_PAT]); + vmx_vmread(VMCS_GUEST_IA32_SYSENTER_CS, + &state->msrs[NVMM_X64_MSR_SYSENTER_CS]); + vmx_vmread(VMCS_GUEST_IA32_SYSENTER_ESP, + &state->msrs[NVMM_X64_MSR_SYSENTER_ESP]); + vmx_vmread(VMCS_GUEST_IA32_SYSENTER_EIP, + &state->msrs[NVMM_X64_MSR_SYSENTER_EIP]); + } + + if (flags & NVMM_X64_STATE_MISC) { + // XXX CPL? not sure + + vmx_vmread(VMCS_GUEST_INTERRUPTIBILITY, &intstate); + state->misc[NVMM_X64_MISC_INT_SHADOW] = + (intstate & (INT_STATE_STI|INT_STATE_MOVSS)) != 0; + + state->misc[NVMM_X64_MISC_INT_WINDOW_EXIT] = + cpudata->int_window_exit; + state->misc[NVMM_X64_MISC_NMI_WINDOW_EXIT] = + cpudata->nmi_window_exit; + } + + CTASSERT(sizeof(cpudata->gfpu.xsh_fxsave) == sizeof(state->fpu)); + if (flags & NVMM_X64_STATE_FPU) { + memcpy(&state->fpu, cpudata->gfpu.xsh_fxsave, + sizeof(state->fpu)); + } + + vmx_vmcs_leave(vcpu); +} + +/* -------------------------------------------------------------------------- */ + +static void +vmx_tlb_flush(struct pmap *pm) +{ + struct nvmm_machine *mach = pm->pm_data; + struct vmx_machdata *machdata = mach->machdata; + struct nvmm_cpu *vcpu; + int error; + size_t i; + + kcpuset_atomicly_merge(machdata->ept_want_flush, kcpuset_running); + + /* + * Not as dumb as it seems. We want to make sure that when we leave + * this function, each VCPU got halted at some point, and possibly + * resumed with the updated kcpuset. + */ + for (i = 0; i < NVMM_MAX_VCPUS; i++) { + error = nvmm_vcpu_get(mach, i, &vcpu); + if (error) + continue; + nvmm_vcpu_put(vcpu); + } +} + +static void +vmx_machine_create(struct nvmm_machine *mach) +{ + struct pmap *pmap = mach->vm->vm_map.pmap; + struct vmx_machdata *machdata; + + /* Convert to EPT. */ + pmap_ept_transform(pmap); + + /* Fill in pmap info. */ + pmap->pm_data = (void *)mach; + pmap->pm_tlb_flush = vmx_tlb_flush; + + machdata = kmem_zalloc(sizeof(struct vmx_machdata), KM_SLEEP); + kcpuset_create(&machdata->ept_want_flush, true); + mach->machdata = machdata; + + /* Start with an EPT flush everywhere. */ + kcpuset_copy(machdata->ept_want_flush, kcpuset_running); +} + +static void +vmx_machine_destroy(struct nvmm_machine *mach) +{ + struct vmx_machdata *machdata = mach->machdata; + + kcpuset_destroy(machdata->ept_want_flush); + kmem_free(machdata, sizeof(struct vmx_machdata)); +} + +static int +vmx_machine_configure(struct nvmm_machine *mach, uint64_t op, void *data) +{ + struct nvmm_x86_conf_cpuid *cpuid = data; + struct vmx_machdata *machdata = (struct vmx_machdata *)mach->machdata; + size_t i; + + if (__predict_false(op != NVMM_X86_CONF_CPUID)) { + return EINVAL; + } + + if (__predict_false((cpuid->set.eax & cpuid->del.eax) || + (cpuid->set.ebx & cpuid->del.ebx) || + (cpuid->set.ecx & cpuid->del.ecx) || + (cpuid->set.edx & cpuid->del.edx))) { + return EINVAL; + } + + /* If already here, replace. */ + for (i = 0; i < VMX_NCPUIDS; i++) { + if (!machdata->cpuidpresent[i]) { + continue; + } + if (machdata->cpuid[i].leaf == cpuid->leaf) { + memcpy(&machdata->cpuid[i], cpuid, + sizeof(struct nvmm_x86_conf_cpuid)); + return 0; + } + } + + /* Not here, insert. */ + for (i = 0; i < VMX_NCPUIDS; i++) { + if (!machdata->cpuidpresent[i]) { + machdata->cpuidpresent[i] = true; + memcpy(&machdata->cpuid[i], cpuid, + sizeof(struct nvmm_x86_conf_cpuid)); + return 0; + } + } + + return ENOBUFS; +} + +/* -------------------------------------------------------------------------- */ + +static int +vmx_init_ctls(uint64_t msr_ctls, uint64_t msr_true_ctls, + uint64_t set_one, uint64_t set_zero, uint64_t *res) +{ + uint64_t basic, val, true_val; + bool one_allowed, zero_allowed, has_true; + size_t i; + + basic = rdmsr(MSR_IA32_VMX_BASIC); + has_true = (basic & IA32_VMX_BASIC_TRUE_CTLS) != 0; + + val = rdmsr(msr_ctls); + if (has_true) { + true_val = rdmsr(msr_true_ctls); + } else { + true_val = val; + } + +#define ONE_ALLOWED(msrval, bitoff) \ + ((msrval & __BIT(32 + bitoff)) != 0) +#define ZERO_ALLOWED(msrval, bitoff) \ + ((msrval & __BIT(bitoff)) == 0) + + for (i = 0; i < 32; i++) { + one_allowed = ONE_ALLOWED(true_val, i); + zero_allowed = ZERO_ALLOWED(true_val, i); + + if (zero_allowed && !one_allowed) { + if (set_one & __BIT(i)) + return -1; + *res &= ~__BIT(i); + } else if (one_allowed && !zero_allowed) { + if (set_zero & __BIT(i)) + return -1; + *res |= __BIT(i); + } else { + if (set_zero & __BIT(i)) { + *res &= ~__BIT(i); + } else if (set_one & __BIT(i)) { + *res |= __BIT(i); + } else if (!has_true) { + *res &= ~__BIT(i); + } else if (ZERO_ALLOWED(val, i)) { + *res &= ~__BIT(i); + } else if (ONE_ALLOWED(val, i)) { + *res |= __BIT(i); + } else { + return -1; + } + } + } + + return 0; +} + +static bool +vmx_ident(void) +{ + uint64_t msr; + int ret; + + if (!(cpu_feature[1] & CPUID2_VMX)) { + return false; + } + + msr = rdmsr(MSR_IA32_FEATURE_CONTROL); + if ((msr & IA32_FEATURE_CONTROL_LOCK) == 0) { + return false; + } + + msr = rdmsr(MSR_IA32_VMX_BASIC); + if ((msr & IA32_VMX_BASIC_IO_REPORT) == 0) { + return false; + } + if (__SHIFTOUT(msr, IA32_VMX_BASIC_MEM_TYPE) != MEM_TYPE_WB) { + return false; + } + + msr = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP); + if ((msr & IA32_VMX_EPT_VPID_WALKLENGTH_4) == 0) { + return false; + } + if ((msr & IA32_VMX_EPT_VPID_INVEPT) == 0) { + return false; + } + if ((msr & IA32_VMX_EPT_VPID_INVVPID) == 0) { + return false; + } + if ((msr & IA32_VMX_EPT_VPID_FLAGS_AD) == 0) { + return false; + } + if (!(msr & IA32_VMX_EPT_VPID_UC) && !(msr & IA32_VMX_EPT_VPID_WB)) { + return false; + } + + /* PG and PE are reported, even if Unrestricted Guests is supported. */ + vmx_cr0_fixed0 = rdmsr(MSR_IA32_VMX_CR0_FIXED0) & ~(CR0_PG|CR0_PE); + vmx_cr0_fixed1 = rdmsr(MSR_IA32_VMX_CR0_FIXED1) | (CR0_PG|CR0_PE); + ret = vmx_check_cr(rcr0(), vmx_cr0_fixed0, vmx_cr0_fixed1); + if (ret == -1) { + return false; + } + + vmx_cr4_fixed0 = rdmsr(MSR_IA32_VMX_CR4_FIXED0); + vmx_cr4_fixed1 = rdmsr(MSR_IA32_VMX_CR4_FIXED1); + ret = vmx_check_cr(rcr4() | CR4_VMXE, vmx_cr4_fixed0, vmx_cr4_fixed1); + if (ret == -1) { + return false; + } + + /* Init the CTLSs right now, and check for errors. */ + ret = vmx_init_ctls( + MSR_IA32_VMX_PINBASED_CTLS, MSR_IA32_VMX_TRUE_PINBASED_CTLS, + VMX_PINBASED_CTLS_ONE, VMX_PINBASED_CTLS_ZERO, + &vmx_pinbased_ctls); + if (ret == -1) { + return false; + } + ret = vmx_init_ctls( + MSR_IA32_VMX_PROCBASED_CTLS, MSR_IA32_VMX_TRUE_PROCBASED_CTLS, + VMX_PROCBASED_CTLS_ONE, VMX_PROCBASED_CTLS_ZERO, + &vmx_procbased_ctls); + if (ret == -1) { + return false; + } + ret = vmx_init_ctls( + MSR_IA32_VMX_PROCBASED_CTLS2, MSR_IA32_VMX_PROCBASED_CTLS2, + VMX_PROCBASED_CTLS2_ONE, VMX_PROCBASED_CTLS2_ZERO, + &vmx_procbased_ctls2); + if (ret == -1) { + return false; + } + ret = vmx_init_ctls( + MSR_IA32_VMX_ENTRY_CTLS, MSR_IA32_VMX_TRUE_ENTRY_CTLS, + VMX_ENTRY_CTLS_ONE, VMX_ENTRY_CTLS_ZERO, + &vmx_entry_ctls); + if (ret == -1) { + return false; + } + ret = vmx_init_ctls( + MSR_IA32_VMX_EXIT_CTLS, MSR_IA32_VMX_TRUE_EXIT_CTLS, + VMX_EXIT_CTLS_ONE, VMX_EXIT_CTLS_ZERO, + &vmx_exit_ctls); + if (ret == -1) { + return false; + } + + return true; +} + +static void +vmx_change_cpu(void *arg1, void *arg2) +{ + struct cpu_info *ci = curcpu(); + bool enable = (bool)arg1; + uint64_t cr4; + + if (!enable) { + vmx_vmxoff(); + } + + cr4 = rcr4(); + if (enable) { + cr4 |= CR4_VMXE; + } else { + cr4 &= ~CR4_VMXE; + } + lcr4(cr4); + + if (enable) { + vmx_vmxon(&vmxoncpu[cpu_index(ci)].pa); + } +} + +static void +vmx_init_l1tf(void) +{ + u_int descs[4]; + uint64_t msr; + + if (cpuid_level < 7) { + return; + } + + x86_cpuid(7, descs); + + if (descs[3] & CPUID_SEF_ARCH_CAP) { + msr = rdmsr(MSR_IA32_ARCH_CAPABILITIES); + if (msr & IA32_ARCH_SKIP_L1DFL_VMENTRY) { + /* No mitigation needed. */ + return; + } + } + + if (descs[3] & CPUID_SEF_L1D_FLUSH) { + /* Enable hardware mitigation. */ + vmx_msrlist_entry_nmsr += 1; + } +} + +static void +vmx_init(void) +{ + CPU_INFO_ITERATOR cii; + struct cpu_info *ci; + uint64_t xc, msr; + struct vmxon *vmxon; + uint32_t revision; + paddr_t pa; + vaddr_t va; + int error; + + /* Init the ASID bitmap (VPID). */ + vmx_init_asid(VPID_MAX); + + /* Init the XCR0 mask. */ + vmx_xcr0_mask = VMX_XCR0_MASK_DEFAULT & x86_xsave_features; + + /* Init the TLB flush op, the EPT flush op and the EPTP type. */ + msr = rdmsr(MSR_IA32_VMX_EPT_VPID_CAP); + if ((msr & IA32_VMX_EPT_VPID_INVVPID_CONTEXT) != 0) { + vmx_tlb_flush_op = VMX_INVVPID_CONTEXT; + } else { + vmx_tlb_flush_op = VMX_INVVPID_ALL; + } + if ((msr & IA32_VMX_EPT_VPID_INVEPT_CONTEXT) != 0) { + vmx_ept_flush_op = VMX_INVEPT_CONTEXT; + } else { + vmx_ept_flush_op = VMX_INVEPT_ALL; + } + if ((msr & IA32_VMX_EPT_VPID_WB) != 0) { + vmx_eptp_type = EPTP_TYPE_WB; + } else { + vmx_eptp_type = EPTP_TYPE_UC; + } + + /* Init the L1TF mitigation. */ + vmx_init_l1tf(); + + memset(vmxoncpu, 0, sizeof(vmxoncpu)); + revision = vmx_get_revision(); + + for (CPU_INFO_FOREACH(cii, ci)) { + error = vmx_memalloc(&pa, &va, 1); + if (error) { + panic("%s: out of memory", __func__); + } + vmxoncpu[cpu_index(ci)].pa = pa; + vmxoncpu[cpu_index(ci)].va = va; + + vmxon = (struct vmxon *)vmxoncpu[cpu_index(ci)].va; + vmxon->ident = __SHIFTIN(revision, VMXON_IDENT_REVISION); + } + + xc = xc_broadcast(0, vmx_change_cpu, (void *)true, NULL); + xc_wait(xc); +} + +static void +vmx_fini_asid(void) +{ + size_t allocsz; + + allocsz = roundup(vmx_maxasid, 8) / 8; + kmem_free(vmx_asidmap, allocsz); + + mutex_destroy(&vmx_asidlock); +} + +static void +vmx_fini(void) +{ + uint64_t xc; + size_t i; + + xc = xc_broadcast(0, vmx_change_cpu, (void *)false, NULL); + xc_wait(xc); + + for (i = 0; i < MAXCPUS; i++) { + if (vmxoncpu[i].pa != 0) + vmx_memfree(vmxoncpu[i].pa, vmxoncpu[i].va, 1); + } + + vmx_fini_asid(); +} + +static void +vmx_capability(struct nvmm_capability *cap) +{ + cap->u.x86.xcr0_mask = vmx_xcr0_mask; + cap->u.x86.mxcsr_mask = x86_fpu_mxcsr_mask; + cap->u.x86.conf_cpuid_maxops = VMX_NCPUIDS; +} + +const struct nvmm_impl nvmm_x86_vmx = { + .ident = vmx_ident, + .init = vmx_init, + .fini = vmx_fini, + .capability = vmx_capability, + .conf_max = NVMM_X86_NCONF, + .conf_sizes = vmx_conf_sizes, + .state_size = sizeof(struct nvmm_x64_state), + .machine_create = vmx_machine_create, + .machine_destroy = vmx_machine_destroy, + .machine_configure = vmx_machine_configure, + .vcpu_create = vmx_vcpu_create, + .vcpu_destroy = vmx_vcpu_destroy, + .vcpu_setstate = vmx_vcpu_setstate, + .vcpu_getstate = vmx_vcpu_getstate, + .vcpu_inject = vmx_vcpu_inject, + .vcpu_run = vmx_vcpu_run +}; Index: src/sys/dev/nvmm/x86/nvmm_x86_vmxfunc.S diff -u /dev/null src/sys/dev/nvmm/x86/nvmm_x86_vmxfunc.S:1.1 --- /dev/null Wed Feb 13 16:03:16 2019 +++ src/sys/dev/nvmm/x86/nvmm_x86_vmxfunc.S Wed Feb 13 16:03:16 2019 @@ -0,0 +1,357 @@ +/* $NetBSD: nvmm_x86_vmxfunc.S,v 1.1 2019/02/13 16:03:16 maxv Exp $ */ + +/* + * Copyright (c) 2018 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Maxime Villard. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* Override user-land alignment before including asm.h */ +#define ALIGN_DATA .align 8 +#define ALIGN_TEXT .align 16,0x90 +#define _ALIGN_TEXT ALIGN_TEXT + +#define _LOCORE +#include "assym.h" +#include <machine/asm.h> +#include <machine/segments.h> +#include <x86/specialreg.h> + +#define ASM_NVMM +#include <dev/nvmm/x86/nvmm_x86.h> + + .text + +/* + * %rdi = *pa + */ +ENTRY(_vmx_vmxon) + vmxon (%rdi) + jz .Lfail_vmxon + jc .Lfail_vmxon + xorq %rax,%rax + retq +.Lfail_vmxon: + movq $-1,%rax + retq +END(_vmx_vmxon) + +/* + * no arg + */ +ENTRY(_vmx_vmxoff) + vmxoff + jz .Lfail_vmxoff + jc .Lfail_vmxoff + xorq %rax,%rax + retq +.Lfail_vmxoff: + movq $-1,%rax + retq +END(_vmx_vmxoff) + +/* + * %rdi = op + * %rsi = *descriptor + */ +ENTRY(_vmx_invept) + invept (%rsi),%rdi + jz .Linvept_failvalid + jc .Linvept_failinvalid + xorq %rax,%rax + retq +.Linvept_failvalid: + movq $-1,%rax + retq +.Linvept_failinvalid: + movq $-2,%rax + retq +END(_vmx_invept) + +/* + * %rdi = op + * %rsi = *descriptor + */ +ENTRY(_vmx_invvpid) + invvpid (%rsi),%rdi + jz .Linvvpid_failvalid + jc .Linvvpid_failinvalid + xorq %rax,%rax + retq +.Linvvpid_failvalid: + movq $-1,%rax + retq +.Linvvpid_failinvalid: + movq $-2,%rax + retq +END(_vmx_invvpid) + +/* + * %rdi = op + * %rsi = *val + */ +ENTRY(_vmx_vmread) + vmread %rdi,(%rsi) + jz .Lvmread_failvalid + jc .Lvmread_failinvalid + xorq %rax,%rax + retq +.Lvmread_failvalid: + movq $-1,%rax + retq +.Lvmread_failinvalid: + movq $-2,%rax + retq +END(_vmx_vmread) + +/* + * %rdi = op + * %rsi = val + */ +ENTRY(_vmx_vmwrite) + vmwrite %rsi,%rdi + jz .Lvmwrite_failvalid + jc .Lvmwrite_failinvalid + xorq %rax,%rax + retq +.Lvmwrite_failvalid: + movq $-1,%rax + retq +.Lvmwrite_failinvalid: + movq $-2,%rax + retq +END(_vmx_vmwrite) + +/* + * %rdi = *pa + */ +ENTRY(_vmx_vmptrld) + vmptrld (%rdi) + jz .Lfail_vmptrld + jc .Lfail_vmptrld + xorq %rax,%rax + retq +.Lfail_vmptrld: + movq $-1,%rax + retq +END(_vmx_vmptrld) + +/* + * %rdi = *pa + */ +ENTRY(_vmx_vmptrst) + vmptrst (%rdi) + jz .Lfail_vmptrst + jc .Lfail_vmptrst + xorq %rax,%rax + retq +.Lfail_vmptrst: + movq $-1,%rax + retq +END(_vmx_vmptrst) + +/* + * %rdi = pa + */ +ENTRY(_vmx_vmclear) + vmclear (%rdi) + jz .Lfail_vmclear + jc .Lfail_vmclear + xorq %rax,%rax + retq +.Lfail_vmclear: + movq $-1,%rax + retq +END(_vmx_vmclear) + +/* redef */ +#define VMCS_HOST_RSP 0x00006C14 + +#define HOST_SAVE_GPRS \ + pushq %rbx ;\ + pushq %rbp ;\ + pushq %r12 ;\ + pushq %r13 ;\ + pushq %r14 ;\ + pushq %r15 + +#define HOST_RESTORE_GPRS \ + popq %r15 ;\ + popq %r14 ;\ + popq %r13 ;\ + popq %r12 ;\ + popq %rbp ;\ + popq %rbx + +#define HOST_SAVE_RAX \ + pushq %rax + +#define HOST_RESTORE_RAX \ + popq %rax + +#define HOST_SAVE_LDT \ + sldtw %ax ;\ + pushw %ax + +#define HOST_RESTORE_LDT \ + popw %ax ;\ + lldtw %ax + +/* + * We don't save RAX (done manually), but we do restore it. + */ + +#define GUEST_SAVE_GPRS(reg) \ + movq %rbx,(NVMM_X64_GPR_RBX * 8)(reg) ;\ + movq %rcx,(NVMM_X64_GPR_RCX * 8)(reg) ;\ + movq %rdx,(NVMM_X64_GPR_RDX * 8)(reg) ;\ + movq %r8,(NVMM_X64_GPR_R8 * 8)(reg) ;\ + movq %r9,(NVMM_X64_GPR_R9 * 8)(reg) ;\ + movq %r10,(NVMM_X64_GPR_R10 * 8)(reg) ;\ + movq %r11,(NVMM_X64_GPR_R11 * 8)(reg) ;\ + movq %r12,(NVMM_X64_GPR_R12 * 8)(reg) ;\ + movq %r13,(NVMM_X64_GPR_R13 * 8)(reg) ;\ + movq %r14,(NVMM_X64_GPR_R14 * 8)(reg) ;\ + movq %r15,(NVMM_X64_GPR_R15 * 8)(reg) ;\ + movq %rbp,(NVMM_X64_GPR_RBP * 8)(reg) ;\ + movq %rdi,(NVMM_X64_GPR_RDI * 8)(reg) ;\ + movq %rsi,(NVMM_X64_GPR_RSI * 8)(reg) + +#define GUEST_RESTORE_GPRS(reg) \ + movq (NVMM_X64_GPR_RBX * 8)(reg),%rbx ;\ + movq (NVMM_X64_GPR_RCX * 8)(reg),%rcx ;\ + movq (NVMM_X64_GPR_RDX * 8)(reg),%rdx ;\ + movq (NVMM_X64_GPR_R8 * 8)(reg),%r8 ;\ + movq (NVMM_X64_GPR_R9 * 8)(reg),%r9 ;\ + movq (NVMM_X64_GPR_R10 * 8)(reg),%r10 ;\ + movq (NVMM_X64_GPR_R11 * 8)(reg),%r11 ;\ + movq (NVMM_X64_GPR_R12 * 8)(reg),%r12 ;\ + movq (NVMM_X64_GPR_R13 * 8)(reg),%r13 ;\ + movq (NVMM_X64_GPR_R14 * 8)(reg),%r14 ;\ + movq (NVMM_X64_GPR_R15 * 8)(reg),%r15 ;\ + movq (NVMM_X64_GPR_RBP * 8)(reg),%rbp ;\ + movq (NVMM_X64_GPR_RDI * 8)(reg),%rdi ;\ + movq (NVMM_X64_GPR_RSI * 8)(reg),%rsi ;\ + movq (NVMM_X64_GPR_RAX * 8)(reg),%rax + +/* + * %rdi = VA of guest GPR state + */ +ENTRY(vmx_vmlaunch) + /* Save the Host GPRs. */ + HOST_SAVE_GPRS + + /* Disable Host interrupts. */ + cli + + /* Save the Host LDT. */ + HOST_SAVE_LDT + + /* Save the Host RAX. */ + movq %rdi,%rax + pushq %rax + + /* Save the Host RSP. */ + movq $VMCS_HOST_RSP,%rdi + movq %rsp,%rsi + vmwrite %rsi,%rdi + + /* Restore the Guest GPRs. */ + GUEST_RESTORE_GPRS(%rax) + + /* Run the VM. */ + vmlaunch + + /* Failure. */ + addq $8,%rsp + HOST_RESTORE_LDT + sti + HOST_RESTORE_GPRS + movq $-1,%rax + retq +END(vmx_vmlaunch) + +/* + * %rdi = VA of guest GPR state + */ +ENTRY(vmx_vmresume) + /* Save the Host GPRs. */ + HOST_SAVE_GPRS + + /* Disable Host interrupts. */ + cli + + /* Save the Host LDT. */ + HOST_SAVE_LDT + + /* Save the Host RAX. */ + movq %rdi,%rax + pushq %rax + + /* Save the Host RSP. */ + movq $VMCS_HOST_RSP,%rdi + movq %rsp,%rsi + vmwrite %rsi,%rdi + + /* Restore the Guest GPRs. */ + GUEST_RESTORE_GPRS(%rax) + + /* Run the VM. */ + vmresume + + /* Failure. */ + addq $8,%rsp + HOST_RESTORE_LDT + sti + HOST_RESTORE_GPRS + movq $-1,%rax + retq +END(vmx_vmresume) + +/* + * The CPU jumps here after a #VMEXIT. + */ +ENTRY(vmx_resume_rip) + /* Save the Guest GPRs. RAX done manually. */ + pushq %rax + movq 8(%rsp),%rax + GUEST_SAVE_GPRS(%rax) + popq %rbx + movq %rbx,(NVMM_X64_GPR_RAX * 8)(%rax) + addq $8,%rsp + + /* Restore the Host LDT. */ + HOST_RESTORE_LDT + + /* Enable Host interrupts. */ + sti + + /* Restore the Host GPRs. */ + HOST_RESTORE_GPRS + + xorq %rax,%rax + retq +END(vmx_resume_rip)