Module Name: src Committed By: maxv Date: Sun Jan 6 16:10:51 UTC 2019
Modified Files: src/lib/libnvmm: libnvmm.3 libnvmm_x86.c nvmm.h src/sys/dev/nvmm: nvmm.c nvmm.h src/sys/dev/nvmm/x86: nvmm_x86.h nvmm_x86_svm.c Log Message: Improvements and fixes in NVMM. Kernel driver: * Don't take an extra (unneeded) reference to the UAO. * Provide npc for HLT. I'm not really happy with it right now, will likely be revisited. * Add the INT_SHADOW, INT_WINDOW_EXIT and NMI_WINDOW_EXIT states. Provide them in the exitstate too. * Don't take the TPR into account when processing INTs. The virtualizer can do that itself (Qemu already does). * Provide a hypervisor signature in CPUID, and hide SVM. * Ignore certain MSRs. One special case is MSR_NB_CFG in which we set NB_CFG_INITAPICCPUIDLO. Allow reads of MSR_TSC. * If the LWP has pending signals or softints, leave, rather than waiting for a rescheduling to happen later. This reduces interrupt processing time in the guest (Qemu sends a signal to the thread, and now we leave right away). This could be improved even more by sending an actual IPI to the CPU, but I'll see later. Libnvmm: * Fix the MMU translation of large pages, we need to add the lower bits too. * Change the IO and Mem structures to take a pointer rather than a static array. This provides more flexibility. * Batch together the str+rep IO transactions. We do one big memory read/write, and then send the IO commands to the hypervisor all at once. This considerably increases performance. * Decode MOVZX. With these changes in place, Qemu+NVMM works. I can install NetBSD 8.0 in a VM with multiple VCPUs, connect to the network, etc. To generate a diff of this commit: cvs rdiff -u -r1.6 -r1.7 src/lib/libnvmm/libnvmm.3 cvs rdiff -u -r1.9 -r1.10 src/lib/libnvmm/libnvmm_x86.c cvs rdiff -u -r1.4 -r1.5 src/lib/libnvmm/nvmm.h cvs rdiff -u -r1.4 -r1.5 src/sys/dev/nvmm/nvmm.c cvs rdiff -u -r1.1 -r1.2 src/sys/dev/nvmm/nvmm.h cvs rdiff -u -r1.2 -r1.3 src/sys/dev/nvmm/x86/nvmm_x86.h cvs rdiff -u -r1.9 -r1.10 src/sys/dev/nvmm/x86/nvmm_x86_svm.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/lib/libnvmm/libnvmm.3 diff -u src/lib/libnvmm/libnvmm.3:1.6 src/lib/libnvmm/libnvmm.3:1.7 --- src/lib/libnvmm/libnvmm.3:1.6 Thu Dec 27 07:22:31 2018 +++ src/lib/libnvmm/libnvmm.3 Sun Jan 6 16:10:51 2019 @@ -1,4 +1,4 @@ -.\" $NetBSD: libnvmm.3,v 1.6 2018/12/27 07:22:31 maxv Exp $ +.\" $NetBSD: libnvmm.3,v 1.7 2019/01/06 16:10:51 maxv Exp $ .\" .\" Copyright (c) 2018 The NetBSD Foundation, Inc. .\" All rights reserved. @@ -27,7 +27,7 @@ .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGE. .\" -.Dd December 26, 2018 +.Dd January 06, 2019 .Dt LIBNVMM 3 .Os .Sh NAME @@ -242,8 +242,6 @@ on CPU .Fa cpuid from machine .Fa mach . -.Fa cb -will be called to handle the transaction. See .Sx I/O Assist below for details. @@ -255,8 +253,6 @@ on CPU .Fa cpuid from machine .Fa mach . -.Fa cb -will be called to handle the transaction. See .Sx Mem Assist below for details. @@ -415,7 +411,7 @@ struct nvmm_io { uint64_t port; bool in; size_t size; - uint8_t data[8]; + uint8_t *data; }; .Ed .Pp @@ -463,7 +459,7 @@ struct nvmm_mem { gpaddr_t gpa; bool write; size_t size; - uint8_t data[8]; + uint8_t *data; }; .Ed .Pp Index: src/lib/libnvmm/libnvmm_x86.c diff -u src/lib/libnvmm/libnvmm_x86.c:1.9 src/lib/libnvmm/libnvmm_x86.c:1.10 --- src/lib/libnvmm/libnvmm_x86.c:1.9 Fri Jan 4 10:25:39 2019 +++ src/lib/libnvmm/libnvmm_x86.c Sun Jan 6 16:10:51 2019 @@ -1,4 +1,4 @@ -/* $NetBSD: libnvmm_x86.c,v 1.9 2019/01/04 10:25:39 maxv Exp $ */ +/* $NetBSD: libnvmm_x86.c,v 1.10 2019/01/06 16:10:51 maxv Exp $ */ /* * Copyright (c) 2018 The NetBSD Foundation, Inc. @@ -45,6 +45,8 @@ #include "nvmm.h" +#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) + #include <x86/specialreg.h> extern struct nvmm_callbacks __callbacks; @@ -83,6 +85,11 @@ nvmm_vcpu_dump(struct nvmm_machine *mach (void *)state.segs[i].limit, state.segs[i].attrib.p, state.segs[i].attrib.def32); } + printf("| -> MSR_EFER=%p\n", (void *)state.msrs[NVMM_X64_MSR_EFER]); + printf("| -> CR0=%p\n", (void *)state.crs[NVMM_X64_CR_CR0]); + printf("| -> CR3=%p\n", (void *)state.crs[NVMM_X64_CR_CR3]); + printf("| -> CR4=%p\n", (void *)state.crs[NVMM_X64_CR_CR4]); + printf("| -> CR8=%p\n", (void *)state.crs[NVMM_X64_CR_CR8]); printf("| -> CPL=%p\n", (void *)state.misc[NVMM_X64_MISC_CPL]); return 0; @@ -131,6 +138,7 @@ x86_gva_to_gpa_32bit(struct nvmm_machine return -1; if (pte & PG_PS) { *gpa = (pte & PTE32_L2_FRAME); + *gpa = *gpa + (gva & PTE32_L1_MASK); return 0; } @@ -215,6 +223,7 @@ x86_gva_to_gpa_32bit_pae(struct nvmm_mac return -1; if (pte & PG_PS) { *gpa = (pte & PTE32_PAE_L2_FRAME); + *gpa = *gpa + (gva & PTE32_PAE_L1_MASK); return 0; } @@ -320,6 +329,7 @@ x86_gva_to_gpa_64bit(struct nvmm_machine return -1; if (pte & PG_PS) { *gpa = (pte & PTE64_L3_FRAME); + *gpa = *gpa + (gva & (PTE64_L2_MASK|PTE64_L1_MASK)); return 0; } @@ -341,6 +351,7 @@ x86_gva_to_gpa_64bit(struct nvmm_machine return -1; if (pte & PG_PS) { *gpa = (pte & PTE64_L2_FRAME); + *gpa = *gpa + (gva & PTE64_L1_MASK); return 0; } @@ -500,13 +511,34 @@ mask_from_adsize(size_t adsize) } static uint64_t +rep_get_cnt(struct nvmm_x64_state *state, size_t adsize) +{ + uint64_t mask, cnt; + + mask = mask_from_adsize(adsize); + cnt = state->gprs[NVMM_X64_GPR_RCX] & mask; + + return cnt; +} + +static void +rep_set_cnt(struct nvmm_x64_state *state, size_t adsize, uint64_t cnt) +{ + uint64_t mask; + + mask = mask_from_adsize(adsize); + state->gprs[NVMM_X64_GPR_RCX] &= ~mask; + state->gprs[NVMM_X64_GPR_RCX] |= cnt; +} + +static uint64_t rep_dec_apply(struct nvmm_x64_state *state, size_t adsize) { uint64_t mask, cnt; mask = mask_from_adsize(adsize); - cnt = state->gprs[NVMM_X64_GPR_RCX] & mask; + cnt = state->gprs[NVMM_X64_GPR_RCX] & mask; cnt -= 1; cnt &= mask; @@ -521,6 +553,7 @@ read_guest_memory(struct nvmm_machine *m gvaddr_t gva, uint8_t *data, size_t size) { struct nvmm_mem mem; + uint8_t membuf[8]; nvmm_prot_t prot; gpaddr_t gpa; uintptr_t hva; @@ -547,6 +580,7 @@ read_guest_memory(struct nvmm_machine *m is_mmio = (ret == -1); if (is_mmio) { + mem.data = membuf; mem.gva = gva; mem.gpa = gpa; mem.write = false; @@ -572,6 +606,7 @@ write_guest_memory(struct nvmm_machine * gvaddr_t gva, uint8_t *data, size_t size) { struct nvmm_mem mem; + uint8_t membuf[8]; nvmm_prot_t prot; gpaddr_t gpa; uintptr_t hva; @@ -598,6 +633,7 @@ write_guest_memory(struct nvmm_machine * is_mmio = (ret == -1); if (is_mmio) { + mem.data = membuf; mem.gva = gva; mem.gpa = gpa; mem.write = true; @@ -622,16 +658,55 @@ write_guest_memory(struct nvmm_machine * static int fetch_segment(struct nvmm_machine *, struct nvmm_x64_state *); +#define NVMM_IO_BATCH_SIZE 32 + +static int +assist_io_batch(struct nvmm_machine *mach, struct nvmm_x64_state *state, + struct nvmm_io *io, gvaddr_t gva, uint64_t cnt) +{ + uint8_t iobuf[NVMM_IO_BATCH_SIZE]; + size_t i, iosize, iocnt; + int ret; + + cnt = MIN(cnt, NVMM_IO_BATCH_SIZE); + iosize = MIN(io->size * cnt, NVMM_IO_BATCH_SIZE); + iocnt = iosize / io->size; + + io->data = iobuf; + + if (!io->in) { + ret = read_guest_memory(mach, state, gva, iobuf, iosize); + if (ret == -1) + return -1; + } + + for (i = 0; i < iocnt; i++) { + (*__callbacks.io)(io); + io->data += io->size; + } + + if (io->in) { + ret = write_guest_memory(mach, state, gva, iobuf, iosize); + if (ret == -1) + return -1; + } + + return iocnt; +} + int nvmm_assist_io(struct nvmm_machine *mach, nvmm_cpuid_t cpuid, struct nvmm_exit *exit) { struct nvmm_x64_state state; struct nvmm_io io; - uint64_t cnt; + uint64_t cnt = 0; /* GCC */ + uint8_t iobuf[8]; + int iocnt = 1; gvaddr_t gva; int reg = 0; /* GCC */ int ret, seg; + bool psld = false; if (__predict_false(exit->reason != NVMM_EXIT_IO)) { errno = EINVAL; @@ -641,6 +716,7 @@ nvmm_assist_io(struct nvmm_machine *mach io.port = exit->u.io.port; io.in = (exit->u.io.type == NVMM_EXIT_IO_IN); io.size = exit->u.io.operand_size; + io.data = iobuf; ret = nvmm_vcpu_getstate(mach, cpuid, &state, NVMM_X64_STATE_GPRS | NVMM_X64_STATE_SEGS | @@ -648,6 +724,17 @@ nvmm_assist_io(struct nvmm_machine *mach if (ret == -1) return -1; + if (exit->u.io.rep) { + cnt = rep_get_cnt(&state, exit->u.io.address_size); + if (__predict_false(cnt == 0)) { + return 0; + } + } + + if (__predict_false(state.gprs[NVMM_X64_GPR_RFLAGS] & PSL_D)) { + psld = true; + } + /* * Determine GVA. */ @@ -678,6 +765,13 @@ nvmm_assist_io(struct nvmm_machine *mach if (ret == -1) return -1; } + + if (exit->u.io.rep && !psld) { + iocnt = assist_io_batch(mach, &state, &io, gva, cnt); + if (iocnt == -1) + return -1; + goto done; + } } if (!io.in) { @@ -704,16 +798,18 @@ nvmm_assist_io(struct nvmm_machine *mach } } +done: if (exit->u.io.str) { - if (state.gprs[NVMM_X64_GPR_RFLAGS] & PSL_D) { - state.gprs[reg] -= io.size; + if (__predict_false(psld)) { + state.gprs[reg] -= iocnt * io.size; } else { - state.gprs[reg] += io.size; + state.gprs[reg] += iocnt * io.size; } } if (exit->u.io.rep) { - cnt = rep_dec_apply(&state, exit->u.io.address_size); + cnt -= iocnt; + rep_set_cnt(&state, exit->u.io.address_size, cnt); if (cnt == 0) { state.gprs[NVMM_X64_GPR_RIP] = exit->u.io.npc; } @@ -858,6 +954,7 @@ struct x86_instr { struct x86_rexpref rexpref; size_t operand_size; size_t address_size; + uint64_t zeroextend_mask; struct x86_regmodrm regmodrm; @@ -912,6 +1009,7 @@ struct x86_group_entry { #define OPSIZE_QUAD 0x08 /* 8 bytes */ #define FLAG_z 0x02 +#define FLAG_e 0x10 static const struct x86_group_entry group11[8] = { [0] = { .emul = x86_emul_mov } @@ -1230,6 +1328,34 @@ static const struct x86_opcode primary_o }, }; +static const struct x86_opcode secondary_opcode_table[] = { + /* + * MOVZX + */ + { + /* Gv, Eb */ + .byte = 0xB6, + .regmodrm = true, + .regtorm = false, + .szoverride = true, + .defsize = OPSIZE_BYTE, + .allsize = OPSIZE_WORD|OPSIZE_DOUB|OPSIZE_QUAD, + .flags = FLAG_e, + .emul = x86_emul_mov + }, + { + /* Gv, Ew */ + .byte = 0xB7, + .regmodrm = true, + .regtorm = false, + .szoverride = true, + .defsize = OPSIZE_WORD, + .allsize = OPSIZE_WORD|OPSIZE_DOUB|OPSIZE_QUAD, + .flags = FLAG_e, + .emul = x86_emul_mov + }, +}; + static const struct x86_reg gpr_map__rip = { NVMM_X64_GPR_RIP, 0xFFFFFFFFFFFFFFFF }; /* [REX-present][enc][opsize] */ @@ -2059,6 +2185,67 @@ node_primary_opcode(struct x86_decode_fs return 0; } +static uint64_t +size_to_mask(size_t size) +{ + switch (size) { + case 1: + return 0x00000000000000FF; + case 2: + return 0x000000000000FFFF; + case 4: + return 0x00000000FFFFFFFF; + case 8: + default: + return 0xFFFFFFFFFFFFFFFF; + } +} + +static int +node_secondary_opcode(struct x86_decode_fsm *fsm, struct x86_instr *instr) +{ + const struct x86_opcode *opcode; + uint8_t byte; + size_t i, n; + + if (fsm_read(fsm, &byte, sizeof(byte)) == -1) { + return -1; + } + + n = sizeof(secondary_opcode_table) / sizeof(secondary_opcode_table[0]); + for (i = 0; i < n; i++) { + if (secondary_opcode_table[i].byte == byte) + break; + } + if (i == n) { + return -1; + } + opcode = &secondary_opcode_table[i]; + + instr->opcode = opcode; + instr->emul = opcode->emul; + instr->operand_size = get_operand_size(fsm, instr); + instr->address_size = get_address_size(fsm, instr); + + if (opcode->flags & FLAG_e) { + /* + * Compute the mask for zero-extend. Update the operand size, + * we move fewer bytes. + */ + instr->zeroextend_mask = size_to_mask(instr->operand_size); + instr->zeroextend_mask &= ~size_to_mask(opcode->defsize); + instr->operand_size = opcode->defsize; + } + + if (opcode->regmodrm) { + fsm_advance(fsm, 1, node_regmodrm); + } else { + return -1; + } + + return 0; +} + static int node_main(struct x86_decode_fsm *fsm, struct x86_instr *instr) { @@ -2078,7 +2265,7 @@ node_main(struct x86_decode_fsm *fsm, st * after being introduced. */ if (byte == ESCAPE) { - return -1; + fsm_advance(fsm, 1, node_secondary_opcode); } else if (!instr->rexpref.present) { if (byte == VEX_1) { return -1; @@ -2600,10 +2787,12 @@ assist_mem_single(struct nvmm_machine *m struct x86_instr *instr) { struct nvmm_mem mem; + uint8_t membuf[8]; uint64_t val; int ret; memset(&mem, 0, sizeof(mem)); + mem.data = membuf; switch (instr->src.type) { case STORE_REG: @@ -2703,6 +2892,7 @@ assist_mem_single(struct nvmm_machine *m val = __SHIFTIN(val, instr->dst.u.reg->mask); state->gprs[instr->dst.u.reg->num] &= ~instr->dst.u.reg->mask; state->gprs[instr->dst.u.reg->num] |= val; + state->gprs[instr->dst.u.reg->num] &= ~instr->zeroextend_mask; } return 0; Index: src/lib/libnvmm/nvmm.h diff -u src/lib/libnvmm/nvmm.h:1.4 src/lib/libnvmm/nvmm.h:1.5 --- src/lib/libnvmm/nvmm.h:1.4 Thu Dec 27 07:22:31 2018 +++ src/lib/libnvmm/nvmm.h Sun Jan 6 16:10:51 2019 @@ -1,4 +1,4 @@ -/* $NetBSD: nvmm.h,v 1.4 2018/12/27 07:22:31 maxv Exp $ */ +/* $NetBSD: nvmm.h,v 1.5 2019/01/06 16:10:51 maxv Exp $ */ /* * Copyright (c) 2018 The NetBSD Foundation, Inc. @@ -50,7 +50,7 @@ struct nvmm_io { uint64_t port; bool in; size_t size; - uint8_t data[8]; + uint8_t *data; }; struct nvmm_mem { @@ -58,7 +58,7 @@ struct nvmm_mem { gpaddr_t gpa; bool write; size_t size; - uint8_t data[8]; + uint8_t *data; }; struct nvmm_callbacks { Index: src/sys/dev/nvmm/nvmm.c diff -u src/sys/dev/nvmm/nvmm.c:1.4 src/sys/dev/nvmm/nvmm.c:1.5 --- src/sys/dev/nvmm/nvmm.c:1.4 Sat Dec 15 13:39:43 2018 +++ src/sys/dev/nvmm/nvmm.c Sun Jan 6 16:10:51 2019 @@ -1,4 +1,4 @@ -/* $NetBSD: nvmm.c,v 1.4 2018/12/15 13:39:43 maxv Exp $ */ +/* $NetBSD: nvmm.c,v 1.5 2019/01/06 16:10:51 maxv Exp $ */ /* * Copyright (c) 2018 The NetBSD Foundation, Inc. @@ -30,7 +30,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.4 2018/12/15 13:39:43 maxv Exp $"); +__KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.5 2019/01/06 16:10:51 maxv Exp $"); #include <sys/param.h> #include <sys/systm.h> @@ -644,9 +644,6 @@ nvmm_hva_map(struct nvmm_ioc_hva_map *ar seg->uobj = uao_create(seg->size, 0); uva = seg->hva; - /* Take a reference for the kernel. */ - uao_reference(seg->uobj); - /* Take a reference for the user. */ uao_reference(seg->uobj); Index: src/sys/dev/nvmm/nvmm.h diff -u src/sys/dev/nvmm/nvmm.h:1.1 src/sys/dev/nvmm/nvmm.h:1.2 --- src/sys/dev/nvmm/nvmm.h:1.1 Wed Nov 7 07:43:08 2018 +++ src/sys/dev/nvmm/nvmm.h Sun Jan 6 16:10:51 2019 @@ -1,4 +1,4 @@ -/* $NetBSD: nvmm.h,v 1.1 2018/11/07 07:43:08 maxv Exp $ */ +/* $NetBSD: nvmm.h,v 1.2 2019/01/06 16:10:51 maxv Exp $ */ /* * Copyright (c) 2018 The NetBSD Foundation, Inc. @@ -106,12 +106,17 @@ struct nvmm_exit_msr { uint64_t npc; }; +struct nvmm_exit_hlt { + uint64_t npc; +}; + struct nvmm_exit { enum nvmm_exit_reason reason; union { struct nvmm_exit_memory mem; struct nvmm_exit_io io; struct nvmm_exit_msr msr; + struct nvmm_exit_hlt hlt; } u; uint64_t exitstate[8]; }; Index: src/sys/dev/nvmm/x86/nvmm_x86.h diff -u src/sys/dev/nvmm/x86/nvmm_x86.h:1.2 src/sys/dev/nvmm/x86/nvmm_x86.h:1.3 --- src/sys/dev/nvmm/x86/nvmm_x86.h:1.2 Sun Nov 25 14:09:57 2018 +++ src/sys/dev/nvmm/x86/nvmm_x86.h Sun Jan 6 16:10:51 2019 @@ -1,4 +1,4 @@ -/* $NetBSD: nvmm_x86.h,v 1.2 2018/11/25 14:09:57 maxv Exp $ */ +/* $NetBSD: nvmm_x86.h,v 1.3 2019/01/06 16:10:51 maxv Exp $ */ /* * Copyright (c) 2018 The NetBSD Foundation, Inc. @@ -99,7 +99,10 @@ /* Misc. */ #define NVMM_X64_MISC_CPL 0 -#define NVMM_X64_NMISC 1 +#define NVMM_X64_MISC_INT_SHADOW 1 +#define NVMM_X64_MISC_INT_WINDOW_EXIT 2 +#define NVMM_X64_MISC_NMI_WINDOW_EXIT 3 +#define NVMM_X64_NMISC 4 #ifndef ASM_NVMM @@ -123,8 +126,11 @@ struct nvmm_x64_state_seg { }; /* VM exit state indexes. */ -#define NVMM_X64_EXITSTATE_CR8 0 -#define NVMM_X64_EXITSTATE_RFLAGS 1 +#define NVMM_X64_EXITSTATE_CR8 0 +#define NVMM_X64_EXITSTATE_RFLAGS 1 +#define NVMM_X64_EXITSTATE_INT_SHADOW 2 +#define NVMM_X64_EXITSTATE_INT_WINDOW_EXIT 3 +#define NVMM_X64_EXITSTATE_NMI_WINDOW_EXIT 4 /* Flags. */ #define NVMM_X64_STATE_SEGS 0x01 Index: src/sys/dev/nvmm/x86/nvmm_x86_svm.c diff -u src/sys/dev/nvmm/x86/nvmm_x86_svm.c:1.9 src/sys/dev/nvmm/x86/nvmm_x86_svm.c:1.10 --- src/sys/dev/nvmm/x86/nvmm_x86_svm.c:1.9 Thu Jan 3 08:02:49 2019 +++ src/sys/dev/nvmm/x86/nvmm_x86_svm.c Sun Jan 6 16:10:51 2019 @@ -1,4 +1,4 @@ -/* $NetBSD: nvmm_x86_svm.c,v 1.9 2019/01/03 08:02:49 maxv Exp $ */ +/* $NetBSD: nvmm_x86_svm.c,v 1.10 2019/01/06 16:10:51 maxv Exp $ */ /* * Copyright (c) 2018 The NetBSD Foundation, Inc. @@ -30,7 +30,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: nvmm_x86_svm.c,v 1.9 2019/01/03 08:02:49 maxv Exp $"); +__KERNEL_RCSID(0, "$NetBSD: nvmm_x86_svm.c,v 1.10 2019/01/06 16:10:51 maxv Exp $"); #include <sys/param.h> #include <sys/systm.h> @@ -518,8 +518,11 @@ struct svm_cpudata { bool ts_set; struct xsave_header hfpu __aligned(16); + /* Event state */ + bool int_window_exit; + bool nmi_window_exit; + /* Guest state */ - bool in_nmi; uint64_t tsc_offset; struct xsave_header gfpu __aligned(16); }; @@ -530,26 +533,34 @@ struct svm_cpudata { #define SVM_EVENT_TYPE_SW_INT 4 static void -svm_event_waitexit_enable(struct vmcb *vmcb, bool nmi) +svm_event_waitexit_enable(struct nvmm_cpu *vcpu, bool nmi) { + struct svm_cpudata *cpudata = vcpu->cpudata; + struct vmcb *vmcb = cpudata->vmcb; + if (nmi) { vmcb->ctrl.intercept_misc1 |= VMCB_CTRL_INTERCEPT_IRET; + cpudata->nmi_window_exit = true; } else { vmcb->ctrl.intercept_misc1 |= VMCB_CTRL_INTERCEPT_VINTR; - vmcb->ctrl.v |= (VMCB_CTRL_V_IRQ | - __SHIFTIN(0, VMCB_CTRL_V_INTR_VECTOR)); + vmcb->ctrl.v |= (VMCB_CTRL_V_IRQ | VMCB_CTRL_V_IGN_TPR); + cpudata->int_window_exit = true; } } static void -svm_event_waitexit_disable(struct vmcb *vmcb, bool nmi) +svm_event_waitexit_disable(struct nvmm_cpu *vcpu, bool nmi) { + struct svm_cpudata *cpudata = vcpu->cpudata; + struct vmcb *vmcb = cpudata->vmcb; + if (nmi) { vmcb->ctrl.intercept_misc1 &= ~VMCB_CTRL_INTERCEPT_IRET; + cpudata->nmi_window_exit = false; } else { vmcb->ctrl.intercept_misc1 &= ~VMCB_CTRL_INTERCEPT_VINTR; - vmcb->ctrl.v &= ~(VMCB_CTRL_V_IRQ | - __SHIFTIN(0, VMCB_CTRL_V_INTR_VECTOR)); + vmcb->ctrl.v &= ~(VMCB_CTRL_V_IRQ | VMCB_CTRL_V_IGN_TPR); + cpudata->int_window_exit = false; } } @@ -577,9 +588,7 @@ svm_vcpu_inject(struct nvmm_machine *mac { struct svm_cpudata *cpudata = vcpu->cpudata; struct vmcb *vmcb = cpudata->vmcb; - uint64_t rflags = vmcb->state.rflags; int type = 0, err = 0; - uint64_t tpr; if (event->vector >= 256) { return EINVAL; @@ -592,15 +601,14 @@ svm_vcpu_inject(struct nvmm_machine *mac type = SVM_EVENT_TYPE_NMI; } if (type == SVM_EVENT_TYPE_NMI) { - if (cpudata->in_nmi) { - svm_event_waitexit_enable(vmcb, true); + if (cpudata->nmi_window_exit) { return EAGAIN; } - cpudata->in_nmi = true; + svm_event_waitexit_enable(vcpu, true); } else { - tpr = __SHIFTOUT(vmcb->ctrl.v, VMCB_CTRL_V_TPR); - if ((rflags & PSL_I) == 0 || event->u.prio <= tpr) { - svm_event_waitexit_enable(vmcb, false); + if (((vmcb->state.rflags & PSL_I) == 0) || + ((vmcb->ctrl.intr & VMCB_CTRL_INTR_SHADOW) != 0)) { + svm_event_waitexit_enable(vcpu, false); return EAGAIN; } } @@ -698,6 +706,14 @@ svm_inkernel_handle_cpuid(struct nvmm_cp state->gprs[NVMM_X64_GPR_RCX] = sizeof(struct fxsave); state->gprs[NVMM_X64_GPR_RDX] = svm_xcr0_mask >> 32; break; + case 0x40000000: + memcpy(&state->gprs[NVMM_X64_GPR_RBX], "___ ", 4); + memcpy(&state->gprs[NVMM_X64_GPR_RCX], "NVMM", 4); + memcpy(&state->gprs[NVMM_X64_GPR_RDX], " ___", 4); + break; + case 0x80000001: /* No SVM in ECX. The rest is tunable. */ + state->gprs[NVMM_X64_GPR_RCX] &= ~CPUID_SVM; + break; default: break; } @@ -760,6 +776,16 @@ svm_exit_cpuid(struct nvmm_machine *mach exit->reason = NVMM_EXIT_NONE; } +static void +svm_exit_hlt(struct nvmm_machine *mach, struct nvmm_cpu *vcpu, + struct nvmm_exit *exit) +{ + struct svm_cpudata *cpudata = vcpu->cpudata; + + exit->reason = NVMM_EXIT_HLT; + exit->u.hlt.npc = cpudata->vmcb->ctrl.nrip; +} + #define SVM_EXIT_IO_PORT __BITS(31,16) #define SVM_EXIT_IO_SEG __BITS(12,10) #define SVM_EXIT_IO_A64 __BIT(9) @@ -827,20 +853,42 @@ svm_exit_io(struct nvmm_machine *mach, s exit->u.io.npc = nextpc; } +static const uint64_t msr_ignore_list[] = { + 0xc0010055, /* MSR_CMPHALT */ + MSR_DE_CFG, + MSR_IC_CFG, + MSR_UCODE_AMD_PATCHLEVEL +}; + static bool svm_inkernel_handle_msr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu, struct nvmm_exit *exit) { struct svm_cpudata *cpudata = vcpu->cpudata; struct nvmm_x64_state *state = &cpudata->state; - uint64_t pat; + uint64_t val; + size_t i; switch (exit->u.msr.type) { case NVMM_EXIT_MSR_RDMSR: if (exit->u.msr.msr == MSR_CR_PAT) { - pat = cpudata->vmcb->state.g_pat; - cpudata->vmcb->state.rax = (pat & 0xFFFFFFFF); - state->gprs[NVMM_X64_GPR_RDX] = (pat >> 32); + val = cpudata->vmcb->state.g_pat; + cpudata->vmcb->state.rax = (val & 0xFFFFFFFF); + state->gprs[NVMM_X64_GPR_RDX] = (val >> 32); + goto handled; + } + if (exit->u.msr.msr == MSR_NB_CFG) { + val = NB_CFG_INITAPICCPUIDLO; + cpudata->vmcb->state.rax = (val & 0xFFFFFFFF); + state->gprs[NVMM_X64_GPR_RDX] = (val >> 32); + goto handled; + } + for (i = 0; i < __arraycount(msr_ignore_list); i++) { + if (msr_ignore_list[i] != exit->u.msr.msr) + continue; + val = 0; + cpudata->vmcb->state.rax = (val & 0xFFFFFFFF); + state->gprs[NVMM_X64_GPR_RDX] = (val >> 32); goto handled; } break; @@ -861,6 +909,11 @@ svm_inkernel_handle_msr(struct nvmm_mach cpudata->vmcb->state.g_pat = exit->u.msr.val; goto handled; } + for (i = 0; i < __arraycount(msr_ignore_list); i++) { + if (msr_ignore_list[i] != exit->u.msr.msr) + continue; + goto handled; + } break; } @@ -1128,19 +1181,18 @@ svm_vcpu_run(struct nvmm_machine *mach, exit->reason = NVMM_EXIT_NONE; break; case VMCB_EXITCODE_VINTR: - svm_event_waitexit_disable(vmcb, false); + svm_event_waitexit_disable(vcpu, false); exit->reason = NVMM_EXIT_INT_READY; break; case VMCB_EXITCODE_IRET: - svm_event_waitexit_disable(vmcb, true); - cpudata->in_nmi = false; + svm_event_waitexit_disable(vcpu, true); exit->reason = NVMM_EXIT_NMI_READY; break; case VMCB_EXITCODE_CPUID: svm_exit_cpuid(mach, vcpu, exit); break; case VMCB_EXITCODE_HLT: - exit->reason = NVMM_EXIT_HLT; + svm_exit_hlt(mach, vcpu, exit); break; case VMCB_EXITCODE_IOIO: svm_exit_io(mach, vcpu, exit); @@ -1186,10 +1238,20 @@ svm_vcpu_run(struct nvmm_machine *mach, break; } + if (vmcb->ctrl.exitintinfo & VMCB_CTRL_EXITINTINFO_V) { + printf("WAS PROCESSING!\n"); + } + /* If no reason to return to userland, keep rolling. */ if (curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD) { break; } + if (curcpu()->ci_data.cpu_softints != 0) { + break; + } + if (curlwp->l_flag & LW_USERRET) { + break; + } if (exit->reason != NVMM_EXIT_NONE) { break; } @@ -1204,6 +1266,13 @@ svm_vcpu_run(struct nvmm_machine *mach, VMCB_CTRL_V_TPR); exit->exitstate[NVMM_X64_EXITSTATE_RFLAGS] = vmcb->state.rflags; + exit->exitstate[NVMM_X64_EXITSTATE_INT_SHADOW] = + ((vmcb->ctrl.intr & VMCB_CTRL_INTR_SHADOW) != 0); + exit->exitstate[NVMM_X64_EXITSTATE_INT_WINDOW_EXIT] = + cpudata->int_window_exit; + exit->exitstate[NVMM_X64_EXITSTATE_NMI_WINDOW_EXIT] = + cpudata->nmi_window_exit; + return 0; } @@ -1437,6 +1506,7 @@ svm_vcpu_init(struct nvmm_machine *mach, * - SYSENTER_EIP [read, write] * - FSBASE [read, write] * - GSBASE [read, write] + * - TSC [read] * * Intercept the rest. */ @@ -1452,6 +1522,7 @@ svm_vcpu_init(struct nvmm_machine *mach, svm_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_EIP, true, true); svm_vcpu_msr_allow(cpudata->msrbm, MSR_FSBASE, true, true); svm_vcpu_msr_allow(cpudata->msrbm, MSR_GSBASE, true, true); + svm_vcpu_msr_allow(cpudata->msrbm, MSR_TSC, true, false); vmcb->ctrl.msrpm_base_pa = cpudata->msrbm_pa; /* Generate ASID. */ @@ -1712,6 +1783,24 @@ svm_vcpu_setstate(struct nvmm_cpu *vcpu, memcpy(cstate->misc, nstate->misc, sizeof(nstate->misc)); vmcb->state.cpl = cstate->misc[NVMM_X64_MISC_CPL]; + + if (cstate->misc[NVMM_X64_MISC_INT_SHADOW]) { + vmcb->ctrl.intr |= VMCB_CTRL_INTR_SHADOW; + } else { + vmcb->ctrl.intr &= ~VMCB_CTRL_INTR_SHADOW; + } + + if (cstate->misc[NVMM_X64_MISC_INT_WINDOW_EXIT]) { + svm_event_waitexit_enable(vcpu, false); + } else { + svm_event_waitexit_disable(vcpu, false); + } + + if (cstate->misc[NVMM_X64_MISC_NMI_WINDOW_EXIT]) { + svm_event_waitexit_enable(vcpu, true); + } else { + svm_event_waitexit_disable(vcpu, true); + } } CTASSERT(sizeof(cpudata->gfpu.xsh_fxsave) == sizeof(cstate->fpu)); @@ -1812,6 +1901,13 @@ svm_vcpu_getstate(struct nvmm_cpu *vcpu, if (flags & NVMM_X64_STATE_MISC) { cstate->misc[NVMM_X64_MISC_CPL] = vmcb->state.cpl; + cstate->misc[NVMM_X64_MISC_INT_SHADOW] = + (vmcb->ctrl.intr & VMCB_CTRL_INTR_SHADOW) != 0; + cstate->misc[NVMM_X64_MISC_INT_WINDOW_EXIT] = + cpudata->int_window_exit; + cstate->misc[NVMM_X64_MISC_NMI_WINDOW_EXIT] = + cpudata->nmi_window_exit; + memcpy(nstate->misc, cstate->misc, sizeof(cstate->misc)); }