Module Name:    src
Committed By:   maxv
Date:           Sun Jan  6 16:10:51 UTC 2019

Modified Files:
        src/lib/libnvmm: libnvmm.3 libnvmm_x86.c nvmm.h
        src/sys/dev/nvmm: nvmm.c nvmm.h
        src/sys/dev/nvmm/x86: nvmm_x86.h nvmm_x86_svm.c

Log Message:
Improvements and fixes in NVMM.

Kernel driver:

 * Don't take an extra (unneeded) reference to the UAO.

 * Provide npc for HLT. I'm not really happy with it right now, will
   likely be revisited.

 * Add the INT_SHADOW, INT_WINDOW_EXIT and NMI_WINDOW_EXIT states. Provide
   them in the exitstate too.

 * Don't take the TPR into account when processing INTs. The virtualizer
   can do that itself (Qemu already does).

 * Provide a hypervisor signature in CPUID, and hide SVM.

 * Ignore certain MSRs. One special case is MSR_NB_CFG in which we set
   NB_CFG_INITAPICCPUIDLO. Allow reads of MSR_TSC.

 * If the LWP has pending signals or softints, leave, rather than waiting
   for a rescheduling to happen later. This reduces interrupt processing
   time in the guest (Qemu sends a signal to the thread, and now we leave
   right away). This could be improved even more by sending an actual IPI
   to the CPU, but I'll see later.

Libnvmm:

 * Fix the MMU translation of large pages, we need to add the lower bits
   too.

 * Change the IO and Mem structures to take a pointer rather than a
   static array. This provides more flexibility.

 * Batch together the str+rep IO transactions. We do one big memory
   read/write, and then send the IO commands to the hypervisor all at
   once. This considerably increases performance.

 * Decode MOVZX.

With these changes in place, Qemu+NVMM works. I can install NetBSD 8.0
in a VM with multiple VCPUs, connect to the network, etc.


To generate a diff of this commit:
cvs rdiff -u -r1.6 -r1.7 src/lib/libnvmm/libnvmm.3
cvs rdiff -u -r1.9 -r1.10 src/lib/libnvmm/libnvmm_x86.c
cvs rdiff -u -r1.4 -r1.5 src/lib/libnvmm/nvmm.h
cvs rdiff -u -r1.4 -r1.5 src/sys/dev/nvmm/nvmm.c
cvs rdiff -u -r1.1 -r1.2 src/sys/dev/nvmm/nvmm.h
cvs rdiff -u -r1.2 -r1.3 src/sys/dev/nvmm/x86/nvmm_x86.h
cvs rdiff -u -r1.9 -r1.10 src/sys/dev/nvmm/x86/nvmm_x86_svm.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/lib/libnvmm/libnvmm.3
diff -u src/lib/libnvmm/libnvmm.3:1.6 src/lib/libnvmm/libnvmm.3:1.7
--- src/lib/libnvmm/libnvmm.3:1.6	Thu Dec 27 07:22:31 2018
+++ src/lib/libnvmm/libnvmm.3	Sun Jan  6 16:10:51 2019
@@ -1,4 +1,4 @@
-.\"	$NetBSD: libnvmm.3,v 1.6 2018/12/27 07:22:31 maxv Exp $
+.\"	$NetBSD: libnvmm.3,v 1.7 2019/01/06 16:10:51 maxv Exp $
 .\"
 .\" Copyright (c) 2018 The NetBSD Foundation, Inc.
 .\" All rights reserved.
@@ -27,7 +27,7 @@
 .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 .\" POSSIBILITY OF SUCH DAMAGE.
 .\"
-.Dd December 26, 2018
+.Dd January 06, 2019
 .Dt LIBNVMM 3
 .Os
 .Sh NAME
@@ -242,8 +242,6 @@ on CPU
 .Fa cpuid
 from machine
 .Fa mach .
-.Fa cb
-will be called to handle the transaction.
 See
 .Sx I/O Assist
 below for details.
@@ -255,8 +253,6 @@ on CPU
 .Fa cpuid
 from machine
 .Fa mach .
-.Fa cb
-will be called to handle the transaction.
 See
 .Sx Mem Assist
 below for details.
@@ -415,7 +411,7 @@ struct nvmm_io {
 	uint64_t port;
 	bool in;
 	size_t size;
-	uint8_t data[8];
+	uint8_t *data;
 };
 .Ed
 .Pp
@@ -463,7 +459,7 @@ struct nvmm_mem {
 	gpaddr_t gpa;
 	bool write;
 	size_t size;
-	uint8_t data[8];
+	uint8_t *data;
 };
 .Ed
 .Pp

Index: src/lib/libnvmm/libnvmm_x86.c
diff -u src/lib/libnvmm/libnvmm_x86.c:1.9 src/lib/libnvmm/libnvmm_x86.c:1.10
--- src/lib/libnvmm/libnvmm_x86.c:1.9	Fri Jan  4 10:25:39 2019
+++ src/lib/libnvmm/libnvmm_x86.c	Sun Jan  6 16:10:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: libnvmm_x86.c,v 1.9 2019/01/04 10:25:39 maxv Exp $	*/
+/*	$NetBSD: libnvmm_x86.c,v 1.10 2019/01/06 16:10:51 maxv Exp $	*/
 
 /*
  * Copyright (c) 2018 The NetBSD Foundation, Inc.
@@ -45,6 +45,8 @@
 
 #include "nvmm.h"
 
+#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
+
 #include <x86/specialreg.h>
 
 extern struct nvmm_callbacks __callbacks;
@@ -83,6 +85,11 @@ nvmm_vcpu_dump(struct nvmm_machine *mach
 		    (void *)state.segs[i].limit,
 		    state.segs[i].attrib.p, state.segs[i].attrib.def32);
 	}
+	printf("| -> MSR_EFER=%p\n", (void *)state.msrs[NVMM_X64_MSR_EFER]);
+	printf("| -> CR0=%p\n", (void *)state.crs[NVMM_X64_CR_CR0]);
+	printf("| -> CR3=%p\n", (void *)state.crs[NVMM_X64_CR_CR3]);
+	printf("| -> CR4=%p\n", (void *)state.crs[NVMM_X64_CR_CR4]);
+	printf("| -> CR8=%p\n", (void *)state.crs[NVMM_X64_CR_CR8]);
 	printf("| -> CPL=%p\n", (void *)state.misc[NVMM_X64_MISC_CPL]);
 
 	return 0;
@@ -131,6 +138,7 @@ x86_gva_to_gpa_32bit(struct nvmm_machine
 		return -1;
 	if (pte & PG_PS) {
 		*gpa = (pte & PTE32_L2_FRAME);
+		*gpa = *gpa + (gva & PTE32_L1_MASK);
 		return 0;
 	}
 
@@ -215,6 +223,7 @@ x86_gva_to_gpa_32bit_pae(struct nvmm_mac
 		return -1;
 	if (pte & PG_PS) {
 		*gpa = (pte & PTE32_PAE_L2_FRAME);
+		*gpa = *gpa + (gva & PTE32_PAE_L1_MASK);
 		return 0;
 	}
 
@@ -320,6 +329,7 @@ x86_gva_to_gpa_64bit(struct nvmm_machine
 		return -1;
 	if (pte & PG_PS) {
 		*gpa = (pte & PTE64_L3_FRAME);
+		*gpa = *gpa + (gva & (PTE64_L2_MASK|PTE64_L1_MASK));
 		return 0;
 	}
 
@@ -341,6 +351,7 @@ x86_gva_to_gpa_64bit(struct nvmm_machine
 		return -1;
 	if (pte & PG_PS) {
 		*gpa = (pte & PTE64_L2_FRAME);
+		*gpa = *gpa + (gva & PTE64_L1_MASK);
 		return 0;
 	}
 
@@ -500,13 +511,34 @@ mask_from_adsize(size_t adsize)
 }
 
 static uint64_t
+rep_get_cnt(struct nvmm_x64_state *state, size_t adsize)
+{
+	uint64_t mask, cnt;
+
+	mask = mask_from_adsize(adsize);
+	cnt = state->gprs[NVMM_X64_GPR_RCX] & mask;
+
+	return cnt;
+}
+
+static void
+rep_set_cnt(struct nvmm_x64_state *state, size_t adsize, uint64_t cnt)
+{
+	uint64_t mask;
+
+	mask = mask_from_adsize(adsize);
+	state->gprs[NVMM_X64_GPR_RCX] &= ~mask;
+	state->gprs[NVMM_X64_GPR_RCX] |= cnt;
+}
+
+static uint64_t
 rep_dec_apply(struct nvmm_x64_state *state, size_t adsize)
 {
 	uint64_t mask, cnt;
 
 	mask = mask_from_adsize(adsize);
 
-	cnt = state->gprs[NVMM_X64_GPR_RCX] & mask; 
+	cnt = state->gprs[NVMM_X64_GPR_RCX] & mask;
 	cnt -= 1;
 	cnt &= mask;
 
@@ -521,6 +553,7 @@ read_guest_memory(struct nvmm_machine *m
     gvaddr_t gva, uint8_t *data, size_t size)
 {
 	struct nvmm_mem mem;
+	uint8_t membuf[8];
 	nvmm_prot_t prot;
 	gpaddr_t gpa;
 	uintptr_t hva;
@@ -547,6 +580,7 @@ read_guest_memory(struct nvmm_machine *m
 	is_mmio = (ret == -1);
 
 	if (is_mmio) {
+		mem.data = membuf;
 		mem.gva = gva;
 		mem.gpa = gpa;
 		mem.write = false;
@@ -572,6 +606,7 @@ write_guest_memory(struct nvmm_machine *
     gvaddr_t gva, uint8_t *data, size_t size)
 {
 	struct nvmm_mem mem;
+	uint8_t membuf[8];
 	nvmm_prot_t prot;
 	gpaddr_t gpa;
 	uintptr_t hva;
@@ -598,6 +633,7 @@ write_guest_memory(struct nvmm_machine *
 	is_mmio = (ret == -1);
 
 	if (is_mmio) {
+		mem.data = membuf;
 		mem.gva = gva;
 		mem.gpa = gpa;
 		mem.write = true;
@@ -622,16 +658,55 @@ write_guest_memory(struct nvmm_machine *
 
 static int fetch_segment(struct nvmm_machine *, struct nvmm_x64_state *);
 
+#define NVMM_IO_BATCH_SIZE	32
+
+static int
+assist_io_batch(struct nvmm_machine *mach, struct nvmm_x64_state *state,
+    struct nvmm_io *io, gvaddr_t gva, uint64_t cnt)
+{
+	uint8_t iobuf[NVMM_IO_BATCH_SIZE];
+	size_t i, iosize, iocnt;
+	int ret;
+
+	cnt = MIN(cnt, NVMM_IO_BATCH_SIZE);
+	iosize = MIN(io->size * cnt, NVMM_IO_BATCH_SIZE);
+	iocnt = iosize / io->size;
+
+	io->data = iobuf;
+
+	if (!io->in) {
+		ret = read_guest_memory(mach, state, gva, iobuf, iosize);
+		if (ret == -1)
+			return -1;
+	}
+
+	for (i = 0; i < iocnt; i++) {
+		(*__callbacks.io)(io);
+		io->data += io->size;
+	}
+
+	if (io->in) {
+		ret = write_guest_memory(mach, state, gva, iobuf, iosize);
+		if (ret == -1)
+			return -1;
+	}
+
+	return iocnt;
+}
+
 int
 nvmm_assist_io(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
     struct nvmm_exit *exit)
 {
 	struct nvmm_x64_state state;
 	struct nvmm_io io;
-	uint64_t cnt;
+	uint64_t cnt = 0; /* GCC */
+	uint8_t iobuf[8];
+	int iocnt = 1;
 	gvaddr_t gva;
 	int reg = 0; /* GCC */
 	int ret, seg;
+	bool psld = false;
 
 	if (__predict_false(exit->reason != NVMM_EXIT_IO)) {
 		errno = EINVAL;
@@ -641,6 +716,7 @@ nvmm_assist_io(struct nvmm_machine *mach
 	io.port = exit->u.io.port;
 	io.in = (exit->u.io.type == NVMM_EXIT_IO_IN);
 	io.size = exit->u.io.operand_size;
+	io.data = iobuf;
 
 	ret = nvmm_vcpu_getstate(mach, cpuid, &state,
 	    NVMM_X64_STATE_GPRS | NVMM_X64_STATE_SEGS |
@@ -648,6 +724,17 @@ nvmm_assist_io(struct nvmm_machine *mach
 	if (ret == -1)
 		return -1;
 
+	if (exit->u.io.rep) {
+		cnt = rep_get_cnt(&state, exit->u.io.address_size);
+		if (__predict_false(cnt == 0)) {
+			return 0;
+		}
+	}
+
+	if (__predict_false(state.gprs[NVMM_X64_GPR_RFLAGS] & PSL_D)) {
+		psld = true;
+	}
+
 	/*
 	 * Determine GVA.
 	 */
@@ -678,6 +765,13 @@ nvmm_assist_io(struct nvmm_machine *mach
 			if (ret == -1)
 				return -1;
 		}
+
+		if (exit->u.io.rep && !psld) {
+			iocnt = assist_io_batch(mach, &state, &io, gva, cnt);
+			if (iocnt == -1)
+				return -1;
+			goto done;
+		}
 	}
 
 	if (!io.in) {
@@ -704,16 +798,18 @@ nvmm_assist_io(struct nvmm_machine *mach
 		}
 	}
 
+done:
 	if (exit->u.io.str) {
-		if (state.gprs[NVMM_X64_GPR_RFLAGS] & PSL_D) {
-			state.gprs[reg] -= io.size;
+		if (__predict_false(psld)) {
+			state.gprs[reg] -= iocnt * io.size;
 		} else {
-			state.gprs[reg] += io.size;
+			state.gprs[reg] += iocnt * io.size;
 		}
 	}
 
 	if (exit->u.io.rep) {
-		cnt = rep_dec_apply(&state, exit->u.io.address_size);
+		cnt -= iocnt;
+		rep_set_cnt(&state, exit->u.io.address_size, cnt);
 		if (cnt == 0) {
 			state.gprs[NVMM_X64_GPR_RIP] = exit->u.io.npc;
 		}
@@ -858,6 +954,7 @@ struct x86_instr {
 	struct x86_rexpref rexpref;
 	size_t operand_size;
 	size_t address_size;
+	uint64_t zeroextend_mask;
 
 	struct x86_regmodrm regmodrm;
 
@@ -912,6 +1009,7 @@ struct x86_group_entry {
 #define OPSIZE_QUAD 0x08 /* 8 bytes */
 
 #define FLAG_z	0x02
+#define FLAG_e	0x10
 
 static const struct x86_group_entry group11[8] = {
 	[0] = { .emul = x86_emul_mov }
@@ -1230,6 +1328,34 @@ static const struct x86_opcode primary_o
 	},
 };
 
+static const struct x86_opcode secondary_opcode_table[] = {
+	/*
+	 * MOVZX
+	 */
+	{
+		/* Gv, Eb */
+		.byte = 0xB6,
+		.regmodrm = true,
+		.regtorm = false,
+		.szoverride = true,
+		.defsize = OPSIZE_BYTE,
+		.allsize = OPSIZE_WORD|OPSIZE_DOUB|OPSIZE_QUAD,
+		.flags = FLAG_e,
+		.emul = x86_emul_mov
+	},
+	{
+		/* Gv, Ew */
+		.byte = 0xB7,
+		.regmodrm = true,
+		.regtorm = false,
+		.szoverride = true,
+		.defsize = OPSIZE_WORD,
+		.allsize = OPSIZE_WORD|OPSIZE_DOUB|OPSIZE_QUAD,
+		.flags = FLAG_e,
+		.emul = x86_emul_mov
+	},
+};
+
 static const struct x86_reg gpr_map__rip = { NVMM_X64_GPR_RIP, 0xFFFFFFFFFFFFFFFF };
 
 /* [REX-present][enc][opsize] */
@@ -2059,6 +2185,67 @@ node_primary_opcode(struct x86_decode_fs
 	return 0;
 }
 
+static uint64_t
+size_to_mask(size_t size)
+{
+	switch (size) {
+	case 1:
+		return 0x00000000000000FF;
+	case 2:
+		return 0x000000000000FFFF;
+	case 4:
+		return 0x00000000FFFFFFFF;
+	case 8:
+	default:
+		return 0xFFFFFFFFFFFFFFFF;
+	}
+}
+
+static int
+node_secondary_opcode(struct x86_decode_fsm *fsm, struct x86_instr *instr)
+{
+	const struct x86_opcode *opcode;
+	uint8_t byte;
+	size_t i, n;
+
+	if (fsm_read(fsm, &byte, sizeof(byte)) == -1) {
+		return -1;
+	}
+
+	n = sizeof(secondary_opcode_table) / sizeof(secondary_opcode_table[0]);
+	for (i = 0; i < n; i++) {
+		if (secondary_opcode_table[i].byte == byte)
+			break;
+	}
+	if (i == n) {
+		return -1;
+	}
+	opcode = &secondary_opcode_table[i];
+
+	instr->opcode = opcode;
+	instr->emul = opcode->emul;
+	instr->operand_size = get_operand_size(fsm, instr);
+	instr->address_size = get_address_size(fsm, instr);
+
+	if (opcode->flags & FLAG_e) {
+		/*
+		 * Compute the mask for zero-extend. Update the operand size,
+		 * we move fewer bytes.
+		 */
+		instr->zeroextend_mask = size_to_mask(instr->operand_size);
+		instr->zeroextend_mask &= ~size_to_mask(opcode->defsize);
+		instr->operand_size = opcode->defsize;
+	}
+
+	if (opcode->regmodrm) {
+		fsm_advance(fsm, 1, node_regmodrm);
+	} else {
+		return -1;
+	}
+
+	return 0;
+}
+
 static int
 node_main(struct x86_decode_fsm *fsm, struct x86_instr *instr)
 {
@@ -2078,7 +2265,7 @@ node_main(struct x86_decode_fsm *fsm, st
 	 * after being introduced.
 	 */
 	if (byte == ESCAPE) {
-		return -1;
+		fsm_advance(fsm, 1, node_secondary_opcode);
 	} else if (!instr->rexpref.present) {
 		if (byte == VEX_1) {
 			return -1;
@@ -2600,10 +2787,12 @@ assist_mem_single(struct nvmm_machine *m
     struct x86_instr *instr)
 {
 	struct nvmm_mem mem;
+	uint8_t membuf[8];
 	uint64_t val;
 	int ret;
 
 	memset(&mem, 0, sizeof(mem));
+	mem.data = membuf;
 
 	switch (instr->src.type) {
 	case STORE_REG:
@@ -2703,6 +2892,7 @@ assist_mem_single(struct nvmm_machine *m
 		val = __SHIFTIN(val, instr->dst.u.reg->mask);
 		state->gprs[instr->dst.u.reg->num] &= ~instr->dst.u.reg->mask;
 		state->gprs[instr->dst.u.reg->num] |= val;
+		state->gprs[instr->dst.u.reg->num] &= ~instr->zeroextend_mask;
 	}
 
 	return 0;

Index: src/lib/libnvmm/nvmm.h
diff -u src/lib/libnvmm/nvmm.h:1.4 src/lib/libnvmm/nvmm.h:1.5
--- src/lib/libnvmm/nvmm.h:1.4	Thu Dec 27 07:22:31 2018
+++ src/lib/libnvmm/nvmm.h	Sun Jan  6 16:10:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: nvmm.h,v 1.4 2018/12/27 07:22:31 maxv Exp $	*/
+/*	$NetBSD: nvmm.h,v 1.5 2019/01/06 16:10:51 maxv Exp $	*/
 
 /*
  * Copyright (c) 2018 The NetBSD Foundation, Inc.
@@ -50,7 +50,7 @@ struct nvmm_io {
 	uint64_t port;
 	bool in;
 	size_t size;
-	uint8_t data[8];
+	uint8_t *data;
 };
 
 struct nvmm_mem {
@@ -58,7 +58,7 @@ struct nvmm_mem {
 	gpaddr_t gpa;
 	bool write;
 	size_t size;
-	uint8_t data[8];
+	uint8_t *data;
 };
 
 struct nvmm_callbacks {

Index: src/sys/dev/nvmm/nvmm.c
diff -u src/sys/dev/nvmm/nvmm.c:1.4 src/sys/dev/nvmm/nvmm.c:1.5
--- src/sys/dev/nvmm/nvmm.c:1.4	Sat Dec 15 13:39:43 2018
+++ src/sys/dev/nvmm/nvmm.c	Sun Jan  6 16:10:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: nvmm.c,v 1.4 2018/12/15 13:39:43 maxv Exp $	*/
+/*	$NetBSD: nvmm.c,v 1.5 2019/01/06 16:10:51 maxv Exp $	*/
 
 /*
  * Copyright (c) 2018 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.4 2018/12/15 13:39:43 maxv Exp $");
+__KERNEL_RCSID(0, "$NetBSD: nvmm.c,v 1.5 2019/01/06 16:10:51 maxv Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -644,9 +644,6 @@ nvmm_hva_map(struct nvmm_ioc_hva_map *ar
 	seg->uobj = uao_create(seg->size, 0);
 	uva = seg->hva;
 
-	/* Take a reference for the kernel. */
-	uao_reference(seg->uobj);
-
 	/* Take a reference for the user. */
 	uao_reference(seg->uobj);
 

Index: src/sys/dev/nvmm/nvmm.h
diff -u src/sys/dev/nvmm/nvmm.h:1.1 src/sys/dev/nvmm/nvmm.h:1.2
--- src/sys/dev/nvmm/nvmm.h:1.1	Wed Nov  7 07:43:08 2018
+++ src/sys/dev/nvmm/nvmm.h	Sun Jan  6 16:10:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: nvmm.h,v 1.1 2018/11/07 07:43:08 maxv Exp $	*/
+/*	$NetBSD: nvmm.h,v 1.2 2019/01/06 16:10:51 maxv Exp $	*/
 
 /*
  * Copyright (c) 2018 The NetBSD Foundation, Inc.
@@ -106,12 +106,17 @@ struct nvmm_exit_msr {
 	uint64_t npc;
 };
 
+struct nvmm_exit_hlt {
+	uint64_t npc;
+};
+
 struct nvmm_exit {
 	enum nvmm_exit_reason reason;
 	union {
 		struct nvmm_exit_memory mem;
 		struct nvmm_exit_io io;
 		struct nvmm_exit_msr msr;
+		struct nvmm_exit_hlt hlt;
 	} u;
 	uint64_t exitstate[8];
 };

Index: src/sys/dev/nvmm/x86/nvmm_x86.h
diff -u src/sys/dev/nvmm/x86/nvmm_x86.h:1.2 src/sys/dev/nvmm/x86/nvmm_x86.h:1.3
--- src/sys/dev/nvmm/x86/nvmm_x86.h:1.2	Sun Nov 25 14:09:57 2018
+++ src/sys/dev/nvmm/x86/nvmm_x86.h	Sun Jan  6 16:10:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: nvmm_x86.h,v 1.2 2018/11/25 14:09:57 maxv Exp $	*/
+/*	$NetBSD: nvmm_x86.h,v 1.3 2019/01/06 16:10:51 maxv Exp $	*/
 
 /*
  * Copyright (c) 2018 The NetBSD Foundation, Inc.
@@ -99,7 +99,10 @@
 
 /* Misc. */
 #define NVMM_X64_MISC_CPL		0
-#define NVMM_X64_NMISC			1
+#define NVMM_X64_MISC_INT_SHADOW	1
+#define NVMM_X64_MISC_INT_WINDOW_EXIT	2
+#define NVMM_X64_MISC_NMI_WINDOW_EXIT	3
+#define NVMM_X64_NMISC			4
 
 #ifndef ASM_NVMM
 
@@ -123,8 +126,11 @@ struct nvmm_x64_state_seg {
 };
 
 /* VM exit state indexes. */
-#define NVMM_X64_EXITSTATE_CR8		0
-#define NVMM_X64_EXITSTATE_RFLAGS	1
+#define NVMM_X64_EXITSTATE_CR8			0
+#define NVMM_X64_EXITSTATE_RFLAGS		1
+#define NVMM_X64_EXITSTATE_INT_SHADOW		2
+#define NVMM_X64_EXITSTATE_INT_WINDOW_EXIT	3
+#define NVMM_X64_EXITSTATE_NMI_WINDOW_EXIT	4
 
 /* Flags. */
 #define NVMM_X64_STATE_SEGS	0x01

Index: src/sys/dev/nvmm/x86/nvmm_x86_svm.c
diff -u src/sys/dev/nvmm/x86/nvmm_x86_svm.c:1.9 src/sys/dev/nvmm/x86/nvmm_x86_svm.c:1.10
--- src/sys/dev/nvmm/x86/nvmm_x86_svm.c:1.9	Thu Jan  3 08:02:49 2019
+++ src/sys/dev/nvmm/x86/nvmm_x86_svm.c	Sun Jan  6 16:10:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: nvmm_x86_svm.c,v 1.9 2019/01/03 08:02:49 maxv Exp $	*/
+/*	$NetBSD: nvmm_x86_svm.c,v 1.10 2019/01/06 16:10:51 maxv Exp $	*/
 
 /*
  * Copyright (c) 2018 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: nvmm_x86_svm.c,v 1.9 2019/01/03 08:02:49 maxv Exp $");
+__KERNEL_RCSID(0, "$NetBSD: nvmm_x86_svm.c,v 1.10 2019/01/06 16:10:51 maxv Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -518,8 +518,11 @@ struct svm_cpudata {
 	bool ts_set;
 	struct xsave_header hfpu __aligned(16);
 
+	/* Event state */
+	bool int_window_exit;
+	bool nmi_window_exit;
+
 	/* Guest state */
-	bool in_nmi;
 	uint64_t tsc_offset;
 	struct xsave_header gfpu __aligned(16);
 };
@@ -530,26 +533,34 @@ struct svm_cpudata {
 #define SVM_EVENT_TYPE_SW_INT	4
 
 static void
-svm_event_waitexit_enable(struct vmcb *vmcb, bool nmi)
+svm_event_waitexit_enable(struct nvmm_cpu *vcpu, bool nmi)
 {
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+	struct vmcb *vmcb = cpudata->vmcb;
+
 	if (nmi) {
 		vmcb->ctrl.intercept_misc1 |= VMCB_CTRL_INTERCEPT_IRET;
+		cpudata->nmi_window_exit = true;
 	} else {
 		vmcb->ctrl.intercept_misc1 |= VMCB_CTRL_INTERCEPT_VINTR;
-		vmcb->ctrl.v |= (VMCB_CTRL_V_IRQ |
-		    __SHIFTIN(0, VMCB_CTRL_V_INTR_VECTOR));
+		vmcb->ctrl.v |= (VMCB_CTRL_V_IRQ | VMCB_CTRL_V_IGN_TPR);
+		cpudata->int_window_exit = true;
 	}
 }
 
 static void
-svm_event_waitexit_disable(struct vmcb *vmcb, bool nmi)
+svm_event_waitexit_disable(struct nvmm_cpu *vcpu, bool nmi)
 {
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+	struct vmcb *vmcb = cpudata->vmcb;
+
 	if (nmi) {
 		vmcb->ctrl.intercept_misc1 &= ~VMCB_CTRL_INTERCEPT_IRET;
+		cpudata->nmi_window_exit = false;
 	} else {
 		vmcb->ctrl.intercept_misc1 &= ~VMCB_CTRL_INTERCEPT_VINTR;
-		vmcb->ctrl.v &= ~(VMCB_CTRL_V_IRQ |
-		    __SHIFTIN(0, VMCB_CTRL_V_INTR_VECTOR));
+		vmcb->ctrl.v &= ~(VMCB_CTRL_V_IRQ | VMCB_CTRL_V_IGN_TPR);
+		cpudata->int_window_exit = false;
 	}
 }
 
@@ -577,9 +588,7 @@ svm_vcpu_inject(struct nvmm_machine *mac
 {
 	struct svm_cpudata *cpudata = vcpu->cpudata;
 	struct vmcb *vmcb = cpudata->vmcb;
-	uint64_t rflags = vmcb->state.rflags;
 	int type = 0, err = 0;
-	uint64_t tpr;
 
 	if (event->vector >= 256) {
 		return EINVAL;
@@ -592,15 +601,14 @@ svm_vcpu_inject(struct nvmm_machine *mac
 			type = SVM_EVENT_TYPE_NMI;
 		}
 		if (type == SVM_EVENT_TYPE_NMI) {
-			if (cpudata->in_nmi) {
-				svm_event_waitexit_enable(vmcb, true);
+			if (cpudata->nmi_window_exit) {
 				return EAGAIN;
 			}
-			cpudata->in_nmi = true;
+			svm_event_waitexit_enable(vcpu, true);
 		} else {
-			tpr = __SHIFTOUT(vmcb->ctrl.v, VMCB_CTRL_V_TPR);
-			if ((rflags & PSL_I) == 0 || event->u.prio <= tpr) {
-				svm_event_waitexit_enable(vmcb, false);
+			if (((vmcb->state.rflags & PSL_I) == 0) ||
+			    ((vmcb->ctrl.intr & VMCB_CTRL_INTR_SHADOW) != 0)) {
+				svm_event_waitexit_enable(vcpu, false);
 				return EAGAIN;
 			}
 		}
@@ -698,6 +706,14 @@ svm_inkernel_handle_cpuid(struct nvmm_cp
 		state->gprs[NVMM_X64_GPR_RCX] = sizeof(struct fxsave);
 		state->gprs[NVMM_X64_GPR_RDX] = svm_xcr0_mask >> 32;
 		break;
+	case 0x40000000:
+		memcpy(&state->gprs[NVMM_X64_GPR_RBX], "___ ", 4);
+		memcpy(&state->gprs[NVMM_X64_GPR_RCX], "NVMM", 4);
+		memcpy(&state->gprs[NVMM_X64_GPR_RDX], " ___", 4);
+		break;
+	case 0x80000001: /* No SVM in ECX. The rest is tunable. */
+		state->gprs[NVMM_X64_GPR_RCX] &= ~CPUID_SVM;
+		break;
 	default:
 		break;
 	}
@@ -760,6 +776,16 @@ svm_exit_cpuid(struct nvmm_machine *mach
 	exit->reason = NVMM_EXIT_NONE;
 }
 
+static void
+svm_exit_hlt(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
+    struct nvmm_exit *exit)
+{
+	struct svm_cpudata *cpudata = vcpu->cpudata;
+
+	exit->reason = NVMM_EXIT_HLT;
+	exit->u.hlt.npc = cpudata->vmcb->ctrl.nrip;
+}
+
 #define SVM_EXIT_IO_PORT	__BITS(31,16)
 #define SVM_EXIT_IO_SEG		__BITS(12,10)
 #define SVM_EXIT_IO_A64		__BIT(9)
@@ -827,20 +853,42 @@ svm_exit_io(struct nvmm_machine *mach, s
 	exit->u.io.npc = nextpc;
 }
 
+static const uint64_t msr_ignore_list[] = {
+	0xc0010055, /* MSR_CMPHALT */
+	MSR_DE_CFG,
+	MSR_IC_CFG,
+	MSR_UCODE_AMD_PATCHLEVEL
+};
+
 static bool
 svm_inkernel_handle_msr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
     struct nvmm_exit *exit)
 {
 	struct svm_cpudata *cpudata = vcpu->cpudata;
 	struct nvmm_x64_state *state = &cpudata->state;
-	uint64_t pat;
+	uint64_t val;
+	size_t i;
 
 	switch (exit->u.msr.type) {
 	case NVMM_EXIT_MSR_RDMSR:
 		if (exit->u.msr.msr == MSR_CR_PAT) {
-			pat = cpudata->vmcb->state.g_pat;
-			cpudata->vmcb->state.rax = (pat & 0xFFFFFFFF);
-			state->gprs[NVMM_X64_GPR_RDX] = (pat >> 32);
+			val = cpudata->vmcb->state.g_pat;
+			cpudata->vmcb->state.rax = (val & 0xFFFFFFFF);
+			state->gprs[NVMM_X64_GPR_RDX] = (val >> 32);
+			goto handled;
+		}
+		if (exit->u.msr.msr == MSR_NB_CFG) {
+			val = NB_CFG_INITAPICCPUIDLO;
+			cpudata->vmcb->state.rax = (val & 0xFFFFFFFF);
+			state->gprs[NVMM_X64_GPR_RDX] = (val >> 32);
+			goto handled;
+		}
+		for (i = 0; i < __arraycount(msr_ignore_list); i++) {
+			if (msr_ignore_list[i] != exit->u.msr.msr)
+				continue;
+			val = 0;
+			cpudata->vmcb->state.rax = (val & 0xFFFFFFFF);
+			state->gprs[NVMM_X64_GPR_RDX] = (val >> 32);
 			goto handled;
 		}
 		break;
@@ -861,6 +909,11 @@ svm_inkernel_handle_msr(struct nvmm_mach
 			cpudata->vmcb->state.g_pat = exit->u.msr.val;
 			goto handled;
 		}
+		for (i = 0; i < __arraycount(msr_ignore_list); i++) {
+			if (msr_ignore_list[i] != exit->u.msr.msr)
+				continue;
+			goto handled;
+		}
 		break;
 	}
 
@@ -1128,19 +1181,18 @@ svm_vcpu_run(struct nvmm_machine *mach, 
 			exit->reason = NVMM_EXIT_NONE;
 			break;
 		case VMCB_EXITCODE_VINTR:
-			svm_event_waitexit_disable(vmcb, false);
+			svm_event_waitexit_disable(vcpu, false);
 			exit->reason = NVMM_EXIT_INT_READY;
 			break;
 		case VMCB_EXITCODE_IRET:
-			svm_event_waitexit_disable(vmcb, true);
-			cpudata->in_nmi = false;
+			svm_event_waitexit_disable(vcpu, true);
 			exit->reason = NVMM_EXIT_NMI_READY;
 			break;
 		case VMCB_EXITCODE_CPUID:
 			svm_exit_cpuid(mach, vcpu, exit);
 			break;
 		case VMCB_EXITCODE_HLT:
-			exit->reason = NVMM_EXIT_HLT;
+			svm_exit_hlt(mach, vcpu, exit);
 			break;
 		case VMCB_EXITCODE_IOIO:
 			svm_exit_io(mach, vcpu, exit);
@@ -1186,10 +1238,20 @@ svm_vcpu_run(struct nvmm_machine *mach, 
 			break;
 		}
 
+		if (vmcb->ctrl.exitintinfo & VMCB_CTRL_EXITINTINFO_V) {
+			printf("WAS PROCESSING!\n");
+		}
+
 		/* If no reason to return to userland, keep rolling. */
 		if (curcpu()->ci_schedstate.spc_flags & SPCF_SHOULDYIELD) {
 			break;
 		}
+		if (curcpu()->ci_data.cpu_softints != 0) {
+			break;
+		}
+		if (curlwp->l_flag & LW_USERRET) {
+			break;
+		}
 		if (exit->reason != NVMM_EXIT_NONE) {
 			break;
 		}
@@ -1204,6 +1266,13 @@ svm_vcpu_run(struct nvmm_machine *mach, 
 	    VMCB_CTRL_V_TPR);
 	exit->exitstate[NVMM_X64_EXITSTATE_RFLAGS] = vmcb->state.rflags;
 
+	exit->exitstate[NVMM_X64_EXITSTATE_INT_SHADOW] =
+	    ((vmcb->ctrl.intr & VMCB_CTRL_INTR_SHADOW) != 0);
+	exit->exitstate[NVMM_X64_EXITSTATE_INT_WINDOW_EXIT] =
+	    cpudata->int_window_exit;
+	exit->exitstate[NVMM_X64_EXITSTATE_NMI_WINDOW_EXIT] =
+	    cpudata->nmi_window_exit;
+
 	return 0;
 }
 
@@ -1437,6 +1506,7 @@ svm_vcpu_init(struct nvmm_machine *mach,
 	 *  - SYSENTER_EIP [read, write]
 	 *  - FSBASE [read, write]
 	 *  - GSBASE [read, write]
+	 *  - TSC [read]
 	 *
 	 * Intercept the rest.
 	 */
@@ -1452,6 +1522,7 @@ svm_vcpu_init(struct nvmm_machine *mach,
 	svm_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_EIP, true, true);
 	svm_vcpu_msr_allow(cpudata->msrbm, MSR_FSBASE, true, true);
 	svm_vcpu_msr_allow(cpudata->msrbm, MSR_GSBASE, true, true);
+	svm_vcpu_msr_allow(cpudata->msrbm, MSR_TSC, true, false);
 	vmcb->ctrl.msrpm_base_pa = cpudata->msrbm_pa;
 
 	/* Generate ASID. */
@@ -1712,6 +1783,24 @@ svm_vcpu_setstate(struct nvmm_cpu *vcpu,
 		memcpy(cstate->misc, nstate->misc, sizeof(nstate->misc));
 
 		vmcb->state.cpl = cstate->misc[NVMM_X64_MISC_CPL];
+
+		if (cstate->misc[NVMM_X64_MISC_INT_SHADOW]) {
+			vmcb->ctrl.intr |= VMCB_CTRL_INTR_SHADOW;
+		} else {
+			vmcb->ctrl.intr &= ~VMCB_CTRL_INTR_SHADOW;
+		}
+
+		if (cstate->misc[NVMM_X64_MISC_INT_WINDOW_EXIT]) {
+			svm_event_waitexit_enable(vcpu, false);
+		} else {
+			svm_event_waitexit_disable(vcpu, false);
+		}
+
+		if (cstate->misc[NVMM_X64_MISC_NMI_WINDOW_EXIT]) {
+			svm_event_waitexit_enable(vcpu, true);
+		} else {
+			svm_event_waitexit_disable(vcpu, true);
+		}
 	}
 
 	CTASSERT(sizeof(cpudata->gfpu.xsh_fxsave) == sizeof(cstate->fpu));
@@ -1812,6 +1901,13 @@ svm_vcpu_getstate(struct nvmm_cpu *vcpu,
 	if (flags & NVMM_X64_STATE_MISC) {
 		cstate->misc[NVMM_X64_MISC_CPL] = vmcb->state.cpl;
 
+		cstate->misc[NVMM_X64_MISC_INT_SHADOW] =
+		    (vmcb->ctrl.intr & VMCB_CTRL_INTR_SHADOW) != 0;
+		cstate->misc[NVMM_X64_MISC_INT_WINDOW_EXIT] =
+		    cpudata->int_window_exit;
+		cstate->misc[NVMM_X64_MISC_NMI_WINDOW_EXIT] =
+		    cpudata->nmi_window_exit;
+
 		memcpy(nstate->misc, cstate->misc, sizeof(cstate->misc));
 	}
 

Reply via email to