CVS commit: src

Maxime Villard Sat, 10 Nov 2018 01:29:34 -0800

Module Name:    src
Committed By:   maxv
Date:           Sat Nov 10 09:28:56 UTC 2018


Modified Files:
        src/distrib/sets/lists/comp: md.amd64
        src/lib: Makefile
Added Files:
        src/lib/libnvmm: Makefile libnvmm.3 libnvmm.c libnvmm_x86.c nvmm.h
            shlib_version

Log Message:
Add libnvmm, NetBSD's new virtualization API. It provides a way for VMM
software to effortlessly create and manage virtual machines via NVMM.

It is mostly complete, only nvmm_assist_mem needs to be filled -- I have
a draft for that, but it needs some more care. This Mem Assist should
not be needed when emulating a system in x2apic mode, so theoretically
the current form of libnvmm is sufficient to emulate a whole class of
systems.

Generally speaking, there are so many modes in x86 that it is difficult
to handle each corner case without introducing a ton of checks that just
slow down the common-case execution. Currently we check a limited number
of things; we may add more checks in the future if they turn out to be
needed, but that's rather low priority.

Libnvmm is compiled and installed only on amd64. A man page (reviewed by
wiz@) is provided.


To generate a diff of this commit:
cvs rdiff -u -r1.260 -r1.261 src/distrib/sets/lists/comp/md.amd64
cvs rdiff -u -r1.261 -r1.262 src/lib/Makefile
cvs rdiff -u -r0 -r1.1 src/lib/libnvmm/Makefile src/lib/libnvmm/libnvmm.3 \
    src/lib/libnvmm/libnvmm.c src/lib/libnvmm/libnvmm_x86.c \
    src/lib/libnvmm/nvmm.h src/lib/libnvmm/shlib_version

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/distrib/sets/lists/comp/md.amd64
diff -u src/distrib/sets/lists/comp/md.amd64:1.260 src/distrib/sets/lists/comp/md.amd64:1.261
--- src/distrib/sets/lists/comp/md.amd64:1.260	Wed Nov  7 07:43:07 2018
+++ src/distrib/sets/lists/comp/md.amd64	Sat Nov 10 09:28:56 2018
@@ -1,4 +1,4 @@
-# $NetBSD: md.amd64,v 1.260 2018/11/07 07:43:07 maxv Exp $
+# $NetBSD: md.amd64,v 1.261 2018/11/10 09:28:56 maxv Exp $
 
 ./usr/include/amd64				comp-c-include
 ./usr/include/amd64/ansi.h			comp-c-include
@@ -682,6 +682,7 @@
 ./usr/include/dev/nvmm/nvmm_ioctl.h		comp-c-include
 ./usr/include/dev/nvmm/x86			comp-c-include
 ./usr/include/dev/nvmm/x86/nvmm_x86.h		comp-c-include
+./usr/include/nvmm.h				comp-c-include
 ./usr/include/pmmintrin.h			comp-obsolete		obsolete
 ./usr/include/x64_64				comp-obsolete		obsolete
 ./usr/include/x64_64/ansi.h			comp-obsolete		obsolete
@@ -783,6 +784,12 @@
 ./usr/lib/i386/libi386.so			comp-sys-shlib		compat,pic
 ./usr/lib/i386/libi386_p.a			comp-c-proflib		compat,profile
 ./usr/lib/i386/libi386_pic.a			comp-c-piclib		compat,pic,picinstall
+./usr/lib/libnvmm.a				comp-c-lib		compatfile
+./usr/lib/libnvmm.so				comp-sys-shlib		compat,pic
+./usr/lib/libnvmm.so.0				comp-sys-shlib		compat,pic
+./usr/lib/libnvmm.so.0.1			comp-sys-shlib		compat,pic
+./usr/lib/libnvmm_p.a				comp-c-proflib		compatfile,profile
+./usr/lib/libnvmm_pic.a				comp-c-piclib		compat,pic,picinstall
 ./usr/lib/libx86_64.a				comp-c-lib
 ./usr/lib/libx86_64_p.a				comp-c-proflib		profile
 ./usr/lib/libx86_64_pic.a			comp-c-piclib		pic,picinstall
@@ -888,3 +895,6 @@
 ./usr/share/ldscripts/i386nbsd.xn		comp-obsolete		obsolete
 ./usr/share/ldscripts/i386nbsd.xr		comp-obsolete		obsolete
 ./usr/share/ldscripts/i386nbsd.xu		comp-obsolete		obsolete
+./usr/share/man/cat3/libnvmm.0			comp-c-catman		.cat
+./usr/share/man/html3/libnvmm.html		comp-c-htmlman		html
+./usr/share/man/man3/libnvmm.3			comp-c-man		.man

Index: src/lib/Makefile
diff -u src/lib/Makefile:1.261 src/lib/Makefile:1.262
--- src/lib/Makefile:1.261	Sat Sep  8 14:11:41 2018
+++ src/lib/Makefile	Sat Nov 10 09:28:56 2018
@@ -1,4 +1,4 @@
-#	$NetBSD: Makefile,v 1.261 2018/09/08 14:11:41 christos Exp $
+#	$NetBSD: Makefile,v 1.262 2018/11/10 09:28:56 maxv Exp $
 #	from: @(#)Makefile	5.25.1.1 (Berkeley) 5/7/91
 
 .include <bsd.own.mk>
@@ -50,6 +50,10 @@ SUBDIR+=	librumpclient
 SUBDIR+=	libskey
 .endif
 
+.if ${MACHINE_ARCH} == "x86_64"
+SUBDIR+=	libnvmm
+.endif
+
 .if (${MKMDNS} != "no")
 SUBDIR+=	../external/apache2/mDNSResponder/lib
 .endif

Added files:

Index: src/lib/libnvmm/Makefile
diff -u /dev/null src/lib/libnvmm/Makefile:1.1
--- /dev/null	Sat Nov 10 09:28:56 2018
+++ src/lib/libnvmm/Makefile	Sat Nov 10 09:28:56 2018
@@ -0,0 +1,17 @@
+# $NetBSD: Makefile,v 1.1 2018/11/10 09:28:56 maxv Exp $
+
+USE_SHLIBDIR=   yes
+
+.include <bsd.own.mk>
+
+LIB=		nvmm
+MAN=		libnvmm.3
+
+SRCS=		libnvmm.c libnvmm_x86.c
+
+INCS=		nvmm.h
+INCSDIR=	/usr/include
+
+WARNS=		5
+
+.include <bsd.lib.mk>
Index: src/lib/libnvmm/libnvmm.3
diff -u /dev/null src/lib/libnvmm/libnvmm.3:1.1
--- /dev/null	Sat Nov 10 09:28:56 2018
+++ src/lib/libnvmm/libnvmm.3	Sat Nov 10 09:28:56 2018
@@ -0,0 +1,484 @@
+.Dd September 12, 2018
+.Dt LIBNVMM 3
+.Os
+.Sh NAME
+.Nm libnvmm
+.Nd NetBSD Virtualization API
+.Sh LIBRARY
+.Lb libnvmm
+.Sh SYNOPSIS
+.In nvmm.h
+.Ft int
+.Fn nvmm_capability "struct nvmm_capability *cap"
+.Ft int
+.Fn nvmm_machine_create "struct nvmm_machine *mach"
+.Ft int
+.Fn nvmm_machine_destroy "struct nvmm_machine *mach"
+.Ft int
+.Fn nvmm_machine_configure "struct nvmm_machine *mach" "uint64_t op" \
+    "void *conf"
+.Ft int
+.Fn nvmm_vcpu_create "struct nvmm_machine *mach" "nvmm_cpuid_t cpuid"
+.Ft int
+.Fn nvmm_vcpu_destroy "struct nvmm_machine *mach" "nvmm_cpuid_t cpuid"
+.Ft int
+.Fn nvmm_vcpu_getstate "struct nvmm_machine *mach" "nvmm_cpuid_t cpuid" \
+    "void *state" "uint64_t flags"
+.Ft int
+.Fn nvmm_vcpu_setstate "struct nvmm_machine *mach" "nvmm_cpuid_t cpuid" \
+    "void *state" "uint64_t flags"
+.Ft int
+.Fn nvmm_vcpu_inject "struct nvmm_machine *mach" "nvmm_cpuid_t cpuid" \
+    "struct nvmm_event *event"
+.Ft int
+.Fn nvmm_vcpu_run "struct nvmm_machine *mach" "nvmm_cpuid_t cpuid" \
+    "struct nvmm_exit *exit"
+.Ft int
+.Fn nvmm_gpa_map "struct nvmm_machine *mach" "uintptr_t hva" "gpaddr_t gpa" \
+    "size_t size" "int flags"
+.Ft int
+.Fn nvmm_gpa_unmap "struct nvmm_machine *mach" "uintptr_t hva" "gpaddr_t gpa" \
+    "size_t size"
+.Ft int
+.Fn nvmm_gva_to_gpa "struct nvmm_machine *mach" "nvmm_cpuid_t cpuid" \
+    "gvaddr_t gva" "gpaddr_t *gpa" "nvmm_prot_t *prot"
+.Ft int
+.Fn nvmm_gpa_to_hva "struct nvmm_machine *mach" "gpaddr_t gpa" \
+    "uintptr_t *hva"
+.Ft int
+.Fn nvmm_assist_io "struct nvmm_machine *mach" "nvmm_cpuid_t cpuid" \
+    "struct nvmm_exit *exit" "void (*cb)(struct nvmm_io *)"
+.Ft int
+.Fn nvmm_assist_mem "struct nvmm_machine *mach" "nvmm_cpuid_t cpuid" \
+    "struct nvmm_exit *exit" "void (*cb)(struct nvmm_mem *)"
+.Sh DESCRIPTION
+.Nm
+provides a library for VMM software to handle hardware-accelerated virtual
+machines in
+.Nx .
+A virtual machine is described by an opaque structure,
+.Cd nvmm_machine .
+VMM software should not attempt to modify this structure directly, and should
+use the API provided by
+.Nm
+to handle virtual machines.
+.Pp
+.Fn nvmm_capability
+gets the capabilities of NVMM.
+.Pp
+.Fn nvmm_machine_create
+creates a virtual machine in the kernel.
+The
+.Fa mach
+structure is initialized, and describes the machine.
+.Pp
+.Fn nvmm_machine_destroy
+destroys the virtual machine described in
+.Fa mach .
+.Pp
+.Fn nvmm_machine_configure
+configures, on the machine
+.Fa mach ,
+the parameter indicated in
+.Fa op .
+.Fa conf
+describes the value of the parameter.
+.Pp
+.Fn nvmm_vcpu_create
+creates a virtual CPU in the machine
+.Fa mach ,
+giving it the CPU id
+.Fa cpuid .
+.Pp
+.Fn nvmm_vcpu_destroy
+destroys the virtual CPU identified by
+.Fa cpuid
+in the machine
+.Fa mach .
+.Pp
+.Fn nvmm_vcpu_getstate
+gets the state of the virtual CPU identified by
+.Fa cpuid
+in the machine
+.Fa mach .
+The
+.Fa state
+argument is the address of a state area, and
+.Fa flags
+is the bitmap of the components that are to be retrieved.
+See
+.Sx VCPU State Area
+below for details.
+.Pp
+.Fn nvmm_vcpu_setstate
+sets the state of the virtual CPU identified by
+.Fa cpuid
+in the machine
+.Fa mach .
+The
+.Fa state
+argument is the address of a state area, and
+.Fa flags
+is the bitmap of the components that are to be set.
+See
+.Sx VCPU State Area
+below for details.
+.Pp
+.Fn nvmm_vcpu_run
+runs the CPU identified by
+.Fa cpuid
+in the machine
+.Fa mach ,
+until a VM exit is triggered.
+The
+.Fa exit
+structure is filled to indicate the exit reason, and the associated parameters
+if any.
+.Pp
+.Fn nvmm_gpa_map
+makes the guest physical memory area beginning on address
+.Fa gpa
+and of size
+.Fa size
+available in the machine
+.Fa mach .
+The area is mapped in the calling process' virtual address space, at address
+.Fa hva .
+.Pp
+.Fn nvmm_gpa_unmap
+removes the guest physical memory area beginning on address
+.Fa gpa
+and of size
+.Fa size
+from the machine
+.Fa mach .
+It also unmaps the area beginning on
+.Fa hva
+from the calling process' virtual address space.
+.Pp
+.Fn nvmm_gva_to_gpa
+translates, on the CPU
+.Fa cpuid
+from the machine
+.Fa mach ,
+the guest virtual address given in
+.Fa gva
+into a guest physical address returned in
+.Fa gpa .
+The associated page premissions are returned in
+.Fa prot .
+.Fa gva
+must be page-aligned.
+.Pp
+.Fn nvmm_gpa_to_hva
+translates, on the machine
+.Fa mach ,
+the guest physical address indicated in
+.Fa gpa
+into a host virtual address returned in
+.Fa hva .
+.Fa gpa
+must be page-aligned.
+.Pp
+.Fn nvmm_assist_io
+emulates the I/O operation described in
+.Fa exit
+on CPU
+.Fa cpuid
+from machine
+.Fa mach .
+.Fa cb
+will be called to handle the transaction.
+See
+.Sx I/O Assist
+below for details.
+.Pp
+.Fn nvmm_assist_mem
+emulates the Mem operation described in
+.Fa exit
+on CPU
+.Fa cpuid
+from machine
+.Fa mach .
+.Fa cb
+will be called to handle the transaction.
+See
+.Sx Mem Assist
+below for details.
+.Ss NVMM Capability
+The
+.Cd nvmm_capability
+structure helps VMM software identify the capabilities offered by NVMM on the
+host:
+.Bd -literal
+struct nvmm_capability {
+	uint64_t version;
+	uint64_t state_size;
+	uint64_t max_machines;
+	uint64_t max_vcpus;
+	uint64_t max_ram;
+	union {
+		struct {
+			...
+		} x86;
+		uint64_t rsvd[8];
+	} u;
+};
+.Ed
+.Pp
+For example, the
+.Cd max_machines
+field indicates the maximum number of virtual machines supported, while
+.Cd max_vcpus
+indicates the maximum number of VCPUs supported per virtual machine.
+.Ss VCPU State Area
+A VCPU state area is a structure that entirely defines the content of the
+registers of a VCPU.
+Only one such structure exists, for x86:
+.Bd -literal
+struct nvmm_x64_state {
+	...
+};
+.Ed
+.Pp
+Refer to functional examples to see precisely how to use this structure.
+.Ss Exit Reasons
+The
+.Cd nvmm_exit
+structure is used to handle VM exits:
+.Bd -literal
+enum nvmm_exit_reason {
+	NVMM_EXIT_NONE		= 0x0000000000000000,
+
+	/* General. */
+	NVMM_EXIT_MEMORY	= 0x0000000000000001,
+	NVMM_EXIT_IO		= 0x0000000000000002,
+	NVMM_EXIT_MSR		= 0x0000000000000003,
+	NVMM_EXIT_INT_READY	= 0x0000000000000004,
+	NVMM_EXIT_NMI_READY	= 0x0000000000000005,
+	NVMM_EXIT_SHUTDOWN	= 0x0000000000000006,
+
+	/* Instructions (x86). */
+	...
+
+	NVMM_EXIT_INVALID	= 0xFFFFFFFFFFFFFFFF
+};
+
+struct nvmm_exit {
+	enum nvmm_exit_reason reason;
+	union {
+		...
+	} u;
+	uint64_t exitstate[8];
+};
+.Ed
+.Pp
+The
+.Va reason
+field indicates the reason of the VM exit.
+Additional parameters describing the exit can be present in
+.Va u .
+.Va exitstate
+contains a partial, implementation-specific VCPU state, usable as a fast-path
+to retrieve certain state values.
+.Pp
+It is possible that a VM exit was caused by a reason internal to the host
+kernel, and that VMM software should not be concerned with.
+In this case, the exit reason is set to
+.Cd NVMM_EXIT_NONE .
+This gives a chance for VMM software to halt the VM in its tracks.
+.Pp
+Refer to functional examples to see precisely how to handle VM exits.
+.Ss Event Injection
+It is possible to inject an event into a VCPU.
+An event can be a hardware interrupt, a software interrupt, or a software
+exception, defined by:
+.Bd -literal
+enum nvmm_event_type {
+	NVMM_EVENT_INTERRUPT_HW,
+	NVMM_EVENT_INTERRUPT_SW,
+	NVMM_EVENT_EXCEPTION
+};
+
+struct nvmm_event {
+	enum nvmm_event_type type;
+	uint64_t vector;
+	union {
+		uint64_t error;
+		uint64_t prio;
+	} u;
+};
+.Ed
+.Pp
+This describes an event of type
+.Va type ,
+to be sent to vector number
+.Va vector ,
+with a possible additional
+.Va error
+or
+.Va prio
+code that is implementation-specific.
+.Pp
+It is possible that the VCPU is in a state where it cannot receive this
+event, if:
+.Pp
+.Bl -bullet -offset indent -compact
+.It
+the event is a hardware interrupt, and the VCPU runs with interrupts disabled,
+or
+.It
+the event is a non-maskable interrupt (NMI), and the VCPU is already in a
+in-NMI context.
+.El
+.Pp
+In this case,
+.Fn nvmm_vcpu_inject
+will return
+.Er EAGAIN ,
+and NVMM will cause a VM exit with reason
+.Cd NVMM_EXIT_INT_READY
+or
+.Cd NVMM_EXIT_NMI_READY
+to indicate that VMM software can now reinject the desired event.
+.Ss I/O Assist
+When a VM exit occurs with reason
+.Cd NVMM_EXIT_IO ,
+it is necessary for VMM software to emulate the associated I/O operation.
+.Nm
+provides an easy way for VMM software to perform that.
+.Pp
+.Fn nvmm_assist_io
+will call the
+.Fa cb
+callback function and give it a
+.Cd nvmm_io
+structure as argument.
+This structure describes an I/O transaction:
+.Bd -literal
+struct nvmm_io {
+	uint64_t port;
+	bool in;
+	size_t size;
+	uint8_t data[8];
+};
+.Ed
+.Pp
+The callback can emulate the operation using this descriptor, following two
+unique cases:
+.Pp
+.Bl -bullet -offset indent -compact
+.It
+The operation is an input.
+In this case, the callback should fill
+.Va data
+with the desired value.
+.It
+The operation is an output.
+In this case, the callback should read
+.Va data
+to retrieve the desired value.
+.El
+.Pp
+In either case,
+.Va port
+will indicate the I/O port,
+.Va in
+will indicate if the operation is an input, and
+.Va size
+will indicate the size of the access.
+.Ss Mem Assist
+When a VM exit occurs with reason
+.Cd NVMM_EXIT_MEMORY ,
+it is necessary for VMM software to emulate the associated memory operation.
+.Nm
+provides an easy way for VMM software to perform that, similar to the I/O
+Assist.
+.Pp
+.Fn nvmm_assist_mem
+will call the
+.Fa cb
+callback function and give it a
+.Cd nvmm_mem
+structure as argument.
+This structure describes a Mem transaction:
+.Bd -literal
+struct nvmm_mem {
+	gvaddr_t gva;
+	gpaddr_t gpa;
+	bool write;
+	size_t size;
+	uint8_t data[8];
+};
+.Ed
+.Pp
+The callback can emulate the operation using this descriptor, following two
+unique cases:
+.Pp
+.Bl -bullet -offset indent -compact
+.It
+The operation is a read.
+In this case, the callback should fill
+.Va data
+with the desired value.
+.It
+The operation is a write.
+In this case, the callback should read
+.Va data
+to retrieve the desired value.
+.El
+.Pp
+In either case,
+.Va gva
+will indicate the guest virtual address,
+.Va gpa
+will indicate the guest physical address,
+.Va write
+will indicate if the access is a write, and
+.Va size
+will indicate the size of the access.
+.Sh RETURN VALUES
+Upon successful completion, each of these functions returns zero.
+Otherwise, a value of \-1 is returned and the global
+variable
+.Va errno
+is set to indicate the error.
+.Sh FILES
+Functional examples:
+.Pp
+.Bl -tag -width XXXX -compact
+.It Pa src/share/examples/nvmm/toyvirt/
+Example of virtualizer.
+Launches the binary given as argument in a virtual machine.
+.It Pa src/share/examples/nvmm/smallkern/
+Example of a kernel that can be executed by toyvirt.
+.El
+.Sh ERRORS
+These functions will fail if:
+.Bl -tag -width [ENOBUFS]
+.It Bq Er EEXIST
+An attempt was made to create a machine or a VCPU that already exists.
+.It Bq Er EFAULT
+An attempt was made to emulate a memory-based operation in a guest, and the
+guest page tables did not have the permissions necessary for the operation
+to complete successfully.
+.It Bq Er EINVAL
+An inappropriate parameter was used.
+.It Bq Er ENOBUFS
+The maximum number of machines or VCPUs was reached.
+.It Bq Er ENOENT
+A query was made on a machine or a VCPU that does not exist.
+.It Bq Er EPERM
+An attempt was made to access a machine that does not belong to the process.
+.El
+.Pp
+In addition,
+.Fn nvmm_vcpu_inject
+uses the following error codes:
+.Bl -tag -width [ENOBUFS]
+.It Bq Er EAGAIN
+The VCPU cannot receive the event immediately.
+.El
+.Sh AUTHORS
+NVMM was designed and implemented by
+.An Maxime Villard .
Index: src/lib/libnvmm/libnvmm.c
diff -u /dev/null src/lib/libnvmm/libnvmm.c:1.1
--- /dev/null	Sat Nov 10 09:28:56 2018
+++ src/lib/libnvmm/libnvmm.c	Sat Nov 10 09:28:56 2018
@@ -0,0 +1,433 @@
+/*	$NetBSD: libnvmm.c,v 1.1 2018/11/10 09:28:56 maxv Exp $	*/
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include "nvmm.h"
+
+static int nvmm_fd = -1;
+static size_t nvmm_page_size = 0;
+
+/* -------------------------------------------------------------------------- */
+
+static int
+_nvmm_area_add(struct nvmm_machine *mach, gpaddr_t gpa, uintptr_t hva,
+    size_t size)
+{
+	struct nvmm_area *area;
+	void *ptr;
+	size_t i;
+
+	for (i = 0; i < mach->nareas; i++) {
+		if (gpa >= mach->areas[i].gpa &&
+		    gpa < mach->areas[i].gpa + mach->areas[i].size) {
+			goto error;
+		}
+		if (gpa + size >= mach->areas[i].gpa &&
+		    gpa + size < mach->areas[i].gpa + mach->areas[i].size) {
+			goto error;
+		}
+		if (gpa < mach->areas[i].gpa &&
+		    gpa + size >= mach->areas[i].gpa + mach->areas[i].size) {
+			goto error;
+		}
+	}
+
+	mach->nareas++;
+	ptr = realloc(mach->areas, mach->nareas * sizeof(struct nvmm_area));
+	if (ptr == NULL)
+		return -1;
+	mach->areas = ptr;
+
+	area = &mach->areas[mach->nareas-1];
+	area->gpa = gpa;
+	area->hva = hva;
+	area->size = size;
+
+	return 0;
+
+error:
+	errno = EEXIST;
+	return -1;
+}
+
+static int
+_nvmm_area_delete(struct nvmm_machine *mach, gpaddr_t gpa, uintptr_t hva,
+    size_t size)
+{
+	size_t i;
+
+	for (i = 0; i < mach->nareas; i++) {
+		if (gpa == mach->areas[i].gpa &&
+		    hva == mach->areas[i].hva &&
+		    size == mach->areas[i].size) {
+			break;
+		}
+	}
+	if (i == mach->nareas) {
+		errno = ENOENT;
+		return -1;
+	}
+
+	memcpy(&mach->areas[i], &mach->areas[i+1],
+	    (mach->nareas - i - 1) * sizeof(struct nvmm_area));
+	mach->nareas--;
+
+	return 0;
+}
+
+/* -------------------------------------------------------------------------- */
+
+static int
+nvmm_init(void)
+{
+	if (nvmm_fd != -1)
+		return 0;
+	nvmm_fd = open("/dev/nvmm", O_RDWR);
+	if (nvmm_fd == -1)
+		return -1;
+	nvmm_page_size = sysconf(_SC_PAGESIZE);
+	return 0;
+}
+
+int
+nvmm_capability(struct nvmm_capability *cap)
+{
+	struct nvmm_ioc_capability args;
+	int ret;
+
+	if (nvmm_init() == -1) {
+		return -1;
+	}
+
+	ret = ioctl(nvmm_fd, NVMM_IOC_CAPABILITY, &args);
+	if (ret == -1)
+		return -1;
+
+	memcpy(cap, &args.cap, sizeof(args.cap));
+
+	return 0;
+}
+
+int
+nvmm_machine_create(struct nvmm_machine *mach)
+{
+	struct nvmm_ioc_machine_create args;
+	int ret;
+
+	if (nvmm_init() == -1) {
+		return -1;
+	}
+
+	ret = ioctl(nvmm_fd, NVMM_IOC_MACHINE_CREATE, &args);
+	if (ret == -1)
+		return -1;
+
+	memset(mach, 0, sizeof(*mach));
+	mach->machid = args.machid;
+
+	return 0;
+}
+
+int
+nvmm_machine_destroy(struct nvmm_machine *mach)
+{
+	struct nvmm_ioc_machine_destroy args;
+	int ret;
+
+	if (nvmm_init() == -1) {
+		return -1;
+	}
+
+	args.machid = mach->machid;
+
+	ret = ioctl(nvmm_fd, NVMM_IOC_MACHINE_DESTROY, &args);
+	if (ret == -1)
+		return -1;
+
+	free(mach->areas);
+
+	return 0;
+}
+
+int
+nvmm_machine_configure(struct nvmm_machine *mach, uint64_t op, void *conf)
+{
+	struct nvmm_ioc_machine_configure args;
+	int ret;
+
+	if (nvmm_init() == -1) {
+		return -1;
+	}
+
+	args.machid = mach->machid;
+	args.op = op;
+	args.conf = conf;
+
+	ret = ioctl(nvmm_fd, NVMM_IOC_MACHINE_CONFIGURE, &args);
+	if (ret == -1)
+		return -1;
+
+	return 0;
+}
+
+int
+nvmm_vcpu_create(struct nvmm_machine *mach, nvmm_cpuid_t cpuid)
+{
+	struct nvmm_ioc_vcpu_create args;
+	int ret;
+
+	if (nvmm_init() == -1) {
+		return -1;
+	}
+
+	args.machid = mach->machid;
+	args.cpuid = cpuid;
+
+	ret = ioctl(nvmm_fd, NVMM_IOC_VCPU_CREATE, &args);
+	if (ret == -1)
+		return -1;
+
+	return 0;
+}
+
+int
+nvmm_vcpu_destroy(struct nvmm_machine *mach, nvmm_cpuid_t cpuid)
+{
+	struct nvmm_ioc_vcpu_destroy args;
+	int ret;
+
+	if (nvmm_init() == -1) {
+		return -1;
+	}
+
+	args.machid = mach->machid;
+	args.cpuid = cpuid;
+
+	ret = ioctl(nvmm_fd, NVMM_IOC_VCPU_DESTROY, &args);
+	if (ret == -1)
+		return -1;
+
+	return 0;
+}
+
+int
+nvmm_vcpu_setstate(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
+    void *state, uint64_t flags)
+{
+	struct nvmm_ioc_vcpu_setstate args;
+	int ret;
+
+	if (nvmm_init() == -1) {
+		return -1;
+	}
+
+	args.machid = mach->machid;
+	args.cpuid = cpuid;
+	args.state = state;
+	args.flags = flags;
+
+	ret = ioctl(nvmm_fd, NVMM_IOC_VCPU_SETSTATE, &args);
+	if (ret == -1)
+		return -1;
+
+	return 0;
+}
+
+int
+nvmm_vcpu_getstate(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
+    void *state, uint64_t flags)
+{
+	struct nvmm_ioc_vcpu_getstate args;
+	int ret;
+
+	if (nvmm_init() == -1) {
+		return -1;
+	}
+
+	args.machid = mach->machid;
+	args.cpuid = cpuid;
+	args.state = state;
+	args.flags = flags;
+
+	ret = ioctl(nvmm_fd, NVMM_IOC_VCPU_GETSTATE, &args);
+	if (ret == -1)
+		return -1;
+
+	return 0;
+}
+
+int
+nvmm_vcpu_inject(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
+    struct nvmm_event *event)
+{
+	struct nvmm_ioc_vcpu_inject args;
+	int ret;
+
+	if (nvmm_init() == -1) {
+		return -1;
+	}
+
+	args.machid = mach->machid;
+	args.cpuid = cpuid;
+	memcpy(&args.event, event, sizeof(args.event));
+
+	ret = ioctl(nvmm_fd, NVMM_IOC_VCPU_INJECT, &args);
+	if (ret == -1)
+		return -1;
+
+	return 0;
+}
+
+int
+nvmm_vcpu_run(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
+    struct nvmm_exit *exit)
+{
+	struct nvmm_ioc_vcpu_run args;
+	int ret;
+
+	if (nvmm_init() == -1) {
+		return -1;
+	}
+
+	args.machid = mach->machid;
+	args.cpuid = cpuid;
+	memset(&args.exit, 0, sizeof(args.exit));
+
+	ret = ioctl(nvmm_fd, NVMM_IOC_VCPU_RUN, &args);
+	if (ret == -1)
+		return -1;
+
+	memcpy(exit, &args.exit, sizeof(args.exit));
+
+	return 0;
+}
+
+int
+nvmm_gpa_map(struct nvmm_machine *mach, uintptr_t hva, gpaddr_t gpa,
+    size_t size, int flags)
+{
+	struct nvmm_ioc_gpa_map args;
+	int ret;
+
+	if (nvmm_init() == -1) {
+		return -1;
+	}
+
+	args.machid = mach->machid;
+	args.hva = hva;
+	args.gpa = gpa;
+	args.size = size;
+	args.flags = flags;
+
+	ret = ioctl(nvmm_fd, NVMM_IOC_GPA_MAP, &args);
+	if (ret == -1)
+		return -1;
+
+	ret = _nvmm_area_add(mach, gpa, hva, size);
+	if (ret == -1) {
+		nvmm_gpa_unmap(mach, hva, gpa, size);
+		return -1;
+	}
+
+	return 0;
+}
+
+int
+nvmm_gpa_unmap(struct nvmm_machine *mach, uintptr_t hva, gpaddr_t gpa,
+    size_t size)
+{
+	struct nvmm_ioc_gpa_unmap args;
+	int ret;
+
+	if (nvmm_init() == -1) {
+		return -1;
+	}
+
+	ret = _nvmm_area_delete(mach, gpa, hva, size);
+	if (ret == -1)
+		return -1;
+
+	args.machid = mach->machid;
+	args.gpa = gpa;
+	args.size = size;
+
+	ret = ioctl(nvmm_fd, NVMM_IOC_GPA_UNMAP, &args);
+	if (ret == -1)
+		return -1;
+
+	ret = munmap((void *)hva, size);
+
+	return ret;
+}
+
+/*
+ * nvmm_gva_to_gpa(): architecture-specific.
+ */
+
+int
+nvmm_gpa_to_hva(struct nvmm_machine *mach, gpaddr_t gpa, uintptr_t *hva)
+{
+	size_t i;
+
+	if (gpa % nvmm_page_size != 0) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	for (i = 0; i < mach->nareas; i++) {
+		if (gpa < mach->areas[i].gpa) {
+			continue;
+		}
+		if (gpa >= mach->areas[i].gpa + mach->areas[i].size) {
+			continue;
+		}
+
+		*hva = mach->areas[i].hva + (gpa - mach->areas[i].gpa);
+		return 0;
+	}
+
+	errno = ENOENT;
+	return -1;
+}
+
+/*
+ * nvmm_assist_io(): architecture-specific.
+ */
Index: src/lib/libnvmm/libnvmm_x86.c
diff -u /dev/null src/lib/libnvmm/libnvmm_x86.c:1.1
--- /dev/null	Sat Nov 10 09:28:56 2018
+++ src/lib/libnvmm/libnvmm_x86.c	Sat Nov 10 09:28:56 2018
@@ -0,0 +1,592 @@
+/*	$NetBSD: libnvmm_x86.c,v 1.1 2018/11/10 09:28:56 maxv Exp $	*/
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <machine/vmparam.h>
+#include <machine/pte.h>
+#include <machine/psl.h>
+
+#include "nvmm.h"
+
+#include <x86/specialreg.h>
+
+/* -------------------------------------------------------------------------- */
+
+#define PTE32_L1_SHIFT	12
+#define PTE32_L2_SHIFT	22
+
+#define PTE32_L2_MASK	0xffc00000
+#define PTE32_L1_MASK	0x003ff000
+
+#define PTE32_L2_FRAME	(PTE32_L2_MASK)
+#define PTE32_L1_FRAME	(PTE32_L2_FRAME|PTE32_L1_MASK)
+
+#define pte32_l1idx(va)	(((va) & PTE32_L1_MASK) >> PTE32_L1_SHIFT)
+#define pte32_l2idx(va)	(((va) & PTE32_L2_MASK) >> PTE32_L2_SHIFT)
+
+typedef uint32_t pte_32bit_t;
+
+static int
+x86_gva_to_gpa_32bit(struct nvmm_machine *mach, uint64_t cr3,
+    gvaddr_t gva, gpaddr_t *gpa, bool has_pse, nvmm_prot_t *prot)
+{
+	gpaddr_t L2gpa, L1gpa;
+	uintptr_t L2hva, L1hva;
+	pte_32bit_t *pdir, pte;
+
+	/* We begin with an RWXU access. */
+	*prot = NVMM_PROT_ALL;
+
+	/* Parse L2. */
+	L2gpa = (cr3 & PG_FRAME);
+	if (nvmm_gpa_to_hva(mach, L2gpa, &L2hva) == -1)
+		return -1;
+	pdir = (pte_32bit_t *)L2hva;
+	pte = pdir[pte32_l2idx(gva)];
+	if ((pte & PG_V) == 0)
+		return -1;
+	if ((pte & PG_u) == 0)
+		*prot &= ~NVMM_PROT_USER;
+	if ((pte & PG_KW) == 0)
+		*prot &= ~NVMM_PROT_WRITE;
+	if ((pte & PG_PS) && !has_pse)
+		return -1;
+	if (pte & PG_PS) {
+		*gpa = (pte & PTE32_L2_FRAME);
+		return 0;
+	}
+
+	/* Parse L1. */
+	L1gpa = (pte & PG_FRAME);
+	if (nvmm_gpa_to_hva(mach, L1gpa, &L1hva) == -1)
+		return -1;
+	pdir = (pte_32bit_t *)L1hva;
+	pte = pdir[pte32_l1idx(gva)];
+	if ((pte & PG_V) == 0)
+		return -1;
+	if ((pte & PG_u) == 0)
+		*prot &= ~NVMM_PROT_USER;
+	if ((pte & PG_KW) == 0)
+		*prot &= ~NVMM_PROT_WRITE;
+	if (pte & PG_PS)
+		return -1;
+
+	*gpa = (pte & PG_FRAME);
+	return 0;
+}
+
+/* -------------------------------------------------------------------------- */
+
+#define	PTE32_PAE_L1_SHIFT	12
+#define	PTE32_PAE_L2_SHIFT	21
+#define	PTE32_PAE_L3_SHIFT	30
+
+#define	PTE32_PAE_L3_MASK	0xc0000000
+#define	PTE32_PAE_L2_MASK	0x3fe00000
+#define	PTE32_PAE_L1_MASK	0x001ff000
+
+#define	PTE32_PAE_L3_FRAME	(PTE32_PAE_L3_MASK)
+#define	PTE32_PAE_L2_FRAME	(PTE32_PAE_L3_FRAME|PTE32_PAE_L2_MASK)
+#define	PTE32_PAE_L1_FRAME	(PTE32_PAE_L2_FRAME|PTE32_PAE_L1_MASK)
+
+#define pte32_pae_l1idx(va)	(((va) & PTE32_PAE_L1_MASK) >> PTE32_PAE_L1_SHIFT)
+#define pte32_pae_l2idx(va)	(((va) & PTE32_PAE_L2_MASK) >> PTE32_PAE_L2_SHIFT)
+#define pte32_pae_l3idx(va)	(((va) & PTE32_PAE_L3_MASK) >> PTE32_PAE_L3_SHIFT)
+
+typedef uint64_t pte_32bit_pae_t;
+
+static int
+x86_gva_to_gpa_32bit_pae(struct nvmm_machine *mach, uint64_t cr3,
+    gvaddr_t gva, gpaddr_t *gpa, bool has_pse, nvmm_prot_t *prot)
+{
+	gpaddr_t L3gpa, L2gpa, L1gpa;
+	uintptr_t L3hva, L2hva, L1hva;
+	pte_32bit_pae_t *pdir, pte;
+
+	/* We begin with an RWXU access. */
+	*prot = NVMM_PROT_ALL;
+
+	/* Parse L3. */
+	L3gpa = (cr3 & PG_FRAME);
+	if (nvmm_gpa_to_hva(mach, L3gpa, &L3hva) == -1)
+		return -1;
+	pdir = (pte_32bit_pae_t *)L3hva;
+	pte = pdir[pte32_pae_l3idx(gva)];
+	if ((pte & PG_V) == 0)
+		return -1;
+	if (pte & PG_NX)
+		*prot &= ~NVMM_PROT_EXEC;
+	if (pte & PG_PS)
+		return -1;
+
+	/* Parse L2. */
+	L2gpa = (pte & PG_FRAME);
+	if (nvmm_gpa_to_hva(mach, L2gpa, &L2hva) == -1)
+		return -1;
+	pdir = (pte_32bit_pae_t *)L2hva;
+	pte = pdir[pte32_pae_l2idx(gva)];
+	if ((pte & PG_V) == 0)
+		return -1;
+	if ((pte & PG_u) == 0)
+		*prot &= ~NVMM_PROT_USER;
+	if ((pte & PG_KW) == 0)
+		*prot &= ~NVMM_PROT_WRITE;
+	if (pte & PG_NX)
+		*prot &= ~NVMM_PROT_EXEC;
+	if ((pte & PG_PS) && !has_pse)
+		return -1;
+	if (pte & PG_PS) {
+		*gpa = (pte & PTE32_PAE_L2_FRAME);
+		return 0;
+	}
+
+	/* Parse L1. */
+	L1gpa = (pte & PG_FRAME);
+	if (nvmm_gpa_to_hva(mach, L1gpa, &L1hva) == -1)
+		return -1;
+	pdir = (pte_32bit_pae_t *)L1hva;
+	pte = pdir[pte32_pae_l1idx(gva)];
+	if ((pte & PG_V) == 0)
+		return -1;
+	if ((pte & PG_u) == 0)
+		*prot &= ~NVMM_PROT_USER;
+	if ((pte & PG_KW) == 0)
+		*prot &= ~NVMM_PROT_WRITE;
+	if (pte & PG_NX)
+		*prot &= ~NVMM_PROT_EXEC;
+	if (pte & PG_PS)
+		return -1;
+
+	*gpa = (pte & PG_FRAME);
+	return 0;
+}
+
+/* -------------------------------------------------------------------------- */
+
+#define PTE64_L1_SHIFT	12
+#define PTE64_L2_SHIFT	21
+#define PTE64_L3_SHIFT	30
+#define PTE64_L4_SHIFT	39
+
+#define PTE64_L4_MASK	0x0000ff8000000000
+#define PTE64_L3_MASK	0x0000007fc0000000
+#define PTE64_L2_MASK	0x000000003fe00000
+#define PTE64_L1_MASK	0x00000000001ff000
+
+#define PTE64_L4_FRAME	PTE64_L4_MASK
+#define PTE64_L3_FRAME	(PTE64_L4_FRAME|PTE64_L3_MASK)
+#define PTE64_L2_FRAME	(PTE64_L3_FRAME|PTE64_L2_MASK)
+#define PTE64_L1_FRAME	(PTE64_L2_FRAME|PTE64_L1_MASK)
+
+#define pte64_l1idx(va)	(((va) & PTE64_L1_MASK) >> PTE64_L1_SHIFT)
+#define pte64_l2idx(va)	(((va) & PTE64_L2_MASK) >> PTE64_L2_SHIFT)
+#define pte64_l3idx(va)	(((va) & PTE64_L3_MASK) >> PTE64_L3_SHIFT)
+#define pte64_l4idx(va)	(((va) & PTE64_L4_MASK) >> PTE64_L4_SHIFT)
+
+typedef uint64_t pte_64bit_t;
+
+static inline bool
+x86_gva_64bit_canonical(gvaddr_t gva)
+{
+	/* Bits 63:47 must have the same value. */
+#define SIGN_EXTEND	0xffff800000000000ULL
+	return (gva & SIGN_EXTEND) == 0 || (gva & SIGN_EXTEND) == SIGN_EXTEND;
+}
+
+static int
+x86_gva_to_gpa_64bit(struct nvmm_machine *mach, uint64_t cr3,
+    gvaddr_t gva, gpaddr_t *gpa, bool has_pse, nvmm_prot_t *prot)
+{
+	gpaddr_t L4gpa, L3gpa, L2gpa, L1gpa;
+	uintptr_t L4hva, L3hva, L2hva, L1hva;
+	pte_64bit_t *pdir, pte;
+
+	/* We begin with an RWXU access. */
+	*prot = NVMM_PROT_ALL;
+
+	if (!x86_gva_64bit_canonical(gva))
+		return -1;
+
+	/* Parse L4. */
+	L4gpa = (cr3 & PG_FRAME);
+	if (nvmm_gpa_to_hva(mach, L4gpa, &L4hva) == -1)
+		return -1;
+	pdir = (pte_64bit_t *)L4hva;
+	pte = pdir[pte64_l4idx(gva)];
+	if ((pte & PG_V) == 0)
+		return -1;
+	if ((pte & PG_u) == 0)
+		*prot &= ~NVMM_PROT_USER;
+	if ((pte & PG_KW) == 0)
+		*prot &= ~NVMM_PROT_WRITE;
+	if (pte & PG_NX)
+		*prot &= ~NVMM_PROT_EXEC;
+	if (pte & PG_PS)
+		return -1;
+
+	/* Parse L3. */
+	L3gpa = (pte & PG_FRAME);
+	if (nvmm_gpa_to_hva(mach, L3gpa, &L3hva) == -1)
+		return -1;
+	pdir = (pte_64bit_t *)L3hva;
+	pte = pdir[pte64_l3idx(gva)];
+	if ((pte & PG_V) == 0)
+		return -1;
+	if ((pte & PG_u) == 0)
+		*prot &= ~NVMM_PROT_USER;
+	if ((pte & PG_KW) == 0)
+		*prot &= ~NVMM_PROT_WRITE;
+	if (pte & PG_NX)
+		*prot &= ~NVMM_PROT_EXEC;
+	if ((pte & PG_PS) && !has_pse)
+		return -1;
+	if (pte & PG_PS) {
+		*gpa = (pte & PTE64_L3_FRAME);
+		return 0;
+	}
+
+	/* Parse L2. */
+	L2gpa = (pte & PG_FRAME);
+	if (nvmm_gpa_to_hva(mach, L2gpa, &L2hva) == -1)
+		return -1;
+	pdir = (pte_64bit_t *)L2hva;
+	pte = pdir[pte64_l2idx(gva)];
+	if ((pte & PG_V) == 0)
+		return -1;
+	if ((pte & PG_u) == 0)
+		*prot &= ~NVMM_PROT_USER;
+	if ((pte & PG_KW) == 0)
+		*prot &= ~NVMM_PROT_WRITE;
+	if (pte & PG_NX)
+		*prot &= ~NVMM_PROT_EXEC;
+	if ((pte & PG_PS) && !has_pse)
+		return -1;
+	if (pte & PG_PS) {
+		*gpa = (pte & PTE64_L2_FRAME);
+		return 0;
+	}
+
+	/* Parse L1. */
+	L1gpa = (pte & PG_FRAME);
+	if (nvmm_gpa_to_hva(mach, L1gpa, &L1hva) == -1)
+		return -1;
+	pdir = (pte_64bit_t *)L1hva;
+	pte = pdir[pte64_l1idx(gva)];
+	if ((pte & PG_V) == 0)
+		return -1;
+	if ((pte & PG_u) == 0)
+		*prot &= ~NVMM_PROT_USER;
+	if ((pte & PG_KW) == 0)
+		*prot &= ~NVMM_PROT_WRITE;
+	if (pte & PG_NX)
+		*prot &= ~NVMM_PROT_EXEC;
+	if (pte & PG_PS)
+		return -1;
+
+	*gpa = (pte & PG_FRAME);
+	return 0;
+}
+
+static inline int
+x86_gva_to_gpa(struct nvmm_machine *mach, struct nvmm_x64_state *state,
+    gvaddr_t gva, gpaddr_t *gpa, nvmm_prot_t *prot)
+{
+	bool is_pae, is_lng, has_pse;
+	uint64_t cr3;
+	int ret;
+
+	if ((state->crs[NVMM_X64_CR_CR0] & CR0_PG) == 0) {
+		/* No paging. */
+		*gpa = gva;
+		return 0;
+	}
+
+	is_pae = (state->crs[NVMM_X64_CR_CR4] & CR4_PAE) != 0;
+	is_lng = (state->msrs[NVMM_X64_MSR_EFER] & EFER_LME) != 0;
+	has_pse = (state->crs[NVMM_X64_CR_CR4] & CR4_PSE) != 0;
+	cr3 = state->crs[NVMM_X64_CR_CR3];
+
+	if (is_pae && is_lng) {
+		/* 64bit */
+		ret = x86_gva_to_gpa_64bit(mach, cr3, gva, gpa, has_pse, prot);
+	} else if (is_pae && !is_lng) {
+		/* 32bit PAE */
+		ret = x86_gva_to_gpa_32bit_pae(mach, cr3, gva, gpa, has_pse,
+		    prot);
+	} else if (!is_pae && !is_lng) {
+		/* 32bit */
+		ret = x86_gva_to_gpa_32bit(mach, cr3, gva, gpa, has_pse, prot);
+	} else {
+		ret = -1;
+	}
+
+	if (ret == -1) {
+		errno = EFAULT;
+	}
+
+	return ret;
+}
+
+int
+nvmm_gva_to_gpa(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
+    gvaddr_t gva, gpaddr_t *gpa, nvmm_prot_t *prot)
+{
+	struct nvmm_x64_state state;
+	int ret;
+
+	if (gva & PAGE_MASK) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	ret = nvmm_vcpu_getstate(mach, cpuid, &state,
+	    NVMM_X64_STATE_CRS | NVMM_X64_STATE_MSRS);
+	if (ret == -1)
+		return -1;
+
+	return x86_gva_to_gpa(mach, &state, gva, gpa, prot);
+}
+
+/* -------------------------------------------------------------------------- */
+
+static inline bool
+is_long_mode(struct nvmm_x64_state *state)
+{
+	return (state->msrs[NVMM_X64_MSR_EFER] & EFER_LME) != 0;
+}
+
+static inline bool
+is_illegal(struct nvmm_io *io, nvmm_prot_t prot)
+{
+	return (io->in && !(prot & NVMM_PROT_WRITE));
+}
+
+static int
+segment_apply(struct nvmm_x64_state_seg *seg, gvaddr_t *gva, size_t size)
+{
+	uint64_t limit;
+
+	/*
+	 * This is incomplete. We should check topdown, etc, really that's
+	 * tiring.
+	 */
+	if (__predict_false(!seg->attrib.p)) {
+		goto error;
+	}
+
+	limit = (seg->limit + 1);
+	if (__predict_true(seg->attrib.gran)) {
+		limit *= PAGE_SIZE;
+	}
+
+	if (__predict_false(*gva + seg->base + size > limit)) {
+		goto error;
+	}
+
+	*gva += seg->base;
+	return 0;
+
+error:
+	errno = EFAULT;
+	return -1;
+}
+
+int
+nvmm_assist_io(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
+    struct nvmm_exit *exit, void (*cb)(struct nvmm_io *))
+{
+	struct nvmm_x64_state state;
+	struct nvmm_io io;
+	nvmm_prot_t prot;
+	size_t remain, done;
+	uintptr_t hva;
+	gvaddr_t gva, off;
+	gpaddr_t gpa;
+	uint64_t rsi;
+	uint8_t tmp[8];
+	uint8_t *ptr, *ptr2;
+	bool cross;
+	int ret;
+
+	if (__predict_false(exit->reason != NVMM_EXIT_IO)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	io.port = exit->u.io.port;
+	io.in = (exit->u.io.type == NVMM_EXIT_IO_IN);
+	io.size = exit->u.io.operand_size;
+
+	ret = nvmm_vcpu_getstate(mach, cpuid, &state,
+	    NVMM_X64_STATE_GPRS | NVMM_X64_STATE_SEGS |
+	    NVMM_X64_STATE_CRS | NVMM_X64_STATE_MSRS);
+	if (ret == -1)
+		return -1;
+
+	cross = false;
+
+	if (!exit->u.io.str) {
+		ptr = (uint8_t *)&state.gprs[NVMM_X64_GPR_RAX];
+	} else {
+		rsi = state.gprs[NVMM_X64_GPR_RSI];
+
+		switch (exit->u.io.address_size) {
+		case 8:
+			gva = rsi;
+			break;
+		case 4:
+			gva = (rsi & 0x00000000FFFFFFFF);
+			break;
+		case 2:
+		default: /* impossible */
+			gva = (rsi & 0x000000000000FFFF);
+			break;
+		}
+
+		if (!is_long_mode(&state)) {
+			ret = segment_apply(&state.segs[exit->u.io.seg], &gva,
+			    io.size);
+			if (ret == -1)
+				return -1;
+		}
+
+		off = (gva & PAGE_MASK);
+		gva &= ~PAGE_MASK;
+
+		ret = x86_gva_to_gpa(mach, &state, gva, &gpa, &prot);
+		if (ret == -1)
+			return -1;
+		if (__predict_false(is_illegal(&io, prot))) {
+			errno = EFAULT;
+			return -1;
+		}
+		ret = nvmm_gpa_to_hva(mach, gpa, &hva);
+		if (ret == -1)
+			return -1;
+
+		ptr = (uint8_t *)hva + off;
+
+		/*
+		 * Special case. If the buffer is in between two pages, we
+		 * need to retrieve data from the next page.
+		 */
+		if (__predict_false(off + io.size > PAGE_SIZE)) {
+			cross = true;
+			remain = off + io.size - PAGE_SIZE;
+			done = PAGE_SIZE - off;
+
+			memcpy(tmp, ptr, done);
+
+			ret = x86_gva_to_gpa(mach, &state, gva + PAGE_SIZE,
+			    &gpa, &prot);
+			if (ret == -1)
+				return -1;
+			if (__predict_false(is_illegal(&io, prot))) {
+				errno = EFAULT;
+				return -1;
+			}
+			ret = nvmm_gpa_to_hva(mach, gpa, &hva);
+			if (ret == -1)
+				return -1;
+
+			memcpy(&tmp[done], (uint8_t *)hva, remain);
+			ptr2 = &tmp[done];
+		}
+	}
+
+	if (io.in) {
+		/* nothing to do */
+	} else {
+		memcpy(io.data, ptr, io.size);
+	}
+
+	(*cb)(&io);
+
+	if (io.in) {
+		if (!exit->u.io.str)
+			state.gprs[NVMM_X64_GPR_RAX] = 0;
+		if (__predict_false(cross)) {
+			memcpy(ptr, io.data, done);
+			memcpy(ptr2, &io.data[done], remain);
+		} else {
+			memcpy(ptr, io.data, io.size);
+		}
+	} else {
+		/* nothing to do */
+	}
+
+	if (exit->u.io.rep) {
+		state.gprs[NVMM_X64_GPR_RCX] -= 1;
+		if (state.gprs[NVMM_X64_GPR_RCX] == 0) {
+			state.gprs[NVMM_X64_GPR_RIP] = exit->u.io.npc;
+		}
+		if (exit->u.io.str) {
+			if (state.gprs[NVMM_X64_GPR_RFLAGS] & PSL_D) {
+				state.gprs[NVMM_X64_GPR_RSI] -= io.size;
+			} else {
+				state.gprs[NVMM_X64_GPR_RSI] += io.size;
+			}
+		}
+	} else {
+		state.gprs[NVMM_X64_GPR_RIP] = exit->u.io.npc;
+	}
+
+	ret = nvmm_vcpu_setstate(mach, cpuid, &state, NVMM_X64_STATE_GPRS);
+	if (ret == -1)
+		return -1;
+
+	return 0;
+}
+
+/* -------------------------------------------------------------------------- */
+
+int
+nvmm_assist_mem(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
+    struct nvmm_exit *exit, void (*cb)(struct nvmm_mem *))
+{
+	if (__predict_false(exit->reason != NVMM_EXIT_MEMORY)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	// TODO
+	errno = ENOSYS;
+	return -1;
+}
Index: src/lib/libnvmm/nvmm.h
diff -u /dev/null src/lib/libnvmm/nvmm.h:1.1
--- /dev/null	Sat Nov 10 09:28:56 2018
+++ src/lib/libnvmm/nvmm.h	Sat Nov 10 09:28:56 2018
@@ -0,0 +1,103 @@
+/*	$NetBSD: nvmm.h,v 1.1 2018/11/10 09:28:56 maxv Exp $	*/
+
+/*
+ * Copyright (c) 2018 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Maxime Villard.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _LIBNVMM_H_
+#define _LIBNVMM_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <dev/nvmm/nvmm.h>
+#include <dev/nvmm/nvmm_ioctl.h>
+#ifdef __x86_64__
+#include <dev/nvmm/x86/nvmm_x86.h>
+#endif
+
+struct nvmm_area {
+	gpaddr_t gpa;
+	uintptr_t hva;
+	size_t size;
+};
+
+struct nvmm_machine {
+	nvmm_machid_t machid;
+	struct nvmm_area *areas;
+	size_t nareas;
+};
+
+struct nvmm_io {
+	uint64_t port;
+	bool in;
+	size_t size;
+	uint8_t data[8];
+};
+
+struct nvmm_mem {
+	gvaddr_t gva;
+	gpaddr_t gpa;
+	bool write;
+	size_t size;
+	uint8_t data[8];
+};
+
+#define NVMM_PROT_READ		0x01
+#define NVMM_PROT_WRITE		0x02
+#define NVMM_PROT_EXEC		0x04
+#define NVMM_PROT_USER		0x08
+#define NVMM_PROT_ALL		0x0F
+typedef uint64_t nvmm_prot_t;
+
+int nvmm_capability(struct nvmm_capability *);
+
+int nvmm_machine_create(struct nvmm_machine *);
+int nvmm_machine_destroy(struct nvmm_machine *);
+int nvmm_machine_configure(struct nvmm_machine *, uint64_t, void *);
+
+int nvmm_vcpu_create(struct nvmm_machine *, nvmm_cpuid_t);
+int nvmm_vcpu_destroy(struct nvmm_machine *, nvmm_cpuid_t);
+int nvmm_vcpu_setstate(struct nvmm_machine *, nvmm_cpuid_t, void *, uint64_t);
+int nvmm_vcpu_getstate(struct nvmm_machine *, nvmm_cpuid_t, void *, uint64_t);
+int nvmm_vcpu_inject(struct nvmm_machine *, nvmm_cpuid_t, struct nvmm_event *);
+int nvmm_vcpu_run(struct nvmm_machine *, nvmm_cpuid_t, struct nvmm_exit *);
+
+int nvmm_gpa_map(struct nvmm_machine *, uintptr_t, gpaddr_t, size_t, int);
+int nvmm_gpa_unmap(struct nvmm_machine *, uintptr_t, gpaddr_t, size_t);
+
+int nvmm_gva_to_gpa(struct nvmm_machine *, nvmm_cpuid_t, gvaddr_t, gpaddr_t *,
+    nvmm_prot_t *);
+int nvmm_gpa_to_hva(struct nvmm_machine *, gpaddr_t, uintptr_t *);
+
+int nvmm_assist_io(struct nvmm_machine *, nvmm_cpuid_t, struct nvmm_exit *,
+    void (*)(struct nvmm_io *));
+int nvmm_assist_mem(struct nvmm_machine *, nvmm_cpuid_t, struct nvmm_exit *,
+    void (*)(struct nvmm_mem *));
+
+#endif /* _LIBNVMM_H_ */
Index: src/lib/libnvmm/shlib_version
diff -u /dev/null src/lib/libnvmm/shlib_version:1.1
--- /dev/null	Sat Nov 10 09:28:56 2018
+++ src/lib/libnvmm/shlib_version	Sat Nov 10 09:28:56 2018
@@ -0,0 +1,5 @@
+# $NetBSD: shlib_version,v 1.1 2018/11/10 09:28:56 maxv Exp $
+# Remember to update distrib/sets/lists/base/shl.* when changing
+#
+major=0
+minor=1

CVS commit: src

Reply via email to