Module Name: src Committed By: maxv Date: Sat Nov 10 09:28:56 UTC 2018
Modified Files: src/distrib/sets/lists/comp: md.amd64 src/lib: Makefile Added Files: src/lib/libnvmm: Makefile libnvmm.3 libnvmm.c libnvmm_x86.c nvmm.h shlib_version Log Message: Add libnvmm, NetBSD's new virtualization API. It provides a way for VMM software to effortlessly create and manage virtual machines via NVMM. It is mostly complete, only nvmm_assist_mem needs to be filled -- I have a draft for that, but it needs some more care. This Mem Assist should not be needed when emulating a system in x2apic mode, so theoretically the current form of libnvmm is sufficient to emulate a whole class of systems. Generally speaking, there are so many modes in x86 that it is difficult to handle each corner case without introducing a ton of checks that just slow down the common-case execution. Currently we check a limited number of things; we may add more checks in the future if they turn out to be needed, but that's rather low priority. Libnvmm is compiled and installed only on amd64. A man page (reviewed by wiz@) is provided. To generate a diff of this commit: cvs rdiff -u -r1.260 -r1.261 src/distrib/sets/lists/comp/md.amd64 cvs rdiff -u -r1.261 -r1.262 src/lib/Makefile cvs rdiff -u -r0 -r1.1 src/lib/libnvmm/Makefile src/lib/libnvmm/libnvmm.3 \ src/lib/libnvmm/libnvmm.c src/lib/libnvmm/libnvmm_x86.c \ src/lib/libnvmm/nvmm.h src/lib/libnvmm/shlib_version Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/distrib/sets/lists/comp/md.amd64 diff -u src/distrib/sets/lists/comp/md.amd64:1.260 src/distrib/sets/lists/comp/md.amd64:1.261 --- src/distrib/sets/lists/comp/md.amd64:1.260 Wed Nov 7 07:43:07 2018 +++ src/distrib/sets/lists/comp/md.amd64 Sat Nov 10 09:28:56 2018 @@ -1,4 +1,4 @@ -# $NetBSD: md.amd64,v 1.260 2018/11/07 07:43:07 maxv Exp $ +# $NetBSD: md.amd64,v 1.261 2018/11/10 09:28:56 maxv Exp $ ./usr/include/amd64 comp-c-include ./usr/include/amd64/ansi.h comp-c-include @@ -682,6 +682,7 @@ ./usr/include/dev/nvmm/nvmm_ioctl.h comp-c-include ./usr/include/dev/nvmm/x86 comp-c-include ./usr/include/dev/nvmm/x86/nvmm_x86.h comp-c-include +./usr/include/nvmm.h comp-c-include ./usr/include/pmmintrin.h comp-obsolete obsolete ./usr/include/x64_64 comp-obsolete obsolete ./usr/include/x64_64/ansi.h comp-obsolete obsolete @@ -783,6 +784,12 @@ ./usr/lib/i386/libi386.so comp-sys-shlib compat,pic ./usr/lib/i386/libi386_p.a comp-c-proflib compat,profile ./usr/lib/i386/libi386_pic.a comp-c-piclib compat,pic,picinstall +./usr/lib/libnvmm.a comp-c-lib compatfile +./usr/lib/libnvmm.so comp-sys-shlib compat,pic +./usr/lib/libnvmm.so.0 comp-sys-shlib compat,pic +./usr/lib/libnvmm.so.0.1 comp-sys-shlib compat,pic +./usr/lib/libnvmm_p.a comp-c-proflib compatfile,profile +./usr/lib/libnvmm_pic.a comp-c-piclib compat,pic,picinstall ./usr/lib/libx86_64.a comp-c-lib ./usr/lib/libx86_64_p.a comp-c-proflib profile ./usr/lib/libx86_64_pic.a comp-c-piclib pic,picinstall @@ -888,3 +895,6 @@ ./usr/share/ldscripts/i386nbsd.xn comp-obsolete obsolete ./usr/share/ldscripts/i386nbsd.xr comp-obsolete obsolete ./usr/share/ldscripts/i386nbsd.xu comp-obsolete obsolete +./usr/share/man/cat3/libnvmm.0 comp-c-catman .cat +./usr/share/man/html3/libnvmm.html comp-c-htmlman html +./usr/share/man/man3/libnvmm.3 comp-c-man .man Index: src/lib/Makefile diff -u src/lib/Makefile:1.261 src/lib/Makefile:1.262 --- src/lib/Makefile:1.261 Sat Sep 8 14:11:41 2018 +++ src/lib/Makefile Sat Nov 10 09:28:56 2018 @@ -1,4 +1,4 @@ -# $NetBSD: Makefile,v 1.261 2018/09/08 14:11:41 christos Exp $ +# $NetBSD: Makefile,v 1.262 2018/11/10 09:28:56 maxv Exp $ # from: @(#)Makefile 5.25.1.1 (Berkeley) 5/7/91 .include <bsd.own.mk> @@ -50,6 +50,10 @@ SUBDIR+= librumpclient SUBDIR+= libskey .endif +.if ${MACHINE_ARCH} == "x86_64" +SUBDIR+= libnvmm +.endif + .if (${MKMDNS} != "no") SUBDIR+= ../external/apache2/mDNSResponder/lib .endif Added files: Index: src/lib/libnvmm/Makefile diff -u /dev/null src/lib/libnvmm/Makefile:1.1 --- /dev/null Sat Nov 10 09:28:56 2018 +++ src/lib/libnvmm/Makefile Sat Nov 10 09:28:56 2018 @@ -0,0 +1,17 @@ +# $NetBSD: Makefile,v 1.1 2018/11/10 09:28:56 maxv Exp $ + +USE_SHLIBDIR= yes + +.include <bsd.own.mk> + +LIB= nvmm +MAN= libnvmm.3 + +SRCS= libnvmm.c libnvmm_x86.c + +INCS= nvmm.h +INCSDIR= /usr/include + +WARNS= 5 + +.include <bsd.lib.mk> Index: src/lib/libnvmm/libnvmm.3 diff -u /dev/null src/lib/libnvmm/libnvmm.3:1.1 --- /dev/null Sat Nov 10 09:28:56 2018 +++ src/lib/libnvmm/libnvmm.3 Sat Nov 10 09:28:56 2018 @@ -0,0 +1,484 @@ +.Dd September 12, 2018 +.Dt LIBNVMM 3 +.Os +.Sh NAME +.Nm libnvmm +.Nd NetBSD Virtualization API +.Sh LIBRARY +.Lb libnvmm +.Sh SYNOPSIS +.In nvmm.h +.Ft int +.Fn nvmm_capability "struct nvmm_capability *cap" +.Ft int +.Fn nvmm_machine_create "struct nvmm_machine *mach" +.Ft int +.Fn nvmm_machine_destroy "struct nvmm_machine *mach" +.Ft int +.Fn nvmm_machine_configure "struct nvmm_machine *mach" "uint64_t op" \ + "void *conf" +.Ft int +.Fn nvmm_vcpu_create "struct nvmm_machine *mach" "nvmm_cpuid_t cpuid" +.Ft int +.Fn nvmm_vcpu_destroy "struct nvmm_machine *mach" "nvmm_cpuid_t cpuid" +.Ft int +.Fn nvmm_vcpu_getstate "struct nvmm_machine *mach" "nvmm_cpuid_t cpuid" \ + "void *state" "uint64_t flags" +.Ft int +.Fn nvmm_vcpu_setstate "struct nvmm_machine *mach" "nvmm_cpuid_t cpuid" \ + "void *state" "uint64_t flags" +.Ft int +.Fn nvmm_vcpu_inject "struct nvmm_machine *mach" "nvmm_cpuid_t cpuid" \ + "struct nvmm_event *event" +.Ft int +.Fn nvmm_vcpu_run "struct nvmm_machine *mach" "nvmm_cpuid_t cpuid" \ + "struct nvmm_exit *exit" +.Ft int +.Fn nvmm_gpa_map "struct nvmm_machine *mach" "uintptr_t hva" "gpaddr_t gpa" \ + "size_t size" "int flags" +.Ft int +.Fn nvmm_gpa_unmap "struct nvmm_machine *mach" "uintptr_t hva" "gpaddr_t gpa" \ + "size_t size" +.Ft int +.Fn nvmm_gva_to_gpa "struct nvmm_machine *mach" "nvmm_cpuid_t cpuid" \ + "gvaddr_t gva" "gpaddr_t *gpa" "nvmm_prot_t *prot" +.Ft int +.Fn nvmm_gpa_to_hva "struct nvmm_machine *mach" "gpaddr_t gpa" \ + "uintptr_t *hva" +.Ft int +.Fn nvmm_assist_io "struct nvmm_machine *mach" "nvmm_cpuid_t cpuid" \ + "struct nvmm_exit *exit" "void (*cb)(struct nvmm_io *)" +.Ft int +.Fn nvmm_assist_mem "struct nvmm_machine *mach" "nvmm_cpuid_t cpuid" \ + "struct nvmm_exit *exit" "void (*cb)(struct nvmm_mem *)" +.Sh DESCRIPTION +.Nm +provides a library for VMM software to handle hardware-accelerated virtual +machines in +.Nx . +A virtual machine is described by an opaque structure, +.Cd nvmm_machine . +VMM software should not attempt to modify this structure directly, and should +use the API provided by +.Nm +to handle virtual machines. +.Pp +.Fn nvmm_capability +gets the capabilities of NVMM. +.Pp +.Fn nvmm_machine_create +creates a virtual machine in the kernel. +The +.Fa mach +structure is initialized, and describes the machine. +.Pp +.Fn nvmm_machine_destroy +destroys the virtual machine described in +.Fa mach . +.Pp +.Fn nvmm_machine_configure +configures, on the machine +.Fa mach , +the parameter indicated in +.Fa op . +.Fa conf +describes the value of the parameter. +.Pp +.Fn nvmm_vcpu_create +creates a virtual CPU in the machine +.Fa mach , +giving it the CPU id +.Fa cpuid . +.Pp +.Fn nvmm_vcpu_destroy +destroys the virtual CPU identified by +.Fa cpuid +in the machine +.Fa mach . +.Pp +.Fn nvmm_vcpu_getstate +gets the state of the virtual CPU identified by +.Fa cpuid +in the machine +.Fa mach . +The +.Fa state +argument is the address of a state area, and +.Fa flags +is the bitmap of the components that are to be retrieved. +See +.Sx VCPU State Area +below for details. +.Pp +.Fn nvmm_vcpu_setstate +sets the state of the virtual CPU identified by +.Fa cpuid +in the machine +.Fa mach . +The +.Fa state +argument is the address of a state area, and +.Fa flags +is the bitmap of the components that are to be set. +See +.Sx VCPU State Area +below for details. +.Pp +.Fn nvmm_vcpu_run +runs the CPU identified by +.Fa cpuid +in the machine +.Fa mach , +until a VM exit is triggered. +The +.Fa exit +structure is filled to indicate the exit reason, and the associated parameters +if any. +.Pp +.Fn nvmm_gpa_map +makes the guest physical memory area beginning on address +.Fa gpa +and of size +.Fa size +available in the machine +.Fa mach . +The area is mapped in the calling process' virtual address space, at address +.Fa hva . +.Pp +.Fn nvmm_gpa_unmap +removes the guest physical memory area beginning on address +.Fa gpa +and of size +.Fa size +from the machine +.Fa mach . +It also unmaps the area beginning on +.Fa hva +from the calling process' virtual address space. +.Pp +.Fn nvmm_gva_to_gpa +translates, on the CPU +.Fa cpuid +from the machine +.Fa mach , +the guest virtual address given in +.Fa gva +into a guest physical address returned in +.Fa gpa . +The associated page premissions are returned in +.Fa prot . +.Fa gva +must be page-aligned. +.Pp +.Fn nvmm_gpa_to_hva +translates, on the machine +.Fa mach , +the guest physical address indicated in +.Fa gpa +into a host virtual address returned in +.Fa hva . +.Fa gpa +must be page-aligned. +.Pp +.Fn nvmm_assist_io +emulates the I/O operation described in +.Fa exit +on CPU +.Fa cpuid +from machine +.Fa mach . +.Fa cb +will be called to handle the transaction. +See +.Sx I/O Assist +below for details. +.Pp +.Fn nvmm_assist_mem +emulates the Mem operation described in +.Fa exit +on CPU +.Fa cpuid +from machine +.Fa mach . +.Fa cb +will be called to handle the transaction. +See +.Sx Mem Assist +below for details. +.Ss NVMM Capability +The +.Cd nvmm_capability +structure helps VMM software identify the capabilities offered by NVMM on the +host: +.Bd -literal +struct nvmm_capability { + uint64_t version; + uint64_t state_size; + uint64_t max_machines; + uint64_t max_vcpus; + uint64_t max_ram; + union { + struct { + ... + } x86; + uint64_t rsvd[8]; + } u; +}; +.Ed +.Pp +For example, the +.Cd max_machines +field indicates the maximum number of virtual machines supported, while +.Cd max_vcpus +indicates the maximum number of VCPUs supported per virtual machine. +.Ss VCPU State Area +A VCPU state area is a structure that entirely defines the content of the +registers of a VCPU. +Only one such structure exists, for x86: +.Bd -literal +struct nvmm_x64_state { + ... +}; +.Ed +.Pp +Refer to functional examples to see precisely how to use this structure. +.Ss Exit Reasons +The +.Cd nvmm_exit +structure is used to handle VM exits: +.Bd -literal +enum nvmm_exit_reason { + NVMM_EXIT_NONE = 0x0000000000000000, + + /* General. */ + NVMM_EXIT_MEMORY = 0x0000000000000001, + NVMM_EXIT_IO = 0x0000000000000002, + NVMM_EXIT_MSR = 0x0000000000000003, + NVMM_EXIT_INT_READY = 0x0000000000000004, + NVMM_EXIT_NMI_READY = 0x0000000000000005, + NVMM_EXIT_SHUTDOWN = 0x0000000000000006, + + /* Instructions (x86). */ + ... + + NVMM_EXIT_INVALID = 0xFFFFFFFFFFFFFFFF +}; + +struct nvmm_exit { + enum nvmm_exit_reason reason; + union { + ... + } u; + uint64_t exitstate[8]; +}; +.Ed +.Pp +The +.Va reason +field indicates the reason of the VM exit. +Additional parameters describing the exit can be present in +.Va u . +.Va exitstate +contains a partial, implementation-specific VCPU state, usable as a fast-path +to retrieve certain state values. +.Pp +It is possible that a VM exit was caused by a reason internal to the host +kernel, and that VMM software should not be concerned with. +In this case, the exit reason is set to +.Cd NVMM_EXIT_NONE . +This gives a chance for VMM software to halt the VM in its tracks. +.Pp +Refer to functional examples to see precisely how to handle VM exits. +.Ss Event Injection +It is possible to inject an event into a VCPU. +An event can be a hardware interrupt, a software interrupt, or a software +exception, defined by: +.Bd -literal +enum nvmm_event_type { + NVMM_EVENT_INTERRUPT_HW, + NVMM_EVENT_INTERRUPT_SW, + NVMM_EVENT_EXCEPTION +}; + +struct nvmm_event { + enum nvmm_event_type type; + uint64_t vector; + union { + uint64_t error; + uint64_t prio; + } u; +}; +.Ed +.Pp +This describes an event of type +.Va type , +to be sent to vector number +.Va vector , +with a possible additional +.Va error +or +.Va prio +code that is implementation-specific. +.Pp +It is possible that the VCPU is in a state where it cannot receive this +event, if: +.Pp +.Bl -bullet -offset indent -compact +.It +the event is a hardware interrupt, and the VCPU runs with interrupts disabled, +or +.It +the event is a non-maskable interrupt (NMI), and the VCPU is already in a +in-NMI context. +.El +.Pp +In this case, +.Fn nvmm_vcpu_inject +will return +.Er EAGAIN , +and NVMM will cause a VM exit with reason +.Cd NVMM_EXIT_INT_READY +or +.Cd NVMM_EXIT_NMI_READY +to indicate that VMM software can now reinject the desired event. +.Ss I/O Assist +When a VM exit occurs with reason +.Cd NVMM_EXIT_IO , +it is necessary for VMM software to emulate the associated I/O operation. +.Nm +provides an easy way for VMM software to perform that. +.Pp +.Fn nvmm_assist_io +will call the +.Fa cb +callback function and give it a +.Cd nvmm_io +structure as argument. +This structure describes an I/O transaction: +.Bd -literal +struct nvmm_io { + uint64_t port; + bool in; + size_t size; + uint8_t data[8]; +}; +.Ed +.Pp +The callback can emulate the operation using this descriptor, following two +unique cases: +.Pp +.Bl -bullet -offset indent -compact +.It +The operation is an input. +In this case, the callback should fill +.Va data +with the desired value. +.It +The operation is an output. +In this case, the callback should read +.Va data +to retrieve the desired value. +.El +.Pp +In either case, +.Va port +will indicate the I/O port, +.Va in +will indicate if the operation is an input, and +.Va size +will indicate the size of the access. +.Ss Mem Assist +When a VM exit occurs with reason +.Cd NVMM_EXIT_MEMORY , +it is necessary for VMM software to emulate the associated memory operation. +.Nm +provides an easy way for VMM software to perform that, similar to the I/O +Assist. +.Pp +.Fn nvmm_assist_mem +will call the +.Fa cb +callback function and give it a +.Cd nvmm_mem +structure as argument. +This structure describes a Mem transaction: +.Bd -literal +struct nvmm_mem { + gvaddr_t gva; + gpaddr_t gpa; + bool write; + size_t size; + uint8_t data[8]; +}; +.Ed +.Pp +The callback can emulate the operation using this descriptor, following two +unique cases: +.Pp +.Bl -bullet -offset indent -compact +.It +The operation is a read. +In this case, the callback should fill +.Va data +with the desired value. +.It +The operation is a write. +In this case, the callback should read +.Va data +to retrieve the desired value. +.El +.Pp +In either case, +.Va gva +will indicate the guest virtual address, +.Va gpa +will indicate the guest physical address, +.Va write +will indicate if the access is a write, and +.Va size +will indicate the size of the access. +.Sh RETURN VALUES +Upon successful completion, each of these functions returns zero. +Otherwise, a value of \-1 is returned and the global +variable +.Va errno +is set to indicate the error. +.Sh FILES +Functional examples: +.Pp +.Bl -tag -width XXXX -compact +.It Pa src/share/examples/nvmm/toyvirt/ +Example of virtualizer. +Launches the binary given as argument in a virtual machine. +.It Pa src/share/examples/nvmm/smallkern/ +Example of a kernel that can be executed by toyvirt. +.El +.Sh ERRORS +These functions will fail if: +.Bl -tag -width [ENOBUFS] +.It Bq Er EEXIST +An attempt was made to create a machine or a VCPU that already exists. +.It Bq Er EFAULT +An attempt was made to emulate a memory-based operation in a guest, and the +guest page tables did not have the permissions necessary for the operation +to complete successfully. +.It Bq Er EINVAL +An inappropriate parameter was used. +.It Bq Er ENOBUFS +The maximum number of machines or VCPUs was reached. +.It Bq Er ENOENT +A query was made on a machine or a VCPU that does not exist. +.It Bq Er EPERM +An attempt was made to access a machine that does not belong to the process. +.El +.Pp +In addition, +.Fn nvmm_vcpu_inject +uses the following error codes: +.Bl -tag -width [ENOBUFS] +.It Bq Er EAGAIN +The VCPU cannot receive the event immediately. +.El +.Sh AUTHORS +NVMM was designed and implemented by +.An Maxime Villard . Index: src/lib/libnvmm/libnvmm.c diff -u /dev/null src/lib/libnvmm/libnvmm.c:1.1 --- /dev/null Sat Nov 10 09:28:56 2018 +++ src/lib/libnvmm/libnvmm.c Sat Nov 10 09:28:56 2018 @@ -0,0 +1,433 @@ +/* $NetBSD: libnvmm.c,v 1.1 2018/11/10 09:28:56 maxv Exp $ */ + +/* + * Copyright (c) 2018 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Maxime Villard. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> +#include <sys/ioctl.h> +#include <sys/mman.h> + +#include "nvmm.h" + +static int nvmm_fd = -1; +static size_t nvmm_page_size = 0; + +/* -------------------------------------------------------------------------- */ + +static int +_nvmm_area_add(struct nvmm_machine *mach, gpaddr_t gpa, uintptr_t hva, + size_t size) +{ + struct nvmm_area *area; + void *ptr; + size_t i; + + for (i = 0; i < mach->nareas; i++) { + if (gpa >= mach->areas[i].gpa && + gpa < mach->areas[i].gpa + mach->areas[i].size) { + goto error; + } + if (gpa + size >= mach->areas[i].gpa && + gpa + size < mach->areas[i].gpa + mach->areas[i].size) { + goto error; + } + if (gpa < mach->areas[i].gpa && + gpa + size >= mach->areas[i].gpa + mach->areas[i].size) { + goto error; + } + } + + mach->nareas++; + ptr = realloc(mach->areas, mach->nareas * sizeof(struct nvmm_area)); + if (ptr == NULL) + return -1; + mach->areas = ptr; + + area = &mach->areas[mach->nareas-1]; + area->gpa = gpa; + area->hva = hva; + area->size = size; + + return 0; + +error: + errno = EEXIST; + return -1; +} + +static int +_nvmm_area_delete(struct nvmm_machine *mach, gpaddr_t gpa, uintptr_t hva, + size_t size) +{ + size_t i; + + for (i = 0; i < mach->nareas; i++) { + if (gpa == mach->areas[i].gpa && + hva == mach->areas[i].hva && + size == mach->areas[i].size) { + break; + } + } + if (i == mach->nareas) { + errno = ENOENT; + return -1; + } + + memcpy(&mach->areas[i], &mach->areas[i+1], + (mach->nareas - i - 1) * sizeof(struct nvmm_area)); + mach->nareas--; + + return 0; +} + +/* -------------------------------------------------------------------------- */ + +static int +nvmm_init(void) +{ + if (nvmm_fd != -1) + return 0; + nvmm_fd = open("/dev/nvmm", O_RDWR); + if (nvmm_fd == -1) + return -1; + nvmm_page_size = sysconf(_SC_PAGESIZE); + return 0; +} + +int +nvmm_capability(struct nvmm_capability *cap) +{ + struct nvmm_ioc_capability args; + int ret; + + if (nvmm_init() == -1) { + return -1; + } + + ret = ioctl(nvmm_fd, NVMM_IOC_CAPABILITY, &args); + if (ret == -1) + return -1; + + memcpy(cap, &args.cap, sizeof(args.cap)); + + return 0; +} + +int +nvmm_machine_create(struct nvmm_machine *mach) +{ + struct nvmm_ioc_machine_create args; + int ret; + + if (nvmm_init() == -1) { + return -1; + } + + ret = ioctl(nvmm_fd, NVMM_IOC_MACHINE_CREATE, &args); + if (ret == -1) + return -1; + + memset(mach, 0, sizeof(*mach)); + mach->machid = args.machid; + + return 0; +} + +int +nvmm_machine_destroy(struct nvmm_machine *mach) +{ + struct nvmm_ioc_machine_destroy args; + int ret; + + if (nvmm_init() == -1) { + return -1; + } + + args.machid = mach->machid; + + ret = ioctl(nvmm_fd, NVMM_IOC_MACHINE_DESTROY, &args); + if (ret == -1) + return -1; + + free(mach->areas); + + return 0; +} + +int +nvmm_machine_configure(struct nvmm_machine *mach, uint64_t op, void *conf) +{ + struct nvmm_ioc_machine_configure args; + int ret; + + if (nvmm_init() == -1) { + return -1; + } + + args.machid = mach->machid; + args.op = op; + args.conf = conf; + + ret = ioctl(nvmm_fd, NVMM_IOC_MACHINE_CONFIGURE, &args); + if (ret == -1) + return -1; + + return 0; +} + +int +nvmm_vcpu_create(struct nvmm_machine *mach, nvmm_cpuid_t cpuid) +{ + struct nvmm_ioc_vcpu_create args; + int ret; + + if (nvmm_init() == -1) { + return -1; + } + + args.machid = mach->machid; + args.cpuid = cpuid; + + ret = ioctl(nvmm_fd, NVMM_IOC_VCPU_CREATE, &args); + if (ret == -1) + return -1; + + return 0; +} + +int +nvmm_vcpu_destroy(struct nvmm_machine *mach, nvmm_cpuid_t cpuid) +{ + struct nvmm_ioc_vcpu_destroy args; + int ret; + + if (nvmm_init() == -1) { + return -1; + } + + args.machid = mach->machid; + args.cpuid = cpuid; + + ret = ioctl(nvmm_fd, NVMM_IOC_VCPU_DESTROY, &args); + if (ret == -1) + return -1; + + return 0; +} + +int +nvmm_vcpu_setstate(struct nvmm_machine *mach, nvmm_cpuid_t cpuid, + void *state, uint64_t flags) +{ + struct nvmm_ioc_vcpu_setstate args; + int ret; + + if (nvmm_init() == -1) { + return -1; + } + + args.machid = mach->machid; + args.cpuid = cpuid; + args.state = state; + args.flags = flags; + + ret = ioctl(nvmm_fd, NVMM_IOC_VCPU_SETSTATE, &args); + if (ret == -1) + return -1; + + return 0; +} + +int +nvmm_vcpu_getstate(struct nvmm_machine *mach, nvmm_cpuid_t cpuid, + void *state, uint64_t flags) +{ + struct nvmm_ioc_vcpu_getstate args; + int ret; + + if (nvmm_init() == -1) { + return -1; + } + + args.machid = mach->machid; + args.cpuid = cpuid; + args.state = state; + args.flags = flags; + + ret = ioctl(nvmm_fd, NVMM_IOC_VCPU_GETSTATE, &args); + if (ret == -1) + return -1; + + return 0; +} + +int +nvmm_vcpu_inject(struct nvmm_machine *mach, nvmm_cpuid_t cpuid, + struct nvmm_event *event) +{ + struct nvmm_ioc_vcpu_inject args; + int ret; + + if (nvmm_init() == -1) { + return -1; + } + + args.machid = mach->machid; + args.cpuid = cpuid; + memcpy(&args.event, event, sizeof(args.event)); + + ret = ioctl(nvmm_fd, NVMM_IOC_VCPU_INJECT, &args); + if (ret == -1) + return -1; + + return 0; +} + +int +nvmm_vcpu_run(struct nvmm_machine *mach, nvmm_cpuid_t cpuid, + struct nvmm_exit *exit) +{ + struct nvmm_ioc_vcpu_run args; + int ret; + + if (nvmm_init() == -1) { + return -1; + } + + args.machid = mach->machid; + args.cpuid = cpuid; + memset(&args.exit, 0, sizeof(args.exit)); + + ret = ioctl(nvmm_fd, NVMM_IOC_VCPU_RUN, &args); + if (ret == -1) + return -1; + + memcpy(exit, &args.exit, sizeof(args.exit)); + + return 0; +} + +int +nvmm_gpa_map(struct nvmm_machine *mach, uintptr_t hva, gpaddr_t gpa, + size_t size, int flags) +{ + struct nvmm_ioc_gpa_map args; + int ret; + + if (nvmm_init() == -1) { + return -1; + } + + args.machid = mach->machid; + args.hva = hva; + args.gpa = gpa; + args.size = size; + args.flags = flags; + + ret = ioctl(nvmm_fd, NVMM_IOC_GPA_MAP, &args); + if (ret == -1) + return -1; + + ret = _nvmm_area_add(mach, gpa, hva, size); + if (ret == -1) { + nvmm_gpa_unmap(mach, hva, gpa, size); + return -1; + } + + return 0; +} + +int +nvmm_gpa_unmap(struct nvmm_machine *mach, uintptr_t hva, gpaddr_t gpa, + size_t size) +{ + struct nvmm_ioc_gpa_unmap args; + int ret; + + if (nvmm_init() == -1) { + return -1; + } + + ret = _nvmm_area_delete(mach, gpa, hva, size); + if (ret == -1) + return -1; + + args.machid = mach->machid; + args.gpa = gpa; + args.size = size; + + ret = ioctl(nvmm_fd, NVMM_IOC_GPA_UNMAP, &args); + if (ret == -1) + return -1; + + ret = munmap((void *)hva, size); + + return ret; +} + +/* + * nvmm_gva_to_gpa(): architecture-specific. + */ + +int +nvmm_gpa_to_hva(struct nvmm_machine *mach, gpaddr_t gpa, uintptr_t *hva) +{ + size_t i; + + if (gpa % nvmm_page_size != 0) { + errno = EINVAL; + return -1; + } + + for (i = 0; i < mach->nareas; i++) { + if (gpa < mach->areas[i].gpa) { + continue; + } + if (gpa >= mach->areas[i].gpa + mach->areas[i].size) { + continue; + } + + *hva = mach->areas[i].hva + (gpa - mach->areas[i].gpa); + return 0; + } + + errno = ENOENT; + return -1; +} + +/* + * nvmm_assist_io(): architecture-specific. + */ Index: src/lib/libnvmm/libnvmm_x86.c diff -u /dev/null src/lib/libnvmm/libnvmm_x86.c:1.1 --- /dev/null Sat Nov 10 09:28:56 2018 +++ src/lib/libnvmm/libnvmm_x86.c Sat Nov 10 09:28:56 2018 @@ -0,0 +1,592 @@ +/* $NetBSD: libnvmm_x86.c,v 1.1 2018/11/10 09:28:56 maxv Exp $ */ + +/* + * Copyright (c) 2018 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Maxime Villard. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <machine/vmparam.h> +#include <machine/pte.h> +#include <machine/psl.h> + +#include "nvmm.h" + +#include <x86/specialreg.h> + +/* -------------------------------------------------------------------------- */ + +#define PTE32_L1_SHIFT 12 +#define PTE32_L2_SHIFT 22 + +#define PTE32_L2_MASK 0xffc00000 +#define PTE32_L1_MASK 0x003ff000 + +#define PTE32_L2_FRAME (PTE32_L2_MASK) +#define PTE32_L1_FRAME (PTE32_L2_FRAME|PTE32_L1_MASK) + +#define pte32_l1idx(va) (((va) & PTE32_L1_MASK) >> PTE32_L1_SHIFT) +#define pte32_l2idx(va) (((va) & PTE32_L2_MASK) >> PTE32_L2_SHIFT) + +typedef uint32_t pte_32bit_t; + +static int +x86_gva_to_gpa_32bit(struct nvmm_machine *mach, uint64_t cr3, + gvaddr_t gva, gpaddr_t *gpa, bool has_pse, nvmm_prot_t *prot) +{ + gpaddr_t L2gpa, L1gpa; + uintptr_t L2hva, L1hva; + pte_32bit_t *pdir, pte; + + /* We begin with an RWXU access. */ + *prot = NVMM_PROT_ALL; + + /* Parse L2. */ + L2gpa = (cr3 & PG_FRAME); + if (nvmm_gpa_to_hva(mach, L2gpa, &L2hva) == -1) + return -1; + pdir = (pte_32bit_t *)L2hva; + pte = pdir[pte32_l2idx(gva)]; + if ((pte & PG_V) == 0) + return -1; + if ((pte & PG_u) == 0) + *prot &= ~NVMM_PROT_USER; + if ((pte & PG_KW) == 0) + *prot &= ~NVMM_PROT_WRITE; + if ((pte & PG_PS) && !has_pse) + return -1; + if (pte & PG_PS) { + *gpa = (pte & PTE32_L2_FRAME); + return 0; + } + + /* Parse L1. */ + L1gpa = (pte & PG_FRAME); + if (nvmm_gpa_to_hva(mach, L1gpa, &L1hva) == -1) + return -1; + pdir = (pte_32bit_t *)L1hva; + pte = pdir[pte32_l1idx(gva)]; + if ((pte & PG_V) == 0) + return -1; + if ((pte & PG_u) == 0) + *prot &= ~NVMM_PROT_USER; + if ((pte & PG_KW) == 0) + *prot &= ~NVMM_PROT_WRITE; + if (pte & PG_PS) + return -1; + + *gpa = (pte & PG_FRAME); + return 0; +} + +/* -------------------------------------------------------------------------- */ + +#define PTE32_PAE_L1_SHIFT 12 +#define PTE32_PAE_L2_SHIFT 21 +#define PTE32_PAE_L3_SHIFT 30 + +#define PTE32_PAE_L3_MASK 0xc0000000 +#define PTE32_PAE_L2_MASK 0x3fe00000 +#define PTE32_PAE_L1_MASK 0x001ff000 + +#define PTE32_PAE_L3_FRAME (PTE32_PAE_L3_MASK) +#define PTE32_PAE_L2_FRAME (PTE32_PAE_L3_FRAME|PTE32_PAE_L2_MASK) +#define PTE32_PAE_L1_FRAME (PTE32_PAE_L2_FRAME|PTE32_PAE_L1_MASK) + +#define pte32_pae_l1idx(va) (((va) & PTE32_PAE_L1_MASK) >> PTE32_PAE_L1_SHIFT) +#define pte32_pae_l2idx(va) (((va) & PTE32_PAE_L2_MASK) >> PTE32_PAE_L2_SHIFT) +#define pte32_pae_l3idx(va) (((va) & PTE32_PAE_L3_MASK) >> PTE32_PAE_L3_SHIFT) + +typedef uint64_t pte_32bit_pae_t; + +static int +x86_gva_to_gpa_32bit_pae(struct nvmm_machine *mach, uint64_t cr3, + gvaddr_t gva, gpaddr_t *gpa, bool has_pse, nvmm_prot_t *prot) +{ + gpaddr_t L3gpa, L2gpa, L1gpa; + uintptr_t L3hva, L2hva, L1hva; + pte_32bit_pae_t *pdir, pte; + + /* We begin with an RWXU access. */ + *prot = NVMM_PROT_ALL; + + /* Parse L3. */ + L3gpa = (cr3 & PG_FRAME); + if (nvmm_gpa_to_hva(mach, L3gpa, &L3hva) == -1) + return -1; + pdir = (pte_32bit_pae_t *)L3hva; + pte = pdir[pte32_pae_l3idx(gva)]; + if ((pte & PG_V) == 0) + return -1; + if (pte & PG_NX) + *prot &= ~NVMM_PROT_EXEC; + if (pte & PG_PS) + return -1; + + /* Parse L2. */ + L2gpa = (pte & PG_FRAME); + if (nvmm_gpa_to_hva(mach, L2gpa, &L2hva) == -1) + return -1; + pdir = (pte_32bit_pae_t *)L2hva; + pte = pdir[pte32_pae_l2idx(gva)]; + if ((pte & PG_V) == 0) + return -1; + if ((pte & PG_u) == 0) + *prot &= ~NVMM_PROT_USER; + if ((pte & PG_KW) == 0) + *prot &= ~NVMM_PROT_WRITE; + if (pte & PG_NX) + *prot &= ~NVMM_PROT_EXEC; + if ((pte & PG_PS) && !has_pse) + return -1; + if (pte & PG_PS) { + *gpa = (pte & PTE32_PAE_L2_FRAME); + return 0; + } + + /* Parse L1. */ + L1gpa = (pte & PG_FRAME); + if (nvmm_gpa_to_hva(mach, L1gpa, &L1hva) == -1) + return -1; + pdir = (pte_32bit_pae_t *)L1hva; + pte = pdir[pte32_pae_l1idx(gva)]; + if ((pte & PG_V) == 0) + return -1; + if ((pte & PG_u) == 0) + *prot &= ~NVMM_PROT_USER; + if ((pte & PG_KW) == 0) + *prot &= ~NVMM_PROT_WRITE; + if (pte & PG_NX) + *prot &= ~NVMM_PROT_EXEC; + if (pte & PG_PS) + return -1; + + *gpa = (pte & PG_FRAME); + return 0; +} + +/* -------------------------------------------------------------------------- */ + +#define PTE64_L1_SHIFT 12 +#define PTE64_L2_SHIFT 21 +#define PTE64_L3_SHIFT 30 +#define PTE64_L4_SHIFT 39 + +#define PTE64_L4_MASK 0x0000ff8000000000 +#define PTE64_L3_MASK 0x0000007fc0000000 +#define PTE64_L2_MASK 0x000000003fe00000 +#define PTE64_L1_MASK 0x00000000001ff000 + +#define PTE64_L4_FRAME PTE64_L4_MASK +#define PTE64_L3_FRAME (PTE64_L4_FRAME|PTE64_L3_MASK) +#define PTE64_L2_FRAME (PTE64_L3_FRAME|PTE64_L2_MASK) +#define PTE64_L1_FRAME (PTE64_L2_FRAME|PTE64_L1_MASK) + +#define pte64_l1idx(va) (((va) & PTE64_L1_MASK) >> PTE64_L1_SHIFT) +#define pte64_l2idx(va) (((va) & PTE64_L2_MASK) >> PTE64_L2_SHIFT) +#define pte64_l3idx(va) (((va) & PTE64_L3_MASK) >> PTE64_L3_SHIFT) +#define pte64_l4idx(va) (((va) & PTE64_L4_MASK) >> PTE64_L4_SHIFT) + +typedef uint64_t pte_64bit_t; + +static inline bool +x86_gva_64bit_canonical(gvaddr_t gva) +{ + /* Bits 63:47 must have the same value. */ +#define SIGN_EXTEND 0xffff800000000000ULL + return (gva & SIGN_EXTEND) == 0 || (gva & SIGN_EXTEND) == SIGN_EXTEND; +} + +static int +x86_gva_to_gpa_64bit(struct nvmm_machine *mach, uint64_t cr3, + gvaddr_t gva, gpaddr_t *gpa, bool has_pse, nvmm_prot_t *prot) +{ + gpaddr_t L4gpa, L3gpa, L2gpa, L1gpa; + uintptr_t L4hva, L3hva, L2hva, L1hva; + pte_64bit_t *pdir, pte; + + /* We begin with an RWXU access. */ + *prot = NVMM_PROT_ALL; + + if (!x86_gva_64bit_canonical(gva)) + return -1; + + /* Parse L4. */ + L4gpa = (cr3 & PG_FRAME); + if (nvmm_gpa_to_hva(mach, L4gpa, &L4hva) == -1) + return -1; + pdir = (pte_64bit_t *)L4hva; + pte = pdir[pte64_l4idx(gva)]; + if ((pte & PG_V) == 0) + return -1; + if ((pte & PG_u) == 0) + *prot &= ~NVMM_PROT_USER; + if ((pte & PG_KW) == 0) + *prot &= ~NVMM_PROT_WRITE; + if (pte & PG_NX) + *prot &= ~NVMM_PROT_EXEC; + if (pte & PG_PS) + return -1; + + /* Parse L3. */ + L3gpa = (pte & PG_FRAME); + if (nvmm_gpa_to_hva(mach, L3gpa, &L3hva) == -1) + return -1; + pdir = (pte_64bit_t *)L3hva; + pte = pdir[pte64_l3idx(gva)]; + if ((pte & PG_V) == 0) + return -1; + if ((pte & PG_u) == 0) + *prot &= ~NVMM_PROT_USER; + if ((pte & PG_KW) == 0) + *prot &= ~NVMM_PROT_WRITE; + if (pte & PG_NX) + *prot &= ~NVMM_PROT_EXEC; + if ((pte & PG_PS) && !has_pse) + return -1; + if (pte & PG_PS) { + *gpa = (pte & PTE64_L3_FRAME); + return 0; + } + + /* Parse L2. */ + L2gpa = (pte & PG_FRAME); + if (nvmm_gpa_to_hva(mach, L2gpa, &L2hva) == -1) + return -1; + pdir = (pte_64bit_t *)L2hva; + pte = pdir[pte64_l2idx(gva)]; + if ((pte & PG_V) == 0) + return -1; + if ((pte & PG_u) == 0) + *prot &= ~NVMM_PROT_USER; + if ((pte & PG_KW) == 0) + *prot &= ~NVMM_PROT_WRITE; + if (pte & PG_NX) + *prot &= ~NVMM_PROT_EXEC; + if ((pte & PG_PS) && !has_pse) + return -1; + if (pte & PG_PS) { + *gpa = (pte & PTE64_L2_FRAME); + return 0; + } + + /* Parse L1. */ + L1gpa = (pte & PG_FRAME); + if (nvmm_gpa_to_hva(mach, L1gpa, &L1hva) == -1) + return -1; + pdir = (pte_64bit_t *)L1hva; + pte = pdir[pte64_l1idx(gva)]; + if ((pte & PG_V) == 0) + return -1; + if ((pte & PG_u) == 0) + *prot &= ~NVMM_PROT_USER; + if ((pte & PG_KW) == 0) + *prot &= ~NVMM_PROT_WRITE; + if (pte & PG_NX) + *prot &= ~NVMM_PROT_EXEC; + if (pte & PG_PS) + return -1; + + *gpa = (pte & PG_FRAME); + return 0; +} + +static inline int +x86_gva_to_gpa(struct nvmm_machine *mach, struct nvmm_x64_state *state, + gvaddr_t gva, gpaddr_t *gpa, nvmm_prot_t *prot) +{ + bool is_pae, is_lng, has_pse; + uint64_t cr3; + int ret; + + if ((state->crs[NVMM_X64_CR_CR0] & CR0_PG) == 0) { + /* No paging. */ + *gpa = gva; + return 0; + } + + is_pae = (state->crs[NVMM_X64_CR_CR4] & CR4_PAE) != 0; + is_lng = (state->msrs[NVMM_X64_MSR_EFER] & EFER_LME) != 0; + has_pse = (state->crs[NVMM_X64_CR_CR4] & CR4_PSE) != 0; + cr3 = state->crs[NVMM_X64_CR_CR3]; + + if (is_pae && is_lng) { + /* 64bit */ + ret = x86_gva_to_gpa_64bit(mach, cr3, gva, gpa, has_pse, prot); + } else if (is_pae && !is_lng) { + /* 32bit PAE */ + ret = x86_gva_to_gpa_32bit_pae(mach, cr3, gva, gpa, has_pse, + prot); + } else if (!is_pae && !is_lng) { + /* 32bit */ + ret = x86_gva_to_gpa_32bit(mach, cr3, gva, gpa, has_pse, prot); + } else { + ret = -1; + } + + if (ret == -1) { + errno = EFAULT; + } + + return ret; +} + +int +nvmm_gva_to_gpa(struct nvmm_machine *mach, nvmm_cpuid_t cpuid, + gvaddr_t gva, gpaddr_t *gpa, nvmm_prot_t *prot) +{ + struct nvmm_x64_state state; + int ret; + + if (gva & PAGE_MASK) { + errno = EINVAL; + return -1; + } + + ret = nvmm_vcpu_getstate(mach, cpuid, &state, + NVMM_X64_STATE_CRS | NVMM_X64_STATE_MSRS); + if (ret == -1) + return -1; + + return x86_gva_to_gpa(mach, &state, gva, gpa, prot); +} + +/* -------------------------------------------------------------------------- */ + +static inline bool +is_long_mode(struct nvmm_x64_state *state) +{ + return (state->msrs[NVMM_X64_MSR_EFER] & EFER_LME) != 0; +} + +static inline bool +is_illegal(struct nvmm_io *io, nvmm_prot_t prot) +{ + return (io->in && !(prot & NVMM_PROT_WRITE)); +} + +static int +segment_apply(struct nvmm_x64_state_seg *seg, gvaddr_t *gva, size_t size) +{ + uint64_t limit; + + /* + * This is incomplete. We should check topdown, etc, really that's + * tiring. + */ + if (__predict_false(!seg->attrib.p)) { + goto error; + } + + limit = (seg->limit + 1); + if (__predict_true(seg->attrib.gran)) { + limit *= PAGE_SIZE; + } + + if (__predict_false(*gva + seg->base + size > limit)) { + goto error; + } + + *gva += seg->base; + return 0; + +error: + errno = EFAULT; + return -1; +} + +int +nvmm_assist_io(struct nvmm_machine *mach, nvmm_cpuid_t cpuid, + struct nvmm_exit *exit, void (*cb)(struct nvmm_io *)) +{ + struct nvmm_x64_state state; + struct nvmm_io io; + nvmm_prot_t prot; + size_t remain, done; + uintptr_t hva; + gvaddr_t gva, off; + gpaddr_t gpa; + uint64_t rsi; + uint8_t tmp[8]; + uint8_t *ptr, *ptr2; + bool cross; + int ret; + + if (__predict_false(exit->reason != NVMM_EXIT_IO)) { + errno = EINVAL; + return -1; + } + + io.port = exit->u.io.port; + io.in = (exit->u.io.type == NVMM_EXIT_IO_IN); + io.size = exit->u.io.operand_size; + + ret = nvmm_vcpu_getstate(mach, cpuid, &state, + NVMM_X64_STATE_GPRS | NVMM_X64_STATE_SEGS | + NVMM_X64_STATE_CRS | NVMM_X64_STATE_MSRS); + if (ret == -1) + return -1; + + cross = false; + + if (!exit->u.io.str) { + ptr = (uint8_t *)&state.gprs[NVMM_X64_GPR_RAX]; + } else { + rsi = state.gprs[NVMM_X64_GPR_RSI]; + + switch (exit->u.io.address_size) { + case 8: + gva = rsi; + break; + case 4: + gva = (rsi & 0x00000000FFFFFFFF); + break; + case 2: + default: /* impossible */ + gva = (rsi & 0x000000000000FFFF); + break; + } + + if (!is_long_mode(&state)) { + ret = segment_apply(&state.segs[exit->u.io.seg], &gva, + io.size); + if (ret == -1) + return -1; + } + + off = (gva & PAGE_MASK); + gva &= ~PAGE_MASK; + + ret = x86_gva_to_gpa(mach, &state, gva, &gpa, &prot); + if (ret == -1) + return -1; + if (__predict_false(is_illegal(&io, prot))) { + errno = EFAULT; + return -1; + } + ret = nvmm_gpa_to_hva(mach, gpa, &hva); + if (ret == -1) + return -1; + + ptr = (uint8_t *)hva + off; + + /* + * Special case. If the buffer is in between two pages, we + * need to retrieve data from the next page. + */ + if (__predict_false(off + io.size > PAGE_SIZE)) { + cross = true; + remain = off + io.size - PAGE_SIZE; + done = PAGE_SIZE - off; + + memcpy(tmp, ptr, done); + + ret = x86_gva_to_gpa(mach, &state, gva + PAGE_SIZE, + &gpa, &prot); + if (ret == -1) + return -1; + if (__predict_false(is_illegal(&io, prot))) { + errno = EFAULT; + return -1; + } + ret = nvmm_gpa_to_hva(mach, gpa, &hva); + if (ret == -1) + return -1; + + memcpy(&tmp[done], (uint8_t *)hva, remain); + ptr2 = &tmp[done]; + } + } + + if (io.in) { + /* nothing to do */ + } else { + memcpy(io.data, ptr, io.size); + } + + (*cb)(&io); + + if (io.in) { + if (!exit->u.io.str) + state.gprs[NVMM_X64_GPR_RAX] = 0; + if (__predict_false(cross)) { + memcpy(ptr, io.data, done); + memcpy(ptr2, &io.data[done], remain); + } else { + memcpy(ptr, io.data, io.size); + } + } else { + /* nothing to do */ + } + + if (exit->u.io.rep) { + state.gprs[NVMM_X64_GPR_RCX] -= 1; + if (state.gprs[NVMM_X64_GPR_RCX] == 0) { + state.gprs[NVMM_X64_GPR_RIP] = exit->u.io.npc; + } + if (exit->u.io.str) { + if (state.gprs[NVMM_X64_GPR_RFLAGS] & PSL_D) { + state.gprs[NVMM_X64_GPR_RSI] -= io.size; + } else { + state.gprs[NVMM_X64_GPR_RSI] += io.size; + } + } + } else { + state.gprs[NVMM_X64_GPR_RIP] = exit->u.io.npc; + } + + ret = nvmm_vcpu_setstate(mach, cpuid, &state, NVMM_X64_STATE_GPRS); + if (ret == -1) + return -1; + + return 0; +} + +/* -------------------------------------------------------------------------- */ + +int +nvmm_assist_mem(struct nvmm_machine *mach, nvmm_cpuid_t cpuid, + struct nvmm_exit *exit, void (*cb)(struct nvmm_mem *)) +{ + if (__predict_false(exit->reason != NVMM_EXIT_MEMORY)) { + errno = EINVAL; + return -1; + } + + // TODO + errno = ENOSYS; + return -1; +} Index: src/lib/libnvmm/nvmm.h diff -u /dev/null src/lib/libnvmm/nvmm.h:1.1 --- /dev/null Sat Nov 10 09:28:56 2018 +++ src/lib/libnvmm/nvmm.h Sat Nov 10 09:28:56 2018 @@ -0,0 +1,103 @@ +/* $NetBSD: nvmm.h,v 1.1 2018/11/10 09:28:56 maxv Exp $ */ + +/* + * Copyright (c) 2018 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Maxime Villard. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _LIBNVMM_H_ +#define _LIBNVMM_H_ + +#include <stdint.h> +#include <stdbool.h> + +#include <dev/nvmm/nvmm.h> +#include <dev/nvmm/nvmm_ioctl.h> +#ifdef __x86_64__ +#include <dev/nvmm/x86/nvmm_x86.h> +#endif + +struct nvmm_area { + gpaddr_t gpa; + uintptr_t hva; + size_t size; +}; + +struct nvmm_machine { + nvmm_machid_t machid; + struct nvmm_area *areas; + size_t nareas; +}; + +struct nvmm_io { + uint64_t port; + bool in; + size_t size; + uint8_t data[8]; +}; + +struct nvmm_mem { + gvaddr_t gva; + gpaddr_t gpa; + bool write; + size_t size; + uint8_t data[8]; +}; + +#define NVMM_PROT_READ 0x01 +#define NVMM_PROT_WRITE 0x02 +#define NVMM_PROT_EXEC 0x04 +#define NVMM_PROT_USER 0x08 +#define NVMM_PROT_ALL 0x0F +typedef uint64_t nvmm_prot_t; + +int nvmm_capability(struct nvmm_capability *); + +int nvmm_machine_create(struct nvmm_machine *); +int nvmm_machine_destroy(struct nvmm_machine *); +int nvmm_machine_configure(struct nvmm_machine *, uint64_t, void *); + +int nvmm_vcpu_create(struct nvmm_machine *, nvmm_cpuid_t); +int nvmm_vcpu_destroy(struct nvmm_machine *, nvmm_cpuid_t); +int nvmm_vcpu_setstate(struct nvmm_machine *, nvmm_cpuid_t, void *, uint64_t); +int nvmm_vcpu_getstate(struct nvmm_machine *, nvmm_cpuid_t, void *, uint64_t); +int nvmm_vcpu_inject(struct nvmm_machine *, nvmm_cpuid_t, struct nvmm_event *); +int nvmm_vcpu_run(struct nvmm_machine *, nvmm_cpuid_t, struct nvmm_exit *); + +int nvmm_gpa_map(struct nvmm_machine *, uintptr_t, gpaddr_t, size_t, int); +int nvmm_gpa_unmap(struct nvmm_machine *, uintptr_t, gpaddr_t, size_t); + +int nvmm_gva_to_gpa(struct nvmm_machine *, nvmm_cpuid_t, gvaddr_t, gpaddr_t *, + nvmm_prot_t *); +int nvmm_gpa_to_hva(struct nvmm_machine *, gpaddr_t, uintptr_t *); + +int nvmm_assist_io(struct nvmm_machine *, nvmm_cpuid_t, struct nvmm_exit *, + void (*)(struct nvmm_io *)); +int nvmm_assist_mem(struct nvmm_machine *, nvmm_cpuid_t, struct nvmm_exit *, + void (*)(struct nvmm_mem *)); + +#endif /* _LIBNVMM_H_ */ Index: src/lib/libnvmm/shlib_version diff -u /dev/null src/lib/libnvmm/shlib_version:1.1 --- /dev/null Sat Nov 10 09:28:56 2018 +++ src/lib/libnvmm/shlib_version Sat Nov 10 09:28:56 2018 @@ -0,0 +1,5 @@ +# $NetBSD: shlib_version,v 1.1 2018/11/10 09:28:56 maxv Exp $ +# Remember to update distrib/sets/lists/base/shl.* when changing +# +major=0 +minor=1