tech@, The diff below adds a new ioctl for vmm(4) that allows an emulated device process request vmm(4) enter a shared mapping in its vmspace so it can access guest memory without using a shared mapping backed by a named file.
Similar to the vm creation ioctl (VMM_IOC_CREATE), the caller requires the "vmm" and "proc" pledge(2) promises. This allows the emulated devices to do this setup early and drop these promises back down to just "stdio" before any device emulation occurs. Feel free to skip to the diff (the regress change shows how it works in a simplified case) or continue reading for reasoning behind this change. I share this primarily for testers and feedback from other devs while mlarkin@ reviews. To test: 1. apply diff 2. build and install new kernel 3. copy or symlink new vmm.h into /usr/include/dev/vmm/ 4. build and reinstall vmd (no changes for vmctl needed) You should see no change during vm usage, however you should now see no change in /tmp consumption while unmounting things like NFS mounts or usb disks. Read on for details. ... vmd(8) began emulating virtio network and block devices in subprocesses with a commit I made at the recent hackathon in Morocco. It relies on creating shared memory mappings using shm_mkstemp(3) and passing file descriptors to the fork/exec'd child processes. I've since received reports that using named mappings for shared memory is having 2 negative impacts: 1. increased overhead during vm teardown, often making systems unresponsive (this is my conclusion based on only minimal evidence) 2. unmounting any device on the host while a vm is running causes some guest memory to be flushed to disk (the backing file is already unlinked, so not visible to other processes). (2) can cause /tmp to fill up or introduce failure conditions I'm not sure we can recover from in vmd. It also has implications for other services on the host. I don't own a fireproof suit that fits...so I'm not about to wade into the VFS & UVM layers to figure out if (1) or (2) can be mitigated or fixed on their own. One idea was to implement what FreeBSD borrowed from Linux in their forever quest to become LinuxBSD: memfd_create(2) [1]. I took one look and decided this was not the time for me to be trying to land a new syscall primarily for vmd (and some ports) and went another route. [1] https://man7.org/linux/man-pages/man2/memfd_create.2.html -dv diff refs/heads/master refs/heads/vmm-mapshare commit - cec1ace2d4d21c85f4c8bacc2dd971721bf6b694 commit + 8f533c371094c044b0127d468be5feaaf775811b blob - f221b58f75c4eb01a3a04ae45c7cdb066b11361a blob + 0e6f5ff858c51bd9707c657b154b0df1f8944c3b --- regress/sys/arch/amd64/vmm/vcpu.c +++ regress/sys/arch/amd64/vmm/vcpu.c @@ -83,6 +83,7 @@ main(int argc, char **argv) struct vm_resetcpu_params vresetp; struct vm_run_params vrunp; struct vm_terminate_params vtp; + struct vm_sharemem_params vsp; struct vm_mem_range *vmr; int fd, ret = 1; @@ -127,8 +128,9 @@ main(int argc, char **argv) ((uint8_t*)p)[j + 1] = PCKBC_AUX; } vmr->vmr_va = (vaddr_t)p; - printf("mapped region %zu: { gpa: 0x%08lx, size: %lu }\n", - i, vmr->vmr_gpa, vmr->vmr_size); + printf("created mapped region %zu: { gpa: 0x%08lx, size: %lu," + " hva: 0x%lx }\n", i, vmr->vmr_gpa, vmr->vmr_size, + vmr->vmr_va); } if (ioctl(fd, VMM_IOC_CREATE, &vcp) == -1) @@ -136,8 +138,55 @@ main(int argc, char **argv) printf("created vm %d named \"%s\"\n", vcp.vcp_id, vcp.vcp_name); /* - * 2. Check that our VM exists. + * 2. Check we can create shared memory mappings. */ + memset(&vsp, 0, sizeof(vsp)); + vsp.vsp_nmemranges = vcp.vcp_nmemranges; + memcpy(&vsp.vsp_memranges, &vcp.vcp_memranges, + sizeof(vsp.vsp_memranges)); + vsp.vsp_vm_id = vcp.vcp_id; + + /* Find some new va ranges... */ + for (i = 0; i < vsp.vsp_nmemranges; i++) { + vmr = &vsp.vsp_memranges[i]; + p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, 0); + if (p == MAP_FAILED) + err(1, "mmap"); + vmr->vmr_va = (vaddr_t)p; + } + + /* Release our mappings so vmm can replace them. */ + for (i = 0; i < vsp.vsp_nmemranges; i++) { + vmr = &vsp.vsp_memranges[i]; + munmap((void*)vmr->vmr_va, vmr->vmr_size); + } + + /* Perform the shared mapping. */ + if (ioctl(fd, VMM_IOC_SHAREMEM, &vsp) == -1) + err(1, "VMM_IOC_SHAREMEM"); + printf("created shared memory mappings\n"); + + /* We should see our reset vector instructions in the new mappings. */ + for (i = 0; i < vsp.vsp_nmemranges; i++) { + vmr = &vsp.vsp_memranges[i]; + p = (void*)vmr->vmr_va; + + for (j = 0; j < vmr->vmr_size; j += 2) { + if (((uint8_t*)p)[j + 0] != 0xE4) + errx(1, "bad byte"); + if (((uint8_t*)p)[j + 1] != PCKBC_AUX) + errx(1, "bad byte"); + } + printf("checked shared region %zu: { gpa: 0x%08lx, size: %lu," + " hva: 0x%lx }\n", i, vmr->vmr_gpa, vmr->vmr_size, + vmr->vmr_va); + } + printf("validated shared memory mappings\n"); + + /* + * 3. Check that our VM exists. + */ memset(&vip, 0, sizeof(vip)); vip.vip_size = 0; info = NULL; @@ -189,7 +238,7 @@ main(int argc, char **argv) ours = NULL; /* - * 3. Reset our VCPU and initialize register state. + * 4. Reset our VCPU and initialize register state. */ memset(&vresetp, 0, sizeof(vresetp)); vresetp.vrp_vm_id = vcp.vcp_id; @@ -205,7 +254,7 @@ main(int argc, char **argv) vresetp.vrp_vm_id); /* - * 4. Run the vcpu, expecting an immediate exit for IO assist. + * 5. Run the vcpu, expecting an immediate exit for IO assist. */ exit = malloc(sizeof(*exit)); if (exit == NULL) { @@ -258,7 +307,7 @@ out: out: /* - * 5. Terminate our VM and clean up. + * 6. Terminate our VM and clean up. */ memset(&vtp, 0, sizeof(vtp)); vtp.vtp_vm_id = vcp.vcp_id; @@ -277,13 +326,23 @@ out: vmr = &vcp.vcp_memranges[i]; if (vmr->vmr_va) { if (munmap((void *)vmr->vmr_va, vmr->vmr_size)) { - warn("failed to unmap region %zu at 0x%08lx", - i, vmr->vmr_va); + warn("failed to unmap orginal region %zu @ hva " + "0x%lx", i, vmr->vmr_va); ret = 1; } else - printf("unmapped region %zu @ gpa 0x%08lx\n", - i, vmr->vmr_gpa); + printf("unmapped origin region %zu @ hva " + "0x%lx\n", i, vmr->vmr_va); } + vmr = &vsp.vsp_memranges[i]; + if (vmr->vmr_va) { + if (munmap((void *)vmr->vmr_va, vmr->vmr_size)) { + warn("failed to unmap shared region %zu @ hva " + "0x%lx", i, vmr->vmr_va); + ret = 1; + } else + printf("unmapped shared region %zu @ hva " + "0x%lx\n", i, vmr->vmr_va); + } } return (ret); blob - d46b3431081b6d2e7e1adab884ec21b0aaa9761a blob + 3daee7dad431ed200cbd734bc0f8b35bebd54216 --- sys/dev/vmm/vmm.c +++ sys/dev/vmm/vmm.c @@ -262,6 +262,9 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag case VMM_IOC_WRITEVMPARAMS: ret = vm_rwvmparams((struct vm_rwvmparams_params *)data, 1); break; + case VMM_IOC_SHAREMEM: + ret = vm_share_mem((struct vm_sharemem_params *)data, p); + break; default: ret = vmmioctl_machdep(dev, cmd, data, flag, p); break; @@ -286,6 +289,7 @@ pledge_ioctl_vmm(struct proc *p, long com) switch (com) { case VMM_IOC_CREATE: case VMM_IOC_INFO: + case VMM_IOC_SHAREMEM: /* The "parent" process in vmd forks and manages VMs */ if (p->p_p->ps_pledge & PLEDGE_PROC) return (0); @@ -780,3 +784,82 @@ vcpu_must_stop(struct vcpu *vcpu) return (1); return (0); } + +/* + * vm_share_mem + * + * Share a uvm mapping for the vm guest memory ranges into the calling process. + * + * Return values: + * 0: if successful + * ENOENT: if the vm cannot be found by vm_find + * EPERM: if the vm cannot be accessed by the current process + * EINVAL: if the provide memory ranges fail checks + * ENOMEM: if uvm_share fails to find available memory in the destination map + */ +int +vm_share_mem(struct vm_sharemem_params *vsp, struct proc *p) +{ + int ret = EINVAL; + size_t i, n; + struct vm *vm; + struct vm_mem_range *src, *dst; + + ret = vm_find(vsp->vsp_vm_id, &vm); + if (ret) + return (ret); + + /* Check we have the expected number of ranges. */ + if (vm->vm_nmemranges != vsp->vsp_nmemranges) + goto out; + n = vm->vm_nmemranges; + + /* Check their types, sizes, and gpa's (implying page alignment). */ + for (i = 0; i < n; i++) { + src = &vm->vm_memranges[i]; + dst = &vsp->vsp_memranges[i]; + + /* + * The vm memranges were already checked during creation, so + * compare to them to confirm validity of mapping request. + */ + if (src->vmr_type != dst->vmr_type) + goto out; + if (src->vmr_gpa != dst->vmr_gpa) + goto out; + if (src->vmr_size != dst->vmr_size) + goto out; + + /* Check our intended destination is page-aligned. */ + if (dst->vmr_va & PAGE_MASK) + goto out; + } + + /* + * Share each range individually with the calling process. We do + * not need PROC_EXEC as the emulated devices do not need to execute + * instructions from guest memory. + */ + for (i = 0; i < n; i++) { + src = &vm->vm_memranges[i]; + dst = &vsp->vsp_memranges[i]; + + /* Skip MMIO range. */ + if (src->vmr_type == VM_MEM_MMIO) + continue; + + DPRINTF("sharing gpa=0x%lx for pid %d @ va=0x%lx\n", + src->vmr_gpa, p->p_p->ps_pid, dst->vmr_va); + ret = uvm_share(&p->p_vmspace->vm_map, dst->vmr_va, + PROT_READ | PROT_WRITE, vm->vm_map, src->vmr_gpa, + src->vmr_size); + if (ret) { + printf("%s: uvm_share failed (%d)\n", __func__, ret); + break; + } + } + ret = 0; +out: + refcnt_rele_wake(&vm->vm_refcnt); + return (ret); +} blob - d2355d42b44b51044901de9d0adc0239586f37b8 blob + 7b3b0d77ad550165b6e53f4de66723366d689b23 --- sys/dev/vmm/vmm.h +++ sys/dev/vmm/vmm.h @@ -76,6 +76,13 @@ struct vm_resetcpu_params { struct vcpu_reg_state vrp_init_state; }; +struct vm_sharemem_params { + /* Input parameters to VMM_IOC_SHAREMEM */ + uint32_t vsp_vm_id; + size_t vsp_nmemranges; + struct vm_mem_range vsp_memranges[VMM_MAX_MEM_RANGES]; +}; + /* IOCTL definitions */ #define VMM_IOC_CREATE _IOWR('V', 1, struct vm_create_params) /* Create VM */ #define VMM_IOC_RUN _IOWR('V', 2, struct vm_run_params) /* Run VCPU */ @@ -88,8 +95,8 @@ struct vm_resetcpu_params { #define VMM_IOC_READVMPARAMS _IOWR('V', 9, struct vm_rwvmparams_params) /* Set VM params */ #define VMM_IOC_WRITEVMPARAMS _IOW('V', 10, struct vm_rwvmparams_params) +#define VMM_IOC_SHAREMEM _IOW('V', 11, struct vm_sharemem_params) - #ifdef _KERNEL /* #define VMM_DEBUG */ @@ -194,6 +201,7 @@ int vcpu_must_stop(struct vcpu *); int vm_terminate(struct vm_terminate_params *); int vm_resetcpu(struct vm_resetcpu_params *); int vcpu_must_stop(struct vcpu *); +int vm_share_mem(struct vm_sharemem_params *, struct proc *); #endif /* _KERNEL */ #endif /* DEV_VMM_H */ blob - 9373a135aa87755f0e34b821af4ab8c0f4970421 blob + dd0efc2fd71bb0ac91d2e389a30e779d8d6c6c0d --- usr.sbin/vmd/vioblk.c +++ usr.sbin/vmd/vioblk.c @@ -58,7 +58,7 @@ vioblk_main(int fd) } __dead void -vioblk_main(int fd) +vioblk_main(int fd, int fd_vmm) { struct virtio_dev dev; struct vioblk_dev *vioblk; @@ -71,8 +71,11 @@ vioblk_main(int fd) log_procinit("vioblk"); - /* stdio - needed for read/write to disk fds and channels to the vm. */ - if (pledge("stdio", NULL) == -1) + /* + * stdio - needed for read/write to disk fds and channels to the vm. + * vmm + proc - needed to create shared vm mappings. + */ + if (pledge("stdio vmm proc", NULL) == -1) fatal("pledge"); /* Receive our virtio_dev, mostly preconfigured. */ @@ -92,8 +95,9 @@ vioblk_main(int fd) vioblk = &dev.vioblk; log_debug("%s: got viblk dev. num disk fds = %d, sync fd = %d, " - "async fd = %d, sz = %lld maxfer = %d", __func__, vioblk->ndisk_fd, - dev.sync_fd, dev.async_fd, vioblk->sz, vioblk->max_xfer); + "async fd = %d, sz = %lld maxfer = %d, vmm fd = %d", __func__, + vioblk->ndisk_fd, dev.sync_fd, dev.async_fd, vioblk->sz, + vioblk->max_xfer, fd_vmm); /* Receive our vm information from the vm process. */ memset(&vm, 0, sizeof(vm)); @@ -108,12 +112,19 @@ vioblk_main(int fd) setproctitle("%s/vioblk[%d]", vcp->vcp_name, vioblk->idx); /* Now that we have our vm information, we can remap memory. */ - ret = remap_guest_mem(&vm); + ret = remap_guest_mem(&vm, fd_vmm); if (ret) { log_warnx("failed to remap guest memory"); goto fail; } + /* + * We no longer need /dev/vmm access. + */ + close_fd(fd_vmm); + if (pledge("stdio", NULL) == -1) + fatal("pledge2"); + /* Initialize the virtio block abstractions. */ type = vm.vm_params.vmc_disktypes[vioblk->idx]; switch (type) { blob - 6ce905fdccfa7befb49353c23628227fdc74c486 blob + d75dc06b9bc0133ace3e74de85abba3b62f539dc --- usr.sbin/vmd/vionet.c +++ usr.sbin/vmd/vionet.c @@ -61,7 +61,7 @@ vionet_main(int fd) static void handle_sync_io(int, short, void *); __dead void -vionet_main(int fd) +vionet_main(int fd, int fd_vmm) { struct virtio_dev dev; struct vionet_dev *vionet = NULL; @@ -73,8 +73,11 @@ vionet_main(int fd) log_procinit("vionet"); - /* stdio - needed for read/write to tap fd and channels to the vm. */ - if (pledge("stdio", NULL) == -1) + /* + * stdio - needed for read/write to disk fds and channels to the vm. + * vmm + proc - needed to create shared vm mappings. + */ + if (pledge("stdio vmm proc", NULL) == -1) fatal("pledge"); /* Receive our vionet_dev, mostly preconfigured. */ @@ -92,8 +95,9 @@ vionet_main(int fd) dev.sync_fd = fd; vionet = &dev.vionet; - log_debug("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d", - __func__, vionet->data_fd, dev.sync_fd, dev.async_fd); + log_debug("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d" + ", vmm fd = %d", __func__, vionet->data_fd, dev.sync_fd, + dev.async_fd, fd_vmm); /* Receive our vm information from the vm process. */ memset(&vm, 0, sizeof(vm)); @@ -108,10 +112,19 @@ vionet_main(int fd) setproctitle("%s/vionet[%d]", vcp->vcp_name, vionet->idx); /* Now that we have our vm information, we can remap memory. */ - ret = remap_guest_mem(&vm); - if (ret) + ret = remap_guest_mem(&vm, fd_vmm); + if (ret) { + fatal("%s: failed to remap", __func__); goto fail; + } + /* + * We no longer need /dev/vmm access. + */ + close_fd(fd_vmm); + if (pledge("stdio", NULL) == -1) + fatal("pledge2"); + /* If we're restoring hardware, re-initialize virtqueue hva's. */ if (vm.vm_state & VM_STATE_RECEIVED) { struct virtio_vq_info *vq_info; blob - 92e77b8f83431c2ff52824e35149477909612653 blob + e3f6d1371ab7795df6e9d4b370ffd1a7c5afbde6 --- usr.sbin/vmd/virtio.c +++ usr.sbin/vmd/virtio.c @@ -1297,7 +1297,7 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev static int virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev) { - char *nargv[8], num[32], t[2]; + char *nargv[10], num[32], vmm_fd[32], t[2]; pid_t dev_pid; int data_fds[VM_MAX_BASE_PER_DISK], sync_fds[2], async_fds[2], ret = 0; size_t i, j, data_fds_sz, sz = 0; @@ -1483,6 +1483,8 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev memset(&nargv, 0, sizeof(nargv)); memset(num, 0, sizeof(num)); snprintf(num, sizeof(num), "%d", sync_fds[1]); + memset(vmm_fd, 0, sizeof(vmm_fd)); + snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd); t[0] = dev->dev_type; t[1] = '\0'; @@ -1492,13 +1494,15 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev nargv[2] = num; nargv[3] = "-t"; nargv[4] = t; - nargv[5] = "-n"; + nargv[5] = "-i"; + nargv[6] = vmm_fd; + nargv[7] = "-n"; if (env->vmd_verbose) { - nargv[6] = "-v"; - nargv[7] = NULL; + nargv[8] = "-v"; + nargv[9] = NULL; } else - nargv[6] = NULL; + nargv[8] = NULL; /* Control resumes in vmd.c:main(). */ execvp(nargv[0], nargv); blob - d42abb5a834cb5f6d9777d8b23f92a6a1c5930f2 blob + 48ec88c37db24290e069bf0ab9249df764b2c424 --- usr.sbin/vmd/vm.c +++ usr.sbin/vmd/vm.c @@ -218,9 +218,10 @@ static const struct vcpu_reg_state vcpu_init_flat16 = * Primary entrypoint for launching a vm. Does not return. * * fd: file descriptor for communicating with vmm process. + * fd_vmm: file descriptor for communicating with vmm(4) device */ void -vm_main(int fd) +vm_main(int fd, int vmm_fd) { struct vm_create_params *vcp = NULL; struct vmd_vm vm; @@ -241,9 +242,8 @@ vm_main(int fd) * vmm - for the vmm ioctls and operations. * proc exec - fork/exec for launching devices. * recvfd - for vm send/recv and sending fd to devices. - * tmppath/rpath - for shm_mkstemp, ftruncate, unlink */ - if (pledge("stdio vmm proc exec recvfd tmppath rpath", NULL) == -1) + if (pledge("stdio vmm proc exec recvfd", NULL) == -1) fatal("pledge"); /* Receive our vm configuration. */ @@ -254,13 +254,6 @@ vm_main(int fd) _exit(EIO); } - /* Receive the /dev/vmm fd number. */ - sz = atomicio(read, fd, &env->vmd_fd, sizeof(env->vmd_fd)); - if (sz != sizeof(env->vmd_fd)) { - log_warnx("failed to receive /dev/vmm fd"); - _exit(EIO); - } - /* Update process with the vm name. */ vcp = &vm.vm_params.vmc_params; setproctitle("%s", vcp->vcp_name); @@ -1099,63 +1092,34 @@ alloc_guest_mem(struct vmd_vm *vm) alloc_guest_mem(struct vmd_vm *vm) { void *p; - char *tmp; - int fd, ret = 0; + int ret = 0; size_t i, j; struct vm_create_params *vcp = &vm->vm_params.vmc_params; struct vm_mem_range *vmr; - tmp = calloc(32, sizeof(char)); - if (tmp == NULL) { - ret = errno; - log_warn("%s: calloc", __func__); - return (ret); - } - strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32); - - vm->vm_nmemfds = vcp->vcp_nmemranges; - for (i = 0; i < vcp->vcp_nmemranges; i++) { vmr = &vcp->vcp_memranges[i]; - fd = shm_mkstemp(tmp); - if (fd < 0) { - ret = errno; - log_warn("%s: shm_mkstemp", __func__); - return (ret); - } - if (ftruncate(fd, vmr->vmr_size) == -1) { - ret = errno; - log_warn("%s: ftruncate", __func__); - goto out; - } - if (fcntl(fd, F_SETFD, 0) == -1) { - ret = errno; - log_warn("%s: fcntl", __func__); - goto out; - } - if (shm_unlink(tmp) == -1) { - ret = errno; - log_warn("%s: shm_unlink", __func__); - goto out; - } - strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32); - + /* + * We only need R/W as userland. vmm(4) will use R/W/X in its + * mapping. + * + * We must use MAP_SHARED so emulated devices will be able + * to generate shared mappings. + */ p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_CONCEAL, fd, 0); + MAP_ANON | MAP_CONCEAL | MAP_SHARED, -1, 0); if (p == MAP_FAILED) { ret = errno; for (j = 0; j < i; j++) { vmr = &vcp->vcp_memranges[j]; munmap((void *)vmr->vmr_va, vmr->vmr_size); } - goto out; + return (ret); } - vm->vm_memfds[i] = fd; vmr->vmr_va = (vaddr_t)p; } -out: - free(tmp); + return (ret); } @@ -2552,10 +2516,11 @@ remap_guest_mem(struct vmd_vm *vm) * Returns 0 on success, non-zero in event of failure. */ int -remap_guest_mem(struct vmd_vm *vm) +remap_guest_mem(struct vmd_vm *vm, int vmm_fd) { struct vm_create_params *vcp; struct vm_mem_range *vmr; + struct vm_sharemem_params vsp; size_t i, j; void *p = NULL; int ret; @@ -2566,23 +2531,32 @@ remap_guest_mem(struct vmd_vm *vm) vcp = &vm->vm_params.vmc_params; /* - * We've execve'd, so we need to re-map the guest VM memory. Iterate - * over all possible vm_mem_range entries so we can initialize all - * file descriptors to a value. + * Initialize our VM shared memory request using our original + * creation parameters. We'll overwrite the va's after mmap(2). */ + memset(&vsp, 0, sizeof(vsp)); + vsp.vsp_nmemranges = vcp->vcp_nmemranges; + vsp.vsp_vm_id = vcp->vcp_id; + memcpy(&vsp.vsp_memranges, &vcp->vcp_memranges, + sizeof(vsp.vsp_memranges)); + + /* + * Use mmap(2) to identify virtual address space for our mappings. + */ for (i = 0; i < VMM_MAX_MEM_RANGES; i++) { - if (i < vcp->vcp_nmemranges) { - vmr = &vcp->vcp_memranges[i]; - /* Skip ranges we know we don't need right now. */ + if (i < vsp.vsp_nmemranges) { + vmr = &vsp.vsp_memranges[i]; + + /* Ignore any MMIO ranges. */ if (vmr->vmr_type == VM_MEM_MMIO) { - log_debug("%s: skipping range i=%ld, type=%d", - __func__, i, vmr->vmr_type); - vm->vm_memfds[i] = -1; + vmr->vmr_va = 0; + vcp->vcp_memranges[i].vmr_va = 0; continue; } - /* Re-mmap the memrange. */ - p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_CONCEAL, vm->vm_memfds[i], 0); + + /* Make initial mappings for the memrange. */ + p = mmap(NULL, vmr->vmr_size, PROT_READ, MAP_ANON, -1, + 0); if (p == MAP_FAILED) { ret = errno; log_warn("%s: mmap", __func__); @@ -2594,11 +2568,29 @@ remap_guest_mem(struct vmd_vm *vm) return (ret); } vmr->vmr_va = (vaddr_t)p; - } else { - /* Initialize with an invalid fd. */ - vm->vm_memfds[i] = -1; + vcp->vcp_memranges[i].vmr_va = vmr->vmr_va; } } + /* + * munmap(2) now that we have va's and ranges that don't overlap. vmm + * will use the va's and sizes to recreate the mappings for us. + */ + for (i = 0; i < vsp.vsp_nmemranges; i++) { + vmr = &vsp.vsp_memranges[i]; + if (vmr->vmr_type == VM_MEM_MMIO) + continue; + if (munmap((void*)vmr->vmr_va, vmr->vmr_size) == -1) + fatal("%s: munmap", __func__); + } + + /* + * Ask vmm to enter the shared mappings for us. They'll point + * to the same host physical memory, but will have a randomized + * virtual address for the calling process. + */ + if (ioctl(vmm_fd, VMM_IOC_SHAREMEM, &vsp) == -1) + return (errno); + return (0); } blob - c06fe974877c17710a81cd69bed541f908e76ef4 blob + 0d0a66533b05a80aafba31d415e97c0fb3a6be88 --- usr.sbin/vmd/vmd.c +++ usr.sbin/vmd/vmd.c @@ -774,7 +774,8 @@ main(int argc, char **argv) struct privsep *ps; int ch; enum privsep_procid proc_id = PROC_PARENT; - int proc_instance = 0, vm_launch = 0, vm_fd = -1; + int proc_instance = 0, vm_launch = 0; + int vmm_fd = -1, vm_fd = -1; const char *errp, *title = NULL; int argc0 = argc; char dev_type = '\0'; @@ -784,7 +785,7 @@ main(int argc, char **argv) if ((env = calloc(1, sizeof(*env))) == NULL) fatal("calloc: env"); - while ((ch = getopt(argc, argv, "D:P:I:V:X:df:nt:v")) != -1) { + while ((ch = getopt(argc, argv, "D:P:I:V:X:df:i:nt:v")) != -1) { switch (ch) { case 'D': if (cmdline_symset(optarg) < 0) @@ -838,6 +839,11 @@ main(int argc, char **argv) default: fatalx("invalid device type"); } break; + case 'i': + vmm_fd = strtonum(optarg, 0, 128, &errp); + if (errp) + fatalx("invalid vmm fd"); + break; default: usage(); } @@ -866,7 +872,7 @@ main(int argc, char **argv) ps = &env->vmd_ps; ps->ps_env = env; - env->vmd_fd = -1; + env->vmd_fd = vmm_fd; if (config_init(env) == -1) fatal("failed to initialize configuration"); @@ -882,14 +888,14 @@ main(int argc, char **argv) * If we're launching a new vm or its device, we short out here. */ if (vm_launch == VMD_LAUNCH_VM) { - vm_main(vm_fd); + vm_main(vm_fd, vmm_fd); /* NOTREACHED */ } else if (vm_launch == VMD_LAUNCH_DEV) { if (dev_type == VMD_DEVTYPE_NET) { - vionet_main(vm_fd); + vionet_main(vm_fd, vmm_fd); /* NOTREACHED */ } else if (dev_type == VMD_DEVTYPE_DISK) { - vioblk_main(vm_fd); + vioblk_main(vm_fd, vmm_fd); /* NOTREACHED */ } fatalx("unsupported device type '%c'", dev_type); blob - 68de0544706a5864aec480590191b33904864053 blob + 61b0cff0c62c9ed752a2128bea8b12bc34f918d7 --- usr.sbin/vmd/vmd.h +++ usr.sbin/vmd/vmd.h @@ -329,9 +329,6 @@ struct vmd_vm { struct timeval vm_start_tv; int vm_start_limit; - int vm_memfds[VMM_MAX_MEM_RANGES]; - size_t vm_nmemfds; - TAILQ_ENTRY(vmd_vm) vm_entry; }; TAILQ_HEAD(vmlist, vmd_vm); @@ -488,7 +485,7 @@ void vm_main(int); int vmm_pipe(struct vmd_vm *, int, void (*)(int, short, void *)); /* vm.c */ -void vm_main(int); +void vm_main(int, int); void mutex_lock(pthread_mutex_t *); void mutex_unlock(pthread_mutex_t *); int read_mem(paddr_t, void *buf, size_t); @@ -499,7 +496,7 @@ int remap_guest_mem(struct vmd_vm *); enum pipe_msg_type vm_pipe_recv(struct vm_dev_pipe *); int write_mem(paddr_t, const void *buf, size_t); void* hvaddr_mem(paddr_t, size_t); -int remap_guest_mem(struct vmd_vm *); +int remap_guest_mem(struct vmd_vm *, int); /* config.c */ int config_init(struct vmd *); @@ -527,9 +524,9 @@ __dead void vionet_main(int); int virtio_get_base(int, char *, size_t, int, const char *); /* vionet.c */ -__dead void vionet_main(int); +__dead void vionet_main(int, int); /* vioblk.c */ -__dead void vioblk_main(int); +__dead void vioblk_main(int, int); #endif /* VMD_H */ blob - 35119673dc31b82aec55e6dd8ef12eff3ef2beef blob + 7f6fe9040ad8b0c6774255a7d02f96322ea5e421 --- usr.sbin/vmd/vmm.c +++ usr.sbin/vmd/vmm.c @@ -627,7 +627,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p { struct vm_create_params *vcp; struct vmd_vm *vm; - char *nargv[6], num[32]; + char *nargv[8], num[32], vmm_fd[32]; int fd, ret = EINVAL; int fds[2]; pid_t vm_pid; @@ -701,16 +701,6 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p if (ret == EIO) goto err; - /* Send the fd number for /dev/vmm. */ - sz = atomicio(vwrite, fds[0], &env->vmd_fd, - sizeof(env->vmd_fd)); - if (sz != sizeof(env->vmd_fd)) { - log_warnx("%s: failed to send /dev/vmm fd for vm '%s'", - __func__, vcp->vcp_name); - ret = EIO; - goto err; - } - /* Read back the kernel-generated vm id from the child */ sz = atomicio(read, fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id)); if (sz != sizeof(vcp->vcp_id)) { @@ -773,17 +763,21 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p memset(&nargv, 0, sizeof(nargv)); memset(num, 0, sizeof(num)); snprintf(num, sizeof(num), "%d", fds[1]); + memset(vmm_fd, 0, sizeof(vmm_fd)); + snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd); nargv[0] = env->argv0; nargv[1] = "-V"; nargv[2] = num; nargv[3] = "-n"; + nargv[4] = "-i"; + nargv[5] = vmm_fd; if (env->vmd_verbose) { - nargv[4] = "-v"; - nargv[5] = NULL; + nargv[6] = "-v"; + nargv[7] = NULL; } else - nargv[4] = NULL; + nargv[6] = NULL; /* Control resumes in vmd main(). */ execvp(nargv[0], nargv);