On Tue, May 09, 2023 at 06:12:55AM -0400, Dave Voutila wrote: > tech@, > > The diff below adds a new ioctl for vmm(4) that allows an emulated > device process request vmm(4) enter a shared mapping in its vmspace so > it can access guest memory without using a shared mapping backed by a > named file. > > Similar to the vm creation ioctl (VMM_IOC_CREATE), the caller requires > the "vmm" and "proc" pledge(2) promises. This allows the emulated > devices to do this setup early and drop these promises back down to just > "stdio" before any device emulation occurs. > > Feel free to skip to the diff (the regress change shows how it works in > a simplified case) or continue reading for reasoning behind this > change. I share this primarily for testers and feedback from other devs > while mlarkin@ reviews. > > To test: > > 1. apply diff > 2. build and install new kernel > 3. copy or symlink new vmm.h into /usr/include/dev/vmm/ > 4. build and reinstall vmd (no changes for vmctl needed) > > You should see no change during vm usage, however you should now see no > change in /tmp consumption while unmounting things like NFS mounts or > usb disks. Read on for details. > > ... > > vmd(8) began emulating virtio network and block devices in subprocesses > with a commit I made at the recent hackathon in Morocco. It relies on > creating shared memory mappings using shm_mkstemp(3) and passing file > descriptors to the fork/exec'd child processes. > > I've since received reports that using named mappings for shared memory > is having 2 negative impacts: > > 1. increased overhead during vm teardown, often making systems > unresponsive (this is my conclusion based on only minimal evidence) > > 2. unmounting any device on the host while a vm is running causes some > guest memory to be flushed to disk (the backing file is already > unlinked, so not visible to other processes). > > (2) can cause /tmp to fill up or introduce failure conditions I'm not > sure we can recover from in vmd. It also has implications for other > services on the host. > > I don't own a fireproof suit that fits...so I'm not about to wade into > the VFS & UVM layers to figure out if (1) or (2) can be mitigated or > fixed on their own. > > One idea was to implement what FreeBSD borrowed from Linux in their > forever quest to become LinuxBSD: memfd_create(2) [1]. > > I took one look and decided this was not the time for me to be trying to > land a new syscall primarily for vmd (and some ports) and went another > route. > > [1] https://man7.org/linux/man-pages/man2/memfd_create.2.html > > -dv >
This does fix the unexpected shm issues. Thanks! Diff reads ok to me, go for it when you are happy with the test results. -ml > > diff refs/heads/master refs/heads/vmm-mapshare > commit - cec1ace2d4d21c85f4c8bacc2dd971721bf6b694 > commit + 8f533c371094c044b0127d468be5feaaf775811b > blob - f221b58f75c4eb01a3a04ae45c7cdb066b11361a > blob + 0e6f5ff858c51bd9707c657b154b0df1f8944c3b > --- regress/sys/arch/amd64/vmm/vcpu.c > +++ regress/sys/arch/amd64/vmm/vcpu.c > @@ -83,6 +83,7 @@ main(int argc, char **argv) > struct vm_resetcpu_params vresetp; > struct vm_run_params vrunp; > struct vm_terminate_params vtp; > + struct vm_sharemem_params vsp; > > struct vm_mem_range *vmr; > int fd, ret = 1; > @@ -127,8 +128,9 @@ main(int argc, char **argv) > ((uint8_t*)p)[j + 1] = PCKBC_AUX; > } > vmr->vmr_va = (vaddr_t)p; > - printf("mapped region %zu: { gpa: 0x%08lx, size: %lu }\n", > - i, vmr->vmr_gpa, vmr->vmr_size); > + printf("created mapped region %zu: { gpa: 0x%08lx, size: %lu," > + " hva: 0x%lx }\n", i, vmr->vmr_gpa, vmr->vmr_size, > + vmr->vmr_va); > } > > if (ioctl(fd, VMM_IOC_CREATE, &vcp) == -1) > @@ -136,8 +138,55 @@ main(int argc, char **argv) > printf("created vm %d named \"%s\"\n", vcp.vcp_id, vcp.vcp_name); > > /* > - * 2. Check that our VM exists. > + * 2. Check we can create shared memory mappings. > */ > + memset(&vsp, 0, sizeof(vsp)); > + vsp.vsp_nmemranges = vcp.vcp_nmemranges; > + memcpy(&vsp.vsp_memranges, &vcp.vcp_memranges, > + sizeof(vsp.vsp_memranges)); > + vsp.vsp_vm_id = vcp.vcp_id; > + > + /* Find some new va ranges... */ > + for (i = 0; i < vsp.vsp_nmemranges; i++) { > + vmr = &vsp.vsp_memranges[i]; > + p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, > + MAP_PRIVATE | MAP_ANON, -1, 0); > + if (p == MAP_FAILED) > + err(1, "mmap"); > + vmr->vmr_va = (vaddr_t)p; > + } > + > + /* Release our mappings so vmm can replace them. */ > + for (i = 0; i < vsp.vsp_nmemranges; i++) { > + vmr = &vsp.vsp_memranges[i]; > + munmap((void*)vmr->vmr_va, vmr->vmr_size); > + } > + > + /* Perform the shared mapping. */ > + if (ioctl(fd, VMM_IOC_SHAREMEM, &vsp) == -1) > + err(1, "VMM_IOC_SHAREMEM"); > + printf("created shared memory mappings\n"); > + > + /* We should see our reset vector instructions in the new mappings. */ > + for (i = 0; i < vsp.vsp_nmemranges; i++) { > + vmr = &vsp.vsp_memranges[i]; > + p = (void*)vmr->vmr_va; > + > + for (j = 0; j < vmr->vmr_size; j += 2) { > + if (((uint8_t*)p)[j + 0] != 0xE4) > + errx(1, "bad byte"); > + if (((uint8_t*)p)[j + 1] != PCKBC_AUX) > + errx(1, "bad byte"); > + } > + printf("checked shared region %zu: { gpa: 0x%08lx, size: %lu," > + " hva: 0x%lx }\n", i, vmr->vmr_gpa, vmr->vmr_size, > + vmr->vmr_va); > + } > + printf("validated shared memory mappings\n"); > + > + /* > + * 3. Check that our VM exists. > + */ > memset(&vip, 0, sizeof(vip)); > vip.vip_size = 0; > info = NULL; > @@ -189,7 +238,7 @@ main(int argc, char **argv) > ours = NULL; > > /* > - * 3. Reset our VCPU and initialize register state. > + * 4. Reset our VCPU and initialize register state. > */ > memset(&vresetp, 0, sizeof(vresetp)); > vresetp.vrp_vm_id = vcp.vcp_id; > @@ -205,7 +254,7 @@ main(int argc, char **argv) > vresetp.vrp_vm_id); > > /* > - * 4. Run the vcpu, expecting an immediate exit for IO assist. > + * 5. Run the vcpu, expecting an immediate exit for IO assist. > */ > exit = malloc(sizeof(*exit)); > if (exit == NULL) { > @@ -258,7 +307,7 @@ out: > > out: > /* > - * 5. Terminate our VM and clean up. > + * 6. Terminate our VM and clean up. > */ > memset(&vtp, 0, sizeof(vtp)); > vtp.vtp_vm_id = vcp.vcp_id; > @@ -277,13 +326,23 @@ out: > vmr = &vcp.vcp_memranges[i]; > if (vmr->vmr_va) { > if (munmap((void *)vmr->vmr_va, vmr->vmr_size)) { > - warn("failed to unmap region %zu at 0x%08lx", > - i, vmr->vmr_va); > + warn("failed to unmap orginal region %zu @ hva " > + "0x%lx", i, vmr->vmr_va); > ret = 1; > } else > - printf("unmapped region %zu @ gpa 0x%08lx\n", > - i, vmr->vmr_gpa); > + printf("unmapped origin region %zu @ hva " > + "0x%lx\n", i, vmr->vmr_va); > } > + vmr = &vsp.vsp_memranges[i]; > + if (vmr->vmr_va) { > + if (munmap((void *)vmr->vmr_va, vmr->vmr_size)) { > + warn("failed to unmap shared region %zu @ hva " > + "0x%lx", i, vmr->vmr_va); > + ret = 1; > + } else > + printf("unmapped shared region %zu @ hva " > + "0x%lx\n", i, vmr->vmr_va); > + } > } > > return (ret); > blob - d46b3431081b6d2e7e1adab884ec21b0aaa9761a > blob + 3daee7dad431ed200cbd734bc0f8b35bebd54216 > --- sys/dev/vmm/vmm.c > +++ sys/dev/vmm/vmm.c > @@ -262,6 +262,9 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag > case VMM_IOC_WRITEVMPARAMS: > ret = vm_rwvmparams((struct vm_rwvmparams_params *)data, 1); > break; > + case VMM_IOC_SHAREMEM: > + ret = vm_share_mem((struct vm_sharemem_params *)data, p); > + break; > default: > ret = vmmioctl_machdep(dev, cmd, data, flag, p); > break; > @@ -286,6 +289,7 @@ pledge_ioctl_vmm(struct proc *p, long com) > switch (com) { > case VMM_IOC_CREATE: > case VMM_IOC_INFO: > + case VMM_IOC_SHAREMEM: > /* The "parent" process in vmd forks and manages VMs */ > if (p->p_p->ps_pledge & PLEDGE_PROC) > return (0); > @@ -780,3 +784,82 @@ vcpu_must_stop(struct vcpu *vcpu) > return (1); > return (0); > } > + > +/* > + * vm_share_mem > + * > + * Share a uvm mapping for the vm guest memory ranges into the calling > process. > + * > + * Return values: > + * 0: if successful > + * ENOENT: if the vm cannot be found by vm_find > + * EPERM: if the vm cannot be accessed by the current process > + * EINVAL: if the provide memory ranges fail checks > + * ENOMEM: if uvm_share fails to find available memory in the destination > map > + */ > +int > +vm_share_mem(struct vm_sharemem_params *vsp, struct proc *p) > +{ > + int ret = EINVAL; > + size_t i, n; > + struct vm *vm; > + struct vm_mem_range *src, *dst; > + > + ret = vm_find(vsp->vsp_vm_id, &vm); > + if (ret) > + return (ret); > + > + /* Check we have the expected number of ranges. */ > + if (vm->vm_nmemranges != vsp->vsp_nmemranges) > + goto out; > + n = vm->vm_nmemranges; > + > + /* Check their types, sizes, and gpa's (implying page alignment). */ > + for (i = 0; i < n; i++) { > + src = &vm->vm_memranges[i]; > + dst = &vsp->vsp_memranges[i]; > + > + /* > + * The vm memranges were already checked during creation, so > + * compare to them to confirm validity of mapping request. > + */ > + if (src->vmr_type != dst->vmr_type) > + goto out; > + if (src->vmr_gpa != dst->vmr_gpa) > + goto out; > + if (src->vmr_size != dst->vmr_size) > + goto out; > + > + /* Check our intended destination is page-aligned. */ > + if (dst->vmr_va & PAGE_MASK) > + goto out; > + } > + > + /* > + * Share each range individually with the calling process. We do > + * not need PROC_EXEC as the emulated devices do not need to execute > + * instructions from guest memory. > + */ > + for (i = 0; i < n; i++) { > + src = &vm->vm_memranges[i]; > + dst = &vsp->vsp_memranges[i]; > + > + /* Skip MMIO range. */ > + if (src->vmr_type == VM_MEM_MMIO) > + continue; > + > + DPRINTF("sharing gpa=0x%lx for pid %d @ va=0x%lx\n", > + src->vmr_gpa, p->p_p->ps_pid, dst->vmr_va); > + ret = uvm_share(&p->p_vmspace->vm_map, dst->vmr_va, > + PROT_READ | PROT_WRITE, vm->vm_map, src->vmr_gpa, > + src->vmr_size); > + if (ret) { > + printf("%s: uvm_share failed (%d)\n", __func__, ret); > + break; > + } > + } > + ret = 0; > +out: > + refcnt_rele_wake(&vm->vm_refcnt); > + return (ret); > +} > blob - d2355d42b44b51044901de9d0adc0239586f37b8 > blob + 7b3b0d77ad550165b6e53f4de66723366d689b23 > --- sys/dev/vmm/vmm.h > +++ sys/dev/vmm/vmm.h > @@ -76,6 +76,13 @@ struct vm_resetcpu_params { > struct vcpu_reg_state vrp_init_state; > }; > > +struct vm_sharemem_params { > + /* Input parameters to VMM_IOC_SHAREMEM */ > + uint32_t vsp_vm_id; > + size_t vsp_nmemranges; > + struct vm_mem_range vsp_memranges[VMM_MAX_MEM_RANGES]; > +}; > + > /* IOCTL definitions */ > #define VMM_IOC_CREATE _IOWR('V', 1, struct vm_create_params) /* Create VM */ > #define VMM_IOC_RUN _IOWR('V', 2, struct vm_run_params) /* Run VCPU */ > @@ -88,8 +95,8 @@ struct vm_resetcpu_params { > #define VMM_IOC_READVMPARAMS _IOWR('V', 9, struct vm_rwvmparams_params) > /* Set VM params */ > #define VMM_IOC_WRITEVMPARAMS _IOW('V', 10, struct vm_rwvmparams_params) > +#define VMM_IOC_SHAREMEM _IOW('V', 11, struct vm_sharemem_params) > > - > #ifdef _KERNEL > > /* #define VMM_DEBUG */ > @@ -194,6 +201,7 @@ int vcpu_must_stop(struct vcpu *); > int vm_terminate(struct vm_terminate_params *); > int vm_resetcpu(struct vm_resetcpu_params *); > int vcpu_must_stop(struct vcpu *); > +int vm_share_mem(struct vm_sharemem_params *, struct proc *); > > #endif /* _KERNEL */ > #endif /* DEV_VMM_H */ > blob - 9373a135aa87755f0e34b821af4ab8c0f4970421 > blob + dd0efc2fd71bb0ac91d2e389a30e779d8d6c6c0d > --- usr.sbin/vmd/vioblk.c > +++ usr.sbin/vmd/vioblk.c > @@ -58,7 +58,7 @@ vioblk_main(int fd) > } > > __dead void > -vioblk_main(int fd) > +vioblk_main(int fd, int fd_vmm) > { > struct virtio_dev dev; > struct vioblk_dev *vioblk; > @@ -71,8 +71,11 @@ vioblk_main(int fd) > > log_procinit("vioblk"); > > - /* stdio - needed for read/write to disk fds and channels to the vm. */ > - if (pledge("stdio", NULL) == -1) > + /* > + * stdio - needed for read/write to disk fds and channels to the vm. > + * vmm + proc - needed to create shared vm mappings. > + */ > + if (pledge("stdio vmm proc", NULL) == -1) > fatal("pledge"); > > /* Receive our virtio_dev, mostly preconfigured. */ > @@ -92,8 +95,9 @@ vioblk_main(int fd) > vioblk = &dev.vioblk; > > log_debug("%s: got viblk dev. num disk fds = %d, sync fd = %d, " > - "async fd = %d, sz = %lld maxfer = %d", __func__, vioblk->ndisk_fd, > - dev.sync_fd, dev.async_fd, vioblk->sz, vioblk->max_xfer); > + "async fd = %d, sz = %lld maxfer = %d, vmm fd = %d", __func__, > + vioblk->ndisk_fd, dev.sync_fd, dev.async_fd, vioblk->sz, > + vioblk->max_xfer, fd_vmm); > > /* Receive our vm information from the vm process. */ > memset(&vm, 0, sizeof(vm)); > @@ -108,12 +112,19 @@ vioblk_main(int fd) > setproctitle("%s/vioblk[%d]", vcp->vcp_name, vioblk->idx); > > /* Now that we have our vm information, we can remap memory. */ > - ret = remap_guest_mem(&vm); > + ret = remap_guest_mem(&vm, fd_vmm); > if (ret) { > log_warnx("failed to remap guest memory"); > goto fail; > } > > + /* > + * We no longer need /dev/vmm access. > + */ > + close_fd(fd_vmm); > + if (pledge("stdio", NULL) == -1) > + fatal("pledge2"); > + > /* Initialize the virtio block abstractions. */ > type = vm.vm_params.vmc_disktypes[vioblk->idx]; > switch (type) { > blob - 6ce905fdccfa7befb49353c23628227fdc74c486 > blob + d75dc06b9bc0133ace3e74de85abba3b62f539dc > --- usr.sbin/vmd/vionet.c > +++ usr.sbin/vmd/vionet.c > @@ -61,7 +61,7 @@ vionet_main(int fd) > static void handle_sync_io(int, short, void *); > > __dead void > -vionet_main(int fd) > +vionet_main(int fd, int fd_vmm) > { > struct virtio_dev dev; > struct vionet_dev *vionet = NULL; > @@ -73,8 +73,11 @@ vionet_main(int fd) > > log_procinit("vionet"); > > - /* stdio - needed for read/write to tap fd and channels to the vm. */ > - if (pledge("stdio", NULL) == -1) > + /* > + * stdio - needed for read/write to disk fds and channels to the vm. > + * vmm + proc - needed to create shared vm mappings. > + */ > + if (pledge("stdio vmm proc", NULL) == -1) > fatal("pledge"); > > /* Receive our vionet_dev, mostly preconfigured. */ > @@ -92,8 +95,9 @@ vionet_main(int fd) > dev.sync_fd = fd; > vionet = &dev.vionet; > > - log_debug("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d", > - __func__, vionet->data_fd, dev.sync_fd, dev.async_fd); > + log_debug("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d" > + ", vmm fd = %d", __func__, vionet->data_fd, dev.sync_fd, > + dev.async_fd, fd_vmm); > > /* Receive our vm information from the vm process. */ > memset(&vm, 0, sizeof(vm)); > @@ -108,10 +112,19 @@ vionet_main(int fd) > setproctitle("%s/vionet[%d]", vcp->vcp_name, vionet->idx); > > /* Now that we have our vm information, we can remap memory. */ > - ret = remap_guest_mem(&vm); > - if (ret) > + ret = remap_guest_mem(&vm, fd_vmm); > + if (ret) { > + fatal("%s: failed to remap", __func__); > goto fail; > + } > > + /* > + * We no longer need /dev/vmm access. > + */ > + close_fd(fd_vmm); > + if (pledge("stdio", NULL) == -1) > + fatal("pledge2"); > + > /* If we're restoring hardware, re-initialize virtqueue hva's. */ > if (vm.vm_state & VM_STATE_RECEIVED) { > struct virtio_vq_info *vq_info; > blob - 92e77b8f83431c2ff52824e35149477909612653 > blob + e3f6d1371ab7795df6e9d4b370ffd1a7c5afbde6 > --- usr.sbin/vmd/virtio.c > +++ usr.sbin/vmd/virtio.c > @@ -1297,7 +1297,7 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev > static int > virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev) > { > - char *nargv[8], num[32], t[2]; > + char *nargv[10], num[32], vmm_fd[32], t[2]; > pid_t dev_pid; > int data_fds[VM_MAX_BASE_PER_DISK], sync_fds[2], async_fds[2], ret = 0; > size_t i, j, data_fds_sz, sz = 0; > @@ -1483,6 +1483,8 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev > memset(&nargv, 0, sizeof(nargv)); > memset(num, 0, sizeof(num)); > snprintf(num, sizeof(num), "%d", sync_fds[1]); > + memset(vmm_fd, 0, sizeof(vmm_fd)); > + snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd); > > t[0] = dev->dev_type; > t[1] = '\0'; > @@ -1492,13 +1494,15 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev > nargv[2] = num; > nargv[3] = "-t"; > nargv[4] = t; > - nargv[5] = "-n"; > + nargv[5] = "-i"; > + nargv[6] = vmm_fd; > + nargv[7] = "-n"; > > if (env->vmd_verbose) { > - nargv[6] = "-v"; > - nargv[7] = NULL; > + nargv[8] = "-v"; > + nargv[9] = NULL; > } else > - nargv[6] = NULL; > + nargv[8] = NULL; > > /* Control resumes in vmd.c:main(). */ > execvp(nargv[0], nargv); > blob - d42abb5a834cb5f6d9777d8b23f92a6a1c5930f2 > blob + 48ec88c37db24290e069bf0ab9249df764b2c424 > --- usr.sbin/vmd/vm.c > +++ usr.sbin/vmd/vm.c > @@ -218,9 +218,10 @@ static const struct vcpu_reg_state vcpu_init_flat16 = > * Primary entrypoint for launching a vm. Does not return. > * > * fd: file descriptor for communicating with vmm process. > + * fd_vmm: file descriptor for communicating with vmm(4) device > */ > void > -vm_main(int fd) > +vm_main(int fd, int vmm_fd) > { > struct vm_create_params *vcp = NULL; > struct vmd_vm vm; > @@ -241,9 +242,8 @@ vm_main(int fd) > * vmm - for the vmm ioctls and operations. > * proc exec - fork/exec for launching devices. > * recvfd - for vm send/recv and sending fd to devices. > - * tmppath/rpath - for shm_mkstemp, ftruncate, unlink > */ > - if (pledge("stdio vmm proc exec recvfd tmppath rpath", NULL) == -1) > + if (pledge("stdio vmm proc exec recvfd", NULL) == -1) > fatal("pledge"); > > /* Receive our vm configuration. */ > @@ -254,13 +254,6 @@ vm_main(int fd) > _exit(EIO); > } > > - /* Receive the /dev/vmm fd number. */ > - sz = atomicio(read, fd, &env->vmd_fd, sizeof(env->vmd_fd)); > - if (sz != sizeof(env->vmd_fd)) { > - log_warnx("failed to receive /dev/vmm fd"); > - _exit(EIO); > - } > - > /* Update process with the vm name. */ > vcp = &vm.vm_params.vmc_params; > setproctitle("%s", vcp->vcp_name); > @@ -1099,63 +1092,34 @@ alloc_guest_mem(struct vmd_vm *vm) > alloc_guest_mem(struct vmd_vm *vm) > { > void *p; > - char *tmp; > - int fd, ret = 0; > + int ret = 0; > size_t i, j; > struct vm_create_params *vcp = &vm->vm_params.vmc_params; > struct vm_mem_range *vmr; > > - tmp = calloc(32, sizeof(char)); > - if (tmp == NULL) { > - ret = errno; > - log_warn("%s: calloc", __func__); > - return (ret); > - } > - strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32); > - > - vm->vm_nmemfds = vcp->vcp_nmemranges; > - > for (i = 0; i < vcp->vcp_nmemranges; i++) { > vmr = &vcp->vcp_memranges[i]; > > - fd = shm_mkstemp(tmp); > - if (fd < 0) { > - ret = errno; > - log_warn("%s: shm_mkstemp", __func__); > - return (ret); > - } > - if (ftruncate(fd, vmr->vmr_size) == -1) { > - ret = errno; > - log_warn("%s: ftruncate", __func__); > - goto out; > - } > - if (fcntl(fd, F_SETFD, 0) == -1) { > - ret = errno; > - log_warn("%s: fcntl", __func__); > - goto out; > - } > - if (shm_unlink(tmp) == -1) { > - ret = errno; > - log_warn("%s: shm_unlink", __func__); > - goto out; > - } > - strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32); > - > + /* > + * We only need R/W as userland. vmm(4) will use R/W/X in its > + * mapping. > + * > + * We must use MAP_SHARED so emulated devices will be able > + * to generate shared mappings. > + */ > p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, > - MAP_SHARED | MAP_CONCEAL, fd, 0); > + MAP_ANON | MAP_CONCEAL | MAP_SHARED, -1, 0); > if (p == MAP_FAILED) { > ret = errno; > for (j = 0; j < i; j++) { > vmr = &vcp->vcp_memranges[j]; > munmap((void *)vmr->vmr_va, vmr->vmr_size); > } > - goto out; > + return (ret); > } > - vm->vm_memfds[i] = fd; > vmr->vmr_va = (vaddr_t)p; > } > -out: > - free(tmp); > + > return (ret); > } > > @@ -2552,10 +2516,11 @@ remap_guest_mem(struct vmd_vm *vm) > * Returns 0 on success, non-zero in event of failure. > */ > int > -remap_guest_mem(struct vmd_vm *vm) > +remap_guest_mem(struct vmd_vm *vm, int vmm_fd) > { > struct vm_create_params *vcp; > struct vm_mem_range *vmr; > + struct vm_sharemem_params vsp; > size_t i, j; > void *p = NULL; > int ret; > @@ -2566,23 +2531,32 @@ remap_guest_mem(struct vmd_vm *vm) > vcp = &vm->vm_params.vmc_params; > > /* > - * We've execve'd, so we need to re-map the guest VM memory. Iterate > - * over all possible vm_mem_range entries so we can initialize all > - * file descriptors to a value. > + * Initialize our VM shared memory request using our original > + * creation parameters. We'll overwrite the va's after mmap(2). > */ > + memset(&vsp, 0, sizeof(vsp)); > + vsp.vsp_nmemranges = vcp->vcp_nmemranges; > + vsp.vsp_vm_id = vcp->vcp_id; > + memcpy(&vsp.vsp_memranges, &vcp->vcp_memranges, > + sizeof(vsp.vsp_memranges)); > + > + /* > + * Use mmap(2) to identify virtual address space for our mappings. > + */ > for (i = 0; i < VMM_MAX_MEM_RANGES; i++) { > - if (i < vcp->vcp_nmemranges) { > - vmr = &vcp->vcp_memranges[i]; > - /* Skip ranges we know we don't need right now. */ > + if (i < vsp.vsp_nmemranges) { > + vmr = &vsp.vsp_memranges[i]; > + > + /* Ignore any MMIO ranges. */ > if (vmr->vmr_type == VM_MEM_MMIO) { > - log_debug("%s: skipping range i=%ld, type=%d", > - __func__, i, vmr->vmr_type); > - vm->vm_memfds[i] = -1; > + vmr->vmr_va = 0; > + vcp->vcp_memranges[i].vmr_va = 0; > continue; > } > - /* Re-mmap the memrange. */ > - p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, > - MAP_SHARED | MAP_CONCEAL, vm->vm_memfds[i], 0); > + > + /* Make initial mappings for the memrange. */ > + p = mmap(NULL, vmr->vmr_size, PROT_READ, MAP_ANON, -1, > + 0); > if (p == MAP_FAILED) { > ret = errno; > log_warn("%s: mmap", __func__); > @@ -2594,11 +2568,29 @@ remap_guest_mem(struct vmd_vm *vm) > return (ret); > } > vmr->vmr_va = (vaddr_t)p; > - } else { > - /* Initialize with an invalid fd. */ > - vm->vm_memfds[i] = -1; > + vcp->vcp_memranges[i].vmr_va = vmr->vmr_va; > } > } > > + /* > + * munmap(2) now that we have va's and ranges that don't overlap. vmm > + * will use the va's and sizes to recreate the mappings for us. > + */ > + for (i = 0; i < vsp.vsp_nmemranges; i++) { > + vmr = &vsp.vsp_memranges[i]; > + if (vmr->vmr_type == VM_MEM_MMIO) > + continue; > + if (munmap((void*)vmr->vmr_va, vmr->vmr_size) == -1) > + fatal("%s: munmap", __func__); > + } > + > + /* > + * Ask vmm to enter the shared mappings for us. They'll point > + * to the same host physical memory, but will have a randomized > + * virtual address for the calling process. > + */ > + if (ioctl(vmm_fd, VMM_IOC_SHAREMEM, &vsp) == -1) > + return (errno); > + > return (0); > } > blob - c06fe974877c17710a81cd69bed541f908e76ef4 > blob + 0d0a66533b05a80aafba31d415e97c0fb3a6be88 > --- usr.sbin/vmd/vmd.c > +++ usr.sbin/vmd/vmd.c > @@ -774,7 +774,8 @@ main(int argc, char **argv) > struct privsep *ps; > int ch; > enum privsep_procid proc_id = PROC_PARENT; > - int proc_instance = 0, vm_launch = 0, vm_fd = -1; > + int proc_instance = 0, vm_launch = 0; > + int vmm_fd = -1, vm_fd = -1; > const char *errp, *title = NULL; > int argc0 = argc; > char dev_type = '\0'; > @@ -784,7 +785,7 @@ main(int argc, char **argv) > if ((env = calloc(1, sizeof(*env))) == NULL) > fatal("calloc: env"); > > - while ((ch = getopt(argc, argv, "D:P:I:V:X:df:nt:v")) != -1) { > + while ((ch = getopt(argc, argv, "D:P:I:V:X:df:i:nt:v")) != -1) { > switch (ch) { > case 'D': > if (cmdline_symset(optarg) < 0) > @@ -838,6 +839,11 @@ main(int argc, char **argv) > default: fatalx("invalid device type"); > } > break; > + case 'i': > + vmm_fd = strtonum(optarg, 0, 128, &errp); > + if (errp) > + fatalx("invalid vmm fd"); > + break; > default: > usage(); > } > @@ -866,7 +872,7 @@ main(int argc, char **argv) > > ps = &env->vmd_ps; > ps->ps_env = env; > - env->vmd_fd = -1; > + env->vmd_fd = vmm_fd; > > if (config_init(env) == -1) > fatal("failed to initialize configuration"); > @@ -882,14 +888,14 @@ main(int argc, char **argv) > * If we're launching a new vm or its device, we short out here. > */ > if (vm_launch == VMD_LAUNCH_VM) { > - vm_main(vm_fd); > + vm_main(vm_fd, vmm_fd); > /* NOTREACHED */ > } else if (vm_launch == VMD_LAUNCH_DEV) { > if (dev_type == VMD_DEVTYPE_NET) { > - vionet_main(vm_fd); > + vionet_main(vm_fd, vmm_fd); > /* NOTREACHED */ > } else if (dev_type == VMD_DEVTYPE_DISK) { > - vioblk_main(vm_fd); > + vioblk_main(vm_fd, vmm_fd); > /* NOTREACHED */ > } > fatalx("unsupported device type '%c'", dev_type); > blob - 68de0544706a5864aec480590191b33904864053 > blob + 61b0cff0c62c9ed752a2128bea8b12bc34f918d7 > --- usr.sbin/vmd/vmd.h > +++ usr.sbin/vmd/vmd.h > @@ -329,9 +329,6 @@ struct vmd_vm { > struct timeval vm_start_tv; > int vm_start_limit; > > - int vm_memfds[VMM_MAX_MEM_RANGES]; > - size_t vm_nmemfds; > - > TAILQ_ENTRY(vmd_vm) vm_entry; > }; > TAILQ_HEAD(vmlist, vmd_vm); > @@ -488,7 +485,7 @@ void vm_main(int); > int vmm_pipe(struct vmd_vm *, int, void (*)(int, short, void *)); > > /* vm.c */ > -void vm_main(int); > +void vm_main(int, int); > void mutex_lock(pthread_mutex_t *); > void mutex_unlock(pthread_mutex_t *); > int read_mem(paddr_t, void *buf, size_t); > @@ -499,7 +496,7 @@ int remap_guest_mem(struct vmd_vm *); > enum pipe_msg_type vm_pipe_recv(struct vm_dev_pipe *); > int write_mem(paddr_t, const void *buf, size_t); > void* hvaddr_mem(paddr_t, size_t); > -int remap_guest_mem(struct vmd_vm *); > +int remap_guest_mem(struct vmd_vm *, int); > > /* config.c */ > int config_init(struct vmd *); > @@ -527,9 +524,9 @@ __dead void vionet_main(int); > int virtio_get_base(int, char *, size_t, int, const char *); > > /* vionet.c */ > -__dead void vionet_main(int); > +__dead void vionet_main(int, int); > > /* vioblk.c */ > -__dead void vioblk_main(int); > +__dead void vioblk_main(int, int); > > #endif /* VMD_H */ > blob - 35119673dc31b82aec55e6dd8ef12eff3ef2beef > blob + 7f6fe9040ad8b0c6774255a7d02f96322ea5e421 > --- usr.sbin/vmd/vmm.c > +++ usr.sbin/vmd/vmm.c > @@ -627,7 +627,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p > { > struct vm_create_params *vcp; > struct vmd_vm *vm; > - char *nargv[6], num[32]; > + char *nargv[8], num[32], vmm_fd[32]; > int fd, ret = EINVAL; > int fds[2]; > pid_t vm_pid; > @@ -701,16 +701,6 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p > if (ret == EIO) > goto err; > > - /* Send the fd number for /dev/vmm. */ > - sz = atomicio(vwrite, fds[0], &env->vmd_fd, > - sizeof(env->vmd_fd)); > - if (sz != sizeof(env->vmd_fd)) { > - log_warnx("%s: failed to send /dev/vmm fd for vm '%s'", > - __func__, vcp->vcp_name); > - ret = EIO; > - goto err; > - } > - > /* Read back the kernel-generated vm id from the child */ > sz = atomicio(read, fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id)); > if (sz != sizeof(vcp->vcp_id)) { > @@ -773,17 +763,21 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p > memset(&nargv, 0, sizeof(nargv)); > memset(num, 0, sizeof(num)); > snprintf(num, sizeof(num), "%d", fds[1]); > + memset(vmm_fd, 0, sizeof(vmm_fd)); > + snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd); > > nargv[0] = env->argv0; > nargv[1] = "-V"; > nargv[2] = num; > nargv[3] = "-n"; > + nargv[4] = "-i"; > + nargv[5] = vmm_fd; > > if (env->vmd_verbose) { > - nargv[4] = "-v"; > - nargv[5] = NULL; > + nargv[6] = "-v"; > + nargv[7] = NULL; > } else > - nargv[4] = NULL; > + nargv[6] = NULL; > > /* Control resumes in vmd main(). */ > execvp(nargv[0], nargv);