On Tue, May 09, 2023 at 06:12:55AM -0400, Dave Voutila wrote:
> tech@,
>
> The diff below adds a new ioctl for vmm(4) that allows an emulated
> device process request vmm(4) enter a shared mapping in its vmspace so
> it can access guest memory without using a shared mapping backed by a
> named file.
>
> Similar to the vm creation ioctl (VMM_IOC_CREATE), the caller requires
> the "vmm" and "proc" pledge(2) promises. This allows the emulated
> devices to do this setup early and drop these promises back down to just
> "stdio" before any device emulation occurs.
>
> Feel free to skip to the diff (the regress change shows how it works in
> a simplified case) or continue reading for reasoning behind this
> change. I share this primarily for testers and feedback from other devs
> while mlarkin@ reviews.
>
> To test:
>
> 1. apply diff
> 2. build and install new kernel
> 3. copy or symlink new vmm.h into /usr/include/dev/vmm/
> 4. build and reinstall vmd (no changes for vmctl needed)
>
> You should see no change during vm usage, however you should now see no
> change in /tmp consumption while unmounting things like NFS mounts or
> usb disks. Read on for details.
>
> ...
>
> vmd(8) began emulating virtio network and block devices in subprocesses
> with a commit I made at the recent hackathon in Morocco. It relies on
> creating shared memory mappings using shm_mkstemp(3) and passing file
> descriptors to the fork/exec'd child processes.
>
> I've since received reports that using named mappings for shared memory
> is having 2 negative impacts:
>
> 1. increased overhead during vm teardown, often making systems
>    unresponsive (this is my conclusion based on only minimal evidence)
>
> 2. unmounting any device on the host while a vm is running causes some
>    guest memory to be flushed to disk (the backing file is already
>    unlinked, so not visible to other processes).
>
> (2) can cause /tmp to fill up or introduce failure conditions I'm not
> sure we can recover from in vmd. It also has implications for other
> services on the host.
>
> I don't own a fireproof suit that fits...so I'm not about to wade into
> the VFS & UVM layers to figure out if (1) or (2) can be mitigated or
> fixed on their own.
>
> One idea was to implement what FreeBSD borrowed from Linux in their
> forever quest to become LinuxBSD: memfd_create(2) [1].
>
> I took one look and decided this was not the time for me to be trying to
> land a new syscall primarily for vmd (and some ports) and went another
> route.
>
> [1] https://man7.org/linux/man-pages/man2/memfd_create.2.html
>
> -dv
>

This does fix the unexpected shm issues. Thanks!

Diff reads ok to me, go for it when you are happy with the test results.

-ml

>
> diff refs/heads/master refs/heads/vmm-mapshare
> commit - cec1ace2d4d21c85f4c8bacc2dd971721bf6b694
> commit + 8f533c371094c044b0127d468be5feaaf775811b
> blob - f221b58f75c4eb01a3a04ae45c7cdb066b11361a
> blob + 0e6f5ff858c51bd9707c657b154b0df1f8944c3b
> --- regress/sys/arch/amd64/vmm/vcpu.c
> +++ regress/sys/arch/amd64/vmm/vcpu.c
> @@ -83,6 +83,7 @@ main(int argc, char **argv)
>       struct vm_resetcpu_params        vresetp;
>       struct vm_run_params             vrunp;
>       struct vm_terminate_params       vtp;
> +     struct vm_sharemem_params        vsp;
>
>       struct vm_mem_range             *vmr;
>       int                              fd, ret = 1;
> @@ -127,8 +128,9 @@ main(int argc, char **argv)
>                       ((uint8_t*)p)[j + 1] = PCKBC_AUX;
>               }
>               vmr->vmr_va = (vaddr_t)p;
> -             printf("mapped region %zu: { gpa: 0x%08lx, size: %lu }\n",
> -                 i, vmr->vmr_gpa, vmr->vmr_size);
> +             printf("created mapped region %zu: { gpa: 0x%08lx, size: %lu,"
> +                 " hva: 0x%lx }\n", i, vmr->vmr_gpa, vmr->vmr_size,
> +                 vmr->vmr_va);
>       }
>
>       if (ioctl(fd, VMM_IOC_CREATE, &vcp) == -1)
> @@ -136,8 +138,55 @@ main(int argc, char **argv)
>       printf("created vm %d named \"%s\"\n", vcp.vcp_id, vcp.vcp_name);
>
>       /*
> -      * 2. Check that our VM exists.
> +      * 2. Check we can create shared memory mappings.
>        */
> +     memset(&vsp, 0, sizeof(vsp));
> +     vsp.vsp_nmemranges = vcp.vcp_nmemranges;
> +     memcpy(&vsp.vsp_memranges, &vcp.vcp_memranges,
> +         sizeof(vsp.vsp_memranges));
> +     vsp.vsp_vm_id = vcp.vcp_id;
> +
> +     /* Find some new va ranges... */
> +     for (i = 0; i < vsp.vsp_nmemranges; i++) {
> +             vmr = &vsp.vsp_memranges[i];
> +             p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
> +                 MAP_PRIVATE | MAP_ANON, -1, 0);
> +             if (p == MAP_FAILED)
> +                     err(1, "mmap");
> +             vmr->vmr_va = (vaddr_t)p;
> +     }
> +
> +     /* Release our mappings so vmm can replace them. */
> +     for (i = 0; i < vsp.vsp_nmemranges; i++) {
> +             vmr = &vsp.vsp_memranges[i];
> +             munmap((void*)vmr->vmr_va, vmr->vmr_size);
> +     }
> +
> +     /* Perform the shared mapping. */
> +     if (ioctl(fd, VMM_IOC_SHAREMEM, &vsp) == -1)
> +             err(1, "VMM_IOC_SHAREMEM");
> +     printf("created shared memory mappings\n");
> +
> +     /* We should see our reset vector instructions in the new mappings. */
> +     for (i = 0; i < vsp.vsp_nmemranges; i++) {
> +             vmr = &vsp.vsp_memranges[i];
> +             p = (void*)vmr->vmr_va;
> +
> +             for (j = 0; j < vmr->vmr_size; j += 2) {
> +                     if (((uint8_t*)p)[j + 0] != 0xE4)
> +                             errx(1, "bad byte");
> +                     if (((uint8_t*)p)[j + 1] != PCKBC_AUX)
> +                             errx(1, "bad byte");
> +             }
> +             printf("checked shared region %zu: { gpa: 0x%08lx, size: %lu,"
> +                 " hva: 0x%lx }\n", i, vmr->vmr_gpa, vmr->vmr_size,
> +                 vmr->vmr_va);
> +     }
> +     printf("validated shared memory mappings\n");
> +
> +     /*
> +      * 3. Check that our VM exists.
> +      */
>       memset(&vip, 0, sizeof(vip));
>       vip.vip_size = 0;
>       info = NULL;
> @@ -189,7 +238,7 @@ main(int argc, char **argv)
>       ours = NULL;
>
>       /*
> -      * 3. Reset our VCPU and initialize register state.
> +      * 4. Reset our VCPU and initialize register state.
>        */
>       memset(&vresetp, 0, sizeof(vresetp));
>       vresetp.vrp_vm_id = vcp.vcp_id;
> @@ -205,7 +254,7 @@ main(int argc, char **argv)
>           vresetp.vrp_vm_id);
>
>       /*
> -      * 4. Run the vcpu, expecting an immediate exit for IO assist.
> +      * 5. Run the vcpu, expecting an immediate exit for IO assist.
>        */
>       exit = malloc(sizeof(*exit));
>       if (exit == NULL) {
> @@ -258,7 +307,7 @@ out:
>
>  out:
>       /*
> -      * 5. Terminate our VM and clean up.
> +      * 6. Terminate our VM and clean up.
>        */
>       memset(&vtp, 0, sizeof(vtp));
>       vtp.vtp_vm_id = vcp.vcp_id;
> @@ -277,13 +326,23 @@ out:
>               vmr = &vcp.vcp_memranges[i];
>               if (vmr->vmr_va) {
>                       if (munmap((void *)vmr->vmr_va, vmr->vmr_size)) {
> -                             warn("failed to unmap region %zu at 0x%08lx",
> -                                 i, vmr->vmr_va);
> +                             warn("failed to unmap orginal region %zu @ hva "
> +                                 "0x%lx", i, vmr->vmr_va);
>                               ret = 1;
>                       } else
> -                             printf("unmapped region %zu @ gpa 0x%08lx\n",
> -                                 i, vmr->vmr_gpa);
> +                             printf("unmapped origin region %zu @ hva "
> +                                 "0x%lx\n", i, vmr->vmr_va);
>               }
> +             vmr = &vsp.vsp_memranges[i];
> +             if (vmr->vmr_va) {
> +                     if (munmap((void *)vmr->vmr_va, vmr->vmr_size)) {
> +                             warn("failed to unmap shared region %zu @ hva "
> +                                 "0x%lx", i, vmr->vmr_va);
> +                             ret = 1;
> +                     } else
> +                             printf("unmapped shared region %zu @ hva "
> +                                 "0x%lx\n", i, vmr->vmr_va);
> +             }
>       }
>
>       return (ret);
> blob - d46b3431081b6d2e7e1adab884ec21b0aaa9761a
> blob + 3daee7dad431ed200cbd734bc0f8b35bebd54216
> --- sys/dev/vmm/vmm.c
> +++ sys/dev/vmm/vmm.c
> @@ -262,6 +262,9 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag
>       case VMM_IOC_WRITEVMPARAMS:
>               ret = vm_rwvmparams((struct vm_rwvmparams_params *)data, 1);
>               break;
> +     case VMM_IOC_SHAREMEM:
> +             ret = vm_share_mem((struct vm_sharemem_params *)data, p);
> +             break;
>       default:
>               ret = vmmioctl_machdep(dev, cmd, data, flag, p);
>               break;
> @@ -286,6 +289,7 @@ pledge_ioctl_vmm(struct proc *p, long com)
>       switch (com) {
>       case VMM_IOC_CREATE:
>       case VMM_IOC_INFO:
> +     case VMM_IOC_SHAREMEM:
>               /* The "parent" process in vmd forks and manages VMs */
>               if (p->p_p->ps_pledge & PLEDGE_PROC)
>                       return (0);
> @@ -780,3 +784,82 @@ vcpu_must_stop(struct vcpu *vcpu)
>               return (1);
>       return (0);
>  }
> +
> +/*
> + * vm_share_mem
> + *
> + * Share a uvm mapping for the vm guest memory ranges into the calling 
> process.
> + *
> + * Return values:
> + *  0: if successful
> + *  ENOENT: if the vm cannot be found by vm_find
> + *  EPERM: if the vm cannot be accessed by the current process
> + *  EINVAL: if the provide memory ranges fail checks
> + *  ENOMEM: if uvm_share fails to find available memory in the destination 
> map
> + */
> +int
> +vm_share_mem(struct vm_sharemem_params *vsp, struct proc *p)
> +{
> +     int ret = EINVAL;
> +     size_t i, n;
> +     struct vm *vm;
> +     struct vm_mem_range *src, *dst;
> +
> +     ret = vm_find(vsp->vsp_vm_id, &vm);
> +     if (ret)
> +             return (ret);
> +
> +     /* Check we have the expected number of ranges. */
> +     if (vm->vm_nmemranges != vsp->vsp_nmemranges)
> +             goto out;
> +     n = vm->vm_nmemranges;
> +
> +     /* Check their types, sizes, and gpa's (implying page alignment). */
> +     for (i = 0; i < n; i++) {
> +             src = &vm->vm_memranges[i];
> +             dst = &vsp->vsp_memranges[i];
> +
> +             /*
> +              * The vm memranges were already checked during creation, so
> +              * compare to them to confirm validity of mapping request.
> +              */
> +             if (src->vmr_type != dst->vmr_type)
> +                     goto out;
> +             if (src->vmr_gpa != dst->vmr_gpa)
> +                     goto out;
> +             if (src->vmr_size != dst->vmr_size)
> +                     goto out;
> +
> +             /* Check our intended destination is page-aligned. */
> +             if (dst->vmr_va & PAGE_MASK)
> +                     goto out;
> +     }
> +
> +     /*
> +      * Share each range individually with the calling process. We do
> +      * not need PROC_EXEC as the emulated devices do not need to execute
> +      * instructions from guest memory.
> +      */
> +     for (i = 0; i < n; i++) {
> +             src = &vm->vm_memranges[i];
> +             dst = &vsp->vsp_memranges[i];
> +
> +             /* Skip MMIO range. */
> +             if (src->vmr_type == VM_MEM_MMIO)
> +                     continue;
> +
> +             DPRINTF("sharing gpa=0x%lx for pid %d @ va=0x%lx\n",
> +                 src->vmr_gpa, p->p_p->ps_pid, dst->vmr_va);
> +             ret = uvm_share(&p->p_vmspace->vm_map, dst->vmr_va,
> +                 PROT_READ | PROT_WRITE, vm->vm_map, src->vmr_gpa,
> +                 src->vmr_size);
> +             if (ret) {
> +                     printf("%s: uvm_share failed (%d)\n", __func__, ret);
> +                     break;
> +             }
> +     }
> +     ret = 0;
> +out:
> +     refcnt_rele_wake(&vm->vm_refcnt);
> +     return (ret);
> +}
> blob - d2355d42b44b51044901de9d0adc0239586f37b8
> blob + 7b3b0d77ad550165b6e53f4de66723366d689b23
> --- sys/dev/vmm/vmm.h
> +++ sys/dev/vmm/vmm.h
> @@ -76,6 +76,13 @@ struct vm_resetcpu_params {
>       struct vcpu_reg_state   vrp_init_state;
>  };
>
> +struct vm_sharemem_params {
> +     /* Input parameters to VMM_IOC_SHAREMEM */
> +     uint32_t                vsp_vm_id;
> +     size_t                  vsp_nmemranges;
> +     struct vm_mem_range     vsp_memranges[VMM_MAX_MEM_RANGES];
> +};
> +
>  /* IOCTL definitions */
>  #define VMM_IOC_CREATE _IOWR('V', 1, struct vm_create_params) /* Create VM */
>  #define VMM_IOC_RUN _IOWR('V', 2, struct vm_run_params) /* Run VCPU */
> @@ -88,8 +95,8 @@ struct vm_resetcpu_params {
>  #define VMM_IOC_READVMPARAMS _IOWR('V', 9, struct vm_rwvmparams_params)
>  /* Set VM params */
>  #define VMM_IOC_WRITEVMPARAMS _IOW('V', 10, struct vm_rwvmparams_params)
> +#define VMM_IOC_SHAREMEM _IOW('V', 11, struct vm_sharemem_params)
>
> -
>  #ifdef _KERNEL
>
>  /* #define VMM_DEBUG */
> @@ -194,6 +201,7 @@ int vcpu_must_stop(struct vcpu *);
>  int vm_terminate(struct vm_terminate_params *);
>  int vm_resetcpu(struct vm_resetcpu_params *);
>  int vcpu_must_stop(struct vcpu *);
> +int vm_share_mem(struct vm_sharemem_params *, struct proc *);
>
>  #endif /* _KERNEL */
>  #endif /* DEV_VMM_H */
> blob - 9373a135aa87755f0e34b821af4ab8c0f4970421
> blob + dd0efc2fd71bb0ac91d2e389a30e779d8d6c6c0d
> --- usr.sbin/vmd/vioblk.c
> +++ usr.sbin/vmd/vioblk.c
> @@ -58,7 +58,7 @@ vioblk_main(int fd)
>  }
>
>  __dead void
> -vioblk_main(int fd)
> +vioblk_main(int fd, int fd_vmm)
>  {
>       struct virtio_dev        dev;
>       struct vioblk_dev       *vioblk;
> @@ -71,8 +71,11 @@ vioblk_main(int fd)
>
>       log_procinit("vioblk");
>
> -     /* stdio - needed for read/write to disk fds and channels to the vm. */
> -     if (pledge("stdio", NULL) == -1)
> +     /*
> +      * stdio - needed for read/write to disk fds and channels to the vm.
> +      * vmm + proc - needed to create shared vm mappings.
> +      */
> +     if (pledge("stdio vmm proc", NULL) == -1)
>               fatal("pledge");
>
>       /* Receive our virtio_dev, mostly preconfigured. */
> @@ -92,8 +95,9 @@ vioblk_main(int fd)
>       vioblk = &dev.vioblk;
>
>       log_debug("%s: got viblk dev. num disk fds = %d, sync fd = %d, "
> -         "async fd = %d, sz = %lld maxfer = %d", __func__, vioblk->ndisk_fd,
> -         dev.sync_fd, dev.async_fd, vioblk->sz, vioblk->max_xfer);
> +         "async fd = %d, sz = %lld maxfer = %d, vmm fd = %d", __func__,
> +         vioblk->ndisk_fd, dev.sync_fd, dev.async_fd, vioblk->sz,
> +         vioblk->max_xfer, fd_vmm);
>
>       /* Receive our vm information from the vm process. */
>       memset(&vm, 0, sizeof(vm));
> @@ -108,12 +112,19 @@ vioblk_main(int fd)
>       setproctitle("%s/vioblk[%d]", vcp->vcp_name, vioblk->idx);
>
>       /* Now that we have our vm information, we can remap memory. */
> -     ret = remap_guest_mem(&vm);
> +     ret = remap_guest_mem(&vm, fd_vmm);
>       if (ret) {
>               log_warnx("failed to remap guest memory");
>               goto fail;
>       }
>
> +     /*
> +      * We no longer need /dev/vmm access.
> +      */
> +     close_fd(fd_vmm);
> +     if (pledge("stdio", NULL) == -1)
> +             fatal("pledge2");
> +
>       /* Initialize the virtio block abstractions. */
>       type = vm.vm_params.vmc_disktypes[vioblk->idx];
>       switch (type) {
> blob - 6ce905fdccfa7befb49353c23628227fdc74c486
> blob + d75dc06b9bc0133ace3e74de85abba3b62f539dc
> --- usr.sbin/vmd/vionet.c
> +++ usr.sbin/vmd/vionet.c
> @@ -61,7 +61,7 @@ vionet_main(int fd)
>  static void handle_sync_io(int, short, void *);
>
>  __dead void
> -vionet_main(int fd)
> +vionet_main(int fd, int fd_vmm)
>  {
>       struct virtio_dev        dev;
>       struct vionet_dev       *vionet = NULL;
> @@ -73,8 +73,11 @@ vionet_main(int fd)
>
>       log_procinit("vionet");
>
> -     /* stdio - needed for read/write to tap fd and channels to the vm. */
> -     if (pledge("stdio", NULL) == -1)
> +     /*
> +      * stdio - needed for read/write to disk fds and channels to the vm.
> +      * vmm + proc - needed to create shared vm mappings.
> +      */
> +     if (pledge("stdio vmm proc", NULL) == -1)
>               fatal("pledge");
>
>       /* Receive our vionet_dev, mostly preconfigured. */
> @@ -92,8 +95,9 @@ vionet_main(int fd)
>       dev.sync_fd = fd;
>       vionet = &dev.vionet;
>
> -     log_debug("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d",
> -         __func__, vionet->data_fd, dev.sync_fd, dev.async_fd);
> +     log_debug("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d"
> +         ", vmm fd = %d", __func__, vionet->data_fd, dev.sync_fd,
> +         dev.async_fd, fd_vmm);
>
>       /* Receive our vm information from the vm process. */
>       memset(&vm, 0, sizeof(vm));
> @@ -108,10 +112,19 @@ vionet_main(int fd)
>       setproctitle("%s/vionet[%d]", vcp->vcp_name, vionet->idx);
>
>       /* Now that we have our vm information, we can remap memory. */
> -     ret = remap_guest_mem(&vm);
> -     if (ret)
> +     ret = remap_guest_mem(&vm, fd_vmm);
> +     if (ret) {
> +             fatal("%s: failed to remap", __func__);
>               goto fail;
> +     }
>
> +     /*
> +      * We no longer need /dev/vmm access.
> +      */
> +     close_fd(fd_vmm);
> +     if (pledge("stdio", NULL) == -1)
> +             fatal("pledge2");
> +
>       /* If we're restoring hardware, re-initialize virtqueue hva's. */
>       if (vm.vm_state & VM_STATE_RECEIVED) {
>               struct virtio_vq_info *vq_info;
> blob - 92e77b8f83431c2ff52824e35149477909612653
> blob + e3f6d1371ab7795df6e9d4b370ffd1a7c5afbde6
> --- usr.sbin/vmd/virtio.c
> +++ usr.sbin/vmd/virtio.c
> @@ -1297,7 +1297,7 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev
>  static int
>  virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev)
>  {
> -     char *nargv[8], num[32], t[2];
> +     char *nargv[10], num[32], vmm_fd[32], t[2];
>       pid_t dev_pid;
>       int data_fds[VM_MAX_BASE_PER_DISK], sync_fds[2], async_fds[2], ret = 0;
>       size_t i, j, data_fds_sz, sz = 0;
> @@ -1483,6 +1483,8 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev
>               memset(&nargv, 0, sizeof(nargv));
>               memset(num, 0, sizeof(num));
>               snprintf(num, sizeof(num), "%d", sync_fds[1]);
> +             memset(vmm_fd, 0, sizeof(vmm_fd));
> +             snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd);
>
>               t[0] = dev->dev_type;
>               t[1] = '\0';
> @@ -1492,13 +1494,15 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev
>               nargv[2] = num;
>               nargv[3] = "-t";
>               nargv[4] = t;
> -             nargv[5] = "-n";
> +             nargv[5] = "-i";
> +             nargv[6] = vmm_fd;
> +             nargv[7] = "-n";
>
>               if (env->vmd_verbose) {
> -                     nargv[6] = "-v";
> -                     nargv[7] = NULL;
> +                     nargv[8] = "-v";
> +                     nargv[9] = NULL;
>               } else
> -                     nargv[6] = NULL;
> +                     nargv[8] = NULL;
>
>               /* Control resumes in vmd.c:main(). */
>               execvp(nargv[0], nargv);
> blob - d42abb5a834cb5f6d9777d8b23f92a6a1c5930f2
> blob + 48ec88c37db24290e069bf0ab9249df764b2c424
> --- usr.sbin/vmd/vm.c
> +++ usr.sbin/vmd/vm.c
> @@ -218,9 +218,10 @@ static const struct vcpu_reg_state vcpu_init_flat16 =
>   * Primary entrypoint for launching a vm. Does not return.
>   *
>   * fd: file descriptor for communicating with vmm process.
> + * fd_vmm: file descriptor for communicating with vmm(4) device
>   */
>  void
> -vm_main(int fd)
> +vm_main(int fd, int vmm_fd)
>  {
>       struct vm_create_params *vcp = NULL;
>       struct vmd_vm            vm;
> @@ -241,9 +242,8 @@ vm_main(int fd)
>        * vmm - for the vmm ioctls and operations.
>        * proc exec - fork/exec for launching devices.
>        * recvfd - for vm send/recv and sending fd to devices.
> -      * tmppath/rpath - for shm_mkstemp, ftruncate, unlink
>        */
> -     if (pledge("stdio vmm proc exec recvfd tmppath rpath", NULL) == -1)
> +     if (pledge("stdio vmm proc exec recvfd", NULL) == -1)
>               fatal("pledge");
>
>       /* Receive our vm configuration. */
> @@ -254,13 +254,6 @@ vm_main(int fd)
>               _exit(EIO);
>       }
>
> -     /* Receive the /dev/vmm fd number. */
> -     sz = atomicio(read, fd, &env->vmd_fd, sizeof(env->vmd_fd));
> -     if (sz != sizeof(env->vmd_fd)) {
> -             log_warnx("failed to receive /dev/vmm fd");
> -             _exit(EIO);
> -     }
> -
>       /* Update process with the vm name. */
>       vcp = &vm.vm_params.vmc_params;
>       setproctitle("%s", vcp->vcp_name);
> @@ -1099,63 +1092,34 @@ alloc_guest_mem(struct vmd_vm *vm)
>  alloc_guest_mem(struct vmd_vm *vm)
>  {
>       void *p;
> -     char *tmp;
> -     int fd, ret = 0;
> +     int ret = 0;
>       size_t i, j;
>       struct vm_create_params *vcp = &vm->vm_params.vmc_params;
>       struct vm_mem_range *vmr;
>
> -     tmp = calloc(32, sizeof(char));
> -     if (tmp == NULL) {
> -             ret = errno;
> -             log_warn("%s: calloc", __func__);
> -             return (ret);
> -     }
> -     strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32);
> -
> -     vm->vm_nmemfds = vcp->vcp_nmemranges;
> -
>       for (i = 0; i < vcp->vcp_nmemranges; i++) {
>               vmr = &vcp->vcp_memranges[i];
>
> -             fd = shm_mkstemp(tmp);
> -             if (fd < 0) {
> -                     ret = errno;
> -                     log_warn("%s: shm_mkstemp", __func__);
> -                     return (ret);
> -             }
> -             if (ftruncate(fd, vmr->vmr_size) == -1) {
> -                     ret = errno;
> -                     log_warn("%s: ftruncate", __func__);
> -                     goto out;
> -             }
> -             if (fcntl(fd, F_SETFD, 0) == -1) {
> -                     ret = errno;
> -                     log_warn("%s: fcntl", __func__);
> -                     goto out;
> -             }
> -             if (shm_unlink(tmp) == -1) {
> -                     ret = errno;
> -                     log_warn("%s: shm_unlink", __func__);
> -                     goto out;
> -             }
> -             strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32);
> -
> +             /*
> +              * We only need R/W as userland. vmm(4) will use R/W/X in its
> +              * mapping.
> +              *
> +              * We must use MAP_SHARED so emulated devices will be able
> +              * to generate shared mappings.
> +              */
>               p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
> -                 MAP_SHARED | MAP_CONCEAL, fd, 0);
> +                 MAP_ANON | MAP_CONCEAL | MAP_SHARED, -1, 0);
>               if (p == MAP_FAILED) {
>                       ret = errno;
>                       for (j = 0; j < i; j++) {
>                               vmr = &vcp->vcp_memranges[j];
>                               munmap((void *)vmr->vmr_va, vmr->vmr_size);
>                       }
> -                     goto out;
> +                     return (ret);
>               }
> -             vm->vm_memfds[i] = fd;
>               vmr->vmr_va = (vaddr_t)p;
>       }
> -out:
> -     free(tmp);
> +
>       return (ret);
>  }
>
> @@ -2552,10 +2516,11 @@ remap_guest_mem(struct vmd_vm *vm)
>   * Returns 0 on success, non-zero in event of failure.
>   */
>  int
> -remap_guest_mem(struct vmd_vm *vm)
> +remap_guest_mem(struct vmd_vm *vm, int vmm_fd)
>  {
>       struct vm_create_params *vcp;
>       struct vm_mem_range     *vmr;
> +     struct vm_sharemem_params vsp;
>       size_t                   i, j;
>       void                    *p = NULL;
>       int                      ret;
> @@ -2566,23 +2531,32 @@ remap_guest_mem(struct vmd_vm *vm)
>       vcp = &vm->vm_params.vmc_params;
>
>       /*
> -      * We've execve'd, so we need to re-map the guest VM memory. Iterate
> -      * over all possible vm_mem_range entries so we can initialize all
> -      * file descriptors to a value.
> +      * Initialize our VM shared memory request using our original
> +      * creation parameters. We'll overwrite the va's after mmap(2).
>        */
> +     memset(&vsp, 0, sizeof(vsp));
> +     vsp.vsp_nmemranges = vcp->vcp_nmemranges;
> +     vsp.vsp_vm_id = vcp->vcp_id;
> +     memcpy(&vsp.vsp_memranges, &vcp->vcp_memranges,
> +         sizeof(vsp.vsp_memranges));
> +
> +     /*
> +      * Use mmap(2) to identify virtual address space for our mappings.
> +      */
>       for (i = 0; i < VMM_MAX_MEM_RANGES; i++) {
> -             if (i < vcp->vcp_nmemranges) {
> -                     vmr = &vcp->vcp_memranges[i];
> -                     /* Skip ranges we know we don't need right now. */
> +             if (i < vsp.vsp_nmemranges) {
> +                     vmr = &vsp.vsp_memranges[i];
> +
> +                     /* Ignore any MMIO ranges. */
>                       if (vmr->vmr_type == VM_MEM_MMIO) {
> -                             log_debug("%s: skipping range i=%ld, type=%d",
> -                                 __func__, i, vmr->vmr_type);
> -                             vm->vm_memfds[i] = -1;
> +                             vmr->vmr_va = 0;
> +                             vcp->vcp_memranges[i].vmr_va = 0;
>                               continue;
>                       }
> -                     /* Re-mmap the memrange. */
> -                     p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
> -                         MAP_SHARED | MAP_CONCEAL, vm->vm_memfds[i], 0);
> +
> +                     /* Make initial mappings for the memrange. */
> +                     p = mmap(NULL, vmr->vmr_size, PROT_READ, MAP_ANON, -1,
> +                         0);
>                       if (p == MAP_FAILED) {
>                               ret = errno;
>                               log_warn("%s: mmap", __func__);
> @@ -2594,11 +2568,29 @@ remap_guest_mem(struct vmd_vm *vm)
>                               return (ret);
>                       }
>                       vmr->vmr_va = (vaddr_t)p;
> -             } else {
> -                     /* Initialize with an invalid fd. */
> -                     vm->vm_memfds[i] = -1;
> +                     vcp->vcp_memranges[i].vmr_va = vmr->vmr_va;
>               }
>       }
>
> +     /*
> +      * munmap(2) now that we have va's and ranges that don't overlap. vmm
> +      * will use the va's and sizes to recreate the mappings for us.
> +      */
> +     for (i = 0; i < vsp.vsp_nmemranges; i++) {
> +             vmr = &vsp.vsp_memranges[i];
> +             if (vmr->vmr_type == VM_MEM_MMIO)
> +                     continue;
> +             if (munmap((void*)vmr->vmr_va, vmr->vmr_size) == -1)
> +                     fatal("%s: munmap", __func__);
> +     }
> +
> +     /*
> +      * Ask vmm to enter the shared mappings for us. They'll point
> +      * to the same host physical memory, but will have a randomized
> +      * virtual address for the calling process.
> +      */
> +     if (ioctl(vmm_fd, VMM_IOC_SHAREMEM, &vsp) == -1)
> +             return (errno);
> +
>       return (0);
>  }
> blob - c06fe974877c17710a81cd69bed541f908e76ef4
> blob + 0d0a66533b05a80aafba31d415e97c0fb3a6be88
> --- usr.sbin/vmd/vmd.c
> +++ usr.sbin/vmd/vmd.c
> @@ -774,7 +774,8 @@ main(int argc, char **argv)
>       struct privsep          *ps;
>       int                      ch;
>       enum privsep_procid      proc_id = PROC_PARENT;
> -     int                      proc_instance = 0, vm_launch = 0, vm_fd = -1;
> +     int                      proc_instance = 0, vm_launch = 0;
> +     int                      vmm_fd = -1, vm_fd = -1;
>       const char              *errp, *title = NULL;
>       int                      argc0 = argc;
>       char                     dev_type = '\0';
> @@ -784,7 +785,7 @@ main(int argc, char **argv)
>       if ((env = calloc(1, sizeof(*env))) == NULL)
>               fatal("calloc: env");
>
> -     while ((ch = getopt(argc, argv, "D:P:I:V:X:df:nt:v")) != -1) {
> +     while ((ch = getopt(argc, argv, "D:P:I:V:X:df:i:nt:v")) != -1) {
>               switch (ch) {
>               case 'D':
>                       if (cmdline_symset(optarg) < 0)
> @@ -838,6 +839,11 @@ main(int argc, char **argv)
>                       default: fatalx("invalid device type");
>                       }
>                       break;
> +             case 'i':
> +                     vmm_fd = strtonum(optarg, 0, 128, &errp);
> +                     if (errp)
> +                             fatalx("invalid vmm fd");
> +                     break;
>               default:
>                       usage();
>               }
> @@ -866,7 +872,7 @@ main(int argc, char **argv)
>
>       ps = &env->vmd_ps;
>       ps->ps_env = env;
> -     env->vmd_fd = -1;
> +     env->vmd_fd = vmm_fd;
>
>       if (config_init(env) == -1)
>               fatal("failed to initialize configuration");
> @@ -882,14 +888,14 @@ main(int argc, char **argv)
>        * If we're launching a new vm or its device, we short out here.
>        */
>       if (vm_launch == VMD_LAUNCH_VM) {
> -             vm_main(vm_fd);
> +             vm_main(vm_fd, vmm_fd);
>               /* NOTREACHED */
>       } else if (vm_launch == VMD_LAUNCH_DEV) {
>               if (dev_type == VMD_DEVTYPE_NET) {
> -                     vionet_main(vm_fd);
> +                     vionet_main(vm_fd, vmm_fd);
>                       /* NOTREACHED */
>               } else if (dev_type == VMD_DEVTYPE_DISK) {
> -                     vioblk_main(vm_fd);
> +                     vioblk_main(vm_fd, vmm_fd);
>                       /* NOTREACHED */
>               }
>               fatalx("unsupported device type '%c'", dev_type);
> blob - 68de0544706a5864aec480590191b33904864053
> blob + 61b0cff0c62c9ed752a2128bea8b12bc34f918d7
> --- usr.sbin/vmd/vmd.h
> +++ usr.sbin/vmd/vmd.h
> @@ -329,9 +329,6 @@ struct vmd_vm {
>       struct timeval           vm_start_tv;
>       int                      vm_start_limit;
>
> -     int                      vm_memfds[VMM_MAX_MEM_RANGES];
> -     size_t                   vm_nmemfds;
> -
>       TAILQ_ENTRY(vmd_vm)      vm_entry;
>  };
>  TAILQ_HEAD(vmlist, vmd_vm);
> @@ -488,7 +485,7 @@ void       vm_main(int);
>  int   vmm_pipe(struct vmd_vm *, int, void (*)(int, short, void *));
>
>  /* vm.c */
> -void  vm_main(int);
> +void  vm_main(int, int);
>  void  mutex_lock(pthread_mutex_t *);
>  void  mutex_unlock(pthread_mutex_t *);
>  int   read_mem(paddr_t, void *buf, size_t);
> @@ -499,7 +496,7 @@ int        remap_guest_mem(struct vmd_vm *);
>  enum pipe_msg_type vm_pipe_recv(struct vm_dev_pipe *);
>  int   write_mem(paddr_t, const void *buf, size_t);
>  void*         hvaddr_mem(paddr_t, size_t);
> -int   remap_guest_mem(struct vmd_vm *);
> +int   remap_guest_mem(struct vmd_vm *, int);
>
>  /* config.c */
>  int   config_init(struct vmd *);
> @@ -527,9 +524,9 @@ __dead void vionet_main(int);
>  int   virtio_get_base(int, char *, size_t, int, const char *);
>
>  /* vionet.c */
> -__dead void vionet_main(int);
> +__dead void vionet_main(int, int);
>
>  /* vioblk.c */
> -__dead void vioblk_main(int);
> +__dead void vioblk_main(int, int);
>
>  #endif /* VMD_H */
> blob - 35119673dc31b82aec55e6dd8ef12eff3ef2beef
> blob + 7f6fe9040ad8b0c6774255a7d02f96322ea5e421
> --- usr.sbin/vmd/vmm.c
> +++ usr.sbin/vmd/vmm.c
> @@ -627,7 +627,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p
>  {
>       struct vm_create_params *vcp;
>       struct vmd_vm           *vm;
> -     char                    *nargv[6], num[32];
> +     char                    *nargv[8], num[32], vmm_fd[32];
>       int                      fd, ret = EINVAL;
>       int                      fds[2];
>       pid_t                    vm_pid;
> @@ -701,16 +701,6 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p
>               if (ret == EIO)
>                       goto err;
>
> -             /* Send the fd number for /dev/vmm. */
> -             sz = atomicio(vwrite, fds[0], &env->vmd_fd,
> -                 sizeof(env->vmd_fd));
> -             if (sz != sizeof(env->vmd_fd)) {
> -                     log_warnx("%s: failed to send /dev/vmm fd for vm '%s'",
> -                         __func__, vcp->vcp_name);
> -                     ret = EIO;
> -                     goto err;
> -             }
> -
>               /* Read back the kernel-generated vm id from the child */
>               sz = atomicio(read, fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id));
>               if (sz != sizeof(vcp->vcp_id)) {
> @@ -773,17 +763,21 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p
>               memset(&nargv, 0, sizeof(nargv));
>               memset(num, 0, sizeof(num));
>               snprintf(num, sizeof(num), "%d", fds[1]);
> +             memset(vmm_fd, 0, sizeof(vmm_fd));
> +             snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd);
>
>               nargv[0] = env->argv0;
>               nargv[1] = "-V";
>               nargv[2] = num;
>               nargv[3] = "-n";
> +             nargv[4] = "-i";
> +             nargv[5] = vmm_fd;
>
>               if (env->vmd_verbose) {
> -                     nargv[4] = "-v";
> -                     nargv[5] = NULL;
> +                     nargv[6] = "-v";
> +                     nargv[7] = NULL;
>               } else
> -                     nargv[4] = NULL;
> +                     nargv[6] = NULL;
>
>               /* Control resumes in vmd main(). */
>               execvp(nargv[0], nargv);

Reply via email to