vmm/vmd: use anon shared mappings for devices

Dave Voutila Tue, 09 May 2023 05:12:24 -0700

tech@,

The diff below adds a new ioctl for vmm(4) that allows an emulated
device process request vmm(4) enter a shared mapping in its vmspace so
it can access guest memory without using a shared mapping backed by a
named file.


Similar to the vm creation ioctl (VMM_IOC_CREATE), the caller requires
the "vmm" and "proc" pledge(2) promises. This allows the emulated
devices to do this setup early and drop these promises back down to just
"stdio" before any device emulation occurs.

Feel free to skip to the diff (the regress change shows how it works in
a simplified case) or continue reading for reasoning behind this
change. I share this primarily for testers and feedback from other devs
while mlarkin@ reviews.

To test:

1. apply diff
2. build and install new kernel
3. copy or symlink new vmm.h into /usr/include/dev/vmm/
4. build and reinstall vmd (no changes for vmctl needed)

You should see no change during vm usage, however you should now see no
change in /tmp consumption while unmounting things like NFS mounts or
usb disks. Read on for details.

...

vmd(8) began emulating virtio network and block devices in subprocesses
with a commit I made at the recent hackathon in Morocco. It relies on
creating shared memory mappings using shm_mkstemp(3) and passing file
descriptors to the fork/exec'd child processes.

I've since received reports that using named mappings for shared memory
is having 2 negative impacts:

1. increased overhead during vm teardown, often making systems
   unresponsive (this is my conclusion based on only minimal evidence)

2. unmounting any device on the host while a vm is running causes some
   guest memory to be flushed to disk (the backing file is already
   unlinked, so not visible to other processes).

(2) can cause /tmp to fill up or introduce failure conditions I'm not
sure we can recover from in vmd. It also has implications for other
services on the host.

I don't own a fireproof suit that fits...so I'm not about to wade into
the VFS & UVM layers to figure out if (1) or (2) can be mitigated or
fixed on their own.

One idea was to implement what FreeBSD borrowed from Linux in their
forever quest to become LinuxBSD: memfd_create(2) [1].

I took one look and decided this was not the time for me to be trying to
land a new syscall primarily for vmd (and some ports) and went another
route.

[1] https://man7.org/linux/man-pages/man2/memfd_create.2.html

-dv


diff refs/heads/master refs/heads/vmm-mapshare
commit - cec1ace2d4d21c85f4c8bacc2dd971721bf6b694
commit + 8f533c371094c044b0127d468be5feaaf775811b
blob - f221b58f75c4eb01a3a04ae45c7cdb066b11361a
blob + 0e6f5ff858c51bd9707c657b154b0df1f8944c3b
--- regress/sys/arch/amd64/vmm/vcpu.c
+++ regress/sys/arch/amd64/vmm/vcpu.c
@@ -83,6 +83,7 @@ main(int argc, char **argv)
        struct vm_resetcpu_params        vresetp;
        struct vm_run_params             vrunp;
        struct vm_terminate_params       vtp;
+       struct vm_sharemem_params        vsp;

        struct vm_mem_range             *vmr;
        int                              fd, ret = 1;
@@ -127,8 +128,9 @@ main(int argc, char **argv)
                        ((uint8_t*)p)[j + 1] = PCKBC_AUX;
                }
                vmr->vmr_va = (vaddr_t)p;
-               printf("mapped region %zu: { gpa: 0x%08lx, size: %lu }\n",
-                   i, vmr->vmr_gpa, vmr->vmr_size);
+               printf("created mapped region %zu: { gpa: 0x%08lx, size: %lu,"
+                   " hva: 0x%lx }\n", i, vmr->vmr_gpa, vmr->vmr_size,
+                   vmr->vmr_va);
        }

        if (ioctl(fd, VMM_IOC_CREATE, &vcp) == -1)
@@ -136,8 +138,55 @@ main(int argc, char **argv)
        printf("created vm %d named \"%s\"\n", vcp.vcp_id, vcp.vcp_name);

        /*
-        * 2. Check that our VM exists.
+        * 2. Check we can create shared memory mappings.
         */
+       memset(&vsp, 0, sizeof(vsp));
+       vsp.vsp_nmemranges = vcp.vcp_nmemranges;
+       memcpy(&vsp.vsp_memranges, &vcp.vcp_memranges,
+           sizeof(vsp.vsp_memranges));
+       vsp.vsp_vm_id = vcp.vcp_id;
+
+       /* Find some new va ranges... */
+       for (i = 0; i < vsp.vsp_nmemranges; i++) {
+               vmr = &vsp.vsp_memranges[i];
+               p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
+                   MAP_PRIVATE | MAP_ANON, -1, 0);
+               if (p == MAP_FAILED)
+                       err(1, "mmap");
+               vmr->vmr_va = (vaddr_t)p;
+       }
+
+       /* Release our mappings so vmm can replace them. */
+       for (i = 0; i < vsp.vsp_nmemranges; i++) {
+               vmr = &vsp.vsp_memranges[i];
+               munmap((void*)vmr->vmr_va, vmr->vmr_size);
+       }
+
+       /* Perform the shared mapping. */
+       if (ioctl(fd, VMM_IOC_SHAREMEM, &vsp) == -1)
+               err(1, "VMM_IOC_SHAREMEM");
+       printf("created shared memory mappings\n");
+
+       /* We should see our reset vector instructions in the new mappings. */
+       for (i = 0; i < vsp.vsp_nmemranges; i++) {
+               vmr = &vsp.vsp_memranges[i];
+               p = (void*)vmr->vmr_va;
+
+               for (j = 0; j < vmr->vmr_size; j += 2) {
+                       if (((uint8_t*)p)[j + 0] != 0xE4)
+                               errx(1, "bad byte");
+                       if (((uint8_t*)p)[j + 1] != PCKBC_AUX)
+                               errx(1, "bad byte");
+               }
+               printf("checked shared region %zu: { gpa: 0x%08lx, size: %lu,"
+                   " hva: 0x%lx }\n", i, vmr->vmr_gpa, vmr->vmr_size,
+                   vmr->vmr_va);
+       }
+       printf("validated shared memory mappings\n");
+
+       /*
+        * 3. Check that our VM exists.
+        */
        memset(&vip, 0, sizeof(vip));
        vip.vip_size = 0;
        info = NULL;
@@ -189,7 +238,7 @@ main(int argc, char **argv)
        ours = NULL;

        /*
-        * 3. Reset our VCPU and initialize register state.
+        * 4. Reset our VCPU and initialize register state.
         */
        memset(&vresetp, 0, sizeof(vresetp));
        vresetp.vrp_vm_id = vcp.vcp_id;
@@ -205,7 +254,7 @@ main(int argc, char **argv)
            vresetp.vrp_vm_id);

        /*
-        * 4. Run the vcpu, expecting an immediate exit for IO assist.
+        * 5. Run the vcpu, expecting an immediate exit for IO assist.
         */
        exit = malloc(sizeof(*exit));
        if (exit == NULL) {
@@ -258,7 +307,7 @@ out:

 out:
        /*
-        * 5. Terminate our VM and clean up.
+        * 6. Terminate our VM and clean up.
         */
        memset(&vtp, 0, sizeof(vtp));
        vtp.vtp_vm_id = vcp.vcp_id;
@@ -277,13 +326,23 @@ out:
                vmr = &vcp.vcp_memranges[i];
                if (vmr->vmr_va) {
                        if (munmap((void *)vmr->vmr_va, vmr->vmr_size)) {
-                               warn("failed to unmap region %zu at 0x%08lx",
-                                   i, vmr->vmr_va);
+                               warn("failed to unmap orginal region %zu @ hva "
+                                   "0x%lx", i, vmr->vmr_va);
                                ret = 1;
                        } else
-                               printf("unmapped region %zu @ gpa 0x%08lx\n",
-                                   i, vmr->vmr_gpa);
+                               printf("unmapped origin region %zu @ hva "
+                                   "0x%lx\n", i, vmr->vmr_va);
                }
+               vmr = &vsp.vsp_memranges[i];
+               if (vmr->vmr_va) {
+                       if (munmap((void *)vmr->vmr_va, vmr->vmr_size)) {
+                               warn("failed to unmap shared region %zu @ hva "
+                                   "0x%lx", i, vmr->vmr_va);
+                               ret = 1;
+                       } else
+                               printf("unmapped shared region %zu @ hva "
+                                   "0x%lx\n", i, vmr->vmr_va);
+               }
        }

        return (ret);
blob - d46b3431081b6d2e7e1adab884ec21b0aaa9761a
blob + 3daee7dad431ed200cbd734bc0f8b35bebd54216
--- sys/dev/vmm/vmm.c
+++ sys/dev/vmm/vmm.c
@@ -262,6 +262,9 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t data, int flag
        case VMM_IOC_WRITEVMPARAMS:
                ret = vm_rwvmparams((struct vm_rwvmparams_params *)data, 1);
                break;
+       case VMM_IOC_SHAREMEM:
+               ret = vm_share_mem((struct vm_sharemem_params *)data, p);
+               break;
        default:
                ret = vmmioctl_machdep(dev, cmd, data, flag, p);
                break;
@@ -286,6 +289,7 @@ pledge_ioctl_vmm(struct proc *p, long com)
        switch (com) {
        case VMM_IOC_CREATE:
        case VMM_IOC_INFO:
+       case VMM_IOC_SHAREMEM:
                /* The "parent" process in vmd forks and manages VMs */
                if (p->p_p->ps_pledge & PLEDGE_PROC)
                        return (0);
@@ -780,3 +784,82 @@ vcpu_must_stop(struct vcpu *vcpu)
                return (1);
        return (0);
 }
+
+/*
+ * vm_share_mem
+ *
+ * Share a uvm mapping for the vm guest memory ranges into the calling process.
+ *
+ * Return values:
+ *  0: if successful
+ *  ENOENT: if the vm cannot be found by vm_find
+ *  EPERM: if the vm cannot be accessed by the current process
+ *  EINVAL: if the provide memory ranges fail checks
+ *  ENOMEM: if uvm_share fails to find available memory in the destination map
+ */
+int
+vm_share_mem(struct vm_sharemem_params *vsp, struct proc *p)
+{
+       int ret = EINVAL;
+       size_t i, n;
+       struct vm *vm;
+       struct vm_mem_range *src, *dst;
+
+       ret = vm_find(vsp->vsp_vm_id, &vm);
+       if (ret)
+               return (ret);
+
+       /* Check we have the expected number of ranges. */
+       if (vm->vm_nmemranges != vsp->vsp_nmemranges)
+               goto out;
+       n = vm->vm_nmemranges;
+
+       /* Check their types, sizes, and gpa's (implying page alignment). */
+       for (i = 0; i < n; i++) {
+               src = &vm->vm_memranges[i];
+               dst = &vsp->vsp_memranges[i];
+
+               /*
+                * The vm memranges were already checked during creation, so
+                * compare to them to confirm validity of mapping request.
+                */
+               if (src->vmr_type != dst->vmr_type)
+                       goto out;
+               if (src->vmr_gpa != dst->vmr_gpa)
+                       goto out;
+               if (src->vmr_size != dst->vmr_size)
+                       goto out;
+
+               /* Check our intended destination is page-aligned. */
+               if (dst->vmr_va & PAGE_MASK)
+                       goto out;
+       }
+
+       /*
+        * Share each range individually with the calling process. We do
+        * not need PROC_EXEC as the emulated devices do not need to execute
+        * instructions from guest memory.
+        */
+       for (i = 0; i < n; i++) {
+               src = &vm->vm_memranges[i];
+               dst = &vsp->vsp_memranges[i];
+
+               /* Skip MMIO range. */
+               if (src->vmr_type == VM_MEM_MMIO)
+                       continue;
+
+               DPRINTF("sharing gpa=0x%lx for pid %d @ va=0x%lx\n",
+                   src->vmr_gpa, p->p_p->ps_pid, dst->vmr_va);
+               ret = uvm_share(&p->p_vmspace->vm_map, dst->vmr_va,
+                   PROT_READ | PROT_WRITE, vm->vm_map, src->vmr_gpa,
+                   src->vmr_size);
+               if (ret) {
+                       printf("%s: uvm_share failed (%d)\n", __func__, ret);
+                       break;
+               }
+       }
+       ret = 0;
+out:
+       refcnt_rele_wake(&vm->vm_refcnt);
+       return (ret);
+}
blob - d2355d42b44b51044901de9d0adc0239586f37b8
blob + 7b3b0d77ad550165b6e53f4de66723366d689b23
--- sys/dev/vmm/vmm.h
+++ sys/dev/vmm/vmm.h
@@ -76,6 +76,13 @@ struct vm_resetcpu_params {
        struct vcpu_reg_state   vrp_init_state;
 };

+struct vm_sharemem_params {
+       /* Input parameters to VMM_IOC_SHAREMEM */
+       uint32_t                vsp_vm_id;
+       size_t                  vsp_nmemranges;
+       struct vm_mem_range     vsp_memranges[VMM_MAX_MEM_RANGES];
+};
+
 /* IOCTL definitions */
 #define VMM_IOC_CREATE _IOWR('V', 1, struct vm_create_params) /* Create VM */
 #define VMM_IOC_RUN _IOWR('V', 2, struct vm_run_params) /* Run VCPU */
@@ -88,8 +95,8 @@ struct vm_resetcpu_params {
 #define VMM_IOC_READVMPARAMS _IOWR('V', 9, struct vm_rwvmparams_params)
 /* Set VM params */
 #define VMM_IOC_WRITEVMPARAMS _IOW('V', 10, struct vm_rwvmparams_params)
+#define VMM_IOC_SHAREMEM _IOW('V', 11, struct vm_sharemem_params)

-
 #ifdef _KERNEL

 /* #define VMM_DEBUG */
@@ -194,6 +201,7 @@ int vcpu_must_stop(struct vcpu *);
 int vm_terminate(struct vm_terminate_params *);
 int vm_resetcpu(struct vm_resetcpu_params *);
 int vcpu_must_stop(struct vcpu *);
+int vm_share_mem(struct vm_sharemem_params *, struct proc *);

 #endif /* _KERNEL */
 #endif /* DEV_VMM_H */
blob - 9373a135aa87755f0e34b821af4ab8c0f4970421
blob + dd0efc2fd71bb0ac91d2e389a30e779d8d6c6c0d
--- usr.sbin/vmd/vioblk.c
+++ usr.sbin/vmd/vioblk.c
@@ -58,7 +58,7 @@ vioblk_main(int fd)
 }

 __dead void
-vioblk_main(int fd)
+vioblk_main(int fd, int fd_vmm)
 {
        struct virtio_dev        dev;
        struct vioblk_dev       *vioblk;
@@ -71,8 +71,11 @@ vioblk_main(int fd)

        log_procinit("vioblk");

-       /* stdio - needed for read/write to disk fds and channels to the vm. */
-       if (pledge("stdio", NULL) == -1)
+       /*
+        * stdio - needed for read/write to disk fds and channels to the vm.
+        * vmm + proc - needed to create shared vm mappings.
+        */
+       if (pledge("stdio vmm proc", NULL) == -1)
                fatal("pledge");

        /* Receive our virtio_dev, mostly preconfigured. */
@@ -92,8 +95,9 @@ vioblk_main(int fd)
        vioblk = &dev.vioblk;

        log_debug("%s: got viblk dev. num disk fds = %d, sync fd = %d, "
-           "async fd = %d, sz = %lld maxfer = %d", __func__, vioblk->ndisk_fd,
-           dev.sync_fd, dev.async_fd, vioblk->sz, vioblk->max_xfer);
+           "async fd = %d, sz = %lld maxfer = %d, vmm fd = %d", __func__,
+           vioblk->ndisk_fd, dev.sync_fd, dev.async_fd, vioblk->sz,
+           vioblk->max_xfer, fd_vmm);

        /* Receive our vm information from the vm process. */
        memset(&vm, 0, sizeof(vm));
@@ -108,12 +112,19 @@ vioblk_main(int fd)
        setproctitle("%s/vioblk[%d]", vcp->vcp_name, vioblk->idx);

        /* Now that we have our vm information, we can remap memory. */
-       ret = remap_guest_mem(&vm);
+       ret = remap_guest_mem(&vm, fd_vmm);
        if (ret) {
                log_warnx("failed to remap guest memory");
                goto fail;
        }

+       /*
+        * We no longer need /dev/vmm access.
+        */
+       close_fd(fd_vmm);
+       if (pledge("stdio", NULL) == -1)
+               fatal("pledge2");
+
        /* Initialize the virtio block abstractions. */
        type = vm.vm_params.vmc_disktypes[vioblk->idx];
        switch (type) {
blob - 6ce905fdccfa7befb49353c23628227fdc74c486
blob + d75dc06b9bc0133ace3e74de85abba3b62f539dc
--- usr.sbin/vmd/vionet.c
+++ usr.sbin/vmd/vionet.c
@@ -61,7 +61,7 @@ vionet_main(int fd)
 static void handle_sync_io(int, short, void *);

 __dead void
-vionet_main(int fd)
+vionet_main(int fd, int fd_vmm)
 {
        struct virtio_dev        dev;
        struct vionet_dev       *vionet = NULL;
@@ -73,8 +73,11 @@ vionet_main(int fd)

        log_procinit("vionet");

-       /* stdio - needed for read/write to tap fd and channels to the vm. */
-       if (pledge("stdio", NULL) == -1)
+       /*
+        * stdio - needed for read/write to disk fds and channels to the vm.
+        * vmm + proc - needed to create shared vm mappings.
+        */
+       if (pledge("stdio vmm proc", NULL) == -1)
                fatal("pledge");

        /* Receive our vionet_dev, mostly preconfigured. */
@@ -92,8 +95,9 @@ vionet_main(int fd)
        dev.sync_fd = fd;
        vionet = &dev.vionet;

-       log_debug("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d",
-           __func__, vionet->data_fd, dev.sync_fd, dev.async_fd);
+       log_debug("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d"
+           ", vmm fd = %d", __func__, vionet->data_fd, dev.sync_fd,
+           dev.async_fd, fd_vmm);

        /* Receive our vm information from the vm process. */
        memset(&vm, 0, sizeof(vm));
@@ -108,10 +112,19 @@ vionet_main(int fd)
        setproctitle("%s/vionet[%d]", vcp->vcp_name, vionet->idx);

        /* Now that we have our vm information, we can remap memory. */
-       ret = remap_guest_mem(&vm);
-       if (ret)
+       ret = remap_guest_mem(&vm, fd_vmm);
+       if (ret) {
+               fatal("%s: failed to remap", __func__);
                goto fail;
+       }

+       /*
+        * We no longer need /dev/vmm access.
+        */
+       close_fd(fd_vmm);
+       if (pledge("stdio", NULL) == -1)
+               fatal("pledge2");
+
        /* If we're restoring hardware, re-initialize virtqueue hva's. */
        if (vm.vm_state & VM_STATE_RECEIVED) {
                struct virtio_vq_info *vq_info;
blob - 92e77b8f83431c2ff52824e35149477909612653
blob + e3f6d1371ab7795df6e9d4b370ffd1a7c5afbde6
--- usr.sbin/vmd/virtio.c
+++ usr.sbin/vmd/virtio.c
@@ -1297,7 +1297,7 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev
 static int
 virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev)
 {
-       char *nargv[8], num[32], t[2];
+       char *nargv[10], num[32], vmm_fd[32], t[2];
        pid_t dev_pid;
        int data_fds[VM_MAX_BASE_PER_DISK], sync_fds[2], async_fds[2], ret = 0;
        size_t i, j, data_fds_sz, sz = 0;
@@ -1483,6 +1483,8 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev
                memset(&nargv, 0, sizeof(nargv));
                memset(num, 0, sizeof(num));
                snprintf(num, sizeof(num), "%d", sync_fds[1]);
+               memset(vmm_fd, 0, sizeof(vmm_fd));
+               snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd);

                t[0] = dev->dev_type;
                t[1] = '\0';
@@ -1492,13 +1494,15 @@ virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev
                nargv[2] = num;
                nargv[3] = "-t";
                nargv[4] = t;
-               nargv[5] = "-n";
+               nargv[5] = "-i";
+               nargv[6] = vmm_fd;
+               nargv[7] = "-n";

                if (env->vmd_verbose) {
-                       nargv[6] = "-v";
-                       nargv[7] = NULL;
+                       nargv[8] = "-v";
+                       nargv[9] = NULL;
                } else
-                       nargv[6] = NULL;
+                       nargv[8] = NULL;

                /* Control resumes in vmd.c:main(). */
                execvp(nargv[0], nargv);
blob - d42abb5a834cb5f6d9777d8b23f92a6a1c5930f2
blob + 48ec88c37db24290e069bf0ab9249df764b2c424
--- usr.sbin/vmd/vm.c
+++ usr.sbin/vmd/vm.c
@@ -218,9 +218,10 @@ static const struct vcpu_reg_state vcpu_init_flat16 =
  * Primary entrypoint for launching a vm. Does not return.
  *
  * fd: file descriptor for communicating with vmm process.
+ * fd_vmm: file descriptor for communicating with vmm(4) device
  */
 void
-vm_main(int fd)
+vm_main(int fd, int vmm_fd)
 {
        struct vm_create_params *vcp = NULL;
        struct vmd_vm            vm;
@@ -241,9 +242,8 @@ vm_main(int fd)
         * vmm - for the vmm ioctls and operations.
         * proc exec - fork/exec for launching devices.
         * recvfd - for vm send/recv and sending fd to devices.
-        * tmppath/rpath - for shm_mkstemp, ftruncate, unlink
         */
-       if (pledge("stdio vmm proc exec recvfd tmppath rpath", NULL) == -1)
+       if (pledge("stdio vmm proc exec recvfd", NULL) == -1)
                fatal("pledge");

        /* Receive our vm configuration. */
@@ -254,13 +254,6 @@ vm_main(int fd)
                _exit(EIO);
        }

-       /* Receive the /dev/vmm fd number. */
-       sz = atomicio(read, fd, &env->vmd_fd, sizeof(env->vmd_fd));
-       if (sz != sizeof(env->vmd_fd)) {
-               log_warnx("failed to receive /dev/vmm fd");
-               _exit(EIO);
-       }
-
        /* Update process with the vm name. */
        vcp = &vm.vm_params.vmc_params;
        setproctitle("%s", vcp->vcp_name);
@@ -1099,63 +1092,34 @@ alloc_guest_mem(struct vmd_vm *vm)
 alloc_guest_mem(struct vmd_vm *vm)
 {
        void *p;
-       char *tmp;
-       int fd, ret = 0;
+       int ret = 0;
        size_t i, j;
        struct vm_create_params *vcp = &vm->vm_params.vmc_params;
        struct vm_mem_range *vmr;

-       tmp = calloc(32, sizeof(char));
-       if (tmp == NULL) {
-               ret = errno;
-               log_warn("%s: calloc", __func__);
-               return (ret);
-       }
-       strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32);
-
-       vm->vm_nmemfds = vcp->vcp_nmemranges;
-
        for (i = 0; i < vcp->vcp_nmemranges; i++) {
                vmr = &vcp->vcp_memranges[i];

-               fd = shm_mkstemp(tmp);
-               if (fd < 0) {
-                       ret = errno;
-                       log_warn("%s: shm_mkstemp", __func__);
-                       return (ret);
-               }
-               if (ftruncate(fd, vmr->vmr_size) == -1) {
-                       ret = errno;
-                       log_warn("%s: ftruncate", __func__);
-                       goto out;
-               }
-               if (fcntl(fd, F_SETFD, 0) == -1) {
-                       ret = errno;
-                       log_warn("%s: fcntl", __func__);
-                       goto out;
-               }
-               if (shm_unlink(tmp) == -1) {
-                       ret = errno;
-                       log_warn("%s: shm_unlink", __func__);
-                       goto out;
-               }
-               strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32);
-
+               /*
+                * We only need R/W as userland. vmm(4) will use R/W/X in its
+                * mapping.
+                *
+                * We must use MAP_SHARED so emulated devices will be able
+                * to generate shared mappings.
+                */
                p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
-                   MAP_SHARED | MAP_CONCEAL, fd, 0);
+                   MAP_ANON | MAP_CONCEAL | MAP_SHARED, -1, 0);
                if (p == MAP_FAILED) {
                        ret = errno;
                        for (j = 0; j < i; j++) {
                                vmr = &vcp->vcp_memranges[j];
                                munmap((void *)vmr->vmr_va, vmr->vmr_size);
                        }
-                       goto out;
+                       return (ret);
                }
-               vm->vm_memfds[i] = fd;
                vmr->vmr_va = (vaddr_t)p;
        }
-out:
-       free(tmp);
+
        return (ret);
 }

@@ -2552,10 +2516,11 @@ remap_guest_mem(struct vmd_vm *vm)
  * Returns 0 on success, non-zero in event of failure.
  */
 int
-remap_guest_mem(struct vmd_vm *vm)
+remap_guest_mem(struct vmd_vm *vm, int vmm_fd)
 {
        struct vm_create_params *vcp;
        struct vm_mem_range     *vmr;
+       struct vm_sharemem_params vsp;
        size_t                   i, j;
        void                    *p = NULL;
        int                      ret;
@@ -2566,23 +2531,32 @@ remap_guest_mem(struct vmd_vm *vm)
        vcp = &vm->vm_params.vmc_params;

        /*
-        * We've execve'd, so we need to re-map the guest VM memory. Iterate
-        * over all possible vm_mem_range entries so we can initialize all
-        * file descriptors to a value.
+        * Initialize our VM shared memory request using our original
+        * creation parameters. We'll overwrite the va's after mmap(2).
         */
+       memset(&vsp, 0, sizeof(vsp));
+       vsp.vsp_nmemranges = vcp->vcp_nmemranges;
+       vsp.vsp_vm_id = vcp->vcp_id;
+       memcpy(&vsp.vsp_memranges, &vcp->vcp_memranges,
+           sizeof(vsp.vsp_memranges));
+
+       /*
+        * Use mmap(2) to identify virtual address space for our mappings.
+        */
        for (i = 0; i < VMM_MAX_MEM_RANGES; i++) {
-               if (i < vcp->vcp_nmemranges) {
-                       vmr = &vcp->vcp_memranges[i];
-                       /* Skip ranges we know we don't need right now. */
+               if (i < vsp.vsp_nmemranges) {
+                       vmr = &vsp.vsp_memranges[i];
+
+                       /* Ignore any MMIO ranges. */
                        if (vmr->vmr_type == VM_MEM_MMIO) {
-                               log_debug("%s: skipping range i=%ld, type=%d",
-                                   __func__, i, vmr->vmr_type);
-                               vm->vm_memfds[i] = -1;
+                               vmr->vmr_va = 0;
+                               vcp->vcp_memranges[i].vmr_va = 0;
                                continue;
                        }
-                       /* Re-mmap the memrange. */
-                       p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
-                           MAP_SHARED | MAP_CONCEAL, vm->vm_memfds[i], 0);
+
+                       /* Make initial mappings for the memrange. */
+                       p = mmap(NULL, vmr->vmr_size, PROT_READ, MAP_ANON, -1,
+                           0);
                        if (p == MAP_FAILED) {
                                ret = errno;
                                log_warn("%s: mmap", __func__);
@@ -2594,11 +2568,29 @@ remap_guest_mem(struct vmd_vm *vm)
                                return (ret);
                        }
                        vmr->vmr_va = (vaddr_t)p;
-               } else {
-                       /* Initialize with an invalid fd. */
-                       vm->vm_memfds[i] = -1;
+                       vcp->vcp_memranges[i].vmr_va = vmr->vmr_va;
                }
        }

+       /*
+        * munmap(2) now that we have va's and ranges that don't overlap. vmm
+        * will use the va's and sizes to recreate the mappings for us.
+        */
+       for (i = 0; i < vsp.vsp_nmemranges; i++) {
+               vmr = &vsp.vsp_memranges[i];
+               if (vmr->vmr_type == VM_MEM_MMIO)
+                       continue;
+               if (munmap((void*)vmr->vmr_va, vmr->vmr_size) == -1)
+                       fatal("%s: munmap", __func__);
+       }
+
+       /*
+        * Ask vmm to enter the shared mappings for us. They'll point
+        * to the same host physical memory, but will have a randomized
+        * virtual address for the calling process.
+        */
+       if (ioctl(vmm_fd, VMM_IOC_SHAREMEM, &vsp) == -1)
+               return (errno);
+
        return (0);
 }
blob - c06fe974877c17710a81cd69bed541f908e76ef4
blob + 0d0a66533b05a80aafba31d415e97c0fb3a6be88
--- usr.sbin/vmd/vmd.c
+++ usr.sbin/vmd/vmd.c
@@ -774,7 +774,8 @@ main(int argc, char **argv)
        struct privsep          *ps;
        int                      ch;
        enum privsep_procid      proc_id = PROC_PARENT;
-       int                      proc_instance = 0, vm_launch = 0, vm_fd = -1;
+       int                      proc_instance = 0, vm_launch = 0;
+       int                      vmm_fd = -1, vm_fd = -1;
        const char              *errp, *title = NULL;
        int                      argc0 = argc;
        char                     dev_type = '\0';
@@ -784,7 +785,7 @@ main(int argc, char **argv)
        if ((env = calloc(1, sizeof(*env))) == NULL)
                fatal("calloc: env");

-       while ((ch = getopt(argc, argv, "D:P:I:V:X:df:nt:v")) != -1) {
+       while ((ch = getopt(argc, argv, "D:P:I:V:X:df:i:nt:v")) != -1) {
                switch (ch) {
                case 'D':
                        if (cmdline_symset(optarg) < 0)
@@ -838,6 +839,11 @@ main(int argc, char **argv)
                        default: fatalx("invalid device type");
                        }
                        break;
+               case 'i':
+                       vmm_fd = strtonum(optarg, 0, 128, &errp);
+                       if (errp)
+                               fatalx("invalid vmm fd");
+                       break;
                default:
                        usage();
                }
@@ -866,7 +872,7 @@ main(int argc, char **argv)

        ps = &env->vmd_ps;
        ps->ps_env = env;
-       env->vmd_fd = -1;
+       env->vmd_fd = vmm_fd;

        if (config_init(env) == -1)
                fatal("failed to initialize configuration");
@@ -882,14 +888,14 @@ main(int argc, char **argv)
         * If we're launching a new vm or its device, we short out here.
         */
        if (vm_launch == VMD_LAUNCH_VM) {
-               vm_main(vm_fd);
+               vm_main(vm_fd, vmm_fd);
                /* NOTREACHED */
        } else if (vm_launch == VMD_LAUNCH_DEV) {
                if (dev_type == VMD_DEVTYPE_NET) {
-                       vionet_main(vm_fd);
+                       vionet_main(vm_fd, vmm_fd);
                        /* NOTREACHED */
                } else if (dev_type == VMD_DEVTYPE_DISK) {
-                       vioblk_main(vm_fd);
+                       vioblk_main(vm_fd, vmm_fd);
                        /* NOTREACHED */
                }
                fatalx("unsupported device type '%c'", dev_type);
blob - 68de0544706a5864aec480590191b33904864053
blob + 61b0cff0c62c9ed752a2128bea8b12bc34f918d7
--- usr.sbin/vmd/vmd.h
+++ usr.sbin/vmd/vmd.h
@@ -329,9 +329,6 @@ struct vmd_vm {
        struct timeval           vm_start_tv;
        int                      vm_start_limit;

-       int                      vm_memfds[VMM_MAX_MEM_RANGES];
-       size_t                   vm_nmemfds;
-
        TAILQ_ENTRY(vmd_vm)      vm_entry;
 };
 TAILQ_HEAD(vmlist, vmd_vm);
@@ -488,7 +485,7 @@ void         vm_main(int);
 int     vmm_pipe(struct vmd_vm *, int, void (*)(int, short, void *));

 /* vm.c */
-void    vm_main(int);
+void    vm_main(int, int);
 void    mutex_lock(pthread_mutex_t *);
 void    mutex_unlock(pthread_mutex_t *);
 int     read_mem(paddr_t, void *buf, size_t);
@@ -499,7 +496,7 @@ int  remap_guest_mem(struct vmd_vm *);
 enum pipe_msg_type vm_pipe_recv(struct vm_dev_pipe *);
 int     write_mem(paddr_t, const void *buf, size_t);
 void*   hvaddr_mem(paddr_t, size_t);
-int     remap_guest_mem(struct vmd_vm *);
+int     remap_guest_mem(struct vmd_vm *, int);

 /* config.c */
 int     config_init(struct vmd *);
@@ -527,9 +524,9 @@ __dead void vionet_main(int);
 int     virtio_get_base(int, char *, size_t, int, const char *);

 /* vionet.c */
-__dead void vionet_main(int);
+__dead void vionet_main(int, int);

 /* vioblk.c */
-__dead void vioblk_main(int);
+__dead void vioblk_main(int, int);

 #endif /* VMD_H */
blob - 35119673dc31b82aec55e6dd8ef12eff3ef2beef
blob + 7f6fe9040ad8b0c6774255a7d02f96322ea5e421
--- usr.sbin/vmd/vmm.c
+++ usr.sbin/vmd/vmm.c
@@ -627,7 +627,7 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p
 {
        struct vm_create_params *vcp;
        struct vmd_vm           *vm;
-       char                    *nargv[6], num[32];
+       char                    *nargv[8], num[32], vmm_fd[32];
        int                      fd, ret = EINVAL;
        int                      fds[2];
        pid_t                    vm_pid;
@@ -701,16 +701,6 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p
                if (ret == EIO)
                        goto err;

-               /* Send the fd number for /dev/vmm. */
-               sz = atomicio(vwrite, fds[0], &env->vmd_fd,
-                   sizeof(env->vmd_fd));
-               if (sz != sizeof(env->vmd_fd)) {
-                       log_warnx("%s: failed to send /dev/vmm fd for vm '%s'",
-                           __func__, vcp->vcp_name);
-                       ret = EIO;
-                       goto err;
-               }
-
                /* Read back the kernel-generated vm id from the child */
                sz = atomicio(read, fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id));
                if (sz != sizeof(vcp->vcp_id)) {
@@ -773,17 +763,21 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p
                memset(&nargv, 0, sizeof(nargv));
                memset(num, 0, sizeof(num));
                snprintf(num, sizeof(num), "%d", fds[1]);
+               memset(vmm_fd, 0, sizeof(vmm_fd));
+               snprintf(vmm_fd, sizeof(vmm_fd), "%d", env->vmd_fd);

                nargv[0] = env->argv0;
                nargv[1] = "-V";
                nargv[2] = num;
                nargv[3] = "-n";
+               nargv[4] = "-i";
+               nargv[5] = vmm_fd;

                if (env->vmd_verbose) {
-                       nargv[4] = "-v";
-                       nargv[5] = NULL;
+                       nargv[6] = "-v";
+                       nargv[7] = NULL;
                } else
-                       nargv[4] = NULL;
+                       nargv[6] = NULL;

                /* Control resumes in vmd main(). */
                execvp(nargv[0], nargv);

vmm/vmd: use anon shared mappings for devices

Reply via email to