Dave Voutila <d...@sisu.io> writes:

> In vmd, the vmm process forks to create the resulting vm process. After
> this fork, the vmm parent process closes all the file descriptors
> pointing to the vm's devices (cdrom, kernel, disks, nics, etc.).
>
> The logic was a bit funky, so this change relies on the fact we can
> attempt the close(2) call and use its success/failure to determine if we
> have an fd to mark -1 in the vm structure. (I guess we could just
> blindly set them to -1 post-close, but this feels more sensical to me.)
>
> While in the function, it cleans up some commentary and logging around
> the actual fork(2) call. Since fork(2) sets errno, we can use it in the
> log message, too.
>
> This reduces some noise in my upcoming diff to introduce execvp to the
> forking of child vm processes (as presented at AsiaBSDCon).
>

Touch longer, but won't generate ktrace noise by blind calls to close(2)
and also accounts for the other error conditions (EINTR, EIO).

For EIO, not sure yet how we want to handle it other than log it.

For EINTR, we want to account for that race and make sure we retry since
the vmm process is long-lived and could inadvertently keep things like
tty fds or disk image fds open after the guest vm terminates.

I need to check on other calls to close(2) in vmd, but most fds are
passed via imsg, so I'd presume any interruption is handled in the
kernel. (Someone correct me if I'm wrong.)


diff refs/heads/master refs/heads/vmd-vmm-fd-dance
commit - 8371c6b4d5765a69d520f93d64f57c6a2989f9ae
commit + 9c07553a618956ba89f29b93906dcd6ea6c19de8
blob - 75af37a29a6cd4917d8c4ca9be35c48772314f4b
blob + e0af71dac5d730c7b88166384a95b94bf14fcfc4
--- usr.sbin/vmd/vmd.c
+++ usr.sbin/vmd/vmd.c
@@ -1956,3 +1956,21 @@ vm_terminate(struct vmd_vm *vm, const char *caller)
                vm_remove(vm, caller);
        }
 }
+
+int
+close_fd(int fd)
+{
+       int     ret;
+
+       if (fd == -1)
+               return (0);
+
+       do
+               ret = close(fd);
+       while (ret == -1 && errno == EINTR);
+
+       if (ret == -1 && errno == EIO)
+               log_warn("%s(%d)", __func__, fd);
+
+       return (ret);
+}
blob - 7bbbf62734bc7cd575c4fee384193037b0495bb4
blob + 7228ace4b31a9fd6b5eb90bf1d048198a03a04fc
--- usr.sbin/vmd/vmd.h
+++ usr.sbin/vmd/vmd.h
@@ -443,6 +443,7 @@ void         getmonotime(struct timeval *);
 uint32_t prefixlen2mask(uint8_t);
 void    prefixlen2mask6(u_int8_t, struct in6_addr *);
 void    getmonotime(struct timeval *);
+int     close_fd(int);

 /* priv.c */
 void    priv(struct privsep *, struct privsep_proc *);
blob - d9eff3c8f703854c7b1e49fee04b8e426956cfbb
blob + 7db920beec7e34a63f5ba3602f17185eec33d3f7
--- usr.sbin/vmd/vmm.c
+++ usr.sbin/vmd/vmm.c
@@ -641,12 +641,10 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p
        if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, fds) == -1)
                fatal("socketpair");

-       /* Start child vmd for this VM (fork, chroot, drop privs) */
+       /* Fork the vmm process to create the vm, inheriting open device fds. */
        vm_pid = fork();
-
-       /* Start child failed? - cleanup and leave */
        if (vm_pid == -1) {
-               log_warnx("%s: start child failed", __func__);
+               log_warn("%s: fork child failed", __func__);
                ret = EIO;
                goto err;
        }
@@ -654,31 +652,24 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p
        if (vm_pid > 0) {
                /* Parent */
                vm->vm_pid = vm_pid;
-               close(fds[1]);
+               close_fd(fds[1]);

                for (i = 0 ; i < vcp->vcp_ndisks; i++) {
                        for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
-                               if (vm->vm_disks[i][j] != -1)
-                                       close(vm->vm_disks[i][j]);
-                               vm->vm_disks[i][j] = -1;
+                               if (close_fd(vm->vm_disks[i][j]) == 0)
+                                   vm->vm_disks[i][j] = -1;
                        }
                }
                for (i = 0 ; i < vcp->vcp_nnics; i++) {
-                       close(vm->vm_ifs[i].vif_fd);
-                       vm->vm_ifs[i].vif_fd = -1;
+                       if (close_fd(vm->vm_ifs[i].vif_fd) == 0)
+                           vm->vm_ifs[i].vif_fd = -1;
                }
-               if (vm->vm_kernel != -1) {
-                       close(vm->vm_kernel);
+               if (close_fd(vm->vm_kernel) == 0)
                        vm->vm_kernel = -1;
-               }
-               if (vm->vm_cdrom != -1) {
-                       close(vm->vm_cdrom);
+               if (close_fd(vm->vm_cdrom) == 0)
                        vm->vm_cdrom = -1;
-               }
-               if (vm->vm_tty != -1) {
-                       close(vm->vm_tty);
+               if (close_fd(vm->vm_tty) == 0)
                        vm->vm_tty = -1;
-               }

                /* Read back the kernel-generated vm id from the child */
                sz = atomicio(read, fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id));
@@ -702,8 +693,8 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p
                return (0);
        } else {
                /* Child */
-               close(fds[0]);
-               close(PROC_PARENT_SOCK_FILENO);
+               close_fd(fds[0]);
+               close_fd(PROC_PARENT_SOCK_FILENO);

                ret = start_vm(vm, fds[1]);

Reply via email to