Dave Voutila writes:
> Dave Voutila writes: > >> vmd(8) users of tech@, >> >> NOTE: I have no intention to try to commit this prior to 6.9's release >> due to its complexity, but I didn't want to "wait" to solicit testers or >> potential feedback. > > Freeze is over, so bumping this thread with an updated diff below. > Now that there's been some testing and snaps are building once again, anyone willing to review & OK? >> >> I noticed recently that I could not have two vmctl(8) clients "wait" for >> the same vm to shutdown as one would cancel the other. Worse yet, if you >> cancel a wait (^C) you can effectively corrupt the state being used for >> tracking the waiting client preventing future clients from waiting on >> the vm. >> >> It turns out the socket fd of the vmctl(8) client is being sent by the >> control process as the peerid in the imsg. This fd is being stored on >> the vmd_vm structure in the vm_peerid member, but this fd only has >> meaning in the scope of the control process. Consequently: >> >> - only 1 value can be stored at a time, meaning only 1 waiting client >> can exist at a time >> - since vm_peerid is used for storing if another vmd(8) process is >> waiting on the vm, vm_peerid can be corrupted by vmctl(8) >> - the control process cannot update waiting state on vmctl disconnects >> and since fd's are reused it's possible the message could be sent to a >> vmctl(8) client performing an operation other than a "wait" >> >> The below diff: >> >> 1. enables support for multiple vmctl(8) clients to wait on the same vm >> to terminate >> 2. keeps the wait state in the control process and out of the parent's >> global vm state, tracking waiting parties in a TAILQ >> 3. removes the waiting client state on client disconnect/cancellation >> 4. simplifies vmd(8) by removing IMSG_VMDOP_WAIT_VM_REQUEST handling >> from the vmm process, which isn't needed (and was partially >> responsible for the corruption) >> > > Above design still stands, but I've fixed some messaging issues related > to the fact the parent process was forwarding > IMSG_VMDOP_TERMINATE_VM_RESPONSE messages directly to the control > process resulting in duplicate messages. This broke doing a `vmctl stop` > for all vms (-a) and waiting (-w). It now only forwards errors. > >> There are some subsequent tweaks that may follow this diff, specifically >> one related to the fact I've switched the logic to send >> IMSG_VMDOP_TERMINATE_VM_EVENT messages to the control process (which >> makes sense to me) but control relays a IMSG_VMDOP_TERMINATE_VM_RESPONSE >> message to the waiting vmctl(8) client. I'd need to update vmctl(8) to >> look for the other event and don't want to complicate the diff further. >> >> If any testers out there can try to break this for me it would be much >> appreciated. :-) >> > > Testers? I'd like to give people a few days to kick the tires before > asking for OK to commit. > > -dv > > > Index: control.c > =================================================================== > RCS file: /cvs/src/usr.sbin/vmd/control.c,v > retrieving revision 1.34 > diff -u -p -r1.34 control.c > --- control.c 20 Apr 2021 21:11:56 -0000 1.34 > +++ control.c 21 Apr 2021 17:17:04 -0000 > @@ -41,6 +41,13 @@ > > struct ctl_connlist ctl_conns = TAILQ_HEAD_INITIALIZER(ctl_conns); > > +struct ctl_notify { > + int ctl_fd; > + uint32_t ctl_vmid; > + TAILQ_ENTRY(ctl_notify) entry; > +}; > +TAILQ_HEAD(ctl_notify_q, ctl_notify) ctl_notify_q = > + TAILQ_HEAD_INITIALIZER(ctl_notify_q); > void > control_accept(int, short, void *); > struct ctl_conn > @@ -78,7 +85,10 @@ int > control_dispatch_vmd(int fd, struct privsep_proc *p, struct imsg *imsg) > { > struct ctl_conn *c; > + struct ctl_notify *notify = NULL, *notify_next; > struct privsep *ps = p->p_ps; > + struct vmop_result vmr; > + int waiting = 0; > > switch (imsg->hdr.type) { > case IMSG_VMDOP_START_VM_RESPONSE: > @@ -86,11 +96,11 @@ control_dispatch_vmd(int fd, struct priv > case IMSG_VMDOP_SEND_VM_RESPONSE: > case IMSG_VMDOP_RECEIVE_VM_RESPONSE: > case IMSG_VMDOP_UNPAUSE_VM_RESPONSE: > - case IMSG_VMDOP_TERMINATE_VM_RESPONSE: > case IMSG_VMDOP_GET_INFO_VM_DATA: > case IMSG_VMDOP_GET_INFO_VM_END_DATA: > case IMSG_CTL_FAIL: > case IMSG_CTL_OK: > + /* Provide basic response back to a specific control client */ > if ((c = control_connbyfd(imsg->hdr.peerid)) == NULL) { > log_warnx("%s: lost control connection: fd %d", > __func__, imsg->hdr.peerid); > @@ -99,6 +109,61 @@ control_dispatch_vmd(int fd, struct priv > imsg_compose_event(&c->iev, imsg->hdr.type, > 0, 0, imsg->fd, imsg->data, IMSG_DATA_SIZE(imsg)); > break; > + case IMSG_VMDOP_TERMINATE_VM_RESPONSE: > + IMSG_SIZE_CHECK(imsg, &vmr); > + memcpy(&vmr, imsg->data, sizeof(vmr)); > + > + if ((c = control_connbyfd(imsg->hdr.peerid)) == NULL) { > + log_warnx("%s: lost control connection: fd %d", > + __func__, imsg->hdr.peerid); > + return (0); > + } > + > + TAILQ_FOREACH(notify, &ctl_notify_q, entry) { > + if (notify->ctl_fd == (int) imsg->hdr.peerid) { > + /* > + * Update if waiting by vm name. This is only > + * supported when stopping a single vm. If > + * stopping all vms, vmctl(8) sends the request > + * using the vmid. > + */ > + if (notify->ctl_vmid < 1) > + notify->ctl_vmid = vmr.vmr_id; > + waiting = 1; > + break; > + } > + } > + > + /* An error needs to be relayed to the client immediately */ > + if (!waiting || vmr.vmr_result) { > + imsg_compose_event(&c->iev, imsg->hdr.type, > + 0, 0, imsg->fd, imsg->data, IMSG_DATA_SIZE(imsg)); > + > + if (notify) { > + TAILQ_REMOVE(&ctl_notify_q, notify, entry); > + free(notify); > + } > + } > + break; > + case IMSG_VMDOP_TERMINATE_VM_EVENT: > + /* Notify any waiting clients that a VM terminated */ > + IMSG_SIZE_CHECK(imsg, &vmr); > + memcpy(&vmr, imsg->data, sizeof(vmr)); > + > + TAILQ_FOREACH_SAFE(notify, &ctl_notify_q, entry, notify_next) { > + if (notify->ctl_vmid != vmr.vmr_id) > + continue; > + if ((c = control_connbyfd(notify->ctl_fd)) != NULL) { > + /* XXX vmctl expects *_RESPONSE, not *_EVENT */ > + imsg_compose_event(&c->iev, > + IMSG_VMDOP_TERMINATE_VM_RESPONSE, > + 0, 0, imsg->fd, imsg->data, > + IMSG_DATA_SIZE(imsg)); > + TAILQ_REMOVE(&ctl_notify_q, notify, entry); > + free(notify); > + } > + } > + break; > case IMSG_VMDOP_CONFIG: > config_getconfig(ps->ps_env, imsg); > proc_compose(ps, PROC_PARENT, IMSG_VMDOP_DONE, NULL, 0); > @@ -276,7 +341,8 @@ control_connbyfd(int fd) > void > control_close(int fd, struct control_sock *cs) > { > - struct ctl_conn *c; > + struct ctl_conn *c; > + struct ctl_notify *notify, *notify_next; > > if ((c = control_connbyfd(fd)) == NULL) { > log_warn("%s: fd %d: not found", __func__, fd); > @@ -286,6 +352,14 @@ control_close(int fd, struct control_soc > msgbuf_clear(&c->iev.ibuf.w); > TAILQ_REMOVE(&ctl_conns, c, entry); > > + TAILQ_FOREACH_SAFE(notify, &ctl_notify_q, entry, notify_next) { > + if (notify->ctl_fd == fd) { > + TAILQ_REMOVE(&ctl_notify_q, notify, entry); > + free(notify); > + break; > + } > + } > + > event_del(&c->iev.ev); > close(c->iev.ibuf.fd); > > @@ -308,7 +382,8 @@ control_dispatch_imsg(int fd, short even > struct imsg imsg; > struct vmop_create_params vmc; > struct vmop_id vid; > - int n, v, ret = 0; > + struct ctl_notify *notify; > + int n, v, wait = 0, ret = 0; > > if ((c = control_connbyfd(fd)) == NULL) { > log_warn("%s: fd %d: not found", __func__, fd); > @@ -388,11 +463,25 @@ control_dispatch_imsg(int fd, short even > } > break; > case IMSG_VMDOP_WAIT_VM_REQUEST: > + wait = 1; > + /* FALLTHROUGH */ > case IMSG_VMDOP_TERMINATE_VM_REQUEST: > if (IMSG_DATA_SIZE(&imsg) < sizeof(vid)) > goto fail; > memcpy(&vid, imsg.data, sizeof(vid)); > vid.vid_uid = c->peercred.uid; > + > + if (wait || vid.vid_flags & VMOP_WAIT) { > + vid.vid_flags |= VMOP_WAIT; > + notify = calloc(1, sizeof(struct ctl_notify)); > + if (notify == NULL) > + fatal("%s: calloc", __func__); > + notify->ctl_vmid = vid.vid_id; > + notify->ctl_fd = fd; > + TAILQ_INSERT_TAIL(&ctl_notify_q, notify, entry); > + log_debug("%s: registered wait for peer %d", > + __func__, fd); > + } > > if (proc_compose_imsg(ps, PROC_PARENT, -1, > imsg.hdr.type, fd, -1, &vid, sizeof(vid)) == -1) { > Index: vmd.c > =================================================================== > RCS file: /cvs/src/usr.sbin/vmd/vmd.c,v > retrieving revision 1.122 > diff -u -p -r1.122 vmd.c > --- vmd.c 5 Apr 2021 11:35:26 -0000 1.122 > +++ vmd.c 21 Apr 2021 17:17:04 -0000 > @@ -128,42 +128,41 @@ vmd_dispatch_control(int fd, struct priv > IMSG_SIZE_CHECK(imsg, &vid); > memcpy(&vid, imsg->data, sizeof(vid)); > flags = vid.vid_flags; > + cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; > > if ((id = vid.vid_id) == 0) { > /* Lookup vm (id) by name */ > if ((vm = vm_getbyname(vid.vid_name)) == NULL) { > res = ENOENT; > - cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; > break; > } else if ((vm->vm_state & VM_STATE_SHUTDOWN) && > (flags & VMOP_FORCE) == 0) { > res = EALREADY; > - cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; > break; > } else if (!(vm->vm_state & VM_STATE_RUNNING)) { > res = EINVAL; > - cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; > break; > } > id = vm->vm_vmid; > } else if ((vm = vm_getbyvmid(id)) == NULL) { > res = ENOENT; > - cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; > break; > } > - if (vm_checkperm(vm, &vm->vm_params.vmc_owner, > - vid.vid_uid) != 0) { > + if (vm_checkperm(vm, &vm->vm_params.vmc_owner, vid.vid_uid)) { > res = EPERM; > - cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; > break; > } > > - memset(&vid, 0, sizeof(vid)); > - vid.vid_id = id; > - vid.vid_flags = flags; > - if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type, > - imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1) > - return (-1); > + /* Only relay TERMINATION requests, not WAIT requests */ > + if (imsg->hdr.type == IMSG_VMDOP_TERMINATE_VM_REQUEST) { > + memset(&vid, 0, sizeof(vid)); > + vid.vid_id = id; > + vid.vid_flags = flags; > + > + if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type, > + imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1) > + return (-1); > + } > break; > case IMSG_VMDOP_GET_INFO_VM_REQUEST: > proc_forward_imsg(ps, imsg, PROC_VMM, -1); > @@ -420,12 +419,14 @@ vmd_dispatch_vmm(int fd, struct privsep_ > case IMSG_VMDOP_TERMINATE_VM_RESPONSE: > IMSG_SIZE_CHECK(imsg, &vmr); > memcpy(&vmr, imsg->data, sizeof(vmr)); > - DPRINTF("%s: forwarding TERMINATE VM for vm id %d", > - __func__, vmr.vmr_id); > - proc_forward_imsg(ps, imsg, PROC_CONTROL, -1); > - if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) > - break; > - if (vmr.vmr_result == 0) { > + > + if (vmr.vmr_result) { > + DPRINTF("%s: forwarding TERMINATE VM for vm id %d", > + __func__, vmr.vmr_id); > + proc_forward_imsg(ps, imsg, PROC_CONTROL, -1); > + } else { > + if ((vm = vm_getbyvmid(vmr.vmr_id)) == NULL) > + break; > /* Mark VM as shutting down */ > vm->vm_state |= VM_STATE_SHUTDOWN; > } > @@ -478,16 +479,13 @@ vmd_dispatch_vmm(int fd, struct privsep_ > config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid); > } > > - /* Send a response if a control client is waiting for it */ > - if (imsg->hdr.peerid != (uint32_t)-1) { > - /* the error is meaningless for deferred responses */ > - vmr.vmr_result = 0; > + /* The error is meaningless for deferred responses */ > + vmr.vmr_result = 0; > > - if (proc_compose_imsg(ps, PROC_CONTROL, -1, > - IMSG_VMDOP_TERMINATE_VM_RESPONSE, > - imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1) > - return (-1); > - } > + if (proc_compose_imsg(ps, PROC_CONTROL, -1, > + IMSG_VMDOP_TERMINATE_VM_EVENT, > + imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1) > + return (-1); > break; > case IMSG_VMDOP_GET_INFO_VM_DATA: > IMSG_SIZE_CHECK(imsg, &vir); > Index: vmm.c > =================================================================== > RCS file: /cvs/src/usr.sbin/vmd/vmm.c,v > retrieving revision 1.100 > diff -u -p -r1.100 vmm.c > --- vmm.c 11 Apr 2021 21:02:40 -0000 1.100 > +++ vmm.c 21 Apr 2021 17:17:04 -0000 > @@ -150,30 +150,6 @@ vmm_dispatch_parent(int fd, struct privs > res = ENOENT; > cmd = IMSG_VMDOP_START_VM_RESPONSE; > break; > - case IMSG_VMDOP_WAIT_VM_REQUEST: > - IMSG_SIZE_CHECK(imsg, &vid); > - memcpy(&vid, imsg->data, sizeof(vid)); > - id = vid.vid_id; > - > - DPRINTF("%s: recv'ed WAIT_VM for %d", __func__, id); > - > - cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE; > - if (id == 0) { > - res = ENOENT; > - } else if ((vm = vm_getbyvmid(id)) != NULL) { > - if (vm->vm_peerid != (uint32_t)-1) { > - peerid = vm->vm_peerid; > - res = EINTR; > - } else > - cmd = 0; > - vm->vm_peerid = imsg->hdr.peerid; > - } else { > - /* vm doesn't exist, cannot stop vm */ > - log_debug("%s: cannot stop vm that is not running", > - __func__); > - res = VMD_VM_STOP_INVALID; > - } > - break; > case IMSG_VMDOP_TERMINATE_VM_REQUEST: > IMSG_SIZE_CHECK(imsg, &vid); > memcpy(&vid, imsg->data, sizeof(vid)); > @@ -221,15 +197,6 @@ vmm_dispatch_parent(int fd, struct privs > __func__); > res = VMD_VM_STOP_INVALID; > } > - } > - if ((flags & VMOP_WAIT) && > - res == 0 && (vm->vm_state & VM_STATE_SHUTDOWN)) { > - if (vm->vm_peerid != (uint32_t)-1) { > - peerid = vm->vm_peerid; > - res = EINTR; > - } else > - cmd = 0; > - vm->vm_peerid = imsg->hdr.peerid; > } > } else { > /* VM doesn't exist, cannot stop vm */