Re: don't remove known vmd vm's on failure
On Sun, Jan 15, 2023 at 09:08:29AM -0500, Dave Voutila wrote: > > Dave Voutila writes: > > > It turns out not only does vmd have numerous error paths for handling > > when something is amiss with a guest, most of the paths don't check if > > it's a known vm defined in vm.conf. > > > > As a result, vmd often removes the vm from the SLIST of vm's meaning > > one can't easily attempt to start it again or see it in vmctl's status > > output. > > > > A simple reproduction: > > > > 1. define a vm with memory > 4gb in vm.conf > > 2. run vmd in the foreground (doas vmd -d) so it's not started by rc.d > > 3. try to start with `vmctl start -c ${vm_name}`, you should trigger > > an ENOMEM and get the "Cannot allocate memory" message from vmctl. > > 4. try to start the same vm again...now you get EPERM! > > 5. the vm is no longer visible in the output from `vmctl status` :( > > > > The problem is most of the error paths call vm_remove, which not only > > tears down the vm via vm_stop, but also removes it from the vm list and > > frees it. Only clean stops or restarts seem to perform this check > > currently. > > > > Below diff refactors into checking if the vm is defined in the global > > config before deciding to call vm_stop or vm_remove. > > Slight tweak... __func__->caller to actually pass the correct name to > vm_{stop,remove}() from vm_terminate() > Finally getting caught up. ok mlarkin on this if you didn't commit it already. -ml > > diff refs/heads/master refs/heads/vmd-accounting > commit - d4e23fe7544b01187ebf3ac8ae32e955445ee666 > commit + 46503195403bfab50cd34bd8682f35a17d54d03d > blob - 6bffb2519a31464836aa573dbccb7aa14ea97722 > blob + f30dc14de1ff9d5cf121cbc08b6db183a06d0c07 > --- usr.sbin/vmd/vmd.c > +++ usr.sbin/vmd/vmd.c > @@ -67,6 +67,8 @@ struct vmd *env; > int vm_claimid(const char *, int, uint32_t *); > void start_vm_batch(int, short, void*); > > +static inline void vm_terminate(struct vmd_vm *, const char *); > + > struct vmd *env; > > static struct privsep_proc procs[] = { > @@ -395,14 +397,14 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc > errno = vmr.vmr_result; > log_warn("%s: failed to forward vm result", > vcp->vcp_name); > - vm_remove(vm, __func__); > + vm_terminate(vm, __func__); > return (-1); > } > } > > if (vmr.vmr_result) { > log_warnx("%s: failed to start vm", vcp->vcp_name); > - vm_remove(vm, __func__); > + vm_terminate(vm, __func__); > errno = vmr.vmr_result; > break; > } > @@ -410,7 +412,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc > /* Now configure all the interfaces */ > if (vm_priv_ifconfig(ps, vm) == -1) { > log_warn("%s: failed to configure vm", vcp->vcp_name); > - vm_remove(vm, __func__); > + vm_terminate(vm, __func__); > break; > } > > @@ -441,10 +443,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc > log_info("%s: sent vm %d successfully.", > vm->vm_params.vmc_params.vcp_name, > vm->vm_vmid); > - if (vm->vm_from_config) > - vm_stop(vm, 0, __func__); > - else > - vm_remove(vm, __func__); > + vm_terminate(vm, __func__); > } > > /* Send a response if a control client is waiting for it */ > @@ -470,10 +469,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc > } > if (vmr.vmr_result != EAGAIN || > vm->vm_params.vmc_bootdevice) { > - if (vm->vm_from_config) > - vm_stop(vm, 0, __func__); > - else > - vm_remove(vm, __func__); > + vm_terminate(vm, __func__); > } else { > /* Stop VM instance but keep the tty open */ > vm_stop(vm, 1, __func__); > @@ -509,7 +505,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc > imsg->hdr.peerid, -1, , sizeof(vir)) == -1) { > log_debug("%s: GET_INFO_VM failed for vm %d, removing", > __func__, vm->vm_vmid); > - vm_remove(vm, __func__); > + vm_terminate(vm, __func__); > return (-1); > } > break; > @@ -545,7 +541,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc >
Re: don't remove known vmd vm's on failure
Tried it out here with my gimpy little test setup and your suggested repro case. Seems to be more sane to me in this case, and looks like the right thing to do, So ok beck@ for what that’s worth. > On Jan 21, 2023, at 8:08 AM, Dave Voutila wrote: > > > *bump*... Anyone able to test or review? Other than bikeshedding some > function naming, this isn't a dramatic change. > > Dave Voutila writes: > >> Dave Voutila writes: >> >>> It turns out not only does vmd have numerous error paths for handling >>> when something is amiss with a guest, most of the paths don't check if >>> it's a known vm defined in vm.conf. >>> >>> As a result, vmd often removes the vm from the SLIST of vm's meaning >>> one can't easily attempt to start it again or see it in vmctl's status >>> output. >>> >>> A simple reproduction: >>> >>> 1. define a vm with memory > 4gb in vm.conf >>> 2. run vmd in the foreground (doas vmd -d) so it's not started by rc.d >>> 3. try to start with `vmctl start -c ${vm_name}`, you should trigger >>> an ENOMEM and get the "Cannot allocate memory" message from vmctl. >>> 4. try to start the same vm again...now you get EPERM! >>> 5. the vm is no longer visible in the output from `vmctl status` :( >>> >>> The problem is most of the error paths call vm_remove, which not only >>> tears down the vm via vm_stop, but also removes it from the vm list and >>> frees it. Only clean stops or restarts seem to perform this check >>> currently. >>> >>> Below diff refactors into checking if the vm is defined in the global >>> config before deciding to call vm_stop or vm_remove. >> >> Slight tweak... __func__->caller to actually pass the correct name to >> vm_{stop,remove}() from vm_terminate() >> >> >> diff refs/heads/master refs/heads/vmd-accounting >> commit - d4e23fe7544b01187ebf3ac8ae32e955445ee666 >> commit + 46503195403bfab50cd34bd8682f35a17d54d03d >> blob - 6bffb2519a31464836aa573dbccb7aa14ea97722 >> blob + f30dc14de1ff9d5cf121cbc08b6db183a06d0c07 >> --- usr.sbin/vmd/vmd.c >> +++ usr.sbin/vmd/vmd.c >> @@ -67,6 +67,8 @@ struct vmd *env; >> int vm_claimid(const char *, int, uint32_t *); >> void start_vm_batch(int, short, void*); >> >> +static inline void vm_terminate(struct vmd_vm *, const char *); >> + >> struct vmd *env; >> >> static struct privsep_proc procs[] = { >> @@ -395,14 +397,14 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc >> errno = vmr.vmr_result; >> log_warn("%s: failed to forward vm result", >>vcp->vcp_name); >> - vm_remove(vm, __func__); >> + vm_terminate(vm, __func__); >> return (-1); >> } >> } >> >> if (vmr.vmr_result) { >> log_warnx("%s: failed to start vm", vcp->vcp_name); >> - vm_remove(vm, __func__); >> + vm_terminate(vm, __func__); >> errno = vmr.vmr_result; >> break; >> } >> @@ -410,7 +412,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc >> /* Now configure all the interfaces */ >> if (vm_priv_ifconfig(ps, vm) == -1) { >> log_warn("%s: failed to configure vm", vcp->vcp_name); >> - vm_remove(vm, __func__); >> + vm_terminate(vm, __func__); >> break; >> } >> >> @@ -441,10 +443,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc >> log_info("%s: sent vm %d successfully.", >>vm->vm_params.vmc_params.vcp_name, >>vm->vm_vmid); >> - if (vm->vm_from_config) >> - vm_stop(vm, 0, __func__); >> - else >> - vm_remove(vm, __func__); >> + vm_terminate(vm, __func__); >> } >> >> /* Send a response if a control client is waiting for it */ >> @@ -470,10 +469,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc >> } >> if (vmr.vmr_result != EAGAIN || >>vm->vm_params.vmc_bootdevice) { >> - if (vm->vm_from_config) >> - vm_stop(vm, 0, __func__); >> - else >> - vm_remove(vm, __func__); >> + vm_terminate(vm, __func__); >> } else { >> /* Stop VM instance but keep the tty open */ >> vm_stop(vm, 1, __func__); >> @@ -509,7 +505,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc >>imsg->hdr.peerid, -1, , sizeof(vir)) == -1) { >> log_debug("%s: GET_INFO_VM failed for vm %d, removing", >>__func__, vm->vm_vmid); >> - vm_remove(vm, __func__); >> + vm_terminate(vm, __func__); >> return (-1); >> } >> break; >> @@ -545,7 +541,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc >>sizeof(vir)) == -1) { >> log_debug("%s: GET_INFO_VM_END failed", >>__func__); >> - vm_remove(vm, __func__); >> + vm_terminate(vm, __func__); >> return (-1); >> } >> } >> >> @@ -1943,3 +1943,14 @@ getmonotime(struct timeval *tv) >> >> TIMESPEC_TO_TIMEVAL(tv, ); >> } >> + >> +static inline void >> +vm_terminate(struct vmd_vm *vm, const char *caller) >> +{ >> + if (vm->vm_from_config) >> + vm_stop(vm, 0, caller); >> + else { >> + /* vm_remove calls vm_stop */ >> + vm_remove(vm, caller); >> + } >> +} >
Re: don't remove known vmd vm's on failure
*bump*... Anyone able to test or review? Other than bikeshedding some function naming, this isn't a dramatic change. Dave Voutila writes: > Dave Voutila writes: > >> It turns out not only does vmd have numerous error paths for handling >> when something is amiss with a guest, most of the paths don't check if >> it's a known vm defined in vm.conf. >> >> As a result, vmd often removes the vm from the SLIST of vm's meaning >> one can't easily attempt to start it again or see it in vmctl's status >> output. >> >> A simple reproduction: >> >> 1. define a vm with memory > 4gb in vm.conf >> 2. run vmd in the foreground (doas vmd -d) so it's not started by rc.d >> 3. try to start with `vmctl start -c ${vm_name}`, you should trigger >> an ENOMEM and get the "Cannot allocate memory" message from vmctl. >> 4. try to start the same vm again...now you get EPERM! >> 5. the vm is no longer visible in the output from `vmctl status` :( >> >> The problem is most of the error paths call vm_remove, which not only >> tears down the vm via vm_stop, but also removes it from the vm list and >> frees it. Only clean stops or restarts seem to perform this check >> currently. >> >> Below diff refactors into checking if the vm is defined in the global >> config before deciding to call vm_stop or vm_remove. > > Slight tweak... __func__->caller to actually pass the correct name to > vm_{stop,remove}() from vm_terminate() > > > diff refs/heads/master refs/heads/vmd-accounting > commit - d4e23fe7544b01187ebf3ac8ae32e955445ee666 > commit + 46503195403bfab50cd34bd8682f35a17d54d03d > blob - 6bffb2519a31464836aa573dbccb7aa14ea97722 > blob + f30dc14de1ff9d5cf121cbc08b6db183a06d0c07 > --- usr.sbin/vmd/vmd.c > +++ usr.sbin/vmd/vmd.c > @@ -67,6 +67,8 @@ struct vmd *env; > int vm_claimid(const char *, int, uint32_t *); > void start_vm_batch(int, short, void*); > > +static inline void vm_terminate(struct vmd_vm *, const char *); > + > struct vmd *env; > > static struct privsep_proc procs[] = { > @@ -395,14 +397,14 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc > errno = vmr.vmr_result; > log_warn("%s: failed to forward vm result", > vcp->vcp_name); > - vm_remove(vm, __func__); > + vm_terminate(vm, __func__); > return (-1); > } > } > > if (vmr.vmr_result) { > log_warnx("%s: failed to start vm", vcp->vcp_name); > - vm_remove(vm, __func__); > + vm_terminate(vm, __func__); > errno = vmr.vmr_result; > break; > } > @@ -410,7 +412,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc > /* Now configure all the interfaces */ > if (vm_priv_ifconfig(ps, vm) == -1) { > log_warn("%s: failed to configure vm", vcp->vcp_name); > - vm_remove(vm, __func__); > + vm_terminate(vm, __func__); > break; > } > > @@ -441,10 +443,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc > log_info("%s: sent vm %d successfully.", > vm->vm_params.vmc_params.vcp_name, > vm->vm_vmid); > - if (vm->vm_from_config) > - vm_stop(vm, 0, __func__); > - else > - vm_remove(vm, __func__); > + vm_terminate(vm, __func__); > } > > /* Send a response if a control client is waiting for it */ > @@ -470,10 +469,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc > } > if (vmr.vmr_result != EAGAIN || > vm->vm_params.vmc_bootdevice) { > - if (vm->vm_from_config) > - vm_stop(vm, 0, __func__); > - else > - vm_remove(vm, __func__); > + vm_terminate(vm, __func__); > } else { > /* Stop VM instance but keep the tty open */ > vm_stop(vm, 1, __func__); > @@ -509,7 +505,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc > imsg->hdr.peerid, -1, , sizeof(vir)) == -1) { > log_debug("%s: GET_INFO_VM failed for vm %d, removing", > __func__, vm->vm_vmid); > - vm_remove(vm, __func__); > + vm_terminate(vm, __func__); > return (-1); > } > break; > @@ -545,7 +541,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc > sizeof(vir)) == -1) { >
Re: don't remove known vmd vm's on failure
Dave Voutila writes: > It turns out not only does vmd have numerous error paths for handling > when something is amiss with a guest, most of the paths don't check if > it's a known vm defined in vm.conf. > > As a result, vmd often removes the vm from the SLIST of vm's meaning > one can't easily attempt to start it again or see it in vmctl's status > output. > > A simple reproduction: > > 1. define a vm with memory > 4gb in vm.conf > 2. run vmd in the foreground (doas vmd -d) so it's not started by rc.d > 3. try to start with `vmctl start -c ${vm_name}`, you should trigger > an ENOMEM and get the "Cannot allocate memory" message from vmctl. > 4. try to start the same vm again...now you get EPERM! > 5. the vm is no longer visible in the output from `vmctl status` :( > > The problem is most of the error paths call vm_remove, which not only > tears down the vm via vm_stop, but also removes it from the vm list and > frees it. Only clean stops or restarts seem to perform this check > currently. > > Below diff refactors into checking if the vm is defined in the global > config before deciding to call vm_stop or vm_remove. Slight tweak... __func__->caller to actually pass the correct name to vm_{stop,remove}() from vm_terminate() diff refs/heads/master refs/heads/vmd-accounting commit - d4e23fe7544b01187ebf3ac8ae32e955445ee666 commit + 46503195403bfab50cd34bd8682f35a17d54d03d blob - 6bffb2519a31464836aa573dbccb7aa14ea97722 blob + f30dc14de1ff9d5cf121cbc08b6db183a06d0c07 --- usr.sbin/vmd/vmd.c +++ usr.sbin/vmd/vmd.c @@ -67,6 +67,8 @@ struct vmd*env; int vm_claimid(const char *, int, uint32_t *); voidstart_vm_batch(int, short, void*); +static inline void vm_terminate(struct vmd_vm *, const char *); + struct vmd *env; static struct privsep_proc procs[] = { @@ -395,14 +397,14 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc errno = vmr.vmr_result; log_warn("%s: failed to forward vm result", vcp->vcp_name); - vm_remove(vm, __func__); + vm_terminate(vm, __func__); return (-1); } } if (vmr.vmr_result) { log_warnx("%s: failed to start vm", vcp->vcp_name); - vm_remove(vm, __func__); + vm_terminate(vm, __func__); errno = vmr.vmr_result; break; } @@ -410,7 +412,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc /* Now configure all the interfaces */ if (vm_priv_ifconfig(ps, vm) == -1) { log_warn("%s: failed to configure vm", vcp->vcp_name); - vm_remove(vm, __func__); + vm_terminate(vm, __func__); break; } @@ -441,10 +443,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc log_info("%s: sent vm %d successfully.", vm->vm_params.vmc_params.vcp_name, vm->vm_vmid); - if (vm->vm_from_config) - vm_stop(vm, 0, __func__); - else - vm_remove(vm, __func__); + vm_terminate(vm, __func__); } /* Send a response if a control client is waiting for it */ @@ -470,10 +469,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc } if (vmr.vmr_result != EAGAIN || vm->vm_params.vmc_bootdevice) { - if (vm->vm_from_config) - vm_stop(vm, 0, __func__); - else - vm_remove(vm, __func__); + vm_terminate(vm, __func__); } else { /* Stop VM instance but keep the tty open */ vm_stop(vm, 1, __func__); @@ -509,7 +505,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc imsg->hdr.peerid, -1, , sizeof(vir)) == -1) { log_debug("%s: GET_INFO_VM failed for vm %d, removing", __func__, vm->vm_vmid); - vm_remove(vm, __func__); + vm_terminate(vm, __func__); return (-1); } break; @@ -545,7 +541,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc sizeof(vir)) == -1) { log_debug("%s: GET_INFO_VM_END failed", __func__); - vm_remove(vm, __func__); +
don't remove known vmd vm's on failure
It turns out not only does vmd have numerous error paths for handling when something is amiss with a guest, most of the paths don't check if it's a known vm defined in vm.conf. As a result, vmd often removes the vm from the SLIST of vm's meaning one can't easily attempt to start it again or see it in vmctl's status output. A simple reproduction: 1. define a vm with memory > 4gb in vm.conf 2. run vmd in the foreground (doas vmd -d) so it's not started by rc.d 3. try to start with `vmctl start -c ${vm_name}`, you should trigger an ENOMEM and get the "Cannot allocate memory" message from vmctl. 4. try to start the same vm again...now you get EPERM! 5. the vm is no longer visible in the output from `vmctl status` :( The problem is most of the error paths call vm_remove, which not only tears down the vm via vm_stop, but also removes it from the vm list and frees it. Only clean stops or restarts seem to perform this check currently. Below diff refactors into checking if the vm is defined in the global config before deciding to call vm_stop or vm_remove. ok? -dv diff refs/heads/master refs/heads/vmd-accounting commit - d4e23fe7544b01187ebf3ac8ae32e955445ee666 commit + 46503195403bfab50cd34bd8682f35a17d54d03d blob - 6bffb2519a31464836aa573dbccb7aa14ea97722 blob + f30dc14de1ff9d5cf121cbc08b6db183a06d0c07 --- usr.sbin/vmd/vmd.c +++ usr.sbin/vmd/vmd.c @@ -67,6 +67,8 @@ struct vmd*env; int vm_claimid(const char *, int, uint32_t *); voidstart_vm_batch(int, short, void*); +static inline void vm_terminate(struct vmd_vm *, const char *); + struct vmd *env; static struct privsep_proc procs[] = { @@ -395,14 +397,14 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc errno = vmr.vmr_result; log_warn("%s: failed to forward vm result", vcp->vcp_name); - vm_remove(vm, __func__); + vm_terminate(vm, __func__); return (-1); } } if (vmr.vmr_result) { log_warnx("%s: failed to start vm", vcp->vcp_name); - vm_remove(vm, __func__); + vm_terminate(vm, __func__); errno = vmr.vmr_result; break; } @@ -410,7 +412,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc /* Now configure all the interfaces */ if (vm_priv_ifconfig(ps, vm) == -1) { log_warn("%s: failed to configure vm", vcp->vcp_name); - vm_remove(vm, __func__); + vm_terminate(vm, __func__); break; } @@ -441,10 +443,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc log_info("%s: sent vm %d successfully.", vm->vm_params.vmc_params.vcp_name, vm->vm_vmid); - if (vm->vm_from_config) - vm_stop(vm, 0, __func__); - else - vm_remove(vm, __func__); + vm_terminate(vm, __func__); } /* Send a response if a control client is waiting for it */ @@ -470,10 +469,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc } if (vmr.vmr_result != EAGAIN || vm->vm_params.vmc_bootdevice) { - if (vm->vm_from_config) - vm_stop(vm, 0, __func__); - else - vm_remove(vm, __func__); + vm_terminate(vm, __func__); } else { /* Stop VM instance but keep the tty open */ vm_stop(vm, 1, __func__); @@ -509,7 +505,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc imsg->hdr.peerid, -1, , sizeof(vir)) == -1) { log_debug("%s: GET_INFO_VM failed for vm %d, removing", __func__, vm->vm_vmid); - vm_remove(vm, __func__); + vm_terminate(vm, __func__); return (-1); } break; @@ -545,7 +541,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc sizeof(vir)) == -1) { log_debug("%s: GET_INFO_VM_END failed", __func__); - vm_remove(vm, __func__); + vm_terminate(vm, __func__); return (-1); } } @@ -1943,3 +1943,14