Re: don't remove known vmd vm's on failure

2023-01-26 Thread Mike Larkin
On Sun, Jan 15, 2023 at 09:08:29AM -0500, Dave Voutila wrote:
> 
> Dave Voutila  writes:
> 
> > It turns out not only does vmd have numerous error paths for handling
> > when something is amiss with a guest, most of the paths don't check if
> > it's a known vm defined in vm.conf.
> >
> > As a result, vmd often removes the vm from the SLIST of vm's meaning
> > one can't easily attempt to start it again or see it in vmctl's status
> > output.
> >
> > A simple reproduction:
> >
> >   1. define a vm with memory > 4gb in vm.conf
> >   2. run vmd in the foreground (doas vmd -d) so it's not started by rc.d
> >   3. try to start with `vmctl start -c ${vm_name}`, you should trigger
> >  an ENOMEM and get the "Cannot allocate memory" message from vmctl.
> >   4. try to start the same vm again...now you get EPERM!
> >   5. the vm is no longer visible in the output from `vmctl status` :(
> >
> > The problem is most of the error paths call vm_remove, which not only
> > tears down the vm via vm_stop, but also removes it from the vm list and
> > frees it. Only clean stops or restarts seem to perform this check
> > currently.
> >
> > Below diff refactors into checking if the vm is defined in the global
> > config before deciding to call vm_stop or vm_remove.
> 
> Slight tweak... __func__->caller to actually pass the correct name to
> vm_{stop,remove}() from vm_terminate()
> 

Finally getting caught up. ok mlarkin on this if you didn't commit it
already.

-ml

> 
> diff refs/heads/master refs/heads/vmd-accounting
> commit - d4e23fe7544b01187ebf3ac8ae32e955445ee666
> commit + 46503195403bfab50cd34bd8682f35a17d54d03d
> blob - 6bffb2519a31464836aa573dbccb7aa14ea97722
> blob + f30dc14de1ff9d5cf121cbc08b6db183a06d0c07
> --- usr.sbin/vmd/vmd.c
> +++ usr.sbin/vmd/vmd.c
> @@ -67,6 +67,8 @@ struct vmd  *env;
>  int   vm_claimid(const char *, int, uint32_t *);
>  void  start_vm_batch(int, short, void*);
> 
> +static inline void vm_terminate(struct vmd_vm *, const char *);
> +
>  struct vmd   *env;
> 
>  static struct privsep_proc procs[] = {
> @@ -395,14 +397,14 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
>   errno = vmr.vmr_result;
>   log_warn("%s: failed to forward vm result",
>   vcp->vcp_name);
> - vm_remove(vm, __func__);
> + vm_terminate(vm, __func__);
>   return (-1);
>   }
>   }
> 
>   if (vmr.vmr_result) {
>   log_warnx("%s: failed to start vm", vcp->vcp_name);
> - vm_remove(vm, __func__);
> + vm_terminate(vm, __func__);
>   errno = vmr.vmr_result;
>   break;
>   }
> @@ -410,7 +412,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
>   /* Now configure all the interfaces */
>   if (vm_priv_ifconfig(ps, vm) == -1) {
>   log_warn("%s: failed to configure vm", vcp->vcp_name);
> - vm_remove(vm, __func__);
> + vm_terminate(vm, __func__);
>   break;
>   }
> 
> @@ -441,10 +443,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
>   log_info("%s: sent vm %d successfully.",
>   vm->vm_params.vmc_params.vcp_name,
>   vm->vm_vmid);
> - if (vm->vm_from_config)
> - vm_stop(vm, 0, __func__);
> - else
> - vm_remove(vm, __func__);
> + vm_terminate(vm, __func__);
>   }
> 
>   /* Send a response if a control client is waiting for it */
> @@ -470,10 +469,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
>   }
>   if (vmr.vmr_result != EAGAIN ||
>   vm->vm_params.vmc_bootdevice) {
> - if (vm->vm_from_config)
> - vm_stop(vm, 0, __func__);
> - else
> - vm_remove(vm, __func__);
> + vm_terminate(vm, __func__);
>   } else {
>   /* Stop VM instance but keep the tty open */
>   vm_stop(vm, 1, __func__);
> @@ -509,7 +505,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
>   imsg->hdr.peerid, -1, , sizeof(vir)) == -1) {
>   log_debug("%s: GET_INFO_VM failed for vm %d, removing",
>   __func__, vm->vm_vmid);
> - vm_remove(vm, __func__);
> + vm_terminate(vm, __func__);
>   return (-1);
>   }
>   break;
> @@ -545,7 +541,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
>   

Re: don't remove known vmd vm's on failure

2023-01-22 Thread Bob Beck
Tried it out here with my gimpy little test setup and your suggested repro 
case. 

Seems to be more sane to me in this case, and looks like the right thing to do, 

So ok beck@ for what that’s worth. 

> On Jan 21, 2023, at 8:08 AM, Dave Voutila  wrote:
> 
> 
> *bump*... Anyone able to test or review? Other than bikeshedding some
> function naming, this isn't a dramatic change.
> 
> Dave Voutila  writes:
> 
>> Dave Voutila  writes:
>> 
>>> It turns out not only does vmd have numerous error paths for handling
>>> when something is amiss with a guest, most of the paths don't check if
>>> it's a known vm defined in vm.conf.
>>> 
>>> As a result, vmd often removes the vm from the SLIST of vm's meaning
>>> one can't easily attempt to start it again or see it in vmctl's status
>>> output.
>>> 
>>> A simple reproduction:
>>> 
>>>  1. define a vm with memory > 4gb in vm.conf
>>>  2. run vmd in the foreground (doas vmd -d) so it's not started by rc.d
>>>  3. try to start with `vmctl start -c ${vm_name}`, you should trigger
>>> an ENOMEM and get the "Cannot allocate memory" message from vmctl.
>>>  4. try to start the same vm again...now you get EPERM!
>>>  5. the vm is no longer visible in the output from `vmctl status` :(
>>> 
>>> The problem is most of the error paths call vm_remove, which not only
>>> tears down the vm via vm_stop, but also removes it from the vm list and
>>> frees it. Only clean stops or restarts seem to perform this check
>>> currently.
>>> 
>>> Below diff refactors into checking if the vm is defined in the global
>>> config before deciding to call vm_stop or vm_remove.
>> 
>> Slight tweak... __func__->caller to actually pass the correct name to
>> vm_{stop,remove}() from vm_terminate()
>> 
>> 
>> diff refs/heads/master refs/heads/vmd-accounting
>> commit - d4e23fe7544b01187ebf3ac8ae32e955445ee666
>> commit + 46503195403bfab50cd34bd8682f35a17d54d03d
>> blob - 6bffb2519a31464836aa573dbccb7aa14ea97722
>> blob + f30dc14de1ff9d5cf121cbc08b6db183a06d0c07
>> --- usr.sbin/vmd/vmd.c
>> +++ usr.sbin/vmd/vmd.c
>> @@ -67,6 +67,8 @@ struct vmd *env;
>> int vm_claimid(const char *, int, uint32_t *);
>> void start_vm_batch(int, short, void*);
>> 
>> +static inline void vm_terminate(struct vmd_vm *, const char *);
>> +
>> struct vmd *env;
>> 
>> static struct privsep_proc procs[] = {
>> @@ -395,14 +397,14 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
>> errno = vmr.vmr_result;
>> log_warn("%s: failed to forward vm result",
>>vcp->vcp_name);
>> - vm_remove(vm, __func__);
>> + vm_terminate(vm, __func__);
>> return (-1);
>> }
>> }
>> 
>> if (vmr.vmr_result) {
>> log_warnx("%s: failed to start vm", vcp->vcp_name);
>> - vm_remove(vm, __func__);
>> + vm_terminate(vm, __func__);
>> errno = vmr.vmr_result;
>> break;
>> }
>> @@ -410,7 +412,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
>> /* Now configure all the interfaces */
>> if (vm_priv_ifconfig(ps, vm) == -1) {
>> log_warn("%s: failed to configure vm", vcp->vcp_name);
>> - vm_remove(vm, __func__);
>> + vm_terminate(vm, __func__);
>> break;
>> }
>> 
>> @@ -441,10 +443,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
>> log_info("%s: sent vm %d successfully.",
>>vm->vm_params.vmc_params.vcp_name,
>>vm->vm_vmid);
>> - if (vm->vm_from_config)
>> - vm_stop(vm, 0, __func__);
>> - else
>> - vm_remove(vm, __func__);
>> + vm_terminate(vm, __func__);
>> }
>> 
>> /* Send a response if a control client is waiting for it */
>> @@ -470,10 +469,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
>> }
>> if (vmr.vmr_result != EAGAIN ||
>>vm->vm_params.vmc_bootdevice) {
>> - if (vm->vm_from_config)
>> - vm_stop(vm, 0, __func__);
>> - else
>> - vm_remove(vm, __func__);
>> + vm_terminate(vm, __func__);
>> } else {
>> /* Stop VM instance but keep the tty open */
>> vm_stop(vm, 1, __func__);
>> @@ -509,7 +505,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
>>imsg->hdr.peerid, -1, , sizeof(vir)) == -1) {
>> log_debug("%s: GET_INFO_VM failed for vm %d, removing",
>>__func__, vm->vm_vmid);
>> - vm_remove(vm, __func__);
>> + vm_terminate(vm, __func__);
>> return (-1);
>> }
>> break;
>> @@ -545,7 +541,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
>>sizeof(vir)) == -1) {
>> log_debug("%s: GET_INFO_VM_END failed",
>>__func__);
>> - vm_remove(vm, __func__);
>> + vm_terminate(vm, __func__);
>> return (-1);
>> }
>> }
>> 
>> @@ -1943,3 +1943,14 @@ getmonotime(struct timeval *tv)
>> 
>> TIMESPEC_TO_TIMEVAL(tv, );
>> }
>> +
>> +static inline void
>> +vm_terminate(struct vmd_vm *vm, const char *caller)
>> +{
>> + if (vm->vm_from_config)
>> + vm_stop(vm, 0, caller);
>> + else {
>> + /* vm_remove calls vm_stop */
>> + vm_remove(vm, caller);
>> + }
>> +}
> 



Re: don't remove known vmd vm's on failure

2023-01-21 Thread Dave Voutila


*bump*... Anyone able to test or review? Other than bikeshedding some
function naming, this isn't a dramatic change.

Dave Voutila  writes:

> Dave Voutila  writes:
>
>> It turns out not only does vmd have numerous error paths for handling
>> when something is amiss with a guest, most of the paths don't check if
>> it's a known vm defined in vm.conf.
>>
>> As a result, vmd often removes the vm from the SLIST of vm's meaning
>> one can't easily attempt to start it again or see it in vmctl's status
>> output.
>>
>> A simple reproduction:
>>
>>   1. define a vm with memory > 4gb in vm.conf
>>   2. run vmd in the foreground (doas vmd -d) so it's not started by rc.d
>>   3. try to start with `vmctl start -c ${vm_name}`, you should trigger
>>  an ENOMEM and get the "Cannot allocate memory" message from vmctl.
>>   4. try to start the same vm again...now you get EPERM!
>>   5. the vm is no longer visible in the output from `vmctl status` :(
>>
>> The problem is most of the error paths call vm_remove, which not only
>> tears down the vm via vm_stop, but also removes it from the vm list and
>> frees it. Only clean stops or restarts seem to perform this check
>> currently.
>>
>> Below diff refactors into checking if the vm is defined in the global
>> config before deciding to call vm_stop or vm_remove.
>
> Slight tweak... __func__->caller to actually pass the correct name to
> vm_{stop,remove}() from vm_terminate()
>
>
> diff refs/heads/master refs/heads/vmd-accounting
> commit - d4e23fe7544b01187ebf3ac8ae32e955445ee666
> commit + 46503195403bfab50cd34bd8682f35a17d54d03d
> blob - 6bffb2519a31464836aa573dbccb7aa14ea97722
> blob + f30dc14de1ff9d5cf121cbc08b6db183a06d0c07
> --- usr.sbin/vmd/vmd.c
> +++ usr.sbin/vmd/vmd.c
> @@ -67,6 +67,8 @@ struct vmd  *env;
>  int   vm_claimid(const char *, int, uint32_t *);
>  void  start_vm_batch(int, short, void*);
>
> +static inline void vm_terminate(struct vmd_vm *, const char *);
> +
>  struct vmd   *env;
>
>  static struct privsep_proc procs[] = {
> @@ -395,14 +397,14 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
>   errno = vmr.vmr_result;
>   log_warn("%s: failed to forward vm result",
>   vcp->vcp_name);
> - vm_remove(vm, __func__);
> + vm_terminate(vm, __func__);
>   return (-1);
>   }
>   }
>
>   if (vmr.vmr_result) {
>   log_warnx("%s: failed to start vm", vcp->vcp_name);
> - vm_remove(vm, __func__);
> + vm_terminate(vm, __func__);
>   errno = vmr.vmr_result;
>   break;
>   }
> @@ -410,7 +412,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
>   /* Now configure all the interfaces */
>   if (vm_priv_ifconfig(ps, vm) == -1) {
>   log_warn("%s: failed to configure vm", vcp->vcp_name);
> - vm_remove(vm, __func__);
> + vm_terminate(vm, __func__);
>   break;
>   }
>
> @@ -441,10 +443,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
>   log_info("%s: sent vm %d successfully.",
>   vm->vm_params.vmc_params.vcp_name,
>   vm->vm_vmid);
> - if (vm->vm_from_config)
> - vm_stop(vm, 0, __func__);
> - else
> - vm_remove(vm, __func__);
> + vm_terminate(vm, __func__);
>   }
>
>   /* Send a response if a control client is waiting for it */
> @@ -470,10 +469,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
>   }
>   if (vmr.vmr_result != EAGAIN ||
>   vm->vm_params.vmc_bootdevice) {
> - if (vm->vm_from_config)
> - vm_stop(vm, 0, __func__);
> - else
> - vm_remove(vm, __func__);
> + vm_terminate(vm, __func__);
>   } else {
>   /* Stop VM instance but keep the tty open */
>   vm_stop(vm, 1, __func__);
> @@ -509,7 +505,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
>   imsg->hdr.peerid, -1, , sizeof(vir)) == -1) {
>   log_debug("%s: GET_INFO_VM failed for vm %d, removing",
>   __func__, vm->vm_vmid);
> - vm_remove(vm, __func__);
> + vm_terminate(vm, __func__);
>   return (-1);
>   }
>   break;
> @@ -545,7 +541,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
>   sizeof(vir)) == -1) {
>

Re: don't remove known vmd vm's on failure

2023-01-15 Thread Dave Voutila


Dave Voutila  writes:

> It turns out not only does vmd have numerous error paths for handling
> when something is amiss with a guest, most of the paths don't check if
> it's a known vm defined in vm.conf.
>
> As a result, vmd often removes the vm from the SLIST of vm's meaning
> one can't easily attempt to start it again or see it in vmctl's status
> output.
>
> A simple reproduction:
>
>   1. define a vm with memory > 4gb in vm.conf
>   2. run vmd in the foreground (doas vmd -d) so it's not started by rc.d
>   3. try to start with `vmctl start -c ${vm_name}`, you should trigger
>  an ENOMEM and get the "Cannot allocate memory" message from vmctl.
>   4. try to start the same vm again...now you get EPERM!
>   5. the vm is no longer visible in the output from `vmctl status` :(
>
> The problem is most of the error paths call vm_remove, which not only
> tears down the vm via vm_stop, but also removes it from the vm list and
> frees it. Only clean stops or restarts seem to perform this check
> currently.
>
> Below diff refactors into checking if the vm is defined in the global
> config before deciding to call vm_stop or vm_remove.

Slight tweak... __func__->caller to actually pass the correct name to
vm_{stop,remove}() from vm_terminate()


diff refs/heads/master refs/heads/vmd-accounting
commit - d4e23fe7544b01187ebf3ac8ae32e955445ee666
commit + 46503195403bfab50cd34bd8682f35a17d54d03d
blob - 6bffb2519a31464836aa573dbccb7aa14ea97722
blob + f30dc14de1ff9d5cf121cbc08b6db183a06d0c07
--- usr.sbin/vmd/vmd.c
+++ usr.sbin/vmd/vmd.c
@@ -67,6 +67,8 @@ struct vmd*env;
 int vm_claimid(const char *, int, uint32_t *);
 voidstart_vm_batch(int, short, void*);

+static inline void vm_terminate(struct vmd_vm *, const char *);
+
 struct vmd *env;

 static struct privsep_proc procs[] = {
@@ -395,14 +397,14 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
errno = vmr.vmr_result;
log_warn("%s: failed to forward vm result",
vcp->vcp_name);
-   vm_remove(vm, __func__);
+   vm_terminate(vm, __func__);
return (-1);
}
}

if (vmr.vmr_result) {
log_warnx("%s: failed to start vm", vcp->vcp_name);
-   vm_remove(vm, __func__);
+   vm_terminate(vm, __func__);
errno = vmr.vmr_result;
break;
}
@@ -410,7 +412,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
/* Now configure all the interfaces */
if (vm_priv_ifconfig(ps, vm) == -1) {
log_warn("%s: failed to configure vm", vcp->vcp_name);
-   vm_remove(vm, __func__);
+   vm_terminate(vm, __func__);
break;
}

@@ -441,10 +443,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
log_info("%s: sent vm %d successfully.",
vm->vm_params.vmc_params.vcp_name,
vm->vm_vmid);
-   if (vm->vm_from_config)
-   vm_stop(vm, 0, __func__);
-   else
-   vm_remove(vm, __func__);
+   vm_terminate(vm, __func__);
}

/* Send a response if a control client is waiting for it */
@@ -470,10 +469,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
}
if (vmr.vmr_result != EAGAIN ||
vm->vm_params.vmc_bootdevice) {
-   if (vm->vm_from_config)
-   vm_stop(vm, 0, __func__);
-   else
-   vm_remove(vm, __func__);
+   vm_terminate(vm, __func__);
} else {
/* Stop VM instance but keep the tty open */
vm_stop(vm, 1, __func__);
@@ -509,7 +505,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
imsg->hdr.peerid, -1, , sizeof(vir)) == -1) {
log_debug("%s: GET_INFO_VM failed for vm %d, removing",
__func__, vm->vm_vmid);
-   vm_remove(vm, __func__);
+   vm_terminate(vm, __func__);
return (-1);
}
break;
@@ -545,7 +541,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
sizeof(vir)) == -1) {
log_debug("%s: GET_INFO_VM_END failed",
__func__);
-   vm_remove(vm, __func__);
+

don't remove known vmd vm's on failure

2023-01-14 Thread Dave Voutila
It turns out not only does vmd have numerous error paths for handling
when something is amiss with a guest, most of the paths don't check if
it's a known vm defined in vm.conf.

As a result, vmd often removes the vm from the SLIST of vm's meaning
one can't easily attempt to start it again or see it in vmctl's status
output.

A simple reproduction:

  1. define a vm with memory > 4gb in vm.conf
  2. run vmd in the foreground (doas vmd -d) so it's not started by rc.d
  3. try to start with `vmctl start -c ${vm_name}`, you should trigger
 an ENOMEM and get the "Cannot allocate memory" message from vmctl.
  4. try to start the same vm again...now you get EPERM!
  5. the vm is no longer visible in the output from `vmctl status` :(

The problem is most of the error paths call vm_remove, which not only
tears down the vm via vm_stop, but also removes it from the vm list and
frees it. Only clean stops or restarts seem to perform this check
currently.

Below diff refactors into checking if the vm is defined in the global
config before deciding to call vm_stop or vm_remove.

ok?
-dv


diff refs/heads/master refs/heads/vmd-accounting
commit - d4e23fe7544b01187ebf3ac8ae32e955445ee666
commit + 46503195403bfab50cd34bd8682f35a17d54d03d
blob - 6bffb2519a31464836aa573dbccb7aa14ea97722
blob + f30dc14de1ff9d5cf121cbc08b6db183a06d0c07
--- usr.sbin/vmd/vmd.c
+++ usr.sbin/vmd/vmd.c
@@ -67,6 +67,8 @@ struct vmd*env;
 int vm_claimid(const char *, int, uint32_t *);
 voidstart_vm_batch(int, short, void*);

+static inline void vm_terminate(struct vmd_vm *, const char *);
+
 struct vmd *env;

 static struct privsep_proc procs[] = {
@@ -395,14 +397,14 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
errno = vmr.vmr_result;
log_warn("%s: failed to forward vm result",
vcp->vcp_name);
-   vm_remove(vm, __func__);
+   vm_terminate(vm, __func__);
return (-1);
}
}

if (vmr.vmr_result) {
log_warnx("%s: failed to start vm", vcp->vcp_name);
-   vm_remove(vm, __func__);
+   vm_terminate(vm, __func__);
errno = vmr.vmr_result;
break;
}
@@ -410,7 +412,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
/* Now configure all the interfaces */
if (vm_priv_ifconfig(ps, vm) == -1) {
log_warn("%s: failed to configure vm", vcp->vcp_name);
-   vm_remove(vm, __func__);
+   vm_terminate(vm, __func__);
break;
}

@@ -441,10 +443,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
log_info("%s: sent vm %d successfully.",
vm->vm_params.vmc_params.vcp_name,
vm->vm_vmid);
-   if (vm->vm_from_config)
-   vm_stop(vm, 0, __func__);
-   else
-   vm_remove(vm, __func__);
+   vm_terminate(vm, __func__);
}

/* Send a response if a control client is waiting for it */
@@ -470,10 +469,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
}
if (vmr.vmr_result != EAGAIN ||
vm->vm_params.vmc_bootdevice) {
-   if (vm->vm_from_config)
-   vm_stop(vm, 0, __func__);
-   else
-   vm_remove(vm, __func__);
+   vm_terminate(vm, __func__);
} else {
/* Stop VM instance but keep the tty open */
vm_stop(vm, 1, __func__);
@@ -509,7 +505,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
imsg->hdr.peerid, -1, , sizeof(vir)) == -1) {
log_debug("%s: GET_INFO_VM failed for vm %d, removing",
__func__, vm->vm_vmid);
-   vm_remove(vm, __func__);
+   vm_terminate(vm, __func__);
return (-1);
}
break;
@@ -545,7 +541,7 @@ vmd_dispatch_vmm(int fd, struct privsep_proc *p, struc
sizeof(vir)) == -1) {
log_debug("%s: GET_INFO_VM_END failed",
__func__);
-   vm_remove(vm, __func__);
+   vm_terminate(vm, __func__);
return (-1);
}
}
@@ -1943,3 +1943,14