On Fri, Jan 22, 2010 at 12:56:50PM +0100, Guido Trotter wrote:
> Before, when doing kvm live migrations we use to accept an "unknown
> status" but to reject anything that didn't match our regexp. Since we've
> seen "info migrate" return a completely empty answer, we'll be more
> tolerant of completely unknown results (while still logging them) and at
> the same time we'll limit the number of them which we're willing to
> accept in a row.
> 
> Signed-off-by: Guido Trotter <ultrot...@google.com>
> ---
>  lib/hypervisor/hv_kvm.py |   17 ++++++++++++++---
>  1 files changed, 14 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/hypervisor/hv_kvm.py b/lib/hypervisor/hv_kvm.py
> index 92a2854..40b5ef0 100644
> --- a/lib/hypervisor/hv_kvm.py
> +++ b/lib/hypervisor/hv_kvm.py
> @@ -675,17 +675,25 @@ class KVMHypervisor(hv_base.BaseHypervisor):
>  
>      info_command = 'info migrate'
>      done = False
> +    broken_answers = 0
>      while not done:
>        result = self._CallMonitorCommand(instance_name, info_command)
>        match = self._MIGRATION_STATUS_RE.search(result.stdout)
>        if not match:
> -        raise errors.HypervisorError("Unknown 'info migrate' result: %s" %
> -                                     result.stdout)
> +        broken_answers += 1
> +        if not result.stdout:
> +          logging.info("KVM: empty 'info migrate' result")
> +        else:
> +          logging.warning("KVM: unknown 'info migrate' result: %s" %
> +                          result.stdout)
> +        time.sleep(2)
>        else:
>          status = match.group(1)
>          if status == 'completed':
>            done = True
>          elif status == 'active':
> +          # reset the broken answers count
> +          broken_answers = 0
>            time.sleep(2)
>          elif status == 'failed' or status == 'cancelled':
>            if not live:
> @@ -693,8 +701,11 @@ class KVMHypervisor(hv_base.BaseHypervisor):
>            raise errors.HypervisorError("Migration %s at the kvm level" %
>                                         status)
>          else:
> -          logging.info("KVM: unknown migration status '%s'", status)
> +          logging.warning("KVM: unknown migration status '%s'", status)
> +          broken_answers += 1
>            time.sleep(2)
> +      if broken_answers >= 5:
> +        raise errors.HypervisorError("Too many 'info migrate' broken 
> answers")
>  

LGTM, the only thing I don't like 100% is the "5" and "2" hardcoded
here. Could you move it to a constant on the KVMHypervisor class, e.g.
MAX_MIGRATE_BAD_ANSWERS, and MIGRATE_RETRY_DELAY? (Or such). Side note,
I don't think this retry loop works with the Retry class.

iustin

Reply via email to