On Fri, Jan 22, 2010 at 12:56:50PM +0100, Guido Trotter wrote: > Before, when doing kvm live migrations we use to accept an "unknown > status" but to reject anything that didn't match our regexp. Since we've > seen "info migrate" return a completely empty answer, we'll be more > tolerant of completely unknown results (while still logging them) and at > the same time we'll limit the number of them which we're willing to > accept in a row. > > Signed-off-by: Guido Trotter <ultrot...@google.com> > --- > lib/hypervisor/hv_kvm.py | 17 ++++++++++++++--- > 1 files changed, 14 insertions(+), 3 deletions(-) > > diff --git a/lib/hypervisor/hv_kvm.py b/lib/hypervisor/hv_kvm.py > index 92a2854..40b5ef0 100644 > --- a/lib/hypervisor/hv_kvm.py > +++ b/lib/hypervisor/hv_kvm.py > @@ -675,17 +675,25 @@ class KVMHypervisor(hv_base.BaseHypervisor): > > info_command = 'info migrate' > done = False > + broken_answers = 0 > while not done: > result = self._CallMonitorCommand(instance_name, info_command) > match = self._MIGRATION_STATUS_RE.search(result.stdout) > if not match: > - raise errors.HypervisorError("Unknown 'info migrate' result: %s" % > - result.stdout) > + broken_answers += 1 > + if not result.stdout: > + logging.info("KVM: empty 'info migrate' result") > + else: > + logging.warning("KVM: unknown 'info migrate' result: %s" % > + result.stdout) > + time.sleep(2) > else: > status = match.group(1) > if status == 'completed': > done = True > elif status == 'active': > + # reset the broken answers count > + broken_answers = 0 > time.sleep(2) > elif status == 'failed' or status == 'cancelled': > if not live: > @@ -693,8 +701,11 @@ class KVMHypervisor(hv_base.BaseHypervisor): > raise errors.HypervisorError("Migration %s at the kvm level" % > status) > else: > - logging.info("KVM: unknown migration status '%s'", status) > + logging.warning("KVM: unknown migration status '%s'", status) > + broken_answers += 1 > time.sleep(2) > + if broken_answers >= 5: > + raise errors.HypervisorError("Too many 'info migrate' broken > answers") >
LGTM, the only thing I don't like 100% is the "5" and "2" hardcoded here. Could you move it to a constant on the KVMHypervisor class, e.g. MAX_MIGRATE_BAD_ANSWERS, and MIGRATE_RETRY_DELAY? (Or such). Side note, I don't think this retry loop works with the Retry class. iustin