Add a side effect to StartInstance on Xen hypervisor that restores the
Xen configuration in case it is missing. This is a common failure
scenario during migrations. The generic backend checks if the instance
is running and queries the hypervisor about it's state. If it needs
recovery, the backend gathers the associated block devices and calls
RestoreInstance in the same manner as if it would call StartInstance in
case the instance wasn't already running.

Signed-off-by: Viktor Bachraty <vbachr...@google.com>
---
 lib/backend.py            | 17 +++++++++++------
 lib/hypervisor/hv_base.py | 30 +++++++++++++++++++++++++++++-
 lib/hypervisor/hv_xen.py  | 31 +++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+), 7 deletions(-)

diff --git a/lib/backend.py b/lib/backend.py
index 2c2448b..3ccc901 100644
--- a/lib/backend.py
+++ b/lib/backend.py
@@ -2908,15 +2908,20 @@ def StartInstance(instance, startup_paused, reason, 
store_reason=True):
   @rtype: None
 
   """
-  instance_info = _GetInstanceInfo(instance)
+  try:
+    instance_info = _GetInstanceInfo(instance)
+    hyper = hypervisor.GetHypervisor(instance.hypervisor)
 
-  if instance_info and not _IsInstanceUserDown(instance_info):
-    logging.info("Instance '%s' already running, not starting", instance.name)
-    return
+    if instance_info and not _IsInstanceUserDown(instance_info):
+      logging.info("Instance '%s' already running, not starting", 
instance.name)
+      if hyper.VerifyInstance(instance):
+        return
+      logging.info("Instance '%s' needs fixup", instance.name)
+      block_devices = _GatherAndLinkBlockDevs(instance)
+      hyper.RestoreInstance(instance, block_devices)
+      return
 
-  try:
     block_devices = _GatherAndLinkBlockDevs(instance)
-    hyper = hypervisor.GetHypervisor(instance.hypervisor)
     hyper.StartInstance(instance, block_devices, startup_paused)
     if store_reason:
       _StoreInstReasonTrail(instance.name, reason)
diff --git a/lib/hypervisor/hv_base.py b/lib/hypervisor/hv_base.py
index 8aaa062..6d4e85f 100644
--- a/lib/hypervisor/hv_base.py
+++ b/lib/hypervisor/hv_base.py
@@ -300,9 +300,37 @@ class BaseHypervisor(object):
   CAN_MIGRATE = False
 
   def StartInstance(self, instance, block_devices, startup_paused):
-    """Start an instance."""
+    """Start an instance.
+
+    @type instance: L{objects.Instance}
+    @param instance: instance to stop
+    @type block_devices: list of tuples (disk_object, link_name, drive_uri)
+    @param block_devices: blockdevices assigned to this instance
+    @type startup_paused: bool
+    @param startup_paused: if instance should be paused at startup
+    """
     raise NotImplementedError
 
+  def VerifyInstance(self, instance):  # pylint: disable=R0201,W0613
+    """Verify if running instance is in correct state.
+
+    @type instance: L{objects.Instance}
+    @param instance: instance to stop
+
+    @return: bool, if instance in correct state
+    """
+    return True
+
+  def RestoreInstance(self, instance, block_devices):
+    """Fixup running instance's state.
+
+    @type instance: L{objects.Instance}
+    @param instance: instance to stop
+    @type block_devices: list of tuples (disk_object, link_name, drive_uri)
+    @param block_devices: blockdevices assigned to this instance
+    """
+    pass
+
   def StopInstance(self, instance, force=False, retry=False, name=None,
                    timeout=None):
     """Stop an instance
diff --git a/lib/hypervisor/hv_xen.py b/lib/hypervisor/hv_xen.py
index 9873f14..ae47b41 100644
--- a/lib/hypervisor/hv_xen.py
+++ b/lib/hypervisor/hv_xen.py
@@ -34,6 +34,7 @@
 
 import logging
 import errno
+import os
 import string # pylint: disable=W0402
 import shutil
 import time
@@ -896,9 +897,39 @@ class XenHypervisor(hv_base.BaseHypervisor):
 
     self._WriteConfigFile(instance.name, buf.getvalue())
 
+  def VerifyInstance(self, instance):
+    """Verify if running instance is in correct state.
+
+    @type instance: L{objects.Instance}
+    @param instance: instance to stop
+
+    @return: bool, if instance in correct state
+    """
+    config_file = utils.PathJoin(self._cfgdir, "auto", instance.name)
+    return os.path.exists(config_file)
+
+  def RestoreInstance(self, instance, block_devices):
+    """Fixup running instance's state.
+
+    @type instance: L{objects.Instance}
+    @param instance: instance to stop
+    @type block_devices: list of tuples (disk_object, link_name, drive_uri)
+    @param block_devices: blockdevices assigned to this instance
+    """
+    startup_memory = self._InstanceStartupMemory(instance)
+    self._MakeConfigFile(instance, startup_memory, block_devices)
+
   def StartInstance(self, instance, block_devices, startup_paused):
     """Start an instance.
 
+    @type instance: L{objects.Instance}
+    @param instance: instance to stop
+    @type block_devices: list of tuples (cfdev, rldev)
+      - cfdev: dict containing ganeti config disk part
+      - rldev: ganeti.block.bdev.BlockDev object
+    @param block_devices: blockdevices assigned to this instance
+    @type startup_paused: bool
+    @param startup_paused: if instance should be paused at startup
     """
     startup_memory = self._InstanceStartupMemory(instance)
 
-- 
2.8.0.rc3.226.g39d4020

Reply via email to