[PATCH stable-2.16 5/7] Make migrate --cleanup more robust

'Viktor Bachraty' via ganeti-devel Fri, 02 Dec 2016 06:04:32 -0800

In case of Xen migrations, the most common failure case is when the
instance fails to freeze so the migration fails with domains running on
both target and source node. This patchs allows migrate --cleanup to
recover by running AbortMigrate() in case the instance is running on
both the source and target node.


Signed-off-by: Viktor Bachraty <vbachr...@google.com>
---
 lib/cmdlib/instance_migration.py | 99 ++++++++++++++++++++++++++++++----------
 1 file changed, 74 insertions(+), 25 deletions(-)

diff --git a/lib/cmdlib/instance_migration.py b/lib/cmdlib/instance_migration.py
index 1e053fb..3225861 100644
--- a/lib/cmdlib/instance_migration.py
+++ b/lib/cmdlib/instance_migration.py
@@ -598,12 +598,44 @@ class TLMigrateInstance(Tasklet):
       nres.Raise("Cannot change disks config on node %s" %
                  self.cfg.GetNodeName(node_uuid))
 
+  def _FindInstanceLocations(self, name):
+    """Returns a list of nodes that have the given instance running
+
+    Args:
+      name: string, instance name string to search for
+
+    Returns:
+      list of strings, node uuids
+    """
+    self.feedback_fn("* checking where the instance actually runs (if this"
+                     " hangs, the hypervisor might be in a bad state)")
+
+    cluster_hvparams = self.cfg.GetClusterInfo().hvparams
+    instance_list = self.rpc.call_instance_list(
+        self.all_node_uuids, [self.instance.hypervisor], cluster_hvparams)
+
+    # Verify each result and raise an exception if failed
+    for node_uuid, result in instance_list.items():
+      result.Raise("Can't contact node %s" % self.cfg.GetNodeName(node_uuid))
+
+    # Xen renames the instance during migration, unfortunately we don't have
+    # a nicer way of identifying that it's the same instance. This is an awful
+    # leaking abstraction.
+    variants = [
+        name, 'migrating-' + name, name + '--incoming', name + 
'--migratedaway']
+    node_uuids = [node for node, data in instance_list.items()
+                  if any(var in data.payload for var in variants)]
+    self.feedback_fn("* instance running on: %s" % ','.join(
+        self.cfg.GetNodeName(uuid) for uuid in node_uuids))
+    return node_uuids
+
   def _ExecCleanup(self):
     """Try to cleanup after a failed migration.
 
     The cleanup is done by:
       - check that the instance is running only on one node
-        (and update the config if needed)
+      - try 'aborting' migration if it is running on two nodes
+      - update the config if needed
       - change disks on its secondary node to secondary
       - wait until disks are fully synchronized
       - disconnect from the network
@@ -611,23 +643,36 @@ class TLMigrateInstance(Tasklet):
       - wait again until disks are fully synchronized
 
     """
-    # check running on only one node
-    self.feedback_fn("* checking where the instance actually runs"
-                     " (if this hangs, the hypervisor might be in"
-                     " a bad state)")
-    cluster_hvparams = self.cfg.GetClusterInfo().hvparams
-    ins_l = self.rpc.call_instance_list(self.all_node_uuids,
-                                        [self.instance.hypervisor],
-                                        cluster_hvparams)
-    for node_uuid, result in ins_l.items():
-      result.Raise("Can't contact node %s" % node_uuid)
-
-    runningon_source = self.instance.name in \
-                         ins_l[self.source_node_uuid].payload
-    runningon_target = self.instance.name in \
-                         ins_l[self.target_node_uuid].payload
+    instance_locations = self._FindInstanceLocations(self.instance.name)
+    runningon_source = self.source_node_uuid in instance_locations
+    runningon_target = self.target_node_uuid in instance_locations
 
     if runningon_source and runningon_target:
+      # If we have an instance on both the source and the destination, we know
+      # that instance migration was interrupted in the middle, we can try to
+      # do effectively the same as when aborting an interrupted migration.
+      self.feedback_fn("Trying to cleanup after failed migration")
+      result = self.rpc.call_migration_info(
+          self.source_node_uuid, self.instance)
+      if result.fail_msg:
+        raise errors.OpExecError(
+            "Failed fetching source migration information from %s: %s" %
+            (self.cfg.GetNodeName(self.source_node_uuid), result.fail_msg))
+      self.migration_info = result.payload
+      abort_results = self._AbortMigration()
+
+      if abort_results[0].fail_msg or abort_results[1].fail_msg:
+        raise errors.OpExecError(
+            "Instance migration cleanup failed: %s" % ','.join([
+                abort_results[0].fail_msg, abort_results[1].fail_msg]))
+
+      # AbortMigration() should have fixed instance locations, so query again
+      instance_locations = self._FindInstanceLocations(self.instance.name)
+      runningon_source = self.source_node_uuid in instance_locations
+      runningon_target = self.target_node_uuid in instance_locations
+
+    # Abort didn't work, manual intervention required
+    if runningon_source and runningon_target:
       raise errors.OpExecError("Instance seems to be running on two nodes,"
                                " or the hypervisor is confused; you will have"
                                " to ensure manually that it runs only on one"
@@ -655,6 +700,7 @@ class TLMigrateInstance(Tasklet):
 
     disks = self.cfg.GetInstanceDisks(self.instance.uuid)
 
+    # TODO: Cleanup code duplication of _RevertDiskStatus()
     self._CloseInstanceDisks(demoted_node_uuid)
 
     if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
@@ -697,24 +743,27 @@ class TLMigrateInstance(Tasklet):
   def _AbortMigration(self):
     """Call the hypervisor code to abort a started migration.
 
+    Returns:
+      tuple of rpc call results
     """
-    abort_result = self.rpc.call_instance_finalize_migration_dst(
-                     self.target_node_uuid, self.instance, self.migration_info,
-                     False)
-    abort_msg = abort_result.fail_msg
+    src_result = self.rpc.call_instance_finalize_migration_dst(
+        self.target_node_uuid, self.instance, self.migration_info, False)
+    abort_msg = src_result.fail_msg
     if abort_msg:
       logging.error("Aborting migration failed on target node %s: %s",
                     self.cfg.GetNodeName(self.target_node_uuid), abort_msg)
-      # Don't raise an exception here, as we stil have to try to revert the
-      # disk status, even if this step failed.
 
-    abort_result = self.rpc.call_instance_finalize_migration_src(
-      self.source_node_uuid, self.instance, False, self.live)
-    abort_msg = abort_result.fail_msg
+    # Don't raise an exception here, as we stil have to try to revert the
+    # disk status, even if this step failed.
+    dst_result = self.rpc.call_instance_finalize_migration_src(
+        self.source_node_uuid, self.instance, False, self.live)
+    abort_msg = dst_result.fail_msg
     if abort_msg:
       logging.error("Aborting migration failed on source node %s: %s",
                     self.cfg.GetNodeName(self.source_node_uuid), abort_msg)
 
+    return src_result, dst_result
+
   def _ExecMigration(self):
     """Migrate an instance.
 
-- 
2.8.0.rc3.226.g39d4020

[PATCH stable-2.16 5/7] Make migrate --cleanup more robust

Reply via email to