In some failure modes, Ganeti state of record may desync from actual
state of world. This patch allows migrate --cleanup to adopt an instance
if it is detected running on an unexpected node.

Signed-off-by: Viktor Bachraty <vbachr...@google.com>
---
 lib/cmdlib/instance_migration.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/lib/cmdlib/instance_migration.py b/lib/cmdlib/instance_migration.py
index 423a08b..cac0f5e 100644
--- a/lib/cmdlib/instance_migration.py
+++ b/lib/cmdlib/instance_migration.py
@@ -611,8 +611,9 @@ class TLMigrateInstance(Tasklet):
                      " hangs, the hypervisor might be in a bad state)")
 
     cluster_hvparams = self.cfg.GetClusterInfo().hvparams
+    online_node_uuids = self.cfg.GetOnlineNodeList()
     instance_list = self.rpc.call_instance_list(
-        self.all_node_uuids, [self.instance.hypervisor], cluster_hvparams)
+        online_node_uuids, [self.instance.hypervisor], cluster_hvparams)
 
     # Verify each result and raise an exception if failed
     for node_uuid, result in instance_list.items():
@@ -678,10 +679,16 @@ class TLMigrateInstance(Tasklet):
                                " and restart this operation")
 
     if not (runningon_source or runningon_target):
-      raise errors.OpExecError("Instance does not seem to be running at all;"
-                               " in this case it's safer to repair by"
-                               " running 'gnt-instance stop' to ensure disk"
-                               " shutdown, and then restarting it")
+      if len(instance_locations) == 1:
+        # The instance is running on a differrent node than expected, let's
+        # adopt it as if it was running on the secondary
+        self.target_node_uuid = instance_locations[0]
+        runningon_target = True
+      else:
+        raise errors.OpExecError("Instance does not seem to be running at all;"
+                                 " in this case it's safer to repair by"
+                                 " running 'gnt-instance stop' to ensure disk"
+                                 " shutdown, and then restarting it")
 
     if runningon_target:
       # the migration has actually succeeded, we need to update the config
-- 
2.8.0.rc3.226.g39d4020

Reply via email to