In some failure modes, Ganeti state of record may desync from actual state of world. This patch allows migrate --cleanup to adopt an instance if it is detected running on an unexpected node.
Signed-off-by: Viktor Bachraty <vbachr...@google.com> --- lib/cmdlib/instance_migration.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/lib/cmdlib/instance_migration.py b/lib/cmdlib/instance_migration.py index 423a08b..cac0f5e 100644 --- a/lib/cmdlib/instance_migration.py +++ b/lib/cmdlib/instance_migration.py @@ -611,8 +611,9 @@ class TLMigrateInstance(Tasklet): " hangs, the hypervisor might be in a bad state)") cluster_hvparams = self.cfg.GetClusterInfo().hvparams + online_node_uuids = self.cfg.GetOnlineNodeList() instance_list = self.rpc.call_instance_list( - self.all_node_uuids, [self.instance.hypervisor], cluster_hvparams) + online_node_uuids, [self.instance.hypervisor], cluster_hvparams) # Verify each result and raise an exception if failed for node_uuid, result in instance_list.items(): @@ -678,10 +679,16 @@ class TLMigrateInstance(Tasklet): " and restart this operation") if not (runningon_source or runningon_target): - raise errors.OpExecError("Instance does not seem to be running at all;" - " in this case it's safer to repair by" - " running 'gnt-instance stop' to ensure disk" - " shutdown, and then restarting it") + if len(instance_locations) == 1: + # The instance is running on a differrent node than expected, let's + # adopt it as if it was running on the secondary + self.target_node_uuid = instance_locations[0] + runningon_target = True + else: + raise errors.OpExecError("Instance does not seem to be running at all;" + " in this case it's safer to repair by" + " running 'gnt-instance stop' to ensure disk" + " shutdown, and then restarting it") if runningon_target: # the migration has actually succeeded, we need to update the config -- 2.8.0.rc3.226.g39d4020