avoids the possibility to die during phase3_cleanup and instead of needing to duplicate the cleanup ourselves, benefit from phase2_cleanup doing so.
The duplicate cleanup was also very incomplete: it didn't stop the remote kvm process (leading to 'VM already running' when trying to migrate again afterwards), but it removed its disks, and it didn't unlock the config, didn't close the tunnel and didn't cancel the block-dirty bitmaps. Since migrate_cancel should do nothing after the (non-storage) migrate process has completed, even that cleanup step is fine here. Since phase3 is empty at the moment, the order of operations is still the same. Also add a test, that would complain about finish_tunnel not being called before this patch. That test also checks that local disks are not already removed before finishing the block jobs. Signed-off-by: Fabian Ebner <f.eb...@proxmox.com> --- New in v2 The test would also expose the temporary breakage with the wrong #8/#9 patch order With and without this patch: When dying here, i.e. when finishing the block jobs, the VM is in a blocked state afterwards (postmigrate), because the (non-storage) migration was successful. Simply resuming it seems to work just fine, would it be worth to add a (guarded) resume call in the cleanup too? PVE/QemuMigrate.pm | 23 ++++++++---------- test/MigrationTest/QemuMigrateMock.pm | 6 +++++ test/run_qemu_migrate_tests.pl | 35 +++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 13 deletions(-) diff --git a/PVE/QemuMigrate.pm b/PVE/QemuMigrate.pm index b503601..435c1f7 100644 --- a/PVE/QemuMigrate.pm +++ b/PVE/QemuMigrate.pm @@ -1134,6 +1134,16 @@ sub phase2 { die "unable to parse migration status '$stat->{status}' - aborting\n"; } } + + if ($self->{storage_migration}) { + # finish block-job with block-job-cancel, to disconnect source VM from NBD + # to avoid it trying to re-establish it. We are in blockjob ready state, + # thus, this command changes to it to blockjob complete (see qapi docs) + eval { PVE::QemuServer::qemu_drive_mirror_monitor($vmid, undef, $self->{storage_migration_jobs}, 'cancel'); }; + if (my $err = $@) { + die "Failed to complete storage migration: $err\n"; + } + } } sub phase2_cleanup { @@ -1209,19 +1219,6 @@ sub phase3_cleanup { my $tunnel = $self->{tunnel}; - if ($self->{storage_migration}) { - # finish block-job with block-job-cancel, to disconnect source VM from NBD - # to avoid it trying to re-establish it. We are in blockjob ready state, - # thus, this command changes to it to blockjob complete (see qapi docs) - eval { PVE::QemuServer::qemu_drive_mirror_monitor($vmid, undef, $self->{storage_migration_jobs}, 'cancel'); }; - - if (my $err = $@) { - eval { PVE::QemuServer::qemu_blockjobs_cancel($vmid, $self->{storage_migration_jobs}) }; - eval { PVE::QemuMigrate::cleanup_remotedisks($self) }; - die "Failed to complete storage migration: $err\n"; - } - } - if ($self->{volume_map}) { my $target_drives = $self->{target_drive}; diff --git a/test/MigrationTest/QemuMigrateMock.pm b/test/MigrationTest/QemuMigrateMock.pm index 2d424e0..8e0b7d0 100644 --- a/test/MigrationTest/QemuMigrateMock.pm +++ b/test/MigrationTest/QemuMigrateMock.pm @@ -139,6 +139,12 @@ $MigrationTest::Shared::qemu_server_module->mock( file_set_contents("${RUN_DIR_PATH}/nbd_info", to_json($nbd_info)); }, qemu_drive_mirror_monitor => sub { + my ($vmid, $vmiddst, $jobs, $completion, $qga) = @_; + + if ($fail_config->{qemu_drive_mirror_monitor} && + $fail_config->{qemu_drive_mirror_monitor} eq $completion) { + die "qemu_drive_mirror_monitor '$completion' error\n"; + } return; }, set_migration_caps => sub { diff --git a/test/run_qemu_migrate_tests.pl b/test/run_qemu_migrate_tests.pl index 4f7f021..5edea7b 100755 --- a/test/run_qemu_migrate_tests.pl +++ b/test/run_qemu_migrate_tests.pl @@ -1444,6 +1444,41 @@ my $tests = [ }, }, }, + { + name => '149_running_unused_block_job_cancel_fail', + target => 'pve1', + vmid => 149, + vm_status => { + running => 1, + runningmachine => 'pc-q35-5.0+pve0', + }, + opts => { + online => 1, + 'with-local-disks' => 1, + }, + config_patch => { + scsi1 => undef, + unused0 => 'local-dir:149/vm-149-disk-0.qcow2', + }, + expected_calls => {}, + expect_die => "qemu_drive_mirror_monitor 'cancel' error", + # note that 'cancel' is also used to finish and that's what this test is about + fail_config => { + 'qemu_drive_mirror_monitor' => 'cancel', + }, + expected => { + source_volids => local_volids_for_vm(149), + target_volids => {}, + vm_config => get_patched_config(149, { + scsi1 => undef, + unused0 => 'local-dir:149/vm-149-disk-0.qcow2', + }), + vm_status => { + running => 1, + runningmachine => 'pc-q35-5.0+pve0', + }, + }, + }, { name => '149_offline', target => 'pve1', -- 2.20.1 _______________________________________________ pve-devel mailing list pve-devel@lists.proxmox.com https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel