From: "Dr. David Alan Gilbert" <dgilb...@redhat.com> An early postcopy failure can be recovered from as long as we know we haven't sent the command to run the destination. We have to undo the bdrv_inactivate_all by calling bdrv_invalidate_cache_all
Note that I'm not using ms->block_inactive because once we've sent the postcopy package we dont want anything else to try and recover the block storage on the source; the destination might have started writing to it. Signed-off-by: Dr. David Alan Gilbert <dgilb...@redhat.com> --- migration/migration.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/migration/migration.c b/migration/migration.c index 2766d2f..283677c 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -1605,6 +1605,7 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running) QIOChannelBuffer *bioc; QEMUFile *fb; int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + bool restart_block = false; migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_POSTCOPY_ACTIVE); @@ -1624,6 +1625,7 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running) if (ret < 0) { goto fail; } + restart_block = true; /* * Cause any non-postcopiable, but iterative devices to @@ -1680,6 +1682,18 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running) /* <><> end of stuff going into the package */ + /* Last point of recovery; as soon as we send the package the destination + * can open devices and potentially start running. + * Lets just check again we've not got any errors. + */ + ret = qemu_file_get_error(ms->to_dst_file); + if (ret) { + error_report("postcopy_start: Migration stream errored (pre package)"); + goto fail_closefb; + } + + restart_block = false; + /* Now send that blob */ if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) { goto fail_closefb; @@ -1717,6 +1731,17 @@ fail_closefb: fail: migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, MIGRATION_STATUS_FAILED); + if (restart_block) { + /* A failure happened early enough that we know the destination hasn't + * accessed block devices, so we're safe to recover. + */ + Error *local_err = NULL; + + bdrv_invalidate_cache_all(&local_err); + if (local_err) { + error_report_err(local_err); + } + } qemu_mutex_unlock_iothread(); return -1; } -- 2.9.3