* Peter Xu (pet...@redhat.com) wrote: > Allows the fault thread to stop handling page faults temporarily. When > network failure happened (and if we expect a recovery afterwards), we > should not allow the fault thread to continue sending things to source, > instead, it should halt for a while until the connection is rebuilt. > > When the dest main thread noticed the failure, it kicks the fault thread > to switch to pause state. > > Signed-off-by: Peter Xu <pet...@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilb...@redhat.com> > --- > migration/migration.c | 1 + > migration/migration.h | 1 + > migration/postcopy-ram.c | 50 > ++++++++++++++++++++++++++++++++++++++++++++---- > migration/savevm.c | 3 +++ > migration/trace-events | 2 ++ > 5 files changed, 53 insertions(+), 4 deletions(-) > > diff --git a/migration/migration.c b/migration/migration.c > index 9a0b5b0..9d93836 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -147,6 +147,7 @@ MigrationIncomingState > *migration_incoming_get_current(void) > qemu_mutex_init(&mis_current.rp_mutex); > qemu_event_init(&mis_current.main_thread_load_event, false); > qemu_sem_init(&mis_current.postcopy_pause_sem_dst, 0); > + qemu_sem_init(&mis_current.postcopy_pause_sem_fault, 0); > once = true; > } > return &mis_current; > diff --git a/migration/migration.h b/migration/migration.h > index 047872b..574fedd 100644 > --- a/migration/migration.h > +++ b/migration/migration.h > @@ -63,6 +63,7 @@ struct MigrationIncomingState { > > /* notify PAUSED postcopy incoming migrations to try to continue */ > QemuSemaphore postcopy_pause_sem_dst; > + QemuSemaphore postcopy_pause_sem_fault; > }; > > MigrationIncomingState *migration_incoming_get_current(void); > diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c > index 9ce391d..ba53155 100644 > --- a/migration/postcopy-ram.c > +++ b/migration/postcopy-ram.c > @@ -418,6 +418,17 @@ static int ram_block_enable_notify(const char > *block_name, void *host_addr, > return 0; > } > > +static bool postcopy_pause_fault_thread(MigrationIncomingState *mis) > +{ > + trace_postcopy_pause_fault_thread(); > + > + qemu_sem_wait(&mis->postcopy_pause_sem_fault); > + > + trace_postcopy_pause_fault_thread_continued(); > + > + return true; > +} > + > /* > * Handle faults detected by the USERFAULT markings > */ > @@ -465,6 +476,22 @@ static void *postcopy_ram_fault_thread(void *opaque) > } > } > > + if (!mis->to_src_file) { > + /* > + * Possibly someone tells us that the return path is > + * broken already using the event. We should hold until > + * the channel is rebuilt. > + */ > + if (postcopy_pause_fault_thread(mis)) { > + last_rb = NULL; > + /* Continue to read the userfaultfd */ > + } else { > + error_report("%s: paused but don't allow to continue", > + __func__); > + break; > + } > + } > + > ret = read(mis->userfault_fd, &msg, sizeof(msg)); > if (ret != sizeof(msg)) { > if (errno == EAGAIN) { > @@ -504,18 +531,33 @@ static void *postcopy_ram_fault_thread(void *opaque) > qemu_ram_get_idstr(rb), > rb_offset); > > +retry: > /* > * Send the request to the source - we want to request one > * of our host page sizes (which is >= TPS) > */ > if (rb != last_rb) { > last_rb = rb; > - migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), > - rb_offset, qemu_ram_pagesize(rb)); > + ret = migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), > + rb_offset, > qemu_ram_pagesize(rb)); > } else { > /* Save some space */ > - migrate_send_rp_req_pages(mis, NULL, > - rb_offset, qemu_ram_pagesize(rb)); > + ret = migrate_send_rp_req_pages(mis, NULL, > + rb_offset, > qemu_ram_pagesize(rb)); > + } > + > + if (ret) { > + /* May be network failure, try to wait for recovery */ > + if (ret == -EIO && postcopy_pause_fault_thread(mis)) { > + /* We got reconnected somehow, try to continue */ > + last_rb = NULL; > + goto retry; > + } else { > + /* This is a unavoidable fault */ > + error_report("%s: migrate_send_rp_req_pages() get %d", > + __func__, ret); > + break; > + } > } > } > trace_postcopy_ram_fault_thread_exit(); > diff --git a/migration/savevm.c b/migration/savevm.c > index 1f62268..386788d 100644 > --- a/migration/savevm.c > +++ b/migration/savevm.c > @@ -1974,6 +1974,9 @@ static bool > postcopy_pause_incoming(MigrationIncomingState *mis) > mis->to_src_file = NULL; > qemu_mutex_unlock(&mis->rp_mutex); > > + /* Notify the fault thread for the invalidated file handle */ > + postcopy_fault_thread_notify(mis); > + > while (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { > qemu_sem_wait(&mis->postcopy_pause_sem_dst); > } > diff --git a/migration/trace-events b/migration/trace-events > index a269eec..dbb4971 100644 > --- a/migration/trace-events > +++ b/migration/trace-events > @@ -100,6 +100,8 @@ open_return_path_on_source_continue(void) "" > postcopy_start(void) "" > postcopy_pause_return_path(void) "" > postcopy_pause_return_path_continued(void) "" > +postcopy_pause_fault_thread(void) "" > +postcopy_pause_fault_thread_continued(void) "" > postcopy_pause_continued(void) "" > postcopy_pause_incoming(void) "" > postcopy_pause_incoming_continued(void) "" > -- > 2.7.4 > -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK