* zhanghailiang (zhang.zhanghaili...@huawei.com) wrote: > If we modify VM's RAM (pages) during setup stage after enable write-protect > notification in snapshot thread, the modification action will get stuck > because > we only remove the page's write-protect in savevm process, it blocked by > itself. > > To fix this bug, we will remove page's write-protect in fault thread during > the setup stage. Besides, we should not try to get global lock after setup > stage, > or there maybe an deadlock error.
Hmm this complicates things a bit more doesn't it. What's the order of: a) setup b) savings devices c) Being able to transmit the pages? Are these pages that are being modified during setup, being modified as part of the device state save? Dave > > Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com> > --- > include/migration/migration.h | 4 ++-- > migration/migration.c | 2 +- > migration/postcopy-ram.c | 17 ++++++++++++++++- > migration/ram.c | 37 +++++++++++++++++++++++++++++++------ > 4 files changed, 50 insertions(+), 10 deletions(-) > > diff --git a/include/migration/migration.h b/include/migration/migration.h > index ef4c071..435de31 100644 > --- a/include/migration/migration.h > +++ b/include/migration/migration.h > @@ -127,7 +127,7 @@ struct MigrationSrcPageRequest { > RAMBlock *rb; > hwaddr offset; > hwaddr len; > - > + uint8_t *pages_copy_addr; > QSIMPLEQ_ENTRY(MigrationSrcPageRequest) next_req; > }; > > @@ -333,7 +333,7 @@ void global_state_store_running(void); > > void flush_page_queue(MigrationState *ms); > int ram_save_queue_pages(MigrationState *ms, const char *rbname, > - ram_addr_t start, ram_addr_t len); > + ram_addr_t start, ram_addr_t len, bool copy_pages); > > PostcopyState postcopy_state_get(void); > /* Set the state and return the old state */ > diff --git a/migration/migration.c b/migration/migration.c > index 3765c3b..bf4c7a1 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -1248,7 +1248,7 @@ static void migrate_handle_rp_req_pages(MigrationState > *ms, const char* rbname, > return; > } > > - if (ram_save_queue_pages(ms, rbname, start, len)) { > + if (ram_save_queue_pages(ms, rbname, start, len, false)) { > mark_source_rp_bad(ms); > } > } > diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c > index 61392d3..2cf477d 100644 > --- a/migration/postcopy-ram.c > +++ b/migration/postcopy-ram.c > @@ -543,13 +543,28 @@ static void *postcopy_ram_fault_thread(void *opaque) > MigrationState *ms = container_of(us, MigrationState, > userfault_state); > ret = ram_save_queue_pages(ms, qemu_ram_get_idstr(rb), rb_offset, > - hostpagesize); > + hostpagesize, true); > > if (ret < 0) { > error_report("%s: Save: %"PRIx64 " failed!", > __func__, (uint64_t)msg.arg.pagefault.address); > break; > } > + > + /* Note: In the setup process, snapshot_thread may modify VM's > + * write-protected pages, we should not block it there, or there > + * will be an deadlock error. > + */ > + if (migration_in_setup(ms)) { > + uint64_t host = msg.arg.pagefault.address; > + > + host &= ~(hostpagesize - 1); > + ret = ram_set_pages_wp(host, getpagesize(), true, > + us->userfault_fd); > + if (ret < 0) { > + error_report("Remove page's write-protect failed"); > + } > + } > } > } > trace_postcopy_ram_fault_thread_exit(); > diff --git a/migration/ram.c b/migration/ram.c > index 8656719..747f9aa 100644 > --- a/migration/ram.c > +++ b/migration/ram.c > @@ -233,6 +233,7 @@ struct PageSearchStatus { > ram_addr_t offset; > /* Set once we wrap around */ > bool complete_round; > + uint8_t *pages_copy; > }; > typedef struct PageSearchStatus PageSearchStatus; > > @@ -742,7 +743,12 @@ static int ram_save_page(QEMUFile *f, PageSearchStatus > *pss, > RAMBlock *block = pss->block; > ram_addr_t offset = pss->offset; > > - p = block->host + offset; > + /* If we have a copy of this page, use the backup page first */ > + if (pss->pages_copy) { > + p = pss->pages_copy; > + } else { > + p = block->host + offset; > + } > > /* In doubt sent page as normal */ > bytes_xmit = 0; > @@ -926,7 +932,12 @@ static int ram_save_compressed_page(QEMUFile *f, > PageSearchStatus *pss, > RAMBlock *block = pss->block; > ram_addr_t offset = pss->offset; > > - p = block->host + offset; > + /* If we have a copy of this page, use the backup first */ > + if (pss->pages_copy) { > + p = pss->pages_copy; > + } else { > + p = block->host + offset; > + } > > bytes_xmit = 0; > ret = ram_control_save_page(f, block->offset, > @@ -1043,7 +1054,7 @@ static bool find_dirty_block(QEMUFile *f, > PageSearchStatus *pss, > * Returns: block (or NULL if none available) > */ > static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset, > - ram_addr_t *ram_addr_abs) > + ram_addr_t *ram_addr_abs, uint8 > **pages_copy_addr) > { > RAMBlock *block = NULL; > > @@ -1055,7 +1066,7 @@ static RAMBlock *unqueue_page(MigrationState *ms, > ram_addr_t *offset, > *offset = entry->offset; > *ram_addr_abs = (entry->offset + entry->rb->offset) & > TARGET_PAGE_MASK; > - > + *pages_copy_addr = entry->pages_copy_addr; > if (entry->len > TARGET_PAGE_SIZE) { > entry->len -= TARGET_PAGE_SIZE; > entry->offset += TARGET_PAGE_SIZE; > @@ -1086,9 +1097,10 @@ static bool get_queued_page(MigrationState *ms, > PageSearchStatus *pss, > RAMBlock *block; > ram_addr_t offset; > bool dirty; > + uint8 *pages_backup_addr = NULL; > > do { > - block = unqueue_page(ms, &offset, ram_addr_abs); > + block = unqueue_page(ms, &offset, ram_addr_abs, &pages_backup_addr); > /* > * We're sending this page, and since it's postcopy nothing else > * will dirty it, and we must make sure it doesn't get sent again > @@ -1130,6 +1142,7 @@ static bool get_queued_page(MigrationState *ms, > PageSearchStatus *pss, > */ > pss->block = block; > pss->offset = offset; > + pss->pages_copy = pages_backup_addr; > } > > return !!block; > @@ -1166,7 +1179,7 @@ void flush_page_queue(MigrationState *ms) > * Return: 0 on success > */ > int ram_save_queue_pages(MigrationState *ms, const char *rbname, > - ram_addr_t start, ram_addr_t len) > + ram_addr_t start, ram_addr_t len, bool copy_pages) > { > RAMBlock *ramblock; > > @@ -1206,6 +1219,17 @@ int ram_save_queue_pages(MigrationState *ms, const > char *rbname, > new_entry->rb = ramblock; > new_entry->offset = start; > new_entry->len = len; > + if (copy_pages) { > + /* Fix me: Better to realize a memory pool */ > + new_entry->pages_copy_addr = g_try_malloc0(len); > + > + if (!new_entry->pages_copy_addr) { > + error_report("%s: Failed to alloc memory", __func__); > + return -1; > + } > + > + memcpy(new_entry->pages_copy_addr, ramblock_ptr(ramblock, start), > len); > + } > > memory_region_ref(ramblock->mr); > qemu_mutex_lock(&ms->src_page_req_mutex); > @@ -1342,6 +1366,7 @@ static int ram_find_and_save_block(QEMUFile *f, bool > last_stage, > pss.block = last_seen_block; > pss.offset = last_offset; > pss.complete_round = false; > + pss.pages_copy = NULL; > > if (!pss.block) { > pss.block = QLIST_FIRST_RCU(&ram_list.blocks); > -- > 1.8.3.1 > > -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK