On 24.07.2020 01:15, Peter Xu wrote:
On Wed, Jul 22, 2020 at 11:11:32AM +0300, Denis Plotnikov wrote:
+static void *background_snapshot_thread(void *opaque)
+{
+ MigrationState *m = opaque;
+ QIOChannelBuffer *bioc;
+ QEMUFile *fb;
+ int res = 0;
+
+ rcu_register_thread();
+
+ qemu_file_set_rate_limit(m->to_dst_file, INT64_MAX);
+
+ qemu_mutex_lock_iothread();
+ vm_stop(RUN_STATE_PAUSED);
+
+ qemu_savevm_state_header(m->to_dst_file);
+ qemu_mutex_unlock_iothread();
+ qemu_savevm_state_setup(m->to_dst_file);
Is it intended to skip bql for the setup phase? IIUC the main thread could
start the vm before we take the lock again below if we released it...
Good point!
+ qemu_mutex_lock_iothread();
+
+ migrate_set_state(&m->state, MIGRATION_STATUS_SETUP,
+ MIGRATION_STATUS_ACTIVE);
+
+ /*
+ * We want to save the vm state for the moment when the snapshot saving was
+ * called but also we want to write RAM content with vm running. The RAM
+ * content should appear first in the vmstate.
+ * So, we first, save non-ram part of the vmstate to the temporary, buffer,
+ * then write ram part of the vmstate to the migration stream with vCPUs
+ * running and, finally, write the non-ram part of the vmstate from the
+ * buffer to the migration stream.
+ */
+ bioc = qio_channel_buffer_new(4096);
+ qio_channel_set_name(QIO_CHANNEL(bioc), "vmstate-buffer");
+ fb = qemu_fopen_channel_output(QIO_CHANNEL(bioc));
+ object_unref(OBJECT(bioc));
+
+ if (ram_write_tracking_start()) {
+ goto failed_resume;
+ }
+
+ if (global_state_store()) {
+ goto failed_resume;
+ }
Is this needed? We should be always in stopped state here, right?
Yes, seems it isn't needed
+
+ cpu_synchronize_all_states();
+
+ if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) {
+ goto failed_resume;
+ }
+
+ vm_start();
+ qemu_mutex_unlock_iothread();
+
+ while (!res) {
+ res = qemu_savevm_state_iterate(m->to_dst_file, false);
+
+ if (res < 0 || qemu_file_get_error(m->to_dst_file)) {
+ goto failed;
+ }
+ }
+
+ /*
+ * By this moment we have RAM content saved into the migration stream.
+ * The next step is to flush the non-ram content (vm devices state)
+ * right after the ram content. The device state was stored in
+ * the temporary buffer prior to the ram saving.
+ */
+ qemu_put_buffer(m->to_dst_file, bioc->data, bioc->usage);
+ qemu_fflush(m->to_dst_file);
+
+ if (qemu_file_get_error(m->to_dst_file)) {
+ goto failed;
+ }
+
+ migrate_set_state(&m->state, MIGRATION_STATUS_ACTIVE,
+ MIGRATION_STATUS_COMPLETED);
+ goto exit;
+
+failed_resume:
+ vm_start();
+ qemu_mutex_unlock_iothread();
+failed:
+ migrate_set_state(&m->state, MIGRATION_STATUS_ACTIVE,
+ MIGRATION_STATUS_FAILED);
+exit:
+ ram_write_tracking_stop();
+ qemu_fclose(fb);
+ qemu_mutex_lock_iothread();
+ qemu_savevm_state_cleanup();
+ qemu_mutex_unlock_iothread();
+ rcu_unregister_thread();
+ return NULL;
+}
+
void migrate_fd_connect(MigrationState *s, Error *error_in)
{
Error *local_err = NULL;
@@ -3599,8 +3694,14 @@ void migrate_fd_connect(MigrationState *s, Error
*error_in)
migrate_fd_cleanup(s);
return;
}
- qemu_thread_create(&s->thread, "live_migration", migration_thread, s,
- QEMU_THREAD_JOINABLE);
+ if (migrate_background_snapshot()) {
+ qemu_thread_create(&s->thread, "bg_snapshot",
Maybe the name "live_snapshot" suites more (since the other one is
"live_migration")?
looks like it, another good name is async_snapshot and all the related
function and properties should be rename accordingly
+ background_snapshot_thread, s,
+ QEMU_THREAD_JOINABLE);
+ } else {
+ qemu_thread_create(&s->thread, "live_migration", migration_thread, s,
+ QEMU_THREAD_JOINABLE);
+ }
s->migration_thread_running = true;
}
[...]
@@ -1151,9 +1188,11 @@ static int save_normal_page(RAMState *rs, RAMBlock
*block, ram_addr_t offset,
ram_counters.transferred += save_page_header(rs, rs->f, block,
offset | RAM_SAVE_FLAG_PAGE);
if (async) {
- qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
- migrate_release_ram() &
- migration_in_postcopy());
+ bool may_free = migrate_background_snapshot() ||
+ (migrate_release_ram() &&
+ migration_in_postcopy());
Does background snapshot need to free the memory? /me confused..
Yes, for the page copies. No, for the rest of the pages.
+
+ qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, may_free);
} else {
qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
}
[...]
+void ram_block_list_create(void)
+{
+ RAMBlock *block = NULL;
+ RamBlockList *block_list = ram_bgs_block_list_get();
+
+ qemu_mutex_lock_ramlist();
+ RAMBLOCK_FOREACH_MIGRATABLE(block) {
+ memory_region_ref(block->mr);
+ QLIST_INSERT_HEAD(block_list, block, bgs_next);
+ }
+ qemu_mutex_unlock_ramlist();
This kind of duplicate with ram_list.blocks itself...
+}
+
+static int page_fault_fd;
+static int thread_quit_fd;
+static QemuThread page_fault_thread;
+
+static int mem_change_wp(void *addr, uint64_t length, bool protect)
+{
+ struct uffdio_writeprotect wp = { 0 };
+
+ assert(page_fault_fd);
+
+ if (protect) {
+ struct uffdio_register reg = { 0 };
+
+ reg.mode = UFFDIO_REGISTER_MODE_WP;
+ reg.range.start = (uint64_t) addr;
+ reg.range.len = length;
+
+ if (ioctl(page_fault_fd, UFFDIO_REGISTER, ®)) {
+ error_report("Can't register memeory at %p len: %"PRIu64
+ " for page fault interception", addr, length);
+ return -1;
+ }
IMHO it's better to move the register out of mem_change_wp(). mem_change_wp()
should be in page granularity, while we should be clear in the code that the
registeration is happening per-ramblock.
so, will move it
Btw, is UFFDIO_UNREGISTER missing in the whole process?
yeah
+
+ wp.mode = UFFDIO_WRITEPROTECT_MODE_WP;
+ }
[...]
@@ -2338,6 +2881,11 @@ static void ram_list_init_bitmaps(void)
bitmap_set(block->bmap, 0, pages);
block->clear_bmap_shift = shift;
block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
+
+ if (migrate_background_snapshot()) {
+ block->touched_map = bitmap_new(pages);
+ block->copied_map = bitmap_new(pages);
+ }
We should be able to avoid allocating bmap & clear_bmap for snapshots.Or we
can also directly reuse the two bitmaps?
Probably, yes