Hao Xiang <hao.xi...@bytedance.com> writes: > On Wed, Feb 21, 2024 at 1:06 PM Fabiano Rosas <faro...@suse.de> wrote: >> >> Hao Xiang <hao.xi...@bytedance.com> writes: >> >> > This change adds a dedicated handler for >> > MigrationOps::ram_save_target_page in >> >> nit: Add a dedicated handler... >> >> Usually "this patch/change" is used only when necessary to avoid >> ambiguity. > > Will do. > >> >> > multifd live migration. Now zero page checking can be done in the multifd >> > threads >> > and this becomes the default configuration. We still provide backward >> > compatibility >> > where zero page checking is done from the migration main thread. >> > >> > Signed-off-by: Hao Xiang <hao.xi...@bytedance.com> >> > --- >> > migration/multifd.c | 1 + >> > migration/options.c | 2 +- >> > migration/ram.c | 53 ++++++++++++++++++++++++++++++++++----------- >> > 3 files changed, 42 insertions(+), 14 deletions(-) >> > >> > diff --git a/migration/multifd.c b/migration/multifd.c >> > index fbb40ea10b..ef5dad1019 100644 >> > --- a/migration/multifd.c >> > +++ b/migration/multifd.c >> > @@ -13,6 +13,7 @@ >> > #include "qemu/osdep.h" >> > #include "qemu/cutils.h" >> >> This include... >> >> > #include "qemu/rcu.h" >> > +#include "qemu/cutils.h" >> >> is there already. >> >> > #include "exec/target_page.h" >> > #include "sysemu/sysemu.h" >> > #include "exec/ramblock.h" >> > diff --git a/migration/options.c b/migration/options.c >> > index 3c603391b0..3c79b6ccd4 100644 >> > --- a/migration/options.c >> > +++ b/migration/options.c >> > @@ -181,7 +181,7 @@ Property migration_properties[] = { >> > MIG_MODE_NORMAL), >> > DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState, >> > parameters.zero_page_detection, >> > - ZERO_PAGE_DETECTION_LEGACY), >> > + ZERO_PAGE_DETECTION_MULTIFD), >> >> I think we'll need something to avoid a 9.0 -> 8.2 migration with this >> enabled. Otherwise it will go along happily until we get data corruption >> because the new QEMU didn't send any zero pages on the migration thread >> and the old QEMU did not look for them in the multifd packet. >> >> Perhaps bumping the MULTIFD_VERSION when ZERO_PAGE_DETECTION_MULTIFD is >> in use. We'd just need to fix the test in the new QEMU to check >> (msg.version > MULTIFD_VERSION) instead of (msg.version != MULTIFD_VERSION). >> >> > >> > /* Migration capabilities */ >> > DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE), >> > diff --git a/migration/ram.c b/migration/ram.c >> > index 5ece9f042e..b088c5a98c 100644 >> > --- a/migration/ram.c >> > +++ b/migration/ram.c >> > @@ -1123,10 +1123,6 @@ static int save_zero_page(RAMState *rs, >> > PageSearchStatus *pss, >> > QEMUFile *file = pss->pss_channel; >> > int len = 0; >> > >> > - if (migrate_zero_page_detection() != ZERO_PAGE_DETECTION_LEGACY) { >> > - return 0; >> > - } >> >> How does 'none' work now? > > I tested it and all pages are transferred with payload (including the > zero pages). > >> >> > - >> > if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) { >> > return 0; >> > } >> > @@ -1256,6 +1252,10 @@ static int ram_save_page(RAMState *rs, >> > PageSearchStatus *pss) >> > >> > static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) >> > { >> > + assert(migrate_multifd()); >> > + assert(!migrate_compress()); >> > + assert(!migration_in_postcopy()); >> >> Drop these, please. Keep only the asserts that are likely to trigger >> during development, such as the existing ones at multifd_send_pages. > > I think I have got enough feedback regarding too many asserts. I will > drop these. assert is not compiled into the free build, correct? >
>From include/qemu/osdep.h: /* * We have a lot of unaudited code that may fail in strange ways, or * even be a security risk during migration, if you disable assertions * at compile-time. You may comment out these safety checks if you * absolutely want to disable assertion overhead, but it is not * supported upstream so the risk is all yours. Meanwhile, please * submit patches to remove any side-effects inside an assertion, or * fixing error handling that should use Error instead of assert. */ #ifdef NDEBUG #error building with NDEBUG is not supported #endif #ifdef G_DISABLE_ASSERT #error building with G_DISABLE_ASSERT is not supported #endif >> >> > + >> > if (!multifd_queue_page(block, offset)) { >> > return -1; >> > } >> > @@ -2046,7 +2046,6 @@ static bool save_compress_page(RAMState *rs, >> > PageSearchStatus *pss, >> > */ >> > static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus >> > *pss) >> > { >> > - RAMBlock *block = pss->block; >> > ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; >> > int res; >> > >> > @@ -2062,17 +2061,40 @@ static int ram_save_target_page_legacy(RAMState >> > *rs, PageSearchStatus *pss) >> > return 1; >> > } >> > >> > + return ram_save_page(rs, pss); >> >> Look at where git put this! Are you using the default diff algorithm? If >> not try using --patience to see if it improves the diff. > > I used the default diff algorithm. > >> >> > +} >> > + >> > +/** >> > + * ram_save_target_page_multifd: save one target page >> > + * >> > + * Returns the number of pages written >> >> We could be more precise here: >> >> ram_save_target_page_multifd: send one target page to multifd workers >> >> Returns 1 if the page was queued, -1 otherwise. > > Will do. > >> >> > + * >> > + * @rs: current RAM state >> > + * @pss: data about the page we want to send >> > + */ >> > +static int ram_save_target_page_multifd(RAMState *rs, PageSearchStatus >> > *pss) >> > +{ >> > + RAMBlock *block = pss->block; >> > + ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; >> > + >> > + /* Multifd is not compatible with old compression. */ >> > + assert(!migrate_compress()); >> >> This should already be enforced at options.c. >> >> > + >> > + /* Multifd is not compabible with postcopy. */ >> > + assert(!migration_in_postcopy()); >> >> Same here. >> >> > + >> > /* >> > - * Do not use multifd in postcopy as one whole host page should be >> > - * placed. Meanwhile postcopy requires atomic update of pages, so >> > even >> > - * if host page size == guest page size the dest guest during run may >> > - * still see partially copied pages which is data corruption. >> > + * Backward compatibility support. While using multifd live >> > + * migration, we still need to handle zero page checking on the >> > + * migration main thread. >> > */ >> > - if (migrate_multifd() && !migration_in_postcopy()) { >> > - return ram_save_multifd_page(block, offset); >> > + if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) { >> > + if (save_zero_page(rs, pss, offset)) { >> > + return 1; >> > + } >> > } >> > >> > - return ram_save_page(rs, pss); >> > + return ram_save_multifd_page(block, offset); >> > } >> > >> > /* Should be called before sending a host page */ >> > @@ -2984,7 +3006,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque) >> > } >> > >> > migration_ops = g_malloc0(sizeof(MigrationOps)); >> > - migration_ops->ram_save_target_page = ram_save_target_page_legacy; >> > + >> > + if (migrate_multifd()) { >> > + migration_ops->ram_save_target_page = >> > ram_save_target_page_multifd; >> > + } else { >> > + migration_ops->ram_save_target_page = ram_save_target_page_legacy; >> > + } >> > >> > bql_unlock(); >> > ret = multifd_send_sync_main();