On 2/10/23 18:11, Daniel P. Berrangé wrote: > On Fri, Oct 28, 2022 at 01:39:10PM +0300, Nikolay Borisov wrote: >> Implement 'fixed-ram' feature. The core of the feature is to ensure that >> each ram page of the migration stream has a specific offset in the >> resulting migration stream. The reason why we'd want such behavior are >> two fold: >> >> - When doing a 'fixed-ram' migration the resulting file will have a >> bounded size, since pages which are dirtied multiple times will >> always go to a fixed location in the file, rather than constantly >> being added to a sequential stream. This eliminates cases where a vm >> with, say, 1g of ram can result in a migration file that's 10s of >> Gbs, provided that the workload constantly redirties memory. >> >> - It paves the way to implement DIO-enabled save/restore of the >> migration stream as the pages are ensured to be written at aligned >> offsets. >> >> The features requires changing the format. First, a bitmap is introduced >> which tracks which pages have been written (i.e are dirtied) during >> migration and subsequently it's being written in the resultin file, >> again at a fixed location for every ramblock. Zero pages are ignored as >> they'd be zero in the destination migration as well. With the changed >> format data would look like the following: >> >> |name len|name|used_len|pc*|bitmap_size|pages_offset|bitmap|pages| >> >> * pc - refers to the page_size/mr->addr members, so newly added members >> begin from "bitmap_size". >> >> This layout is initialized during ram_save_setup so instead of having a >> sequential stream of pages that follow the ramblock headers the dirty >> pages for a ramblock follow its header. Since all pages have a fixed >> location RAM_SAVE_FLAG_EOS is no longer generated on every migration >> iteration but there is effectively a single RAM_SAVE_FLAG_EOS right at >> the end. >> >> Signed-off-by: Nikolay Borisov <nbori...@suse.com> >> --- >> include/exec/ramblock.h | 7 +++ >> migration/migration.c | 51 +++++++++++++++++++++- >> migration/migration.h | 1 + >> migration/ram.c | 97 +++++++++++++++++++++++++++++++++-------- >> migration/savevm.c | 1 + >> qapi/migration.json | 2 +- >> 6 files changed, 138 insertions(+), 21 deletions(-) > > This patch probably ought to extends the docs/devel/migration.rst > file. Specifically the text following the 'Stream structure' > heading. > >> >> diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h >> index 6cbedf9e0c9a..30216c1a41d3 100644 >> --- a/include/exec/ramblock.h >> +++ b/include/exec/ramblock.h >> @@ -43,6 +43,13 @@ struct RAMBlock { >> size_t page_size; >> /* dirty bitmap used during migration */ >> unsigned long *bmap; >> + /* shadow dirty bitmap used when migrating to a file */ >> + unsigned long *shadow_bmap; >> + /* offset in the file pages belonging to this ramblock are saved, used >> + * only during migration to a file >> + */ >> + off_t bitmap_offset; >> + uint64_t pages_offset; >> /* bitmap of already received pages in postcopy */ >> unsigned long *receivedmap; >> >> diff --git a/migration/migration.c b/migration/migration.c >> index 11ceea340702..c7383845a5b4 100644 >> --- a/migration/migration.c >> +++ b/migration/migration.c >> @@ -165,7 +165,8 @@ >> INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot, >> MIGRATION_CAPABILITY_XBZRLE, >> MIGRATION_CAPABILITY_X_COLO, >> MIGRATION_CAPABILITY_VALIDATE_UUID, >> - MIGRATION_CAPABILITY_ZERO_COPY_SEND); >> + MIGRATION_CAPABILITY_ZERO_COPY_SEND, >> + MIGRATION_CAPABILITY_FIXED_RAM); >> >> /* When we add fault tolerance, we could have several >> migrations at once. For now we don't need to add >> @@ -1326,6 +1327,27 @@ static bool migrate_caps_check(bool *cap_list, >> } >> #endif >> >> + if (cap_list[MIGRATION_CAPABILITY_FIXED_RAM]) { >> + if (cap_list[MIGRATION_CAPABILITY_MULTIFD]) { >> + error_setg(errp, "Directly mapped memory incompatible with >> multifd"); >> + return false; >> + } >> + >> + if (cap_list[MIGRATION_CAPABILITY_XBZRLE]) { >> + error_setg(errp, "Directly mapped memory incompatible with >> xbzrle"); >> + return false; >> + } >> + >> + if (cap_list[MIGRATION_CAPABILITY_COMPRESS]) { >> + error_setg(errp, "Directly mapped memory incompatible with >> compression"); >> + return false; >> + } >> + >> + if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) { >> + error_setg(errp, "Directly mapped memory incompatible with >> postcopy ram"); >> + return false; >> + } >> + } >> >> /* incoming side only */ >> if (runstate_check(RUN_STATE_INMIGRATE) && >> @@ -2630,6 +2652,11 @@ MultiFDCompression migrate_multifd_compression(void) >> return s->parameters.multifd_compression; >> } >> >> +int migrate_fixed_ram(void) >> +{ >> + return >> migrate_get_current()->enabled_capabilities[MIGRATION_CAPABILITY_FIXED_RAM]; >> +} >> + >> int migrate_multifd_zlib_level(void) >> { >> MigrationState *s; >> @@ -4190,6 +4217,21 @@ static void *bg_migration_thread(void *opaque) >> return NULL; >> } >> >> +static int >> +migrate_check_fixed_ram(MigrationState *s, Error **errp) >> +{ >> + if (!s->enabled_capabilities[MIGRATION_CAPABILITY_FIXED_RAM]) >> + return 0; >> + >> + if (!qemu_file_is_seekable(s->to_dst_file)) { >> + error_setg(errp, "Directly mapped memory requires a seekable >> transport"); >> + return -1; >> + } >> + >> + return 0; >> +} >> + >> + >> void migrate_fd_connect(MigrationState *s, Error *error_in) >> { >> Error *local_err = NULL; >> @@ -4265,6 +4307,12 @@ void migrate_fd_connect(MigrationState *s, Error >> *error_in) >> return; >> } >> >> + if (migrate_check_fixed_ram(s, &local_err) < 0) { >> + migrate_fd_cleanup(s); >> + migrate_fd_error(s, local_err); >> + return; >> + } >> + >> if (resume) { >> /* Wakeup the main migration thread to do the recovery */ >> migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED, >> @@ -4398,6 +4446,7 @@ static Property migration_properties[] = { >> DEFINE_PROP_STRING("tls-authz", MigrationState, parameters.tls_authz), >> >> /* Migration capabilities */ >> + DEFINE_PROP_MIG_CAP("x-fixed-ram", MIGRATION_CAPABILITY_FIXED_RAM), >> DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE), >> DEFINE_PROP_MIG_CAP("x-rdma-pin-all", >> MIGRATION_CAPABILITY_RDMA_PIN_ALL), >> DEFINE_PROP_MIG_CAP("x-auto-converge", >> MIGRATION_CAPABILITY_AUTO_CONVERGE), >> diff --git a/migration/migration.h b/migration/migration.h >> index 96f27aba2210..9aab1b16f407 100644 >> --- a/migration/migration.h >> +++ b/migration/migration.h >> @@ -410,6 +410,7 @@ bool migrate_zero_blocks(void); >> bool migrate_dirty_bitmaps(void); >> bool migrate_ignore_shared(void); >> bool migrate_validate_uuid(void); >> +int migrate_fixed_ram(void); >> >> bool migrate_auto_converge(void); >> bool migrate_use_multifd(void); >> diff --git a/migration/ram.c b/migration/ram.c >> index dc1de9ddbc68..4f5ddaff356b 100644 >> --- a/migration/ram.c >> +++ b/migration/ram.c >> @@ -1261,9 +1261,14 @@ static int save_zero_page_to_file(RAMState *rs, >> QEMUFile *file, >> int len = 0; >> >> if (buffer_is_zero(p, TARGET_PAGE_SIZE)) { >> - len += save_page_header(rs, file, block, offset | >> RAM_SAVE_FLAG_ZERO); >> - qemu_put_byte(file, 0); >> - len += 1; >> + if (migrate_fixed_ram()) { >> + /* for zero pages we don't need to do anything */ >> + len = 1; >> + } else { >> + len += save_page_header(rs, file, block, offset | >> RAM_SAVE_FLAG_ZERO); >> + qemu_put_byte(file, 0); >> + len += 1; >> + } >> ram_release_page(block->idstr, offset); >> } >> return len; >> @@ -1342,15 +1347,22 @@ static bool control_save_page(RAMState *rs, RAMBlock >> *block, ram_addr_t offset, >> static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t >> offset, >> uint8_t *buf, bool async) >> { >> - ram_transferred_add(save_page_header(rs, rs->f, block, >> - offset | RAM_SAVE_FLAG_PAGE)); >> - if (async) { >> - qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, >> - migrate_release_ram() && >> - migration_in_postcopy()); >> - } else { >> - qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); >> - } >> + >> + if (migrate_fixed_ram()) { >> + qemu_put_buffer_at(rs->f, buf, TARGET_PAGE_SIZE, >> + block->pages_offset + offset); >> + set_bit(offset >> TARGET_PAGE_BITS, block->shadow_bmap); >> + } else { >> + ram_transferred_add(save_page_header(rs, rs->f, block, >> + offset | >> RAM_SAVE_FLAG_PAGE)); >> + if (async) { >> + qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE, >> + migrate_release_ram() && >> + migration_in_postcopy()); >> + } else { >> + qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE); >> + } >> + } >> ram_transferred_add(TARGET_PAGE_SIZE); >> ram_counters.normal++; >> return 1; >> @@ -2683,6 +2695,8 @@ static void ram_save_cleanup(void *opaque) >> block->clear_bmap = NULL; >> g_free(block->bmap); >> block->bmap = NULL; >> + g_free(block->shadow_bmap); >> + block->shadow_bmap = NULL; >> } >> >> xbzrle_cleanup(); >> @@ -3044,6 +3058,7 @@ static void ram_list_init_bitmaps(void) >> */ >> block->bmap = bitmap_new(pages); >> bitmap_set(block->bmap, 0, pages); >> + block->shadow_bmap = bitmap_new(block->used_length >> >> TARGET_PAGE_BITS); >> block->clear_bmap_shift = shift; >> block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); >> } >> @@ -3226,12 +3241,34 @@ static int ram_save_setup(QEMUFile *f, void *opaque) >> qemu_put_buffer(f, (uint8_t *)block->idstr, >> strlen(block->idstr)); >> qemu_put_be64(f, block->used_length); >> if (migrate_postcopy_ram() && block->page_size != >> - qemu_host_page_size) { >> + qemu_host_page_size) { >> qemu_put_be64(f, block->page_size); >> } >> if (migrate_ignore_shared()) { >> qemu_put_be64(f, block->mr->addr); >> } >> + >> + if (migrate_fixed_ram()) { >> + long num_pages = block->used_length >> TARGET_PAGE_BITS; >> + long bitmap_size = BITS_TO_LONGS(num_pages) * >> sizeof(unsigned long); >> + >> + >> + /* Needed for external programs (think >> analyze-migration.py) */ >> + qemu_put_be32(f, bitmap_size); >> + >> + /* >> + * Make pages offset aligned to TARGET_PAGE_SIZE to enable >> + * DIO in the future. Also add 8 to account for the page >> offset >> + * itself >> + */ >> + block->bitmap_offset = qemu_get_offset(f) + 8; >> + block->pages_offset = ROUND_UP(block->bitmap_offset + >> + bitmap_size, >> TARGET_PAGE_SIZE); > > I'm not sure that rounding to TARGET_PAGE_SIZE is sufficient. > > If we've built QEMU for a 32-bit i686 target, but are running on a 64-bit > kernel, is TARGET_PAGE_SIZE going to offer sufficient alignement to keep > the kernel happy. > > Also O_DIRECT has alignment constraints beyond simply the page size. The > page size alignment is most important for the buffers being passed > back and forth. The on-disk alignment constraint is actually defined by > the filesystem and storage it is on, and on-disk alignment is what > matters when we decide on pages_offset.
For comparison, in the libvirt iohelper the alignment of DIRECT I/O-friendly transfers is 64K, as per libvirt commit: commit 12291656b135ed788e41dadbd2d15e613ddea9b5 Author: Eric Blake <ebl...@redhat.com> Date: Tue Jul 12 08:35:05 2011 -0600 save: let iohelper work on O_DIRECT fds Required for a coming patch where iohelper will operate on O_DIRECT fds. There, the user-space memory must be aligned to file system boundaries (at least 512, but using page-aligned works better, and some file systems prefer 64k). Made tougher by the fact that VIR_ALLOC won't work on void *, but posix_memalign won't work on char * and isn't available everywhere. This patch makes some simplifying assumptions - namely, output to an O_DIRECT fd will only be attempted on an empty seekable file (hence, no need to worry about preserving existing data on a partial block, and ftruncate will work to undo the effects of having to round up the size of the last block written), and input from an O_DIRECT fd will only be attempted on a complete seekable file with the only possible short read at EOF. * configure.ac (AC_CHECK_FUNCS_ONCE): Check for posix_memalign. * src/util/iohelper.c (runIO): Use aligned memory, and handle quirks of O_DIRECT on last write. > > If we want to allow saved state to be copied across different filesystems/ > hosts, we need to consider the worst case alignment that would exist on > any conceivable host deployment now or in the future. > > Given that RAM sizes are measured 1000's of MBs in any scenario where > we care about save/restore speed enough to use the fixed-ram feature, > we can afford to waste a little space. > > IOW, we could round pages_offset upto the nearest 1 MB boundary, > which ought to be well enough aligned to cope with any constraint > we might imagine ? Or possibly even align to 4 MB, which is a common > size for small-ish huge pages. > > >> + qemu_put_be64(f, block->pages_offset); >> + >> + /* Now prepare offset for next ramblock */ >> + qemu_set_offset(f, block->pages_offset + >> block->used_length, SEEK_SET); >> + } >> } >> } >> >> @@ -3249,6 +3286,17 @@ static int ram_save_setup(QEMUFile *f, void *opaque) >> return 0; >> } >> >> +static void ram_save_shadow_bmap(QEMUFile *f) >> +{ >> + RAMBlock *block; >> + >> + RAMBLOCK_FOREACH_MIGRATABLE(block) { >> + long num_pages = block->used_length >> TARGET_PAGE_BITS; >> + long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long); >> + qemu_put_buffer_at(f, (uint8_t *)block->shadow_bmap, bitmap_size, >> block->bitmap_offset); >> + } >> +} >> + >> /** >> * ram_save_iterate: iterative stage for migration >> * >> @@ -3358,9 +3406,15 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) >> return ret; >> } >> >> - qemu_put_be64(f, RAM_SAVE_FLAG_EOS); >> - qemu_fflush(f); >> - ram_transferred_add(8); >> + /* >> + * For fixed ram we don't want to pollute the migration stream with >> + * EOS flags. >> + */ >> + if (!migrate_fixed_ram()) { >> + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); >> + qemu_fflush(f); >> + ram_transferred_add(8); >> + } >> >> ret = qemu_file_get_error(f); >> } >> @@ -3405,7 +3459,10 @@ static int ram_save_complete(QEMUFile *f, void >> *opaque) >> pages = ram_find_and_save_block(rs); >> /* no more blocks to sent */ >> if (pages == 0) { >> - break; >> + if (migrate_fixed_ram()) { >> + ram_save_shadow_bmap(f); >> + } >> + break; >> } >> if (pages < 0) { >> ret = pages; >> @@ -3428,8 +3485,10 @@ static int ram_save_complete(QEMUFile *f, void >> *opaque) >> return ret; >> } >> >> - qemu_put_be64(f, RAM_SAVE_FLAG_EOS); >> - qemu_fflush(f); >> + if (!migrate_fixed_ram()) { >> + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); >> + qemu_fflush(f); >> + } >> >> return 0; >> } >> diff --git a/migration/savevm.c b/migration/savevm.c >> index 44a222888306..847a8bdfb6ce 100644 >> --- a/migration/savevm.c >> +++ b/migration/savevm.c >> @@ -240,6 +240,7 @@ static bool should_validate_capability(int capability) >> /* Validate only new capabilities to keep compatibility. */ >> switch (capability) { >> case MIGRATION_CAPABILITY_X_IGNORE_SHARED: >> + case MIGRATION_CAPABILITY_FIXED_RAM: >> return true; >> default: >> return false; >> diff --git a/qapi/migration.json b/qapi/migration.json >> index 88ecf86ac876..6196617171e8 100644 >> --- a/qapi/migration.json >> +++ b/qapi/migration.json >> @@ -485,7 +485,7 @@ >> ## >> { 'enum': 'MigrationCapability', >> 'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks', >> - 'compress', 'events', 'postcopy-ram', >> + 'compress', 'events', 'postcopy-ram', 'fixed-ram', >> { 'name': 'x-colo', 'features': [ 'unstable' ] }, >> 'release-ram', >> 'block', 'return-path', 'pause-before-switchover', 'multifd', >> -- >> 2.34.1 >> > > With regards, > Daniel Thanks, Claudio