ram: Introduce 'fixed-ram' migration stream capability

Claudio Fontana Mon, 20 Mar 2023 04:06:27 -0700

On 2/10/23 18:11, Daniel P. Berrangé wrote:
> On Fri, Oct 28, 2022 at 01:39:10PM +0300, Nikolay Borisov wrote:
>> Implement 'fixed-ram' feature. The core of the feature is to ensure that
>> each ram page of the migration stream has a specific offset in the
>> resulting migration stream. The reason why we'd want such behavior are
>> two fold:
>>
>>  - When doing a 'fixed-ram' migration the resulting file will have a
>>    bounded size, since pages which are dirtied multiple times will
>>    always go to a fixed location in the file, rather than constantly
>>    being added to a sequential stream. This eliminates cases where a vm
>>    with, say, 1g of ram can result in a migration file that's 10s of
>>    Gbs, provided that the workload constantly redirties memory.
>>
>>  - It paves the way to implement DIO-enabled save/restore of the
>>    migration stream as the pages are ensured to be written at aligned
>>    offsets.
>>
>> The features requires changing the format. First, a bitmap is introduced
>> which tracks which pages have been written (i.e are dirtied) during
>> migration and subsequently it's being written in the resultin file,
>> again at a fixed location for every ramblock. Zero pages are ignored as
>> they'd be zero in the destination migration as well. With the changed
>> format data would look like the following:
>>
>> |name len|name|used_len|pc*|bitmap_size|pages_offset|bitmap|pages|
>>
>> * pc - refers to the page_size/mr->addr members, so newly added members
>> begin from "bitmap_size".
>>
>> This layout is initialized during ram_save_setup so instead of having a
>> sequential stream of pages that follow the ramblock headers the dirty
>> pages for a ramblock follow its header. Since all pages have a fixed
>> location RAM_SAVE_FLAG_EOS is no longer generated on every migration
>> iteration but there is effectively a single RAM_SAVE_FLAG_EOS right at
>> the end.
>>
>> Signed-off-by: Nikolay Borisov <nbori...@suse.com>
>> ---
>>  include/exec/ramblock.h |  7 +++
>>  migration/migration.c   | 51 +++++++++++++++++++++-
>>  migration/migration.h   |  1 +
>>  migration/ram.c         | 97 +++++++++++++++++++++++++++++++++--------
>>  migration/savevm.c      |  1 +
>>  qapi/migration.json     |  2 +-
>>  6 files changed, 138 insertions(+), 21 deletions(-)
> 
> This patch probably ought to extends the docs/devel/migration.rst
> file. Specifically the text following the 'Stream structure'
> heading.
> 
>>
>> diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
>> index 6cbedf9e0c9a..30216c1a41d3 100644
>> --- a/include/exec/ramblock.h
>> +++ b/include/exec/ramblock.h
>> @@ -43,6 +43,13 @@ struct RAMBlock {
>>      size_t page_size;
>>      /* dirty bitmap used during migration */
>>      unsigned long *bmap;
>> +    /* shadow dirty bitmap used when migrating to a file */
>> +    unsigned long *shadow_bmap;
>> +    /* offset in the file pages belonging to this ramblock are saved, used
>> +     * only during migration to a file
>> +     */
>> +    off_t bitmap_offset;
>> +    uint64_t pages_offset;
>>      /* bitmap of already received pages in postcopy */
>>      unsigned long *receivedmap;
>>  
>> diff --git a/migration/migration.c b/migration/migration.c
>> index 11ceea340702..c7383845a5b4 100644
>> --- a/migration/migration.c
>> +++ b/migration/migration.c
>> @@ -165,7 +165,8 @@ 
>> INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot,
>>      MIGRATION_CAPABILITY_XBZRLE,
>>      MIGRATION_CAPABILITY_X_COLO,
>>      MIGRATION_CAPABILITY_VALIDATE_UUID,
>> -    MIGRATION_CAPABILITY_ZERO_COPY_SEND);
>> +    MIGRATION_CAPABILITY_ZERO_COPY_SEND,
>> +    MIGRATION_CAPABILITY_FIXED_RAM);
>>  
>>  /* When we add fault tolerance, we could have several
>>     migrations at once.  For now we don't need to add
>> @@ -1326,6 +1327,27 @@ static bool migrate_caps_check(bool *cap_list,
>>      }
>>  #endif
>>  
>> +    if (cap_list[MIGRATION_CAPABILITY_FIXED_RAM]) {
>> +        if (cap_list[MIGRATION_CAPABILITY_MULTIFD]) {
>> +                error_setg(errp, "Directly mapped memory incompatible with 
>> multifd");
>> +                return false;
>> +        }
>> +
>> +        if (cap_list[MIGRATION_CAPABILITY_XBZRLE]) {
>> +                error_setg(errp, "Directly mapped memory incompatible with 
>> xbzrle");
>> +                return false;
>> +        }
>> +
>> +        if (cap_list[MIGRATION_CAPABILITY_COMPRESS]) {
>> +                error_setg(errp, "Directly mapped memory incompatible with 
>> compression");
>> +                return false;
>> +        }
>> +
>> +        if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
>> +                error_setg(errp, "Directly mapped memory incompatible with 
>> postcopy ram");
>> +                return false;
>> +        }
>> +    }
>>  
>>      /* incoming side only */
>>      if (runstate_check(RUN_STATE_INMIGRATE) &&
>> @@ -2630,6 +2652,11 @@ MultiFDCompression migrate_multifd_compression(void)
>>      return s->parameters.multifd_compression;
>>  }
>>  
>> +int migrate_fixed_ram(void)
>> +{
>> +    return 
>> migrate_get_current()->enabled_capabilities[MIGRATION_CAPABILITY_FIXED_RAM];
>> +}
>> +
>>  int migrate_multifd_zlib_level(void)
>>  {
>>      MigrationState *s;
>> @@ -4190,6 +4217,21 @@ static void *bg_migration_thread(void *opaque)
>>      return NULL;
>>  }
>>  
>> +static int
>> +migrate_check_fixed_ram(MigrationState *s, Error **errp)
>> +{
>> +    if (!s->enabled_capabilities[MIGRATION_CAPABILITY_FIXED_RAM])
>> +        return 0;
>> +
>> +    if (!qemu_file_is_seekable(s->to_dst_file)) {
>> +        error_setg(errp, "Directly mapped memory requires a seekable 
>> transport");
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +
>>  void migrate_fd_connect(MigrationState *s, Error *error_in)
>>  {
>>      Error *local_err = NULL;
>> @@ -4265,6 +4307,12 @@ void migrate_fd_connect(MigrationState *s, Error 
>> *error_in)
>>          return;
>>      }
>>  
>> +    if (migrate_check_fixed_ram(s, &local_err) < 0) {
>> +        migrate_fd_cleanup(s);
>> +        migrate_fd_error(s, local_err);
>> +        return;
>> +    }
>> +
>>      if (resume) {
>>          /* Wakeup the main migration thread to do the recovery */
>>          migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
>> @@ -4398,6 +4446,7 @@ static Property migration_properties[] = {
>>      DEFINE_PROP_STRING("tls-authz", MigrationState, parameters.tls_authz),
>>  
>>      /* Migration capabilities */
>> +    DEFINE_PROP_MIG_CAP("x-fixed-ram", MIGRATION_CAPABILITY_FIXED_RAM),
>>      DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
>>      DEFINE_PROP_MIG_CAP("x-rdma-pin-all", 
>> MIGRATION_CAPABILITY_RDMA_PIN_ALL),
>>      DEFINE_PROP_MIG_CAP("x-auto-converge", 
>> MIGRATION_CAPABILITY_AUTO_CONVERGE),
>> diff --git a/migration/migration.h b/migration/migration.h
>> index 96f27aba2210..9aab1b16f407 100644
>> --- a/migration/migration.h
>> +++ b/migration/migration.h
>> @@ -410,6 +410,7 @@ bool migrate_zero_blocks(void);
>>  bool migrate_dirty_bitmaps(void);
>>  bool migrate_ignore_shared(void);
>>  bool migrate_validate_uuid(void);
>> +int migrate_fixed_ram(void);
>>  
>>  bool migrate_auto_converge(void);
>>  bool migrate_use_multifd(void);
>> diff --git a/migration/ram.c b/migration/ram.c
>> index dc1de9ddbc68..4f5ddaff356b 100644
>> --- a/migration/ram.c
>> +++ b/migration/ram.c
>> @@ -1261,9 +1261,14 @@ static int save_zero_page_to_file(RAMState *rs, 
>> QEMUFile *file,
>>      int len = 0;
>>  
>>      if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
>> -        len += save_page_header(rs, file, block, offset | 
>> RAM_SAVE_FLAG_ZERO);
>> -        qemu_put_byte(file, 0);
>> -        len += 1;
>> +        if (migrate_fixed_ram()) {
>> +            /* for zero pages we don't need to do anything */
>> +            len = 1;
>> +        } else {
>> +            len += save_page_header(rs, file, block, offset | 
>> RAM_SAVE_FLAG_ZERO);
>> +            qemu_put_byte(file, 0);
>> +            len += 1;
>> +        }
>>          ram_release_page(block->idstr, offset);
>>      }
>>      return len;
>> @@ -1342,15 +1347,22 @@ static bool control_save_page(RAMState *rs, RAMBlock 
>> *block, ram_addr_t offset,
>>  static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t 
>> offset,
>>                              uint8_t *buf, bool async)
>>  {
>> -    ram_transferred_add(save_page_header(rs, rs->f, block,
>> -                                         offset | RAM_SAVE_FLAG_PAGE));
>> -    if (async) {
>> -        qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
>> -                              migrate_release_ram() &&
>> -                              migration_in_postcopy());
>> -    } else {
>> -        qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
>> -    }
>> +
>> +    if (migrate_fixed_ram()) {
>> +            qemu_put_buffer_at(rs->f, buf, TARGET_PAGE_SIZE,
>> +                               block->pages_offset + offset);
>> +            set_bit(offset >> TARGET_PAGE_BITS, block->shadow_bmap);
>> +    } else {
>> +            ram_transferred_add(save_page_header(rs, rs->f, block,
>> +                                                 offset | 
>> RAM_SAVE_FLAG_PAGE));
>> +            if (async) {
>> +                    qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
>> +                                          migrate_release_ram() &&
>> +                                          migration_in_postcopy());
>> +            } else {
>> +                    qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
>> +            }
>> +    }
>>      ram_transferred_add(TARGET_PAGE_SIZE);
>>      ram_counters.normal++;
>>      return 1;
>> @@ -2683,6 +2695,8 @@ static void ram_save_cleanup(void *opaque)
>>          block->clear_bmap = NULL;
>>          g_free(block->bmap);
>>          block->bmap = NULL;
>> +        g_free(block->shadow_bmap);
>> +        block->shadow_bmap = NULL;
>>      }
>>  
>>      xbzrle_cleanup();
>> @@ -3044,6 +3058,7 @@ static void ram_list_init_bitmaps(void)
>>               */
>>              block->bmap = bitmap_new(pages);
>>              bitmap_set(block->bmap, 0, pages);
>> +            block->shadow_bmap = bitmap_new(block->used_length >> 
>> TARGET_PAGE_BITS);
>>              block->clear_bmap_shift = shift;
>>              block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
>>          }
>> @@ -3226,12 +3241,34 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>>              qemu_put_buffer(f, (uint8_t *)block->idstr, 
>> strlen(block->idstr));
>>              qemu_put_be64(f, block->used_length);
>>              if (migrate_postcopy_ram() && block->page_size !=
>> -                                          qemu_host_page_size) {
>> +                qemu_host_page_size) {
>>                  qemu_put_be64(f, block->page_size);
>>              }
>>              if (migrate_ignore_shared()) {
>>                  qemu_put_be64(f, block->mr->addr);
>>              }
>> +
>> +            if (migrate_fixed_ram()) {
>> +                long num_pages = block->used_length >> TARGET_PAGE_BITS;
>> +                long bitmap_size = BITS_TO_LONGS(num_pages) * 
>> sizeof(unsigned long);
>> +
>> +
>> +                /* Needed for external programs (think 
>> analyze-migration.py) */
>> +                qemu_put_be32(f, bitmap_size);
>> +
>> +                /*
>> +                 * Make pages offset aligned to TARGET_PAGE_SIZE to enable
>> +                 * DIO in the future. Also add 8 to account for the page 
>> offset
>> +                 * itself
>> +                 */
>> +                block->bitmap_offset = qemu_get_offset(f) + 8;
>> +                block->pages_offset = ROUND_UP(block->bitmap_offset +
>> +                                               bitmap_size, 
>> TARGET_PAGE_SIZE);
> 
> I'm not sure that rounding to TARGET_PAGE_SIZE is sufficient.
> 
> If we've built QEMU for a 32-bit i686 target, but are running on a 64-bit
> kernel, is TARGET_PAGE_SIZE going to offer sufficient alignement to keep
> the kernel happy.
> 
> Also O_DIRECT has alignment constraints beyond simply the page size. The
> page size alignment is most important for the buffers being passed
> back and forth. The on-disk alignment constraint is actually defined by
> the filesystem and storage it is on, and on-disk alignment is what
> matters when we decide on pages_offset.


For comparison, in the libvirt iohelper the alignment of DIRECT I/O-friendly 
transfers is 64K,
as per libvirt commit:

commit 12291656b135ed788e41dadbd2d15e613ddea9b5
Author: Eric Blake <ebl...@redhat.com>
Date:   Tue Jul 12 08:35:05 2011 -0600

    save: let iohelper work on O_DIRECT fds
    
    Required for a coming patch where iohelper will operate on O_DIRECT
    fds.  There, the user-space memory must be aligned to file system
    boundaries (at least 512, but using page-aligned works better, and
    some file systems prefer 64k).  Made tougher by the fact that
    VIR_ALLOC won't work on void *, but posix_memalign won't work on
    char * and isn't available everywhere.
    
    This patch makes some simplifying assumptions - namely, output
    to an O_DIRECT fd will only be attempted on an empty seekable
    file (hence, no need to worry about preserving existing data
    on a partial block, and ftruncate will work to undo the effects
    of having to round up the size of the last block written), and
    input from an O_DIRECT fd will only be attempted on a complete
    seekable file with the only possible short read at EOF.
    
    * configure.ac (AC_CHECK_FUNCS_ONCE): Check for posix_memalign.
    * src/util/iohelper.c (runIO): Use aligned memory, and handle
    quirks of O_DIRECT on last write.


> 
> If we want to allow saved state to be copied across different filesystems/
> hosts, we need to consider the worst case alignment that would exist on
> any conceivable host deployment now or in the future.
> 
> Given that RAM sizes are measured 1000's of MBs in any scenario where
> we care about save/restore speed enough to use the fixed-ram feature,
> we can afford to waste a little space.
> 
> IOW, we could round pages_offset upto the nearest 1 MB boundary,
> which ought to be well enough aligned to cope with any constraint
> we might imagine ? Or possibly even align to 4 MB, which is a common
> size for small-ish huge pages.
> 
> 
>> +                qemu_put_be64(f, block->pages_offset);
>> +
>> +                /* Now prepare offset for next ramblock */
>> +                qemu_set_offset(f, block->pages_offset + 
>> block->used_length, SEEK_SET);
>> +            }
>>          }
>>      }
>>  
>> @@ -3249,6 +3286,17 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>>      return 0;
>>  }
>>  
>> +static void ram_save_shadow_bmap(QEMUFile *f)
>> +{
>> +    RAMBlock *block;
>> +
>> +    RAMBLOCK_FOREACH_MIGRATABLE(block) {
>> +        long num_pages = block->used_length >> TARGET_PAGE_BITS;
>> +        long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
>> +        qemu_put_buffer_at(f, (uint8_t *)block->shadow_bmap, bitmap_size, 
>> block->bitmap_offset);
>> +    }
>> +}
>> +
>>  /**
>>   * ram_save_iterate: iterative stage for migration
>>   *
>> @@ -3358,9 +3406,15 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
>>              return ret;
>>          }
>>  
>> -        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
>> -        qemu_fflush(f);
>> -        ram_transferred_add(8);
>> +    /*
>> +     * For fixed ram we don't want to pollute the migration stream with
>> +     * EOS flags.
>> +     */
>> +    if (!migrate_fixed_ram()) {
>> +            qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
>> +            qemu_fflush(f);
>> +            ram_transferred_add(8);
>> +    }
>>  
>>          ret = qemu_file_get_error(f);
>>      }
>> @@ -3405,7 +3459,10 @@ static int ram_save_complete(QEMUFile *f, void 
>> *opaque)
>>              pages = ram_find_and_save_block(rs);
>>              /* no more blocks to sent */
>>              if (pages == 0) {
>> -                break;
>> +                if (migrate_fixed_ram()) {
>> +                    ram_save_shadow_bmap(f);
>> +                }
>> +            break;
>>              }
>>              if (pages < 0) {
>>                  ret = pages;
>> @@ -3428,8 +3485,10 @@ static int ram_save_complete(QEMUFile *f, void 
>> *opaque)
>>          return ret;
>>      }
>>  
>> -    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
>> -    qemu_fflush(f);
>> +    if (!migrate_fixed_ram()) {
>> +        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
>> +        qemu_fflush(f);
>> +    }
>>  
>>      return 0;
>>  }
>> diff --git a/migration/savevm.c b/migration/savevm.c
>> index 44a222888306..847a8bdfb6ce 100644
>> --- a/migration/savevm.c
>> +++ b/migration/savevm.c
>> @@ -240,6 +240,7 @@ static bool should_validate_capability(int capability)
>>      /* Validate only new capabilities to keep compatibility. */
>>      switch (capability) {
>>      case MIGRATION_CAPABILITY_X_IGNORE_SHARED:
>> +    case MIGRATION_CAPABILITY_FIXED_RAM:
>>          return true;
>>      default:
>>          return false;
>> diff --git a/qapi/migration.json b/qapi/migration.json
>> index 88ecf86ac876..6196617171e8 100644
>> --- a/qapi/migration.json
>> +++ b/qapi/migration.json
>> @@ -485,7 +485,7 @@
>>  ##
>>  { 'enum': 'MigrationCapability',
>>    'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks',
>> -           'compress', 'events', 'postcopy-ram',
>> +           'compress', 'events', 'postcopy-ram', 'fixed-ram',
>>             { 'name': 'x-colo', 'features': [ 'unstable' ] },
>>             'release-ram',
>>             'block', 'return-path', 'pause-before-switchover', 'multifd',
>> -- 
>> 2.34.1
>>
> 
> With regards,
> Daniel

Thanks,

Claudio

Re: [PATCH v3 10/14] migration/ram: Introduce 'fixed-ram' migration stream capability

Reply via email to