ram: Introduce 'fixed-ram' migration stream capability

Claudio Fontana Mon, 20 Mar 2023 04:08:56 -0700

Added Fabiano to the thread,

CLaudio


On 3/20/23 12:05, Claudio Fontana wrote:
> On 2/10/23 18:11, Daniel P. Berrangé wrote:
>> On Fri, Oct 28, 2022 at 01:39:10PM +0300, Nikolay Borisov wrote:
>>> Implement 'fixed-ram' feature. The core of the feature is to ensure that
>>> each ram page of the migration stream has a specific offset in the
>>> resulting migration stream. The reason why we'd want such behavior are
>>> two fold:
>>>
>>>  - When doing a 'fixed-ram' migration the resulting file will have a
>>>    bounded size, since pages which are dirtied multiple times will
>>>    always go to a fixed location in the file, rather than constantly
>>>    being added to a sequential stream. This eliminates cases where a vm
>>>    with, say, 1g of ram can result in a migration file that's 10s of
>>>    Gbs, provided that the workload constantly redirties memory.
>>>
>>>  - It paves the way to implement DIO-enabled save/restore of the
>>>    migration stream as the pages are ensured to be written at aligned
>>>    offsets.
>>>
>>> The features requires changing the format. First, a bitmap is introduced
>>> which tracks which pages have been written (i.e are dirtied) during
>>> migration and subsequently it's being written in the resultin file,
>>> again at a fixed location for every ramblock. Zero pages are ignored as
>>> they'd be zero in the destination migration as well. With the changed
>>> format data would look like the following:
>>>
>>> |name len|name|used_len|pc*|bitmap_size|pages_offset|bitmap|pages|
>>>
>>> * pc - refers to the page_size/mr->addr members, so newly added members
>>> begin from "bitmap_size".
>>>
>>> This layout is initialized during ram_save_setup so instead of having a
>>> sequential stream of pages that follow the ramblock headers the dirty
>>> pages for a ramblock follow its header. Since all pages have a fixed
>>> location RAM_SAVE_FLAG_EOS is no longer generated on every migration
>>> iteration but there is effectively a single RAM_SAVE_FLAG_EOS right at
>>> the end.
>>>
>>> Signed-off-by: Nikolay Borisov <nbori...@suse.com>
>>> ---
>>>  include/exec/ramblock.h |  7 +++
>>>  migration/migration.c   | 51 +++++++++++++++++++++-
>>>  migration/migration.h   |  1 +
>>>  migration/ram.c         | 97 +++++++++++++++++++++++++++++++++--------
>>>  migration/savevm.c      |  1 +
>>>  qapi/migration.json     |  2 +-
>>>  6 files changed, 138 insertions(+), 21 deletions(-)
>>
>> This patch probably ought to extends the docs/devel/migration.rst
>> file. Specifically the text following the 'Stream structure'
>> heading.
>>
>>>
>>> diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
>>> index 6cbedf9e0c9a..30216c1a41d3 100644
>>> --- a/include/exec/ramblock.h
>>> +++ b/include/exec/ramblock.h
>>> @@ -43,6 +43,13 @@ struct RAMBlock {
>>>      size_t page_size;
>>>      /* dirty bitmap used during migration */
>>>      unsigned long *bmap;
>>> +    /* shadow dirty bitmap used when migrating to a file */
>>> +    unsigned long *shadow_bmap;
>>> +    /* offset in the file pages belonging to this ramblock are saved, used
>>> +     * only during migration to a file
>>> +     */
>>> +    off_t bitmap_offset;
>>> +    uint64_t pages_offset;
>>>      /* bitmap of already received pages in postcopy */
>>>      unsigned long *receivedmap;
>>>  
>>> diff --git a/migration/migration.c b/migration/migration.c
>>> index 11ceea340702..c7383845a5b4 100644
>>> --- a/migration/migration.c
>>> +++ b/migration/migration.c
>>> @@ -165,7 +165,8 @@ 
>>> INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot,
>>>      MIGRATION_CAPABILITY_XBZRLE,
>>>      MIGRATION_CAPABILITY_X_COLO,
>>>      MIGRATION_CAPABILITY_VALIDATE_UUID,
>>> -    MIGRATION_CAPABILITY_ZERO_COPY_SEND);
>>> +    MIGRATION_CAPABILITY_ZERO_COPY_SEND,
>>> +    MIGRATION_CAPABILITY_FIXED_RAM);
>>>  
>>>  /* When we add fault tolerance, we could have several
>>>     migrations at once.  For now we don't need to add
>>> @@ -1326,6 +1327,27 @@ static bool migrate_caps_check(bool *cap_list,
>>>      }
>>>  #endif
>>>  
>>> +    if (cap_list[MIGRATION_CAPABILITY_FIXED_RAM]) {
>>> +       if (cap_list[MIGRATION_CAPABILITY_MULTIFD]) {
>>> +               error_setg(errp, "Directly mapped memory incompatible with 
>>> multifd");
>>> +               return false;
>>> +       }
>>> +
>>> +       if (cap_list[MIGRATION_CAPABILITY_XBZRLE]) {
>>> +               error_setg(errp, "Directly mapped memory incompatible with 
>>> xbzrle");
>>> +               return false;
>>> +       }
>>> +
>>> +       if (cap_list[MIGRATION_CAPABILITY_COMPRESS]) {
>>> +               error_setg(errp, "Directly mapped memory incompatible with 
>>> compression");
>>> +               return false;
>>> +       }
>>> +
>>> +       if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
>>> +               error_setg(errp, "Directly mapped memory incompatible with 
>>> postcopy ram");
>>> +               return false;
>>> +       }
>>> +    }
>>>  
>>>      /* incoming side only */
>>>      if (runstate_check(RUN_STATE_INMIGRATE) &&
>>> @@ -2630,6 +2652,11 @@ MultiFDCompression migrate_multifd_compression(void)
>>>      return s->parameters.multifd_compression;
>>>  }
>>>  
>>> +int migrate_fixed_ram(void)
>>> +{
>>> +    return 
>>> migrate_get_current()->enabled_capabilities[MIGRATION_CAPABILITY_FIXED_RAM];
>>> +}
>>> +
>>>  int migrate_multifd_zlib_level(void)
>>>  {
>>>      MigrationState *s;
>>> @@ -4190,6 +4217,21 @@ static void *bg_migration_thread(void *opaque)
>>>      return NULL;
>>>  }
>>>  
>>> +static int
>>> +migrate_check_fixed_ram(MigrationState *s, Error **errp)
>>> +{
>>> +    if (!s->enabled_capabilities[MIGRATION_CAPABILITY_FIXED_RAM])
>>> +        return 0;
>>> +
>>> +    if (!qemu_file_is_seekable(s->to_dst_file)) {
>>> +        error_setg(errp, "Directly mapped memory requires a seekable 
>>> transport");
>>> +        return -1;
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +
>>>  void migrate_fd_connect(MigrationState *s, Error *error_in)
>>>  {
>>>      Error *local_err = NULL;
>>> @@ -4265,6 +4307,12 @@ void migrate_fd_connect(MigrationState *s, Error 
>>> *error_in)
>>>          return;
>>>      }
>>>  
>>> +    if (migrate_check_fixed_ram(s, &local_err) < 0) {
>>> +       migrate_fd_cleanup(s);
>>> +       migrate_fd_error(s, local_err);
>>> +       return;
>>> +    }
>>> +
>>>      if (resume) {
>>>          /* Wakeup the main migration thread to do the recovery */
>>>          migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
>>> @@ -4398,6 +4446,7 @@ static Property migration_properties[] = {
>>>      DEFINE_PROP_STRING("tls-authz", MigrationState, parameters.tls_authz),
>>>  
>>>      /* Migration capabilities */
>>> +    DEFINE_PROP_MIG_CAP("x-fixed-ram", MIGRATION_CAPABILITY_FIXED_RAM),
>>>      DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
>>>      DEFINE_PROP_MIG_CAP("x-rdma-pin-all", 
>>> MIGRATION_CAPABILITY_RDMA_PIN_ALL),
>>>      DEFINE_PROP_MIG_CAP("x-auto-converge", 
>>> MIGRATION_CAPABILITY_AUTO_CONVERGE),
>>> diff --git a/migration/migration.h b/migration/migration.h
>>> index 96f27aba2210..9aab1b16f407 100644
>>> --- a/migration/migration.h
>>> +++ b/migration/migration.h
>>> @@ -410,6 +410,7 @@ bool migrate_zero_blocks(void);
>>>  bool migrate_dirty_bitmaps(void);
>>>  bool migrate_ignore_shared(void);
>>>  bool migrate_validate_uuid(void);
>>> +int migrate_fixed_ram(void);
>>>  
>>>  bool migrate_auto_converge(void);
>>>  bool migrate_use_multifd(void);
>>> diff --git a/migration/ram.c b/migration/ram.c
>>> index dc1de9ddbc68..4f5ddaff356b 100644
>>> --- a/migration/ram.c
>>> +++ b/migration/ram.c
>>> @@ -1261,9 +1261,14 @@ static int save_zero_page_to_file(RAMState *rs, 
>>> QEMUFile *file,
>>>      int len = 0;
>>>  
>>>      if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
>>> -        len += save_page_header(rs, file, block, offset | 
>>> RAM_SAVE_FLAG_ZERO);
>>> -        qemu_put_byte(file, 0);
>>> -        len += 1;
>>> +        if (migrate_fixed_ram()) {
>>> +            /* for zero pages we don't need to do anything */
>>> +            len = 1;
>>> +        } else {
>>> +            len += save_page_header(rs, file, block, offset | 
>>> RAM_SAVE_FLAG_ZERO);
>>> +            qemu_put_byte(file, 0);
>>> +            len += 1;
>>> +        }
>>>          ram_release_page(block->idstr, offset);
>>>      }
>>>      return len;
>>> @@ -1342,15 +1347,22 @@ static bool control_save_page(RAMState *rs, 
>>> RAMBlock *block, ram_addr_t offset,
>>>  static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t 
>>> offset,
>>>                              uint8_t *buf, bool async)
>>>  {
>>> -    ram_transferred_add(save_page_header(rs, rs->f, block,
>>> -                                         offset | RAM_SAVE_FLAG_PAGE));
>>> -    if (async) {
>>> -        qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
>>> -                              migrate_release_ram() &&
>>> -                              migration_in_postcopy());
>>> -    } else {
>>> -        qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
>>> -    }
>>> +
>>> +   if (migrate_fixed_ram()) {
>>> +           qemu_put_buffer_at(rs->f, buf, TARGET_PAGE_SIZE,
>>> +                              block->pages_offset + offset);
>>> +           set_bit(offset >> TARGET_PAGE_BITS, block->shadow_bmap);
>>> +   } else {
>>> +           ram_transferred_add(save_page_header(rs, rs->f, block,
>>> +                                                offset | 
>>> RAM_SAVE_FLAG_PAGE));
>>> +           if (async) {
>>> +                   qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
>>> +                                         migrate_release_ram() &&
>>> +                                         migration_in_postcopy());
>>> +           } else {
>>> +                   qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
>>> +           }
>>> +   }
>>>      ram_transferred_add(TARGET_PAGE_SIZE);
>>>      ram_counters.normal++;
>>>      return 1;
>>> @@ -2683,6 +2695,8 @@ static void ram_save_cleanup(void *opaque)
>>>          block->clear_bmap = NULL;
>>>          g_free(block->bmap);
>>>          block->bmap = NULL;
>>> +        g_free(block->shadow_bmap);
>>> +        block->shadow_bmap = NULL;
>>>      }
>>>  
>>>      xbzrle_cleanup();
>>> @@ -3044,6 +3058,7 @@ static void ram_list_init_bitmaps(void)
>>>               */
>>>              block->bmap = bitmap_new(pages);
>>>              bitmap_set(block->bmap, 0, pages);
>>> +            block->shadow_bmap = bitmap_new(block->used_length >> 
>>> TARGET_PAGE_BITS);
>>>              block->clear_bmap_shift = shift;
>>>              block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
>>>          }
>>> @@ -3226,12 +3241,34 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>>>              qemu_put_buffer(f, (uint8_t *)block->idstr, 
>>> strlen(block->idstr));
>>>              qemu_put_be64(f, block->used_length);
>>>              if (migrate_postcopy_ram() && block->page_size !=
>>> -                                          qemu_host_page_size) {
>>> +                qemu_host_page_size) {
>>>                  qemu_put_be64(f, block->page_size);
>>>              }
>>>              if (migrate_ignore_shared()) {
>>>                  qemu_put_be64(f, block->mr->addr);
>>>              }
>>> +
>>> +            if (migrate_fixed_ram()) {
>>> +                long num_pages = block->used_length >> TARGET_PAGE_BITS;
>>> +                long bitmap_size = BITS_TO_LONGS(num_pages) * 
>>> sizeof(unsigned long);
>>> +
>>> +
>>> +                /* Needed for external programs (think 
>>> analyze-migration.py) */
>>> +                qemu_put_be32(f, bitmap_size);
>>> +
>>> +                /*
>>> +                 * Make pages offset aligned to TARGET_PAGE_SIZE to enable
>>> +                 * DIO in the future. Also add 8 to account for the page 
>>> offset
>>> +                 * itself
>>> +                 */
>>> +                block->bitmap_offset = qemu_get_offset(f) + 8;
>>> +                block->pages_offset = ROUND_UP(block->bitmap_offset +
>>> +                                               bitmap_size, 
>>> TARGET_PAGE_SIZE);
>>
>> I'm not sure that rounding to TARGET_PAGE_SIZE is sufficient.
>>
>> If we've built QEMU for a 32-bit i686 target, but are running on a 64-bit
>> kernel, is TARGET_PAGE_SIZE going to offer sufficient alignement to keep
>> the kernel happy.
>>
>> Also O_DIRECT has alignment constraints beyond simply the page size. The
>> page size alignment is most important for the buffers being passed
>> back and forth. The on-disk alignment constraint is actually defined by
>> the filesystem and storage it is on, and on-disk alignment is what
>> matters when we decide on pages_offset.
> 
> For comparison, in the libvirt iohelper the alignment of DIRECT I/O-friendly 
> transfers is 64K,
> as per libvirt commit:
> 
> commit 12291656b135ed788e41dadbd2d15e613ddea9b5
> Author: Eric Blake <ebl...@redhat.com>
> Date:   Tue Jul 12 08:35:05 2011 -0600
> 
>     save: let iohelper work on O_DIRECT fds
>     
>     Required for a coming patch where iohelper will operate on O_DIRECT
>     fds.  There, the user-space memory must be aligned to file system
>     boundaries (at least 512, but using page-aligned works better, and
>     some file systems prefer 64k).  Made tougher by the fact that
>     VIR_ALLOC won't work on void *, but posix_memalign won't work on
>     char * and isn't available everywhere.
>     
>     This patch makes some simplifying assumptions - namely, output
>     to an O_DIRECT fd will only be attempted on an empty seekable
>     file (hence, no need to worry about preserving existing data
>     on a partial block, and ftruncate will work to undo the effects
>     of having to round up the size of the last block written), and
>     input from an O_DIRECT fd will only be attempted on a complete
>     seekable file with the only possible short read at EOF.
>     
>     * configure.ac (AC_CHECK_FUNCS_ONCE): Check for posix_memalign.
>     * src/util/iohelper.c (runIO): Use aligned memory, and handle
>     quirks of O_DIRECT on last write.
> 
> 
>>
>> If we want to allow saved state to be copied across different filesystems/
>> hosts, we need to consider the worst case alignment that would exist on
>> any conceivable host deployment now or in the future.
>>
>> Given that RAM sizes are measured 1000's of MBs in any scenario where
>> we care about save/restore speed enough to use the fixed-ram feature,
>> we can afford to waste a little space.
>>
>> IOW, we could round pages_offset upto the nearest 1 MB boundary,
>> which ought to be well enough aligned to cope with any constraint
>> we might imagine ? Or possibly even align to 4 MB, which is a common
>> size for small-ish huge pages.
>>
>>
>>> +                qemu_put_be64(f, block->pages_offset);
>>> +
>>> +                /* Now prepare offset for next ramblock */
>>> +                qemu_set_offset(f, block->pages_offset + 
>>> block->used_length, SEEK_SET);
>>> +            }
>>>          }
>>>      }
>>>  
>>> @@ -3249,6 +3286,17 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>>>      return 0;
>>>  }
>>>  
>>> +static void ram_save_shadow_bmap(QEMUFile *f)
>>> +{
>>> +    RAMBlock *block;
>>> +
>>> +    RAMBLOCK_FOREACH_MIGRATABLE(block) {
>>> +        long num_pages = block->used_length >> TARGET_PAGE_BITS;
>>> +        long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned 
>>> long);
>>> +        qemu_put_buffer_at(f, (uint8_t *)block->shadow_bmap, bitmap_size, 
>>> block->bitmap_offset);
>>> +    }
>>> +}
>>> +
>>>  /**
>>>   * ram_save_iterate: iterative stage for migration
>>>   *
>>> @@ -3358,9 +3406,15 @@ static int ram_save_iterate(QEMUFile *f, void 
>>> *opaque)
>>>              return ret;
>>>          }
>>>  
>>> -        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
>>> -        qemu_fflush(f);
>>> -        ram_transferred_add(8);
>>> +   /*
>>> +    * For fixed ram we don't want to pollute the migration stream with
>>> +    * EOS flags.
>>> +    */
>>> +   if (!migrate_fixed_ram()) {
>>> +            qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
>>> +            qemu_fflush(f);
>>> +            ram_transferred_add(8);
>>> +   }
>>>  
>>>          ret = qemu_file_get_error(f);
>>>      }
>>> @@ -3405,7 +3459,10 @@ static int ram_save_complete(QEMUFile *f, void 
>>> *opaque)
>>>              pages = ram_find_and_save_block(rs);
>>>              /* no more blocks to sent */
>>>              if (pages == 0) {
>>> -                break;
>>> +                if (migrate_fixed_ram()) {
>>> +                    ram_save_shadow_bmap(f);
>>> +                }
>>> +            break;
>>>              }
>>>              if (pages < 0) {
>>>                  ret = pages;
>>> @@ -3428,8 +3485,10 @@ static int ram_save_complete(QEMUFile *f, void 
>>> *opaque)
>>>          return ret;
>>>      }
>>>  
>>> -    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
>>> -    qemu_fflush(f);
>>> +    if (!migrate_fixed_ram()) {
>>> +        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
>>> +        qemu_fflush(f);
>>> +    }
>>>  
>>>      return 0;
>>>  }
>>> diff --git a/migration/savevm.c b/migration/savevm.c
>>> index 44a222888306..847a8bdfb6ce 100644
>>> --- a/migration/savevm.c
>>> +++ b/migration/savevm.c
>>> @@ -240,6 +240,7 @@ static bool should_validate_capability(int capability)
>>>      /* Validate only new capabilities to keep compatibility. */
>>>      switch (capability) {
>>>      case MIGRATION_CAPABILITY_X_IGNORE_SHARED:
>>> +    case MIGRATION_CAPABILITY_FIXED_RAM:
>>>          return true;
>>>      default:
>>>          return false;
>>> diff --git a/qapi/migration.json b/qapi/migration.json
>>> index 88ecf86ac876..6196617171e8 100644
>>> --- a/qapi/migration.json
>>> +++ b/qapi/migration.json
>>> @@ -485,7 +485,7 @@
>>>  ##
>>>  { 'enum': 'MigrationCapability',
>>>    'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks',
>>> -           'compress', 'events', 'postcopy-ram',
>>> +           'compress', 'events', 'postcopy-ram', 'fixed-ram',
>>>             { 'name': 'x-colo', 'features': [ 'unstable' ] },
>>>             'release-ram',
>>>             'block', 'return-path', 'pause-before-switchover', 'multifd',
>>> -- 
>>> 2.34.1
>>>
>>
>> With regards,
>> Daniel
> 
> Thanks,
> 
> Claudio

Re: [PATCH v3 10/14] migration/ram: Introduce 'fixed-ram' migration stream capability

Reply via email to