"Chuang Xu" <[email protected]> writes:

> From: xuchuangxclwt <[email protected]>
>
> In our long-term experience in Bytedance, we've found that under
> the same load, live migration of larger VMs with more devices is
> often more difficult to converge (requiring a larger downtime limit).
>
> Through some testing and calculations, we conclude that bitmap sync time
> affects the calculation of live migration bandwidth.
>
> When the addresses processed are not aligned, a large number of
> clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate
> 2048 clear_dirty ioctls from two different memory_listener),
> which increases the time required for bitmap_sync and makes it
> more difficult for dirty pages to converge.
>
> For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and
> 16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms*
> (tested with 10GBps dirty rate, the sync time increases as the dirty
> page rate increases), Here are each part of the sync time:
>
> - sync from kvm to ram_list: 2.5ms
> - vhost_log_sync:3ms
> - sync aligned memory from ram_list to RAMBlock: 5ms
> - sync misaligned memory from ram_list to RAMBlock: 61ms
>
> Attempt to merge those fragmented clear_dirty ioctls, then syncing
> misaligned memory from ram_list to RAMBlock takes only about 1ms,
> and the total sync time is only *12ms*.
>
> Signed-off-by: Chuang Xu <[email protected]>
> ---
>  accel/tcg/cputlb.c       |  5 ++--
>  include/system/physmem.h |  7 +++---
>  migration/ram.c          | 17 ++++----------
>  system/memory.c          |  2 +-
>  system/physmem.c         | 49 ++++++++++++++++++++++++++++------------
>  5 files changed, 47 insertions(+), 33 deletions(-)
>
> diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
> index fd1606c856..c8827c8b0d 100644
> --- a/accel/tcg/cputlb.c
> +++ b/accel/tcg/cputlb.c
> @@ -857,8 +857,9 @@ void 
> tlb_flush_page_bits_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
>  void tlb_protect_code(ram_addr_t ram_addr)
>  {
>      physical_memory_test_and_clear_dirty(ram_addr & TARGET_PAGE_MASK,
> -                                             TARGET_PAGE_SIZE,
> -                                             DIRTY_MEMORY_CODE);
> +                                         TARGET_PAGE_SIZE,
> +                                         DIRTY_MEMORY_CODE,
> +                                         NULL);
>  }
>  
>  /* update the TLB so that writes in physical page 'phys_addr' are no longer
> diff --git a/include/system/physmem.h b/include/system/physmem.h
> index 879f6eae38..8eeace9d1f 100644
> --- a/include/system/physmem.h
> +++ b/include/system/physmem.h
> @@ -39,9 +39,10 @@ uint64_t physical_memory_set_dirty_lebitmap(unsigned long 
> *bitmap,
>  
>  void physical_memory_dirty_bits_cleared(ram_addr_t start, ram_addr_t length);
>  
> -bool physical_memory_test_and_clear_dirty(ram_addr_t start,
> -                                          ram_addr_t length,
> -                                          unsigned client);
> +uint64_t physical_memory_test_and_clear_dirty(ram_addr_t start,
> +                                              ram_addr_t length,
> +                                              unsigned client,
> +                                              unsigned long *dest);
>  
>  DirtyBitmapSnapshot *
>  physical_memory_snapshot_and_clear_dirty(MemoryRegion *mr, hwaddr offset,
> diff --git a/migration/ram.c b/migration/ram.c
> index 29f016cb25..a03c9874a2 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -942,7 +942,6 @@ static uint64_t 
> physical_memory_sync_dirty_bitmap(RAMBlock *rb,
>                                                    ram_addr_t start,
>                                                    ram_addr_t length)
>  {
> -    ram_addr_t addr;
>      unsigned long word = BIT_WORD((start + rb->offset) >> TARGET_PAGE_BITS);
>      uint64_t num_dirty = 0;
>      unsigned long *dest = rb->bmap;
> @@ -996,17 +995,11 @@ static uint64_t 
> physical_memory_sync_dirty_bitmap(RAMBlock *rb,
>      } else {
>          ram_addr_t offset = rb->offset;
>  
> -        for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) {
> -            if (physical_memory_test_and_clear_dirty(
> -                        start + addr + offset,
> -                        TARGET_PAGE_SIZE,
> -                        DIRTY_MEMORY_MIGRATION)) {
> -                long k = (start + addr) >> TARGET_PAGE_BITS;
> -                if (!test_and_set_bit(k, dest)) {
> -                    num_dirty++;
> -                }
> -            }
> -        }
> +        num_dirty = physical_memory_test_and_clear_dirty(
> +                        start + offset,
> +                        length,
> +                        DIRTY_MEMORY_MIGRATION,
> +                        dest);
>      }
>  
>      return num_dirty;
> diff --git a/system/memory.c b/system/memory.c
> index 8b84661ae3..666364392d 100644
> --- a/system/memory.c
> +++ b/system/memory.c
> @@ -2424,7 +2424,7 @@ void memory_region_reset_dirty(MemoryRegion *mr, hwaddr 
> addr,
>  {
>      assert(mr->ram_block);
>      physical_memory_test_and_clear_dirty(
> -        memory_region_get_ram_addr(mr) + addr, size, client);
> +        memory_region_get_ram_addr(mr) + addr, size, client, NULL);
>  }
>  
>  int memory_region_get_fd(MemoryRegion *mr)
> diff --git a/system/physmem.c b/system/physmem.c
> index c9869e4049..f8b660dafe 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -1089,19 +1089,31 @@ void physical_memory_set_dirty_range(ram_addr_t 
> start, ram_addr_t length,
>      }
>  }
>  
> -/* Note: start and end must be within the same ram block.  */
> -bool physical_memory_test_and_clear_dirty(ram_addr_t start,
> +/*
> + * Note: start and end must be within the same ram block.
> + *
> + * @dest usage:

I'm not sure if it's just me, but I find this "dest" term quite
confusing. "bmap" might be more straight-forward.

> + * - When @dest is provided, set bits for newly discovered dirty pages
> + *   only if the bit wasn't already set in dest, and count those pages
> + *   in num_dirty.

Am I misreading the code? I don't see this "set ... only if the bit
wasn't already set" part. What I see is: "set bits, but only count those
pages if the bit wasn't already set".

> + * - When @dest is NULL, count all dirty pages in the range
> + *
> + * @return:
> + * - Number of dirty guest pages found within [start, start + length).
> + */
> +uint64_t physical_memory_test_and_clear_dirty(ram_addr_t start,
>                                                ram_addr_t length,
> -                                              unsigned client)
> +                                              unsigned client,
> +                                              unsigned long *dest)
>  {
>      DirtyMemoryBlocks *blocks;
>      unsigned long end, page, start_page;
> -    bool dirty = false;
> +    uint64_t num_dirty = 0;
>      RAMBlock *ramblock;
>      uint64_t mr_offset, mr_size;
>  
>      if (length == 0) {
> -        return false;
> +        return 0;
>      }
>  
>      end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
> @@ -1118,12 +1130,19 @@ bool physical_memory_test_and_clear_dirty(ram_addr_t 
> start,
>          while (page < end) {
>              unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
>              unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
> -            unsigned long num = MIN(end - page,
> -                                    DIRTY_MEMORY_BLOCK_SIZE - offset);
>  
> -            dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx],
> -                                                  offset, num);
> -            page += num;
> +            if (bitmap_test_and_clear_atomic(blocks->blocks[idx], offset, 
> 1)) {
> +                if (dest) {
> +                    unsigned long k = page - (ramblock->offset >> 
> TARGET_PAGE_BITS);
> +                    if (!test_and_set_bit(k, dest)) {
> +                        num_dirty++;
> +                    }
> +                } else {
> +                    num_dirty++;
> +                }
> +            }
> +
> +            page++;
>          }
>  
>          mr_offset = (ram_addr_t)(start_page << TARGET_PAGE_BITS) - 
> ramblock->offset;
> @@ -1131,18 +1150,18 @@ bool physical_memory_test_and_clear_dirty(ram_addr_t 
> start,
>          memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size);
>      }
>  
> -    if (dirty) {
> +    if (num_dirty) {
>          physical_memory_dirty_bits_cleared(start, length);
>      }
>  
> -    return dirty;
> +    return num_dirty;
>  }
>  
>  static void physical_memory_clear_dirty_range(ram_addr_t addr, ram_addr_t 
> length)
>  {
> -    physical_memory_test_and_clear_dirty(addr, length, 
> DIRTY_MEMORY_MIGRATION);
> -    physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_VGA);
> -    physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_CODE);
> +    physical_memory_test_and_clear_dirty(addr, length, 
> DIRTY_MEMORY_MIGRATION, NULL);
> +    physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_VGA, 
> NULL);
> +    physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_CODE, 
> NULL);
>  }
>  
>  DirtyBitmapSnapshot *physical_memory_snapshot_and_clear_dirty

Reply via email to