From: Chuang Xu <[email protected]> In our long-term experience in Bytedance, we've found that under the same load, live migration of larger VMs with more devices is often more difficult to converge (requiring a larger downtime limit).
Through some testing and calculations, we conclude that bitmap sync time affects the calculation of live migration bandwidth. When the addresses processed are not aligned, a large number of clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate 2048 clear_dirty ioctls from two different memory_listener), which increases the time required for bitmap_sync and makes it more difficult for dirty pages to converge. For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and 16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms* (tested with 10GBps dirty rate, the sync time increases as the dirty page rate increases), Here are each part of the sync time: - sync from kvm to ram_list: 2.5ms - vhost_log_sync:3ms - sync aligned memory from ram_list to RAMBlock: 5ms - sync misaligned memory from ram_list to RAMBlock: 61ms Attempt to merge those fragmented clear_dirty ioctls, then syncing misaligned memory from ram_list to RAMBlock takes only about 1ms, and the total sync time is only *12ms*. Signed-off-by: Chuang Xu <[email protected]> Reviewed-by: Fabiano Rosas <[email protected]> Link: https://lore.kernel.org/r/[email protected] [peterx: drop var "offset" in physical_memory_sync_dirty_bitmap] Signed-off-by: Peter Xu <[email protected]> --- include/system/physmem.h | 7 +++--- accel/tcg/cputlb.c | 5 +++-- migration/ram.c | 19 +++++----------- system/memory.c | 2 +- system/physmem.c | 48 +++++++++++++++++++++++++++------------- 5 files changed, 46 insertions(+), 35 deletions(-) diff --git a/include/system/physmem.h b/include/system/physmem.h index 879f6eae38..a59724ef10 100644 --- a/include/system/physmem.h +++ b/include/system/physmem.h @@ -39,9 +39,10 @@ uint64_t physical_memory_set_dirty_lebitmap(unsigned long *bitmap, void physical_memory_dirty_bits_cleared(ram_addr_t start, ram_addr_t length); -bool physical_memory_test_and_clear_dirty(ram_addr_t start, - ram_addr_t length, - unsigned client); +uint64_t physical_memory_test_and_clear_dirty(ram_addr_t start, + ram_addr_t length, + unsigned client, + unsigned long *bmap); DirtyBitmapSnapshot * physical_memory_snapshot_and_clear_dirty(MemoryRegion *mr, hwaddr offset, diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c index fd1606c856..c8827c8b0d 100644 --- a/accel/tcg/cputlb.c +++ b/accel/tcg/cputlb.c @@ -857,8 +857,9 @@ void tlb_flush_page_bits_by_mmuidx_all_cpus_synced(CPUState *src_cpu, void tlb_protect_code(ram_addr_t ram_addr) { physical_memory_test_and_clear_dirty(ram_addr & TARGET_PAGE_MASK, - TARGET_PAGE_SIZE, - DIRTY_MEMORY_CODE); + TARGET_PAGE_SIZE, + DIRTY_MEMORY_CODE, + NULL); } /* update the TLB so that writes in physical page 'phys_addr' are no longer diff --git a/migration/ram.c b/migration/ram.c index df7e154877..c403fd73a6 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -941,7 +941,6 @@ static uint64_t physical_memory_sync_dirty_bitmap(RAMBlock *rb, ram_addr_t start, ram_addr_t length) { - ram_addr_t addr; unsigned long word = BIT_WORD((start + rb->offset) >> TARGET_PAGE_BITS); uint64_t num_dirty = 0; unsigned long *dest = rb->bmap; @@ -993,19 +992,11 @@ static uint64_t physical_memory_sync_dirty_bitmap(RAMBlock *rb, memory_region_clear_dirty_bitmap(rb->mr, start, length); } } else { - ram_addr_t offset = rb->offset; - - for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) { - if (physical_memory_test_and_clear_dirty( - start + addr + offset, - TARGET_PAGE_SIZE, - DIRTY_MEMORY_MIGRATION)) { - long k = (start + addr) >> TARGET_PAGE_BITS; - if (!test_and_set_bit(k, dest)) { - num_dirty++; - } - } - } + num_dirty = physical_memory_test_and_clear_dirty( + start + rb->offset, + length, + DIRTY_MEMORY_MIGRATION, + dest); } return num_dirty; diff --git a/system/memory.c b/system/memory.c index 8b84661ae3..666364392d 100644 --- a/system/memory.c +++ b/system/memory.c @@ -2424,7 +2424,7 @@ void memory_region_reset_dirty(MemoryRegion *mr, hwaddr addr, { assert(mr->ram_block); physical_memory_test_and_clear_dirty( - memory_region_get_ram_addr(mr) + addr, size, client); + memory_region_get_ram_addr(mr) + addr, size, client, NULL); } int memory_region_get_fd(MemoryRegion *mr) diff --git a/system/physmem.c b/system/physmem.c index c9869e4049..26bf30af17 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -1089,19 +1089,30 @@ void physical_memory_set_dirty_range(ram_addr_t start, ram_addr_t length, } } -/* Note: start and end must be within the same ram block. */ -bool physical_memory_test_and_clear_dirty(ram_addr_t start, +/* + * Note: start and end must be within the same ram block. + * + * @bmap usage: + * - When @bmap is provided, set bits for dirty pages, but + * only count those pages if the bit wasn't already set in @bmap. + * - When @bmap is NULL, count all dirty pages in the range. + * + * @return: + * - Number of dirty guest pages found within [start, start + length). + */ +uint64_t physical_memory_test_and_clear_dirty(ram_addr_t start, ram_addr_t length, - unsigned client) + unsigned client, + unsigned long *bmap) { DirtyMemoryBlocks *blocks; unsigned long end, page, start_page; - bool dirty = false; + uint64_t num_dirty = 0; RAMBlock *ramblock; uint64_t mr_offset, mr_size; if (length == 0) { - return false; + return 0; } end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS; @@ -1118,12 +1129,19 @@ bool physical_memory_test_and_clear_dirty(ram_addr_t start, while (page < end) { unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE; unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE; - unsigned long num = MIN(end - page, - DIRTY_MEMORY_BLOCK_SIZE - offset); - dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx], - offset, num); - page += num; + if (bitmap_test_and_clear_atomic(blocks->blocks[idx], offset, 1)) { + if (bmap) { + unsigned long k = page - (ramblock->offset >> TARGET_PAGE_BITS); + if (!test_and_set_bit(k, bmap)) { + num_dirty++; + } + } else { + num_dirty++; + } + } + + page++; } mr_offset = (ram_addr_t)(start_page << TARGET_PAGE_BITS) - ramblock->offset; @@ -1131,18 +1149,18 @@ bool physical_memory_test_and_clear_dirty(ram_addr_t start, memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size); } - if (dirty) { + if (num_dirty) { physical_memory_dirty_bits_cleared(start, length); } - return dirty; + return num_dirty; } static void physical_memory_clear_dirty_range(ram_addr_t addr, ram_addr_t length) { - physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_MIGRATION); - physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_VGA); - physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_CODE); + physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_MIGRATION, NULL); + physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_VGA, NULL); + physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_CODE, NULL); } DirtyBitmapSnapshot *physical_memory_snapshot_and_clear_dirty -- 2.50.1
