I've got a world with just patches 1..5 on that's seeing corruptions, but I've not seen where the problem is. So far the world with 1..4 on hasn't hit those corruption, but maybe I need to test more.
Have you tested this set with google stressapptest? Let it migrate for a few cycles with stress apptest running, then ctrl-z the stressapptest program to let the migration complete, then fg it to collect the results. Dave * arei.gong...@huawei.com (arei.gong...@huawei.com) wrote: > From: ChenLiang <chenlian...@huawei.com> > > Avoid hot pages being replaced by others to remarkably decrease cache > misses > > Sample results with the test program which quote from xbzrle.txt ran in > vm:(migrate bandwidth:1GE and xbzrle cache size 8MB) > > the test program: > > include <stdlib.h> > include <stdio.h> > int main() > { > char *buf = (char *) calloc(4096, 4096); > while (1) { > int i; > for (i = 0; i < 4096 * 4; i++) { > buf[i * 4096 / 4]++; > } > printf("."); > } > } > > before this patch: > virsh qemu-monitor-command test_vm '{"execute": "query-migrate"}' > {"return":{"expected-downtime":1020,"xbzrle-cache":{"bytes":1108284, > "cache-size":8388608,"cache-miss-rate":0.987013,"pages":18297,"overflow":8, > "cache-miss":1228737},"status":"active","setup-time":10,"total-time":52398, > "ram":{"total":12466991104,"remaining":1695744,"mbps":935.559472, > "transferred":5780760580,"dirty-sync-counter":271,"duplicate":2878530, > "dirty-pages-rate":29130,"skipped":0,"normal-bytes":5748592640, > "normal":1403465}},"id":"libvirt-706"} > > 18k pages sent compressed > cache-miss-rate is 98.7%, totally miss. > > after optimizing: > virsh qemu-monitor-command test_vm '{"execute": "query-migrate"}' > {"return":{"expected-downtime":2054,"xbzrle-cache":{"bytes":5066763, > "cache-size":8388608,"cache-miss-rate":0.485924,"pages":194823,"overflow":0, > "cache-miss":210653},"status":"active","setup-time":11,"total-time":18729, > "ram":{"total":12466991104,"remaining":3895296,"mbps":937.663549, > "transferred":1615042219,"dirty-sync-counter":98,"duplicate":2869840, > "dirty-pages-rate":58781,"skipped":0,"normal-bytes":1588404224, > "normal":387794}},"id":"libvirt-266"} > > 194k pages sent compressed > The value of cache-miss-rate decrease to 48.59%. > > Signed-off-by: ChenLiang <chenlian...@huawei.com> > Signed-off-by: Gonglei <arei.gong...@huawei.com> > Reviewed-by: Eric Blake <ebl...@redhat.com> > --- > arch_init.c | 8 +++++--- > docs/xbzrle.txt | 8 ++++++++ > include/migration/page_cache.h | 10 +++++++--- > page_cache.c | 23 +++++++++++++++++++---- > 4 files changed, 39 insertions(+), 10 deletions(-) > > diff --git a/arch_init.c b/arch_init.c > index 15ca4c0..84a4bd3 100644 > --- a/arch_init.c > +++ b/arch_init.c > @@ -343,7 +343,8 @@ static void xbzrle_cache_zero_page(ram_addr_t > current_addr) > > /* We don't care if this fails to allocate a new cache page > * as long as it updated an old one */ > - cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE); > + cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE, > + bitmap_sync_count); > } > > #define ENCODING_FLAG_XBZRLE 0x1 > @@ -355,10 +356,11 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t > **current_data, > int encoded_len = 0, bytes_sent = -1; > uint8_t *prev_cached_page; > > - if (!cache_is_cached(XBZRLE.cache, current_addr)) { > + if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) { > acct_info.xbzrle_cache_miss++; > if (!last_stage) { > - if (cache_insert(XBZRLE.cache, current_addr, *current_data) == > -1) { > + if (cache_insert(XBZRLE.cache, current_addr, *current_data, > + bitmap_sync_count) == -1) { > return -1; > } else { > /* update *current_data when the page has been > diff --git a/docs/xbzrle.txt b/docs/xbzrle.txt > index cc3a26a..52c8511 100644 > --- a/docs/xbzrle.txt > +++ b/docs/xbzrle.txt > @@ -71,6 +71,14 @@ encoded buffer: > encoded length 24 > e9 07 0f 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 03 01 67 01 01 69 > > +Cache update strategy > +===================== > +Keeping the hot pages in the cache is effective for decreased cache > +misses. XBZRLE uses a counter as the age of each page. The counter will > +increase after each ram dirty bitmap sync. When a cache conflict is > +detected, XBZRLE will only evict pages in the cache that are older than > +a threshold. > + > Usage > ====================== > 1. Verify the destination QEMU version is able to decode the new format. > diff --git a/include/migration/page_cache.h b/include/migration/page_cache.h > index 2d5ce2d..10ed532 100644 > --- a/include/migration/page_cache.h > +++ b/include/migration/page_cache.h > @@ -43,8 +43,10 @@ void cache_fini(PageCache *cache); > * > * @cache pointer to the PageCache struct > * @addr: page addr > + * @current_age: current bitmap generation > */ > -bool cache_is_cached(const PageCache *cache, uint64_t addr); > +bool cache_is_cached(const PageCache *cache, uint64_t addr, > + uint64_t current_age); > > /** > * get_cached_data: Get the data cached for an addr > @@ -60,13 +62,15 @@ uint8_t *get_cached_data(const PageCache *cache, uint64_t > addr); > * cache_insert: insert the page into the cache. the page cache > * will dup the data on insert. the previous value will be overwritten > * > - * Returns -1 on error > + * Returns -1 when the page isn't inserted into cache > * > * @cache pointer to the PageCache struct > * @addr: page address > * @pdata: pointer to the page > + * @current_age: current bitmap generation > */ > -int cache_insert(PageCache *cache, uint64_t addr, const uint8_t *pdata); > +int cache_insert(PageCache *cache, uint64_t addr, const uint8_t *pdata, > + uint64_t current_age); > > /** > * cache_resize: resize the page cache. In case of size reduction the extra > diff --git a/page_cache.c b/page_cache.c > index b033681..c78157b 100644 > --- a/page_cache.c > +++ b/page_cache.c > @@ -33,6 +33,9 @@ > do { } while (0) > #endif > > +/* the page in cache will not be replaced in two cycles */ > +#define CACHED_PAGE_LIFETIME 2 > + > typedef struct CacheItem CacheItem; > > struct CacheItem { > @@ -121,7 +124,8 @@ static size_t cache_get_cache_pos(const PageCache *cache, > return pos; > } > > -bool cache_is_cached(const PageCache *cache, uint64_t addr) > +bool cache_is_cached(const PageCache *cache, uint64_t addr, > + uint64_t current_age) > { > size_t pos; > > @@ -130,7 +134,12 @@ bool cache_is_cached(const PageCache *cache, uint64_t > addr) > > pos = cache_get_cache_pos(cache, addr); > > - return (cache->page_cache[pos].it_addr == addr); > + if (cache->page_cache[pos].it_addr == addr) { > + /* update the it_age when the cache hit */ > + cache->page_cache[pos].it_age = current_age; > + return true; > + } > + return false; > } > > static CacheItem *cache_get_by_addr(const PageCache *cache, uint64_t addr) > @@ -150,7 +159,8 @@ uint8_t *get_cached_data(const PageCache *cache, uint64_t > addr) > return cache_get_by_addr(cache, addr)->it_data; > } > > -int cache_insert(PageCache *cache, uint64_t addr, const uint8_t *pdata) > +int cache_insert(PageCache *cache, uint64_t addr, const uint8_t *pdata, > + uint64_t current_age) > { > > CacheItem *it = NULL; > @@ -161,6 +171,11 @@ int cache_insert(PageCache *cache, uint64_t addr, const > uint8_t *pdata) > /* actual update of entry */ > it = cache_get_by_addr(cache, addr); > > + if (it->it_data && > + it->it_age + CACHED_PAGE_LIFETIME > current_age) { > + /* the cache page is fresh, don't replace it */ > + return -1; > + } > /* allocate page */ > if (!it->it_data) { > it->it_data = g_try_malloc(cache->page_size); > @@ -173,7 +188,7 @@ int cache_insert(PageCache *cache, uint64_t addr, const > uint8_t *pdata) > > memcpy(it->it_data, pdata, cache->page_size); > > - it->it_age = ++cache->max_item_age; > + it->it_age = current_age; > it->it_addr = addr; > > return 0; > -- > 1.7.12.4 > > > -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK