[PATCH v7 0/2] Update AVX512 support for xbzrle and CI failure
This patch updates code of avx512 support for xbzrle_encode_buffer function. We mainly modified code in xbzrle-bench.c for addressing CI failure. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin ling xu (2): Update AVX512 support for xbzrle_encode_buffer Unit test code and benchmark code meson.build| 16 ++ meson_options.txt | 2 + migration/ram.c| 34 ++- migration/xbzrle.c | 124 ++ migration/xbzrle.h | 4 + tests/bench/meson.build| 4 + tests/bench/xbzrle-bench.c | 469 + tests/unit/test-xbzrle.c | 39 ++- 8 files changed, 684 insertions(+), 8 deletions(-) create mode 100644 tests/bench/xbzrle-bench.c -- 2.25.1
[PATCH v7 2/2] Update bench-code for addressing CI problem
Unit test code is in test-xbzrle.c, and benchmark code is in xbzrle-bench.c for performance benchmarking. we have modified xbzrle-bench.c to address CI problem. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin --- tests/bench/meson.build| 4 + tests/bench/xbzrle-bench.c | 469 + tests/unit/test-xbzrle.c | 39 ++- 3 files changed, 507 insertions(+), 5 deletions(-) create mode 100644 tests/bench/xbzrle-bench.c diff --git a/tests/bench/meson.build b/tests/bench/meson.build index 279a8fcc33..daefead58d 100644 --- a/tests/bench/meson.build +++ b/tests/bench/meson.build @@ -3,6 +3,10 @@ qht_bench = executable('qht-bench', sources: 'qht-bench.c', dependencies: [qemuutil]) +xbzrle_bench = executable('xbzrle-bench', + sources: 'xbzrle-bench.c', + dependencies: [qemuutil,migration]) + executable('atomic_add-bench', sources: files('atomic_add-bench.c'), dependencies: [qemuutil], diff --git a/tests/bench/xbzrle-bench.c b/tests/bench/xbzrle-bench.c new file mode 100644 index 00..8848a3a32d --- /dev/null +++ b/tests/bench/xbzrle-bench.c @@ -0,0 +1,469 @@ +/* + * Xor Based Zero Run Length Encoding unit tests. + * + * Copyright 2013 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Orit Wasserman + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "../migration/xbzrle.h" + +#if defined(CONFIG_AVX512BW_OPT) +#define XBZRLE_PAGE_SIZE 4096 +static bool is_cpu_support_avx512bw; +#include "qemu/cpuid.h" +static void __attribute__((constructor)) init_cpu_flag(void) +{ +unsigned max = __get_cpuid_max(0, NULL); +int a, b, c, d; +is_cpu_support_avx512bw = false; +if (max >= 1) { +__cpuid(1, a, b, c, d); + /* We must check that AVX is not just available, but usable. */ +if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { +int bv; +__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); +__cpuid_count(7, 0, a, b, c, d); + /* 0xe6: +* XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 +*and ZMM16-ZMM31 state are enabled by OS) +* XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) +*/ +if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { +is_cpu_support_avx512bw = true; +} +} +} +return ; +} + +struct ResTime { +float t_raw; +float t_512; +}; + + +/* Function prototypes +int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen, +uint8_t *dst, int dlen); +*/ +static void encode_decode_zero(struct ResTime *res) +{ +uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE); +int i = 0; +int dlen = 0, dlen512 = 0; +int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006); + +for (i = diff_len; i > 0; i--) { +buffer[1000 + i] = i; +buffer512[1000 + i] = i; +} + +buffer[1000 + diff_len + 3] = 103; +buffer[1000 + diff_len + 5] = 105; + +buffer512[1000 + diff_len + 3] = 103; +buffer512[1000 + diff_len + 5] = 105; + +/* encode zero page */ +time_t t_start, t_end, t_start512, t_end512; +t_start = clock(); +dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed, + XBZRLE_PAGE_SIZE); +t_end = clock(); +float time_val = difftime(t_end, t_start); +g_assert(dlen == 0); + +t_start512 = clock(); +dlen512 = xbzrle_encode_buffer_avx512(buffer512, buffer512, XBZRLE_PAGE_SIZE, + compressed512, XBZRLE_PAGE_SIZE); +t_end512 = clock(); +float time_val512 = difftime(t_end512, t_start512); +g_assert(dlen512 == 0); + +res->t_raw = time_val; +res->t_512 = time_val512; + +g_free(buffer); +g_free(compressed); +g_free(buffer512); +g_free(compressed512); + +} + +static void test_encode_decode_zero_avx512(void) +{ +int i; +float time_raw = 0.0, time_512 = 0.0; +struct ResTime res; +for (i = 0; i < 1; i++) { +encode_decode_zero(); +time_raw += res.t_raw; +time_512 += res.t_512; +} +printf("Zero test:\n"); +printf("Raw xbzrle_encode time is %f ms\n", time_raw); +printf("512 xbzrle_encode time is %f ms\n", time_512); +} + +static voi
[PATCH v7 1/2] AVX512 support for xbzrle_encode_buffer
This commit is the same with [PATCH v6 1/2], and provides avx512 support for xbzrle_encode_buffer function to accelerate xbzrle encoding speed. Runtime check of avx512 support and benchmark for this feature are added. Compared with C version of xbzrle_encode_buffer function, avx512 version can achieve 50%-70% performance improvement on benchmarking. In addition, if dirty data is randomly located in 4K page, the avx512 version can achieve almost 140% performance gain. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin --- meson.build| 16 ++ meson_options.txt | 2 + migration/ram.c| 34 +++-- migration/xbzrle.c | 124 + migration/xbzrle.h | 4 ++ 5 files changed, 177 insertions(+), 3 deletions(-) diff --git a/meson.build b/meson.build index cf3e517e56..d0d28f5c9e 100644 --- a/meson.build +++ b/meson.build @@ -2344,6 +2344,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \ int main(int argc, char *argv[]) { return bar(argv[argc - 1]); } '''), error_message: 'AVX512F not available').allowed()) +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \ + .require(cc.links(''' +#pragma GCC push_options +#pragma GCC target("avx512bw") +#include +#include +static int bar(void *a) { + + __m512i *x = a; + __m512i res= _mm512_abs_epi8(*x); + return res[1]; +} +int main(int argc, char *argv[]) { return bar(argv[0]); } + '''), error_message: 'AVX512BW not available').allowed()) + have_pvrdma = get_option('pvrdma') \ .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \ .require(cc.compiles(gnu_source_prefix + ''' diff --git a/meson_options.txt b/meson_options.txt index 66128178bf..96814dd211 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto', description: 'AVX2 optimizations') option('avx512f', type: 'feature', value: 'disabled', description: 'AVX512F optimizations') +option('avx512bw', type: 'feature', value: 'auto', + description: 'AVX512BW optimizations') option('keyring', type: 'feature', value: 'auto', description: 'Linux keyring support') diff --git a/migration/ram.c b/migration/ram.c index dc1de9ddbc..ff4c15c9c3 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -83,6 +83,34 @@ /* 0x80 is reserved in migration.h start with 0x100 next */ #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100 +int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, + uint8_t *, int) = xbzrle_encode_buffer; +#if defined(CONFIG_AVX512BW_OPT) +#include "qemu/cpuid.h" +static void __attribute__((constructor)) init_cpu_flag(void) +{ +unsigned max = __get_cpuid_max(0, NULL); +int a, b, c, d; +if (max >= 1) { +__cpuid(1, a, b, c, d); + /* We must check that AVX is not just available, but usable. */ +if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { +int bv; +__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); +__cpuid_count(7, 0, a, b, c, d); + /* 0xe6: +* XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 +*and ZMM16-ZMM31 state are enabled by OS) +* XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) +*/ +if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { +xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; +} +} +} +} +#endif + XBZRLECacheStats xbzrle_counters; /* struct contains XBZRLE cache and a static page @@ -802,9 +830,9 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); /* XBZRLE encoding (if there is no overflow) */ -encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, - TARGET_PAGE_SIZE, XBZRLE.encoded_buf, - TARGET_PAGE_SIZE); +encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf, +TARGET_PAGE_SIZE, XBZRLE.encoded_buf, +TARGET_PAGE_SIZE); /* * Update the cache contents, so that it corresponds to the data diff --git a/migration/xbzrle.c b/migration/xbzrle.c index 1ba482ded9..05366e86c0 100644 --- a/migration/xbzrle.c +++ b/migration/xbzrle.c @@ -174,3 +174,127 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen) return d; } + +#if defined(CONFIG_AVX512BW_OPT) +#pragma GCC push_options +#pragma GCC t
[PATCH v6 2/2] Unit test code and benchmark code
Unit test code is in test-xbzrle.c, and benchmark code is in xbzrle-bench.c for performance benchmarking. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin --- tests/bench/meson.build| 4 + tests/bench/xbzrle-bench.c | 465 + tests/unit/test-xbzrle.c | 39 +++- 3 files changed, 503 insertions(+), 5 deletions(-) create mode 100644 tests/bench/xbzrle-bench.c diff --git a/tests/bench/meson.build b/tests/bench/meson.build index 279a8fcc33..daefead58d 100644 --- a/tests/bench/meson.build +++ b/tests/bench/meson.build @@ -3,6 +3,10 @@ qht_bench = executable('qht-bench', sources: 'qht-bench.c', dependencies: [qemuutil]) +xbzrle_bench = executable('xbzrle-bench', + sources: 'xbzrle-bench.c', + dependencies: [qemuutil,migration]) + executable('atomic_add-bench', sources: files('atomic_add-bench.c'), dependencies: [qemuutil], diff --git a/tests/bench/xbzrle-bench.c b/tests/bench/xbzrle-bench.c new file mode 100644 index 00..d71397e6f4 --- /dev/null +++ b/tests/bench/xbzrle-bench.c @@ -0,0 +1,465 @@ +/* + * Xor Based Zero Run Length Encoding unit tests. + * + * Copyright 2013 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Orit Wasserman + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "../migration/xbzrle.h" + +#define XBZRLE_PAGE_SIZE 4096 + +#if defined(CONFIG_AVX512BW_OPT) +static bool is_cpu_support_avx512bw; +#include "qemu/cpuid.h" +static void __attribute__((constructor)) init_cpu_flag(void) +{ +unsigned max = __get_cpuid_max(0, NULL); +int a, b, c, d; +is_cpu_support_avx512bw = false; +if (max >= 1) { +__cpuid(1, a, b, c, d); + /* We must check that AVX is not just available, but usable. */ +if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { +int bv; +__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); +__cpuid_count(7, 0, a, b, c, d); + /* 0xe6: +* XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 +*and ZMM16-ZMM31 state are enabled by OS) +* XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) +*/ +if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { +is_cpu_support_avx512bw = true; +} +} +} +return ; +} +#endif + +struct ResTime { +float t_raw; +float t_512; +}; + +static void encode_decode_zero(struct ResTime *res) +{ +uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE); +int i = 0; +int dlen = 0, dlen512 = 0; +int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006); + +for (i = diff_len; i > 0; i--) { +buffer[1000 + i] = i; +buffer512[1000 + i] = i; +} + +buffer[1000 + diff_len + 3] = 103; +buffer[1000 + diff_len + 5] = 105; + +buffer512[1000 + diff_len + 3] = 103; +buffer512[1000 + diff_len + 5] = 105; + +/* encode zero page */ +time_t t_start, t_end, t_start512, t_end512; +t_start = clock(); +dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed, + XBZRLE_PAGE_SIZE); +t_end = clock(); +float time_val = difftime(t_end, t_start); +g_assert(dlen == 0); + +t_start512 = clock(); +dlen512 = xbzrle_encode_buffer_avx512(buffer512, buffer512, XBZRLE_PAGE_SIZE, + compressed512, XBZRLE_PAGE_SIZE); +t_end512 = clock(); +float time_val512 = difftime(t_end512, t_start512); +g_assert(dlen512 == 0); + +res->t_raw = time_val; +res->t_512 = time_val512; + +g_free(buffer); +g_free(compressed); +g_free(buffer512); +g_free(compressed512); + +} + +static void test_encode_decode_zero_avx512(void) +{ +int i; +float time_raw = 0.0, time_512 = 0.0; +struct ResTime res; +for (i = 0; i < 1; i++) { +encode_decode_zero(); +time_raw += res.t_raw; +time_512 += res.t_512; +} +printf("Zero test:\n"); +printf("Raw xbzrle_encode time is %f ms\n", time_raw); +printf("512 xbzrle_encode time is %f ms\n", time_512); +} + +static void encode_decode_unchanged(struct ResTime *res) +{ +uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE); +
[PATCH v6 1/2] Update AVX512 support for xbzrle_encode_buffer
This commit updates code of avx512 support for xbzrle_encode_buffer function to accelerate xbzrle encoding speed. Runtime check of avx512 support and benchmark for this feature are added. Compared with C version of xbzrle_encode_buffer function, avx512 version can achieve 50%-70% performance improvement on benchmarking. In addition, if dirty data is randomly located in 4K page, the avx512 version can achieve almost 140% performance gain. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin --- meson.build| 16 ++ meson_options.txt | 2 + migration/ram.c| 34 +++-- migration/xbzrle.c | 124 + migration/xbzrle.h | 4 ++ 5 files changed, 177 insertions(+), 3 deletions(-) diff --git a/meson.build b/meson.build index 20fddbd707..5d4b82d7f3 100644 --- a/meson.build +++ b/meson.build @@ -2264,6 +2264,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \ int main(int argc, char *argv[]) { return bar(argv[0]); } '''), error_message: 'AVX512F not available').allowed()) +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \ + .require(cc.links(''' +#pragma GCC push_options +#pragma GCC target("avx512bw") +#include +#include +static int bar(void *a) { + + __m512i *x = a; + __m512i res= _mm512_abs_epi8(*x); + return res[1]; +} +int main(int argc, char *argv[]) { return bar(argv[0]); } + '''), error_message: 'AVX512BW not available').allowed()) + have_pvrdma = get_option('pvrdma') \ .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \ .require(cc.compiles(gnu_source_prefix + ''' diff --git a/meson_options.txt b/meson_options.txt index e58e158396..07194bf680 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto', description: 'AVX2 optimizations') option('avx512f', type: 'feature', value: 'disabled', description: 'AVX512F optimizations') +option('avx512bw', type: 'feature', value: 'auto', + description: 'AVX512BW optimizations') option('keyring', type: 'feature', value: 'auto', description: 'Linux keyring support') diff --git a/migration/ram.c b/migration/ram.c index dc1de9ddbc..ff4c15c9c3 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -83,6 +83,34 @@ /* 0x80 is reserved in migration.h start with 0x100 next */ #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100 +int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, + uint8_t *, int) = xbzrle_encode_buffer; +#if defined(CONFIG_AVX512BW_OPT) +#include "qemu/cpuid.h" +static void __attribute__((constructor)) init_cpu_flag(void) +{ +unsigned max = __get_cpuid_max(0, NULL); +int a, b, c, d; +if (max >= 1) { +__cpuid(1, a, b, c, d); + /* We must check that AVX is not just available, but usable. */ +if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { +int bv; +__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); +__cpuid_count(7, 0, a, b, c, d); + /* 0xe6: +* XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 +*and ZMM16-ZMM31 state are enabled by OS) +* XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) +*/ +if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { +xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; +} +} +} +} +#endif + XBZRLECacheStats xbzrle_counters; /* struct contains XBZRLE cache and a static page @@ -802,9 +830,9 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); /* XBZRLE encoding (if there is no overflow) */ -encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, - TARGET_PAGE_SIZE, XBZRLE.encoded_buf, - TARGET_PAGE_SIZE); +encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf, +TARGET_PAGE_SIZE, XBZRLE.encoded_buf, +TARGET_PAGE_SIZE); /* * Update the cache contents, so that it corresponds to the data diff --git a/migration/xbzrle.c b/migration/xbzrle.c index 1ba482ded9..05366e86c0 100644 --- a/migration/xbzrle.c +++ b/migration/xbzrle.c @@ -174,3 +174,127 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen) return d; } + +#if defined(CONFIG_AVX512BW_OPT) +#pragma GCC push_options +#pragma GCC target("avx512bw") +#include +int xbzrle_enc
[PATCH v6 0/2] This patch updates AVX512 support for xbzrle
This patch updates code of avx512 support for xbzrle_encode_buffer function. We modified code of algorithm and check of avx512. Besides, we provide benchmark in xbzrle-bench.c for performance comparison. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin ling xu (2): Update AVX512 support for xbzrle_encode_buffer Unit test code and benchmark code meson.build| 16 ++ meson_options.txt | 2 + migration/ram.c| 34 ++- migration/xbzrle.c | 124 ++ migration/xbzrle.h | 4 + tests/bench/meson.build| 4 + tests/bench/xbzrle-bench.c | 465 + tests/unit/test-xbzrle.c | 39 +++- 8 files changed, 680 insertions(+), 8 deletions(-) create mode 100644 tests/bench/xbzrle-bench.c -- 2.25.1
[PATCH v5 0/2] This patch updates AVX512 support for xbzrle
This patch updates code of avx512 support for xbzrle_encode_buffer function. We modified runtime check of avx512 and simplified algorithm. Besides, we provide benchmark in xbzrle-bench.c for performance comparison. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin ling xu (2): Update AVX512 support for xbzrle_encode_buffer Test code and benchmark code meson.build| 16 ++ meson_options.txt | 2 + migration/ram.c| 35 ++- migration/xbzrle.c | 130 +++ migration/xbzrle.h | 4 + tests/bench/meson.build| 4 + tests/bench/xbzrle-bench.c | 468 + tests/unit/test-xbzrle.c | 39 +++- 8 files changed, 690 insertions(+), 8 deletions(-) create mode 100644 tests/bench/xbzrle-bench.c -- 2.25.1
[PATCH v5 2/2] Test code and benchmark code
Test code in test-xbzrle.c, benchmark code in xbzrle-bench.c for performance benchmarking. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin --- tests/bench/meson.build| 4 + tests/bench/xbzrle-bench.c | 468 + tests/unit/test-xbzrle.c | 39 +++- 3 files changed, 506 insertions(+), 5 deletions(-) create mode 100644 tests/bench/xbzrle-bench.c diff --git a/tests/bench/meson.build b/tests/bench/meson.build index 279a8fcc33..daefead58d 100644 --- a/tests/bench/meson.build +++ b/tests/bench/meson.build @@ -3,6 +3,10 @@ qht_bench = executable('qht-bench', sources: 'qht-bench.c', dependencies: [qemuutil]) +xbzrle_bench = executable('xbzrle-bench', + sources: 'xbzrle-bench.c', + dependencies: [qemuutil,migration]) + executable('atomic_add-bench', sources: files('atomic_add-bench.c'), dependencies: [qemuutil], diff --git a/tests/bench/xbzrle-bench.c b/tests/bench/xbzrle-bench.c new file mode 100644 index 00..6ffac62e15 --- /dev/null +++ b/tests/bench/xbzrle-bench.c @@ -0,0 +1,468 @@ +/* + * Xor Based Zero Run Length Encoding unit tests. + * + * Copyright 2013 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Orit Wasserman + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "../migration/xbzrle.h" + +#define XBZRLE_PAGE_SIZE 4096 + +//int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, +// uint8_t *, int) = xbzrle_encode_buffer; +#if defined(CONFIG_AVX512BW_OPT) +static bool is_cpu_support_avx512bw; +#include "qemu/cpuid.h" +static void __attribute__((constructor)) init_cpu_flag(void) +{ +unsigned max = __get_cpuid_max(0, NULL); +int a, b, c, d; +is_cpu_support_avx512bw = false; +if (max >= 1) { +__cpuid(1, a, b, c, d); + /* We must check that AVX is not just available, but usable. */ +if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { +int bv; +__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); +__cpuid_count(7, 0, a, b, c, d); + /* 0xe6: +* XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 +*and ZMM16-ZMM31 state are enabled by OS) +* XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) +*/ +if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { +//xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; +is_cpu_support_avx512bw = true; +} +} +} +return ; +} +#endif + +struct ResTime{ +float t_raw; +float t_512; +}; + +static void encode_decode_zero(struct ResTime *res) +{ +uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE); +int i = 0; +int dlen = 0, dlen512 = 0; +int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006); + +for (i = diff_len; i > 0; i--) { +buffer[1000 + i] = i; +buffer512[1000 + i] = i; +} + +buffer[1000 + diff_len + 3] = 103; +buffer[1000 + diff_len + 5] = 105; + +buffer512[1000 + diff_len + 3] = 103; +buffer512[1000 + diff_len + 5] = 105; + +/* encode zero page */ +time_t t_start, t_end, t_start512, t_end512; +t_start = clock(); +dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed, + XBZRLE_PAGE_SIZE); +t_end = clock(); +float time_val = difftime(t_end, t_start); +g_assert(dlen == 0); + +t_start512 = clock(); +dlen512 = xbzrle_encode_buffer_avx512(buffer512, buffer512, XBZRLE_PAGE_SIZE, + compressed512, XBZRLE_PAGE_SIZE); +t_end512 = clock(); +float time_val512 = difftime(t_end512, t_start512); +g_assert(dlen512 == 0); + +res->t_raw = time_val; +res->t_512 = time_val512; + +g_free(buffer); +g_free(compressed); +g_free(buffer512); +g_free(compressed512); + +} + +static void test_encode_decode_zero_avx512(void) +{ +int i; +float time_raw = 0.0, time_512 = 0.0; +struct ResTime res; +for (i = 0; i < 1; i++) { +encode_decode_zero(); +time_raw += res.t_raw; +time_512 += res.t_512; +} +printf("Zero test:\n"); +printf("Raw xbzrle_encode time is %f ms\n", time_raw); +printf("512 xbzrle_encode time is %f ms\n", time_512); +} + +static void encode_decode_unchanged(struct ResTime *re
[PATCH v5 1/2] Update AVX512 support for xbzrle_encode_buffer
This commit updates code of avx512 support for xbzrle_encode_buffer function to accelerate xbzrle encoding speed. We add runtime check of avx512 and add benchmark for this feature. Compared with C version of xbzrle_encode_buffer function, avx512 version can achieve 50%-70% performance improvement on benchmarking. In addition, if dirty data is randomly located in 4K page, the avx512 version can achieve almost 140% performance gain. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin --- meson.build| 16 ++ meson_options.txt | 2 + migration/ram.c| 35 ++-- migration/xbzrle.c | 130 + migration/xbzrle.h | 4 ++ 5 files changed, 184 insertions(+), 3 deletions(-) diff --git a/meson.build b/meson.build index 30a380752c..c9d90a5bff 100644 --- a/meson.build +++ b/meson.build @@ -2264,6 +2264,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \ int main(int argc, char *argv[]) { return bar(argv[0]); } '''), error_message: 'AVX512F not available').allowed()) +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \ + .require(cc.links(''' +#pragma GCC push_options +#pragma GCC target("avx512bw") +#include +#include +static int bar(void *a) { + + __m512i x = *(__m512i *)a; + __m512i res= _mm512_abs_epi8(x); + return res[1]; +} +int main(int argc, char *argv[]) { return bar(argv[0]); } + '''), error_message: 'AVX512BW not available').allowed()) + have_pvrdma = get_option('pvrdma') \ .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \ .require(cc.compiles(gnu_source_prefix + ''' diff --git a/meson_options.txt b/meson_options.txt index e58e158396..07194bf680 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto', description: 'AVX2 optimizations') option('avx512f', type: 'feature', value: 'disabled', description: 'AVX512F optimizations') +option('avx512bw', type: 'feature', value: 'auto', + description: 'AVX512BW optimizations') option('keyring', type: 'feature', value: 'auto', description: 'Linux keyring support') diff --git a/migration/ram.c b/migration/ram.c index dc1de9ddbc..387bef2675 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -83,6 +83,35 @@ /* 0x80 is reserved in migration.h start with 0x100 next */ #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100 +int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, + uint8_t *, int) = xbzrle_encode_buffer; +#if defined(CONFIG_AVX512BW_OPT) +#include "qemu/cpuid.h" +static void __attribute__((constructor)) init_cpu_flag(void) +{ +unsigned max = __get_cpuid_max(0, NULL); +int a, b, c, d; +if (max >= 1) { +__cpuid(1, a, b, c, d); + /* We must check that AVX is not just available, but usable. */ +if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { +int bv; +__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); +__cpuid_count(7, 0, a, b, c, d); + /* 0xe6: +* XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 +*and ZMM16-ZMM31 state are enabled by OS) +* XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) +*/ +if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { +xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; +} +} +} +return ; +} +#endif + XBZRLECacheStats xbzrle_counters; /* struct contains XBZRLE cache and a static page @@ -802,9 +831,9 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); /* XBZRLE encoding (if there is no overflow) */ -encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, - TARGET_PAGE_SIZE, XBZRLE.encoded_buf, - TARGET_PAGE_SIZE); +encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf, +TARGET_PAGE_SIZE, XBZRLE.encoded_buf, +TARGET_PAGE_SIZE); /* * Update the cache contents, so that it corresponds to the data diff --git a/migration/xbzrle.c b/migration/xbzrle.c index 1ba482ded9..6da7f79625 100644 --- a/migration/xbzrle.c +++ b/migration/xbzrle.c @@ -174,3 +174,133 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen) return d; } + +#if defined(CONFIG_AVX512BW_OPT) +#pragma GCC push_options +#pragma GCC target("avx512bw") +#include +int xbzrl
[PATCH v4 2/2] Update test code of AVX512 support for xbzrle_encode
Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin --- tests/unit/test-xbzrle.c | 458 ++- 1 file changed, 457 insertions(+), 1 deletion(-) diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c index ef951b6e54..2676123ce3 100644 --- a/tests/unit/test-xbzrle.c +++ b/tests/unit/test-xbzrle.c @@ -16,6 +16,35 @@ #define XBZRLE_PAGE_SIZE 4096 +#if defined(CONFIG_AVX512BW_OPT) +static bool is_cpu_support_avx512bw; +#include "qemu/cpuid.h" +static void __attribute__((constructor)) init_cpu_flag(void) +{ +unsigned max = __get_cpuid_max(0, NULL); +int a, b, c, d; +is_cpu_support_avx512bw = false; +if (max >= 1) { +__cpuid(1, a, b, c, d); + /* We must check that AVX is not just available, but usable. */ +if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { +int bv; +__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); +__cpuid_count(7, 0, a, b, c, d); + /* 0xe6: +* XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 +*and ZMM16-ZMM31 state are enabled by OS) +* XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) +*/ +if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { +is_cpu_support_avx512bw = true; +} +} +} +return ; +} +#endif + static void test_uleb(void) { uint32_t i, val; @@ -173,11 +202,438 @@ static void test_encode_decode(void) } } +static float *encode_decode_zero(void) +{ +uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE); +int i = 0; +int dlen = 0, dlen512 = 0; +int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006); + +for (i = diff_len; i > 0; i--) { +buffer[1000 + i] = i; +buffer512[1000 + i] = i; +} + +buffer[1000 + diff_len + 3] = 103; +buffer[1000 + diff_len + 5] = 105; + +buffer512[1000 + diff_len + 3] = 103; +buffer512[1000 + diff_len + 5] = 105; + +/* encode zero page */ +time_t t_start, t_end, t_start512, t_end512; +t_start = clock(); +dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed, + XBZRLE_PAGE_SIZE); +t_end = clock(); +float time_val = difftime(t_end, t_start); +g_assert(dlen == 0); + +t_start512 = clock(); +dlen512 = xbzrle_encode_buffer_avx512(buffer512, buffer512, XBZRLE_PAGE_SIZE, + compressed512, XBZRLE_PAGE_SIZE); +t_end512 = clock(); +float time_val512 = difftime(t_end512, t_start512); +g_assert(dlen512 == 0); + +static float result_zero[2]; +result_zero[0] = time_val; +result_zero[1] = time_val512; + +g_free(buffer); +g_free(compressed); +g_free(buffer512); +g_free(compressed512); + +return result_zero; +} + +static void test_encode_decode_zero_avx512(void) +{ +int i; +float time_raw = 0.0, time_512 = 0.0; +float *res; +for (i = 0; i < 1; i++) { +res = encode_decode_zero(); +time_raw += res[0]; +time_512 += res[1]; +} +printf("Zero test:\n"); +printf("Raw xbzrle_encode time is %f ms\n", time_raw); +printf("512 xbzrle_encode time is %f ms\n", time_512); +} + +static float *encode_decode_unchanged(void) +{ +uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE); +int i = 0; +int dlen = 0, dlen512 = 0; +int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006); + +for (i = diff_len; i > 0; i--) { +test[1000 + i] = i + 4; +test512[1000 + i] = i + 4; +} + +test[1000 + diff_len + 3] = 107; +test[1000 + diff_len + 5] = 109; + +test512[1000 + diff_len + 3] = 107; +test512[1000 + diff_len + 5] = 109; + +/* test unchanged buffer */ +time_t t_start, t_end, t_start512, t_end512; +t_start = clock(); +dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed, +XBZRLE_PAGE_SIZE); +t_end = clock(); +float time_val = difftime(t_end, t_start); +g_assert(dlen == 0); + +t_start512 = clock(); +dlen512 = xbzrle_encode_buffer_avx512(test512, test512, XBZRLE_PAGE_SIZE, + compressed512, XBZRLE_PAGE_SIZE); +t_end512 = clock(); +float time_val512 = difftime(t_end512, t_start512); +g_assert(dlen512 == 0); + +static float result_unchanged[2]; +result_unchanged
[PATCH v4 0/2] This patch updates AVX512 support for xbzrle
This patch updates code of AVX512 support for xbzrle_encode_buffer fucntion to accelerate xbzrle encoding speed. The runtime check code is kept in meson.build and meson_options.txt without modifying. The updated AVX512 algorithm is provided in ram.c, xbzrle.c and xbzrle.h. The test code is updated in test-xbzrle.c. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin ling xu (2): Update AVX512 support for xbzrle_encode_buffer function Update test code of AVX512 support for xbzrle_encode meson.build | 16 ++ meson_options.txt| 2 + migration/ram.c | 42 +++- migration/xbzrle.c | 181 migration/xbzrle.h | 4 + tests/unit/test-xbzrle.c | 458 ++- 6 files changed, 699 insertions(+), 4 deletions(-) -- 2.25.1
[PATCH v4 1/2] Update AVX512 support for xbzrle_encode_buffer function
This commit update code of avx512 support for xbzrle_encode_buffer function to accelerate xbzrle encoding speed. Compared with C version of xbzrle_encode_buffer function, avx512 version can achieve almost 60%-70% performance improvement on unit test provided by Qemu. In addition, we provide one more unit test called "test_encode_decode_random_avx512", in which dirty data are randomly located in 4K page, and this case can achieve almost 140% performance gain. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin --- meson.build| 16 meson_options.txt | 2 + migration/ram.c| 42 ++- migration/xbzrle.c | 181 + migration/xbzrle.h | 4 + 5 files changed, 242 insertions(+), 3 deletions(-) diff --git a/meson.build b/meson.build index 294e9a8f32..4222b77e9f 100644 --- a/meson.build +++ b/meson.build @@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \ int main(int argc, char *argv[]) { return bar(argv[0]); } '''), error_message: 'AVX512F not available').allowed()) +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \ + .require(cc.links(''' +#pragma GCC push_options +#pragma GCC target("avx512bw") +#include +#include +static int bar(void *a) { + + __m512i x = *(__m512i *)a; + __m512i res= _mm512_abs_epi8(x); + return res[1]; +} +int main(int argc, char *argv[]) { return bar(argv[0]); } + '''), error_message: 'AVX512BW not available').allowed()) + have_pvrdma = get_option('pvrdma') \ .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \ .require(cc.compiles(gnu_source_prefix + ''' diff --git a/meson_options.txt b/meson_options.txt index e58e158396..07194bf680 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto', description: 'AVX2 optimizations') option('avx512f', type: 'feature', value: 'disabled', description: 'AVX512F optimizations') +option('avx512bw', type: 'feature', value: 'auto', + description: 'AVX512BW optimizations') option('keyring', type: 'feature', value: 'auto', description: 'Linux keyring support') diff --git a/migration/ram.c b/migration/ram.c index dc1de9ddbc..bae7bef236 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -83,6 +83,35 @@ /* 0x80 is reserved in migration.h start with 0x100 next */ #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100 +#if defined(CONFIG_AVX512BW_OPT) +static bool is_cpu_support_avx512bw; +#include "qemu/cpuid.h" +static void __attribute__((constructor)) init_cpu_flag(void) +{ +unsigned max = __get_cpuid_max(0, NULL); +int a, b, c, d; +is_cpu_support_avx512bw = false; +if (max >= 1) { +__cpuid(1, a, b, c, d); + /* We must check that AVX is not just available, but usable. */ +if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { +int bv; +__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); +__cpuid_count(7, 0, a, b, c, d); + /* 0xe6: +* XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 +*and ZMM16-ZMM31 state are enabled by OS) +* XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) +*/ +if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { +is_cpu_support_avx512bw = true; +} +} +} +return ; +} +#endif + XBZRLECacheStats xbzrle_counters; /* struct contains XBZRLE cache and a static page @@ -802,9 +831,16 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); /* XBZRLE encoding (if there is no overflow) */ -encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, - TARGET_PAGE_SIZE, XBZRLE.encoded_buf, - TARGET_PAGE_SIZE); +int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int, + uint8_t *, int) = xbzrle_encode_buffer; +#if defined(CONFIG_AVX512BW_OPT) +if (likely(is_cpu_support_avx512bw)) { +xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512; +} +#endif +encoded_len = xbzrle_encode_buffer_func(prev_cached_page, XBZRLE.current_buf, +TARGET_PAGE_SIZE, XBZRLE.encoded_buf, +TARGET_PAGE_SIZE); /* * Update the cache contents, so that it corresponds to the data diff --git a/migration/xbzrle.c b/migration/xbzrle.c index 1ba482ded9..804adc1acb 100644 --- a/migration/xbzrle.c +++
[PATCH v3 2/2] Test code for AVX512 support for xbzrle_encode_buffer
Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin --- tests/unit/test-xbzrle.c | 307 --- 1 file changed, 290 insertions(+), 17 deletions(-) diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c index ef951b6e54..653016826f 100644 --- a/tests/unit/test-xbzrle.c +++ b/tests/unit/test-xbzrle.c @@ -38,111 +38,280 @@ static void test_uleb(void) g_assert(val == 0); } -static void test_encode_decode_zero(void) +static float *test_encode_decode_zero(void) { uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE); int i = 0; -int dlen = 0; +int dlen = 0, dlen512 = 0; int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006); for (i = diff_len; i > 0; i--) { buffer[1000 + i] = i; +buffer512[1000 + i] = i; } buffer[1000 + diff_len + 3] = 103; buffer[1000 + diff_len + 5] = 105; +buffer512[1000 + diff_len + 3] = 103; +buffer512[1000 + diff_len + 5] = 105; + /* encode zero page */ +time_t t_start, t_end, t_start512, t_end512; +t_start = clock(); dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed, XBZRLE_PAGE_SIZE); +t_end = clock(); +float time_val = difftime(t_end, t_start); g_assert(dlen == 0); +t_start512 = clock(); +dlen512 = xbzrle_encode_buffer_512(buffer512, buffer512, XBZRLE_PAGE_SIZE, + compressed512, XBZRLE_PAGE_SIZE); +t_end512 = clock(); +float time_val512 = difftime(t_end512, t_start512); +g_assert(dlen512 == 0); + +static float result_zero[2]; +result_zero[0] = time_val; +result_zero[1] = time_val512; + g_free(buffer); g_free(compressed); +g_free(buffer512); +g_free(compressed512); + +return result_zero; +} + +static void test_encode_decode_zero_range(void) +{ +int i; +float time_raw = 0.0, time_512 = 0.0; +float *res; +for (i = 0; i < 1; i++) { +res = test_encode_decode_zero(); +time_raw += res[0]; +time_512 += res[1]; +} +printf("Zero test:\n"); +printf("Raw xbzrle_encode time is %f ms\n", time_raw); +printf("512 xbzrle_encode time is %f ms\n", time_512); } -static void test_encode_decode_unchanged(void) +static float *test_encode_decode_unchanged(void) { uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE); uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE); int i = 0; -int dlen = 0; +int dlen = 0, dlen512 = 0; int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006); for (i = diff_len; i > 0; i--) { test[1000 + i] = i + 4; +test512[1000 + i] = i + 4; } test[1000 + diff_len + 3] = 107; test[1000 + diff_len + 5] = 109; +test512[1000 + diff_len + 3] = 107; +test512[1000 + diff_len + 5] = 109; + /* test unchanged buffer */ +time_t t_start, t_end, t_start512, t_end512; +t_start = clock(); dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed, XBZRLE_PAGE_SIZE); +t_end = clock(); +float time_val = difftime(t_end, t_start); g_assert(dlen == 0); +t_start512 = clock(); +dlen512 = xbzrle_encode_buffer_512(test512, test512, XBZRLE_PAGE_SIZE, + compressed512, XBZRLE_PAGE_SIZE); +t_end512 = clock(); +float time_val512 = difftime(t_end512, t_start512); +g_assert(dlen512 == 0); + +static float result_unchanged[2]; +result_unchanged[0] = time_val; +result_unchanged[1] = time_val512; + g_free(test); g_free(compressed); +g_free(test512); +g_free(compressed512); + +return result_unchanged; } -static void test_encode_decode_1_byte(void) +static void test_encode_decode_unchanged_range(void) +{ +int i; +float time_raw = 0.0, time_512 = 0.0; +float *res; +for (i = 0; i < 1; i++) { +res = test_encode_decode_unchanged(); +time_raw += res[0]; +time_512 += res[1]; +} +printf("Unchanged test:\n"); +printf("Raw xbzrle_encode time is %f ms\n", time_raw); +printf("512 xbzrle_encode time is %f ms\n", time_512); +} + +static float *test_encode_decode_1_byte(void) { uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE); uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE); -int dlen = 0, rc = 0; +uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed512 = g_mal
[PATCH v3 0/2] This patch updates runtime check of AVX512
This patch updates runtime check of AVX512 and update avx512 support for xbzrle_encode_buffer function to accelerate xbzrle encoding speed. The runtime check is updated in meson.build and meson_options.txt. The updated AVX512 algorithm is provided in ram.c, xbzrle.c and xbzrle.h. The test code is provided in test-xbzrle.c. Previous discussion is refered below: https://www.mail-archive.com/qemu-devel@nongnu.org/msg903520.html ling xu (2): Update AVX512 support for xbzrle_encode_buffer function Test code for AVX512 support for xbzrle_encode_buffer meson.build | 16 ++ meson_options.txt| 2 + migration/ram.c | 41 ++ migration/xbzrle.c | 181 +++ migration/xbzrle.h | 4 + tests/unit/test-xbzrle.c | 307 --- 6 files changed, 534 insertions(+), 17 deletions(-) -- 2.25.1
[PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
This commit update runtime check of AVX512, and implements avx512 of xbzrle_encode_buffer function to accelerate xbzrle encoding speed. Compared with C version of xbzrle_encode_buffer function, avx512 version can achieve almost 60%-70% performance improvement on unit test provided by Qemu. In addition, we provide one more unit test called "test_encode_decode_random", in which dirty data are randomly located in 4K page, and this case can achieve almost 140% performance gain. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin --- meson.build| 16 meson_options.txt | 2 + migration/ram.c| 41 ++ migration/xbzrle.c | 181 + migration/xbzrle.h | 4 + 5 files changed, 244 insertions(+) diff --git a/meson.build b/meson.build index 294e9a8f32..4222b77e9f 100644 --- a/meson.build +++ b/meson.build @@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \ int main(int argc, char *argv[]) { return bar(argv[0]); } '''), error_message: 'AVX512F not available').allowed()) +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \ + .require(cc.links(''' +#pragma GCC push_options +#pragma GCC target("avx512bw") +#include +#include +static int bar(void *a) { + + __m512i x = *(__m512i *)a; + __m512i res= _mm512_abs_epi8(x); + return res[1]; +} +int main(int argc, char *argv[]) { return bar(argv[0]); } + '''), error_message: 'AVX512BW not available').allowed()) + have_pvrdma = get_option('pvrdma') \ .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \ .require(cc.compiles(gnu_source_prefix + ''' diff --git a/meson_options.txt b/meson_options.txt index e58e158396..07194bf680 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto', description: 'AVX2 optimizations') option('avx512f', type: 'feature', value: 'disabled', description: 'AVX512F optimizations') +option('avx512bw', type: 'feature', value: 'auto', + description: 'AVX512BW optimizations') option('keyring', type: 'feature', value: 'auto', description: 'Linux keyring support') diff --git a/migration/ram.c b/migration/ram.c index dc1de9ddbc..d9c1ac2f7a 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -83,6 +83,35 @@ /* 0x80 is reserved in migration.h start with 0x100 next */ #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100 +#if defined(CONFIG_AVX512BW_OPT) +static bool IS_CPU_SUPPORT_AVX512BW; +#include "qemu/cpuid.h" +static void __attribute__((constructor)) init_cpu_flag(void) +{ +unsigned max = __get_cpuid_max(0, NULL); +int a, b, c, d; +IS_CPU_SUPPORT_AVX512BW = false; +if (max >= 1) { +__cpuid(1, a, b, c, d); + /* We must check that AVX is not just available, but usable. */ +if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { +int bv; +__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); +__cpuid_count(7, 0, a, b, c, d); + /* 0xe6: +* XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 +*and ZMM16-ZMM31 state are enabled by OS) +* XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) +*/ +if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { +IS_CPU_SUPPORT_AVX512BW = true; +} +} +} +return ; +} +#endif + XBZRLECacheStats xbzrle_counters; /* struct contains XBZRLE cache and a static page @@ -802,9 +831,21 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); /* XBZRLE encoding (if there is no overflow) */ +#if defined(CONFIG_AVX512BW_OPT) +if (likely(IS_CPU_SUPPORT_AVX512BW)) { +encoded_len = xbzrle_encode_buffer_512(prev_cached_page, XBZRLE.current_buf, + TARGET_PAGE_SIZE, XBZRLE.encoded_buf, + TARGET_PAGE_SIZE); +} else { +encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, + TARGET_PAGE_SIZE, XBZRLE.encoded_buf, + TARGET_PAGE_SIZE); +} +#else encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE, XBZRLE.encoded_buf, TARGET_PAGE_SIZE); +#endif /* * Update the cache contents, so that it corresponds to the data diff --git a/migration/xbzrle.c
[PATCH v3 2/2] Test code for AVX512 support for xbzrle_encode_buffer
Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin --- tests/unit/test-xbzrle.c | 307 --- 1 file changed, 290 insertions(+), 17 deletions(-) diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c index ef951b6e54..653016826f 100644 --- a/tests/unit/test-xbzrle.c +++ b/tests/unit/test-xbzrle.c @@ -38,111 +38,280 @@ static void test_uleb(void) g_assert(val == 0); } -static void test_encode_decode_zero(void) +static float *test_encode_decode_zero(void) { uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE); int i = 0; -int dlen = 0; +int dlen = 0, dlen512 = 0; int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006); for (i = diff_len; i > 0; i--) { buffer[1000 + i] = i; +buffer512[1000 + i] = i; } buffer[1000 + diff_len + 3] = 103; buffer[1000 + diff_len + 5] = 105; +buffer512[1000 + diff_len + 3] = 103; +buffer512[1000 + diff_len + 5] = 105; + /* encode zero page */ +time_t t_start, t_end, t_start512, t_end512; +t_start = clock(); dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed, XBZRLE_PAGE_SIZE); +t_end = clock(); +float time_val = difftime(t_end, t_start); g_assert(dlen == 0); +t_start512 = clock(); +dlen512 = xbzrle_encode_buffer_512(buffer512, buffer512, XBZRLE_PAGE_SIZE, + compressed512, XBZRLE_PAGE_SIZE); +t_end512 = clock(); +float time_val512 = difftime(t_end512, t_start512); +g_assert(dlen512 == 0); + +static float result_zero[2]; +result_zero[0] = time_val; +result_zero[1] = time_val512; + g_free(buffer); g_free(compressed); +g_free(buffer512); +g_free(compressed512); + +return result_zero; +} + +static void test_encode_decode_zero_range(void) +{ +int i; +float time_raw = 0.0, time_512 = 0.0; +float *res; +for (i = 0; i < 1; i++) { +res = test_encode_decode_zero(); +time_raw += res[0]; +time_512 += res[1]; +} +printf("Zero test:\n"); +printf("Raw xbzrle_encode time is %f ms\n", time_raw); +printf("512 xbzrle_encode time is %f ms\n", time_512); } -static void test_encode_decode_unchanged(void) +static float *test_encode_decode_unchanged(void) { uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE); uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE); int i = 0; -int dlen = 0; +int dlen = 0, dlen512 = 0; int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006); for (i = diff_len; i > 0; i--) { test[1000 + i] = i + 4; +test512[1000 + i] = i + 4; } test[1000 + diff_len + 3] = 107; test[1000 + diff_len + 5] = 109; +test512[1000 + diff_len + 3] = 107; +test512[1000 + diff_len + 5] = 109; + /* test unchanged buffer */ +time_t t_start, t_end, t_start512, t_end512; +t_start = clock(); dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed, XBZRLE_PAGE_SIZE); +t_end = clock(); +float time_val = difftime(t_end, t_start); g_assert(dlen == 0); +t_start512 = clock(); +dlen512 = xbzrle_encode_buffer_512(test512, test512, XBZRLE_PAGE_SIZE, + compressed512, XBZRLE_PAGE_SIZE); +t_end512 = clock(); +float time_val512 = difftime(t_end512, t_start512); +g_assert(dlen512 == 0); + +static float result_unchanged[2]; +result_unchanged[0] = time_val; +result_unchanged[1] = time_val512; + g_free(test); g_free(compressed); +g_free(test512); +g_free(compressed512); + +return result_unchanged; } -static void test_encode_decode_1_byte(void) +static void test_encode_decode_unchanged_range(void) +{ +int i; +float time_raw = 0.0, time_512 = 0.0; +float *res; +for (i = 0; i < 1; i++) { +res = test_encode_decode_unchanged(); +time_raw += res[0]; +time_512 += res[1]; +} +printf("Unchanged test:\n"); +printf("Raw xbzrle_encode time is %f ms\n", time_raw); +printf("512 xbzrle_encode time is %f ms\n", time_512); +} + +static float *test_encode_decode_1_byte(void) { uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE); uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE); -int dlen = 0, rc = 0; +uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed512 = g_mal
[PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
This commit update runtime check of AVX512, and implements avx512 of xbzrle_encode_buffer function to accelerate xbzrle encoding speed. Compared with C version of xbzrle_encode_buffer function, avx512 version can achieve almost 60%-70% performance improvement on unit test provided by Qemu. In addition, we provide one more unit test called "test_encode_decode_random", in which dirty data are randomly located in 4K page, and this case can achieve almost 140% performance gain. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin --- meson.build| 16 meson_options.txt | 2 + migration/ram.c| 41 ++ migration/xbzrle.c | 181 + migration/xbzrle.h | 4 + 5 files changed, 244 insertions(+) diff --git a/meson.build b/meson.build index 294e9a8f32..4222b77e9f 100644 --- a/meson.build +++ b/meson.build @@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \ int main(int argc, char *argv[]) { return bar(argv[0]); } '''), error_message: 'AVX512F not available').allowed()) +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \ + .require(cc.links(''' +#pragma GCC push_options +#pragma GCC target("avx512bw") +#include +#include +static int bar(void *a) { + + __m512i x = *(__m512i *)a; + __m512i res= _mm512_abs_epi8(x); + return res[1]; +} +int main(int argc, char *argv[]) { return bar(argv[0]); } + '''), error_message: 'AVX512BW not available').allowed()) + have_pvrdma = get_option('pvrdma') \ .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \ .require(cc.compiles(gnu_source_prefix + ''' diff --git a/meson_options.txt b/meson_options.txt index e58e158396..07194bf680 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto', description: 'AVX2 optimizations') option('avx512f', type: 'feature', value: 'disabled', description: 'AVX512F optimizations') +option('avx512bw', type: 'feature', value: 'auto', + description: 'AVX512BW optimizations') option('keyring', type: 'feature', value: 'auto', description: 'Linux keyring support') diff --git a/migration/ram.c b/migration/ram.c index dc1de9ddbc..d9c1ac2f7a 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -83,6 +83,35 @@ /* 0x80 is reserved in migration.h start with 0x100 next */ #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100 +#if defined(CONFIG_AVX512BW_OPT) +static bool IS_CPU_SUPPORT_AVX512BW; +#include "qemu/cpuid.h" +static void __attribute__((constructor)) init_cpu_flag(void) +{ +unsigned max = __get_cpuid_max(0, NULL); +int a, b, c, d; +IS_CPU_SUPPORT_AVX512BW = false; +if (max >= 1) { +__cpuid(1, a, b, c, d); + /* We must check that AVX is not just available, but usable. */ +if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { +int bv; +__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); +__cpuid_count(7, 0, a, b, c, d); + /* 0xe6: +* XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 +*and ZMM16-ZMM31 state are enabled by OS) +* XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) +*/ +if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { +IS_CPU_SUPPORT_AVX512BW = true; +} +} +} +return ; +} +#endif + XBZRLECacheStats xbzrle_counters; /* struct contains XBZRLE cache and a static page @@ -802,9 +831,21 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); /* XBZRLE encoding (if there is no overflow) */ +#if defined(CONFIG_AVX512BW_OPT) +if (likely(IS_CPU_SUPPORT_AVX512BW)) { +encoded_len = xbzrle_encode_buffer_512(prev_cached_page, XBZRLE.current_buf, + TARGET_PAGE_SIZE, XBZRLE.encoded_buf, + TARGET_PAGE_SIZE); +} else { +encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, + TARGET_PAGE_SIZE, XBZRLE.encoded_buf, + TARGET_PAGE_SIZE); +} +#else encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE, XBZRLE.encoded_buf, TARGET_PAGE_SIZE); +#endif /* * Update the cache contents, so that it corresponds to the data diff --git a/migration/xbzrle.c
[PATCH v3 0/2] This patch updates runtime check of AVX512
This patch updates runtime check of AVX512 and update avx512 support for xbzrle_encode_buffer function to accelerate xbzrle encoding speed. The runtime check is updated in meson.build and meson_options.txt. The updated AVX512 algorithm is provided in ram.c, xbzrle.c and xbzrle.h. The test code is provided in test-xbzrle.c. ling xu (2): Update AVX512 support for xbzrle_encode_buffer function Test code for AVX512 support for xbzrle_encode_buffer meson.build | 16 ++ meson_options.txt| 2 + migration/ram.c | 41 ++ migration/xbzrle.c | 181 +++ migration/xbzrle.h | 4 + tests/unit/test-xbzrle.c | 307 --- 6 files changed, 534 insertions(+), 17 deletions(-) -- 2.25.1
[PATCH v2 0/2] This patch adds runtime check of AVX512
This patch adds runtime check of AVX512 on running machine and update avx512 support for xbzrle_encode_buffer function to accelerate xbzrle encoding speed. The runtime check is added in meson.build and meson_options.txt. The updated AVX512 algorithm is provided in ram.c, xbzrle.h and xbzrle.c. The test code is provided in test-xbzrle.c. Previous discussion is refered below: https://lore.kernel.org/all/ytlshitevijwe...@redhat.com/ ling xu (2): Update AVX512 support for xbzrle_encode_buffer function Test code for AVX512 support for xbzrle_encode_buffer function meson.build | 211 +++ meson_options.txt| 28 migration/ram.c | 41 ++ migration/xbzrle.c | 181 +++ migration/xbzrle.h | 4 + tests/unit/test-xbzrle.c | 307 --- 6 files changed, 755 insertions(+), 17 deletions(-) -- 2.25.1
[PATCH v2 2/2] Test code for AVX512 support for xbzrle_encode_buffer function
Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin --- tests/unit/test-xbzrle.c | 307 --- 1 file changed, 290 insertions(+), 17 deletions(-) diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c index ef951b6e54..653016826f 100644 --- a/tests/unit/test-xbzrle.c +++ b/tests/unit/test-xbzrle.c @@ -38,111 +38,280 @@ static void test_uleb(void) g_assert(val == 0); } -static void test_encode_decode_zero(void) +static float *test_encode_decode_zero(void) { uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE); int i = 0; -int dlen = 0; +int dlen = 0, dlen512 = 0; int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006); for (i = diff_len; i > 0; i--) { buffer[1000 + i] = i; +buffer512[1000 + i] = i; } buffer[1000 + diff_len + 3] = 103; buffer[1000 + diff_len + 5] = 105; +buffer512[1000 + diff_len + 3] = 103; +buffer512[1000 + diff_len + 5] = 105; + /* encode zero page */ +time_t t_start, t_end, t_start512, t_end512; +t_start = clock(); dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed, XBZRLE_PAGE_SIZE); +t_end = clock(); +float time_val = difftime(t_end, t_start); g_assert(dlen == 0); +t_start512 = clock(); +dlen512 = xbzrle_encode_buffer_512(buffer512, buffer512, XBZRLE_PAGE_SIZE, + compressed512, XBZRLE_PAGE_SIZE); +t_end512 = clock(); +float time_val512 = difftime(t_end512, t_start512); +g_assert(dlen512 == 0); + +static float result_zero[2]; +result_zero[0] = time_val; +result_zero[1] = time_val512; + g_free(buffer); g_free(compressed); +g_free(buffer512); +g_free(compressed512); + +return result_zero; +} + +static void test_encode_decode_zero_range(void) +{ +int i; +float time_raw = 0.0, time_512 = 0.0; +float *res; +for (i = 0; i < 1; i++) { +res = test_encode_decode_zero(); +time_raw += res[0]; +time_512 += res[1]; +} +printf("Zero test:\n"); +printf("Raw xbzrle_encode time is %f ms\n", time_raw); +printf("512 xbzrle_encode time is %f ms\n", time_512); } -static void test_encode_decode_unchanged(void) +static float *test_encode_decode_unchanged(void) { uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE); uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE); int i = 0; -int dlen = 0; +int dlen = 0, dlen512 = 0; int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006); for (i = diff_len; i > 0; i--) { test[1000 + i] = i + 4; +test512[1000 + i] = i + 4; } test[1000 + diff_len + 3] = 107; test[1000 + diff_len + 5] = 109; +test512[1000 + diff_len + 3] = 107; +test512[1000 + diff_len + 5] = 109; + /* test unchanged buffer */ +time_t t_start, t_end, t_start512, t_end512; +t_start = clock(); dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed, XBZRLE_PAGE_SIZE); +t_end = clock(); +float time_val = difftime(t_end, t_start); g_assert(dlen == 0); +t_start512 = clock(); +dlen512 = xbzrle_encode_buffer_512(test512, test512, XBZRLE_PAGE_SIZE, + compressed512, XBZRLE_PAGE_SIZE); +t_end512 = clock(); +float time_val512 = difftime(t_end512, t_start512); +g_assert(dlen512 == 0); + +static float result_unchanged[2]; +result_unchanged[0] = time_val; +result_unchanged[1] = time_val512; + g_free(test); g_free(compressed); +g_free(test512); +g_free(compressed512); + +return result_unchanged; } -static void test_encode_decode_1_byte(void) +static void test_encode_decode_unchanged_range(void) +{ +int i; +float time_raw = 0.0, time_512 = 0.0; +float *res; +for (i = 0; i < 1; i++) { +res = test_encode_decode_unchanged(); +time_raw += res[0]; +time_512 += res[1]; +} +printf("Unchanged test:\n"); +printf("Raw xbzrle_encode time is %f ms\n", time_raw); +printf("512 xbzrle_encode time is %f ms\n", time_512); +} + +static float *test_encode_decode_1_byte(void) { uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE); uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE); uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE); -int dlen = 0, rc = 0; +uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE); +uint8_t *compressed512 = g_mal
[PATCH v2 1/2] Update AVX512 support for xbzrle_encode_buffer function
This commit adds runtime check of AVX512 on running machine, and implements AVX512 of xbzrle_encode_buffer function to accelerate xbzrle encoding speed. Compared with C version of xbzrle_encode_buffer function, AVX512 version can achieve almost 60%-70% performance improvement on unit test provided by qemu. In addition, we provide one more unit test called "test_encode_decode_random", in which dirty data are randomly located in 4K page, and this case can achieve almost 140% performance gain. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin --- meson.build| 211 + meson_options.txt | 28 ++ migration/ram.c| 41 + migration/xbzrle.c | 181 ++ migration/xbzrle.h | 4 + 5 files changed, 465 insertions(+) diff --git a/meson.build b/meson.build index 294e9a8f32..9228df2442 100644 --- a/meson.build +++ b/meson.build @@ -2262,6 +2262,217 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \ int main(int argc, char *argv[]) { return bar(argv[0]); } '''), error_message: 'AVX512F not available').allowed()) +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \ + .require(cc.links(''' +#pragma GCC push_options +#pragma GCC target("avx512bw") +#include +#include +static int bar(void *a) { + + __m512i x = *(__m512i *)a; + __m512i res= _mm512_abs_epi8(x); + return res[1]; +} +int main(int argc, char *argv[]) { return bar(argv[0]); } + '''), error_message: 'AVX512BW not available').allowed()) + +config_host_data.set('CONFIG_AVX512CD_OPT', get_option('avx512cd') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512CD') \ + .require(cc.links(''' +#pragma GCC push_options +#pragma GCC target("avx512cd") +#include +#include +static int bar(void *a) { + + __m512i x = *(__m512i *)a; + __mmask16 k; + __m512i res= _mm512_maskz_lzcnt_epi32 (k, x); + return res[1]; +} +int main(int argc, char *argv[]) { return bar(argv[0]); } + '''), error_message: 'AVX512CD not available').allowed()) + +config_host_data.set('CONFIG_AVX512DQ_OPT', get_option('avx512dq') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512D') \ + .require(cc.links(''' +#pragma GCC push_options +#pragma GCC target("avx512dq") +#include +#include +static int bar(void *a) { + + __mmask x = *(__mmask *)a; + __mmask8 b; + return _kxor_mask8(x,b); +} +int main(int argc, char *argv[]) { return bar(argv[0]); } + '''), error_message: 'AVX512DQ not available').allowed()) + +config_host_data.set('CONFIG_AVX512ER_OPT', get_option('avx512er') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512ER') \ + .require(cc.links(''' +#pragma GCC push_options +#pragma GCC target("avx512er") +#include +#include +static int bar(void *a) { + + __m512d x = *(__m512d *)a; + __m512d res=_mm512_rsqrt28_pd(x); + return res[1]; +} +int main(int argc, char *argv[]) { return bar(argv[0]); } + '''), error_message: 'AVX512ER not available').allowed()) + + +config_host_data.set('CONFIG_AVX512IFMA52_OPT', get_option('avx512ifma52') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512ER') \ + .require(cc.links(''' +#pragma GCC push_options +#pragma GCC target("avx512ifma") +#include +#include +static int bar(void *a) { + + __m512i x = *(__m512i *)a; + __m512i b,c; + __m512i res= _mm512_madd52lo_epu64 (x, b, c); + return res[1]; +} +int main(int argc, char *argv[]) { return bar(argv[0]); } + '''), error_message: 'AVX512IFMA52 not available').allowed()) + + +config_host_data.set('CONFIG_AVX512PF_OPT', get_option('avx512pf') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512PF') \ + .require(cc.links(''' +#pragma GCC push_options +#pragma GCC target("avx512pf") +#include +#include +static void bar(void *a) { + char* base_addr; + __mmask8 k; + __m512i vindex = *(__m512i *)a; + _mm512_mask_prefetch_i64scatter_pd (base_addr, k, vindex, 1, 2); +} +int main(int argc, char *argv[]) { bar(argv[0]); return 0;} + '''), error_message: 'AVX512PF not available').allowed()) + + +config_host_data.set('CONFIG_AVX512VPOPCNTDQ_OPT', get_option('avx512vpopcntdq') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512VPOPCNTDQ') \ + .require(cc.links(''' +#pragma GCC push_options +#pragma GCC target("avx512vpopcntdq") +#include +#include +static int ba
[PATCH 0/1] This patch provides AVX512 support for xbzrle_encode_buffer function
This patch adds avx512 support for xbzrle_encode_buffer function to accelerate xbzrle encoding speed. The specific avx512 implementation is provided in qemu/migration/xbzrle.c. We provide AVX512 intrinsic support in qemu/configure file to enable avx512 in complier. And added unit test called "test_encode_decode_random" is provided in qemu/tests/u nits/test-xbzrle.c. ling xu (1): Add AVX512 support for xbzrle_encode_buffer function configure| 434 ++- migration/ram.c | 6 + migration/xbzrle.c | 177 migration/xbzrle.h | 4 + tests/unit/test-xbzrle.c | 307 +-- 5 files changed, 908 insertions(+), 20 deletions(-) -- 2.25.1
[PATCH 1/1] Add AVX512 support for xbzrle_encode_buffer function
This commit adds AVX512 implementation of xbzrle_encode_buffer function to accelerate xbzrle encoding speed. Compared with C version of xbzrle_encode_buffer function, AVX512 version can achieve almost 60%-70% performance improvement on unit test provided by qemu. In addition, we provide one more unit test called "test_encode_decode_random", in which dirty data are randomly located in 4K page, and this case can achieve almost 140% performance gain. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin --- configure| 434 ++- migration/ram.c | 6 + migration/xbzrle.c | 177 migration/xbzrle.h | 4 + tests/unit/test-xbzrle.c | 307 +-- 5 files changed, 908 insertions(+), 20 deletions(-) diff --git a/configure b/configure index 4f12481765..7685479297 100755 --- a/configure +++ b/configure @@ -587,6 +587,431 @@ else cpu=$(uname -m) fi +# cpu flag for x86 +x86_cpu_flags="" + +get_x86_cpu_flags(){ +# check sse flag +cat > $TMPC << EOF +#include +int main(int argc, char *argv[]) +{ +__m128 a,b; +int res= _mm_ucomieq_ss(a,b); +return 0; +} +EOF +local_cpu_flag="-msse" +rm -f $TMPE +do_cc $local_cpu_flag -o $TMPE $TMPC +if [ -e $TMPE ] +then +x86_cpu_flags="$x86_cpu_flags -msse" +fi + +# check sse2 flag +cat > $TMPC << EOF +#include +int main(int argc, char *argv[]) +{ +__m128d a,b; +int res= _mm_ucomineq_sd (a, b); +return 0; +} +EOF +local_cpu_flag="-msse2" +rm -f $TMPE +do_cc $local_cpu_flag -o $TMPE $TMPC +if [ -e $TMPE ] +then +x86_cpu_flags="$x86_cpu_flags -msse2" +fi + +# check sse3 flag +cat > $TMPC << EOF +#include +int main(int argc, char *argv[]) +{ +__m128 a; +__m128 res= _mm_moveldup_ps (a); +return 0; +} +EOF +local_cpu_flag="-msse3" +rm -f $TMPE +do_cc $local_cpu_flag -o $TMPE $TMPC +if [ -e $TMPE ] +then +x86_cpu_flags="$x86_cpu_flags -msse3" +fi + +# check ssse3 flag +cat > $TMPC << EOF +#include +int main(int argc, char *argv[]) +{ +__m128i a; +__m12i8 res= _mm_abs_epi32 (a); +return 0; +} +EOF +local_cpu_flag="-mssse3" +rm -f $TMPE +do_cc $local_cpu_flag -o $TMPE $TMPC +if [ -e $TMPE ] +then +x86_cpu_flags="$x86_cpu_flags -mssse3" +fi + +# check sse4.1 flag +cat > $TMPC << EOF +#include +int main(int argc, char *argv[]) +{ +__m128 a; +__m128 res=_mm_ceil_ps (a); +return 0; +} +EOF +local_cpu_flag="-msse4.1" +rm -f $TMPE +do_cc $local_cpu_flag -o $TMPE $TMPC +if [ -e $TMPE ] +then +x86_cpu_flags="$x86_cpu_flags -msse4.1" +fi + +# check sse4.2 flag +cat > $TMPC << EOF +#include +int main(int argc, char *argv[]) +{ +__m128i a,b; +__m128i res=_mm_cmpgt_epi64 (a,b); +return 0; +} +EOF +local_cpu_flag="-msse4.2" +rm -f $TMPE +do_cc $local_cpu_flag -o $TMPE $TMPC +if [ -e $TMPE ] +then +x86_cpu_flags="$x86_cpu_flags -msse4.2" +fi + +# check avx flag +cat > $TMPC << EOF +#include +int main(int argc, char *argv[]) +{ +__m256 a; +__m256 res= _mm256_ceil_ps(a); +return 0; +} +EOF +local_cpu_flag="-mavx" +rm -f $TMPE +do_cc $local_cpu_flag -o $TMPE $TMPC +if [ -e $TMPE ] +then +x86_cpu_flags="$x86_cpu_flags -mavx" +fi + +# check avx2 flag +cat > $TMPC << EOF +#include +int main(int argc, char *argv[]) +{ +__m256i a; +__m256i res= _mm256_abs_epi32(a); +return 0; +} +EOF +local_cpu_flag="-mavx2" +rm -f $TMPE +do_cc $local_cpu_flag -o $TMPE $TMPC +if [ -e $TMPE ] +then +x86_cpu_flags="$x86_cpu_flags -mavx2" +fi + +# check AVX_VNNI flag +cat > $TMPC << EOF +#include +int main(int argc, char *argv[]) +{ +__m128i a,b,c; +__m128i res= _mm_dpbusd_epi32(a,b,c); +return 0; +} +EOF +local_cpu_flag="-mavxvnni" +rm -f $TMPE +do_cc $local_cpu_flag -o $TMPE $TMPC +if [ -e $TMPE ] +then +x86_cpu_flags="$x86_cpu_flags -mavxvnni" +fi + +# check AVX512f flag +cat > $TMPC << EOF +#include +int main(int argc, char *argv[]) +{ +__m128d a; +__m512d res= _mm512_broadcastsd_pd(a); +return 0; +} +EOF +local_cpu_flag="-mavx512f" +rm -f $TMPE +do_cc $local_cpu_flag -o $TMPE $TMPC +if [ -e $TMPE ] +then +x86_cpu_flags="$x86_cpu_flags -mavx512f" +fi + +# check AVX512bw flag +cat > $TMPC << EOF +#include +int mai