[PATCH v7 0/2] Update AVX512 support for xbzrle and CI failure

2022-11-16 Thread ling xu
This patch updates code of avx512 support for xbzrle_encode_buffer function. 
We mainly modified code in xbzrle-bench.c for addressing CI failure.

Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 

ling xu (2):
  Update AVX512 support for xbzrle_encode_buffer
  Unit test code and benchmark code

 meson.build|  16 ++
 meson_options.txt  |   2 +
 migration/ram.c|  34 ++-
 migration/xbzrle.c | 124 ++
 migration/xbzrle.h |   4 +
 tests/bench/meson.build|   4 +
 tests/bench/xbzrle-bench.c | 469 +
 tests/unit/test-xbzrle.c   |  39 ++-
 8 files changed, 684 insertions(+), 8 deletions(-)
 create mode 100644 tests/bench/xbzrle-bench.c

-- 
2.25.1




[PATCH v7 2/2] Update bench-code for addressing CI problem

2022-11-16 Thread ling xu
Unit test code is in test-xbzrle.c, and benchmark code is in xbzrle-bench.c
for performance benchmarking. we have modified xbzrle-bench.c to address
CI problem.

Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 
---
 tests/bench/meson.build|   4 +
 tests/bench/xbzrle-bench.c | 469 +
 tests/unit/test-xbzrle.c   |  39 ++-
 3 files changed, 507 insertions(+), 5 deletions(-)
 create mode 100644 tests/bench/xbzrle-bench.c

diff --git a/tests/bench/meson.build b/tests/bench/meson.build
index 279a8fcc33..daefead58d 100644
--- a/tests/bench/meson.build
+++ b/tests/bench/meson.build
@@ -3,6 +3,10 @@ qht_bench = executable('qht-bench',
sources: 'qht-bench.c',
dependencies: [qemuutil])
 
+xbzrle_bench = executable('xbzrle-bench',
+   sources: 'xbzrle-bench.c',
+   dependencies: [qemuutil,migration])
+
 executable('atomic_add-bench',
sources: files('atomic_add-bench.c'),
dependencies: [qemuutil],
diff --git a/tests/bench/xbzrle-bench.c b/tests/bench/xbzrle-bench.c
new file mode 100644
index 00..8848a3a32d
--- /dev/null
+++ b/tests/bench/xbzrle-bench.c
@@ -0,0 +1,469 @@
+/*
+ * Xor Based Zero Run Length Encoding unit tests.
+ *
+ * Copyright 2013 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ *  Orit Wasserman  
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "../migration/xbzrle.h"
+
+#if defined(CONFIG_AVX512BW_OPT)
+#define XBZRLE_PAGE_SIZE 4096
+static bool is_cpu_support_avx512bw;
+#include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void)
+{
+unsigned max = __get_cpuid_max(0, NULL);
+int a, b, c, d;
+is_cpu_support_avx512bw = false;
+if (max >= 1) {
+__cpuid(1, a, b, c, d);
+ /* We must check that AVX is not just available, but usable.  */
+if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+int bv;
+__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+__cpuid_count(7, 0, a, b, c, d);
+   /* 0xe6:
+*  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+*and ZMM16-ZMM31 state are enabled by OS)
+*  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+*/
+if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+is_cpu_support_avx512bw = true;
+}
+}
+}
+return ;
+}
+
+struct ResTime {
+float t_raw;
+float t_512;
+};
+
+
+/* Function prototypes
+int xbzrle_encode_buffer_avx512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+uint8_t *dst, int dlen);
+*/
+static void encode_decode_zero(struct ResTime *res)
+{
+uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
+int i = 0;
+int dlen = 0, dlen512 = 0;
+int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
+
+for (i = diff_len; i > 0; i--) {
+buffer[1000 + i] = i;
+buffer512[1000 + i] = i;
+}
+
+buffer[1000 + diff_len + 3] = 103;
+buffer[1000 + diff_len + 5] = 105;
+
+buffer512[1000 + diff_len + 3] = 103;
+buffer512[1000 + diff_len + 5] = 105;
+
+/* encode zero page */
+time_t t_start, t_end, t_start512, t_end512;
+t_start = clock();
+dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
+   XBZRLE_PAGE_SIZE);
+t_end = clock();
+float time_val = difftime(t_end, t_start);
+g_assert(dlen == 0);
+
+t_start512 = clock();
+dlen512 = xbzrle_encode_buffer_avx512(buffer512, buffer512, 
XBZRLE_PAGE_SIZE,
+   compressed512, XBZRLE_PAGE_SIZE);
+t_end512 = clock();
+float time_val512 = difftime(t_end512, t_start512);
+g_assert(dlen512 == 0);
+
+res->t_raw = time_val;
+res->t_512 = time_val512;
+
+g_free(buffer);
+g_free(compressed);
+g_free(buffer512);
+g_free(compressed512);
+
+}
+
+static void test_encode_decode_zero_avx512(void)
+{
+int i;
+float time_raw = 0.0, time_512 = 0.0;
+struct ResTime res;
+for (i = 0; i < 1; i++) {
+encode_decode_zero();
+time_raw += res.t_raw;
+time_512 += res.t_512;
+}
+printf("Zero test:\n");
+printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
+
+static voi

[PATCH v7 1/2] AVX512 support for xbzrle_encode_buffer

2022-11-16 Thread ling xu
This commit is the same with [PATCH v6 1/2], and provides avx512 support for 
xbzrle_encode_buffer
function to accelerate xbzrle encoding speed. Runtime check of avx512
support and benchmark for this feature are added. Compared with C
version of xbzrle_encode_buffer function, avx512 version can achieve
50%-70% performance improvement on benchmarking. In addition, if dirty
data is randomly located in 4K page, the avx512 version can achieve
almost 140% performance gain.

Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 
---
 meson.build|  16 ++
 meson_options.txt  |   2 +
 migration/ram.c|  34 +++--
 migration/xbzrle.c | 124 +
 migration/xbzrle.h |   4 ++
 5 files changed, 177 insertions(+), 3 deletions(-)

diff --git a/meson.build b/meson.build
index cf3e517e56..d0d28f5c9e 100644
--- a/meson.build
+++ b/meson.build
@@ -2344,6 +2344,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', 
get_option('avx512f') \
 int main(int argc, char *argv[]) { return bar(argv[argc - 1]); }
   '''), error_message: 'AVX512F not available').allowed())
 
+config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable 
AVX512BW') \
+  .require(cc.links('''
+#pragma GCC push_options
+#pragma GCC target("avx512bw")
+#include 
+#include 
+static int bar(void *a) {
+
+  __m512i *x = a;
+  __m512i res= _mm512_abs_epi8(*x);
+  return res[1];
+}
+int main(int argc, char *argv[]) { return bar(argv[0]); }
+  '''), error_message: 'AVX512BW not available').allowed())
+
 have_pvrdma = get_option('pvrdma') \
   .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics 
libraries') \
   .require(cc.compiles(gnu_source_prefix + '''
diff --git a/meson_options.txt b/meson_options.txt
index 66128178bf..96814dd211 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
description: 'AVX2 optimizations')
 option('avx512f', type: 'feature', value: 'disabled',
description: 'AVX512F optimizations')
+option('avx512bw', type: 'feature', value: 'auto',
+   description: 'AVX512BW optimizations')
 option('keyring', type: 'feature', value: 'auto',
description: 'Linux keyring support')
 
diff --git a/migration/ram.c b/migration/ram.c
index dc1de9ddbc..ff4c15c9c3 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -83,6 +83,34 @@
 /* 0x80 is reserved in migration.h start with 0x100 next */
 #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100
 
+int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
+ uint8_t *, int) = xbzrle_encode_buffer;
+#if defined(CONFIG_AVX512BW_OPT)
+#include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void)
+{
+unsigned max = __get_cpuid_max(0, NULL);
+int a, b, c, d;
+if (max >= 1) {
+__cpuid(1, a, b, c, d);
+ /* We must check that AVX is not just available, but usable.  */
+if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+int bv;
+__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+__cpuid_count(7, 0, a, b, c, d);
+   /* 0xe6:
+*  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+*and ZMM16-ZMM31 state are enabled by OS)
+*  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+*/
+if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
+}
+}
+}
+}
+#endif
+
 XBZRLECacheStats xbzrle_counters;
 
 /* struct contains XBZRLE cache and a static page
@@ -802,9 +830,9 @@ static int save_xbzrle_page(RAMState *rs, uint8_t 
**current_data,
 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 
 /* XBZRLE encoding (if there is no overflow) */
-encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
-   TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
-   TARGET_PAGE_SIZE);
+encoded_len = xbzrle_encode_buffer_func(prev_cached_page, 
XBZRLE.current_buf,
+TARGET_PAGE_SIZE, 
XBZRLE.encoded_buf,
+TARGET_PAGE_SIZE);
 
 /*
  * Update the cache contents, so that it corresponds to the data
diff --git a/migration/xbzrle.c b/migration/xbzrle.c
index 1ba482ded9..05366e86c0 100644
--- a/migration/xbzrle.c
+++ b/migration/xbzrle.c
@@ -174,3 +174,127 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t 
*dst, int dlen)
 
 return d;
 }
+
+#if defined(CONFIG_AVX512BW_OPT)
+#pragma GCC push_options
+#pragma GCC t

[PATCH v6 2/2] Unit test code and benchmark code

2022-08-26 Thread ling xu
Unit test code is in test-xbzrle.c, and benchmark code is in xbzrle-bench.c
for performance benchmarking.

Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 
---
 tests/bench/meson.build|   4 +
 tests/bench/xbzrle-bench.c | 465 +
 tests/unit/test-xbzrle.c   |  39 +++-
 3 files changed, 503 insertions(+), 5 deletions(-)
 create mode 100644 tests/bench/xbzrle-bench.c

diff --git a/tests/bench/meson.build b/tests/bench/meson.build
index 279a8fcc33..daefead58d 100644
--- a/tests/bench/meson.build
+++ b/tests/bench/meson.build
@@ -3,6 +3,10 @@ qht_bench = executable('qht-bench',
sources: 'qht-bench.c',
dependencies: [qemuutil])
 
+xbzrle_bench = executable('xbzrle-bench',
+   sources: 'xbzrle-bench.c',
+   dependencies: [qemuutil,migration])
+
 executable('atomic_add-bench',
sources: files('atomic_add-bench.c'),
dependencies: [qemuutil],
diff --git a/tests/bench/xbzrle-bench.c b/tests/bench/xbzrle-bench.c
new file mode 100644
index 00..d71397e6f4
--- /dev/null
+++ b/tests/bench/xbzrle-bench.c
@@ -0,0 +1,465 @@
+/*
+ * Xor Based Zero Run Length Encoding unit tests.
+ *
+ * Copyright 2013 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ *  Orit Wasserman  
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "../migration/xbzrle.h"
+
+#define XBZRLE_PAGE_SIZE 4096
+
+#if defined(CONFIG_AVX512BW_OPT)
+static bool is_cpu_support_avx512bw;
+#include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void)
+{
+unsigned max = __get_cpuid_max(0, NULL);
+int a, b, c, d;
+is_cpu_support_avx512bw = false;
+if (max >= 1) {
+__cpuid(1, a, b, c, d);
+ /* We must check that AVX is not just available, but usable.  */
+if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+int bv;
+__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+__cpuid_count(7, 0, a, b, c, d);
+   /* 0xe6:
+*  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+*and ZMM16-ZMM31 state are enabled by OS)
+*  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+*/
+if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+is_cpu_support_avx512bw = true;
+}
+}
+}
+return ;
+}
+#endif
+
+struct ResTime {
+float t_raw;
+float t_512;
+};
+
+static void encode_decode_zero(struct ResTime *res)
+{
+uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
+int i = 0;
+int dlen = 0, dlen512 = 0;
+int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
+
+for (i = diff_len; i > 0; i--) {
+buffer[1000 + i] = i;
+buffer512[1000 + i] = i;
+}
+
+buffer[1000 + diff_len + 3] = 103;
+buffer[1000 + diff_len + 5] = 105;
+
+buffer512[1000 + diff_len + 3] = 103;
+buffer512[1000 + diff_len + 5] = 105;
+
+/* encode zero page */
+time_t t_start, t_end, t_start512, t_end512;
+t_start = clock();
+dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
+   XBZRLE_PAGE_SIZE);
+t_end = clock();
+float time_val = difftime(t_end, t_start);
+g_assert(dlen == 0);
+
+t_start512 = clock();
+dlen512 = xbzrle_encode_buffer_avx512(buffer512, buffer512, 
XBZRLE_PAGE_SIZE,
+   compressed512, XBZRLE_PAGE_SIZE);
+t_end512 = clock();
+float time_val512 = difftime(t_end512, t_start512);
+g_assert(dlen512 == 0);
+
+res->t_raw = time_val;
+res->t_512 = time_val512;
+
+g_free(buffer);
+g_free(compressed);
+g_free(buffer512);
+g_free(compressed512);
+
+}
+
+static void test_encode_decode_zero_avx512(void)
+{
+int i;
+float time_raw = 0.0, time_512 = 0.0;
+struct ResTime res;
+for (i = 0; i < 1; i++) {
+encode_decode_zero();
+time_raw += res.t_raw;
+time_512 += res.t_512;
+}
+printf("Zero test:\n");
+printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
+
+static void encode_decode_unchanged(struct ResTime *res)
+{
+uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
+

[PATCH v6 1/2] Update AVX512 support for xbzrle_encode_buffer

2022-08-26 Thread ling xu
This commit updates code of avx512 support for xbzrle_encode_buffer
function to accelerate xbzrle encoding speed. Runtime check of avx512
support and benchmark for this feature are added. Compared with C
version of xbzrle_encode_buffer function, avx512 version can achieve
50%-70% performance improvement on benchmarking. In addition, if dirty
data is randomly located in 4K page, the avx512 version can achieve
almost 140% performance gain.

Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 
---
 meson.build|  16 ++
 meson_options.txt  |   2 +
 migration/ram.c|  34 +++--
 migration/xbzrle.c | 124 +
 migration/xbzrle.h |   4 ++
 5 files changed, 177 insertions(+), 3 deletions(-)

diff --git a/meson.build b/meson.build
index 20fddbd707..5d4b82d7f3 100644
--- a/meson.build
+++ b/meson.build
@@ -2264,6 +2264,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', 
get_option('avx512f') \
 int main(int argc, char *argv[]) { return bar(argv[0]); }
   '''), error_message: 'AVX512F not available').allowed())
 
+config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable 
AVX512BW') \
+  .require(cc.links('''
+#pragma GCC push_options
+#pragma GCC target("avx512bw")
+#include 
+#include 
+static int bar(void *a) {
+
+  __m512i *x = a;
+  __m512i res= _mm512_abs_epi8(*x);
+  return res[1];
+}
+int main(int argc, char *argv[]) { return bar(argv[0]); }
+  '''), error_message: 'AVX512BW not available').allowed())
+
 have_pvrdma = get_option('pvrdma') \
   .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics 
libraries') \
   .require(cc.compiles(gnu_source_prefix + '''
diff --git a/meson_options.txt b/meson_options.txt
index e58e158396..07194bf680 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
description: 'AVX2 optimizations')
 option('avx512f', type: 'feature', value: 'disabled',
description: 'AVX512F optimizations')
+option('avx512bw', type: 'feature', value: 'auto',
+   description: 'AVX512BW optimizations')
 option('keyring', type: 'feature', value: 'auto',
description: 'Linux keyring support')
 
diff --git a/migration/ram.c b/migration/ram.c
index dc1de9ddbc..ff4c15c9c3 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -83,6 +83,34 @@
 /* 0x80 is reserved in migration.h start with 0x100 next */
 #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100
 
+int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
+ uint8_t *, int) = xbzrle_encode_buffer;
+#if defined(CONFIG_AVX512BW_OPT)
+#include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void)
+{
+unsigned max = __get_cpuid_max(0, NULL);
+int a, b, c, d;
+if (max >= 1) {
+__cpuid(1, a, b, c, d);
+ /* We must check that AVX is not just available, but usable.  */
+if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+int bv;
+__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+__cpuid_count(7, 0, a, b, c, d);
+   /* 0xe6:
+*  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+*and ZMM16-ZMM31 state are enabled by OS)
+*  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+*/
+if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
+}
+}
+}
+}
+#endif
+
 XBZRLECacheStats xbzrle_counters;
 
 /* struct contains XBZRLE cache and a static page
@@ -802,9 +830,9 @@ static int save_xbzrle_page(RAMState *rs, uint8_t 
**current_data,
 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 
 /* XBZRLE encoding (if there is no overflow) */
-encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
-   TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
-   TARGET_PAGE_SIZE);
+encoded_len = xbzrle_encode_buffer_func(prev_cached_page, 
XBZRLE.current_buf,
+TARGET_PAGE_SIZE, 
XBZRLE.encoded_buf,
+TARGET_PAGE_SIZE);
 
 /*
  * Update the cache contents, so that it corresponds to the data
diff --git a/migration/xbzrle.c b/migration/xbzrle.c
index 1ba482ded9..05366e86c0 100644
--- a/migration/xbzrle.c
+++ b/migration/xbzrle.c
@@ -174,3 +174,127 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t 
*dst, int dlen)
 
 return d;
 }
+
+#if defined(CONFIG_AVX512BW_OPT)
+#pragma GCC push_options
+#pragma GCC target("avx512bw")
+#include 
+int xbzrle_enc

[PATCH v6 0/2] This patch updates AVX512 support for xbzrle

2022-08-26 Thread ling xu
This patch updates code of avx512 support for xbzrle_encode_buffer
function. We modified code of algorithm and check of avx512. Besides, we 
provide benchmark in xbzrle-bench.c for performance comparison.

Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 

ling xu (2):
  Update AVX512 support for xbzrle_encode_buffer
  Unit test code and benchmark code

 meson.build|  16 ++
 meson_options.txt  |   2 +
 migration/ram.c|  34 ++-
 migration/xbzrle.c | 124 ++
 migration/xbzrle.h |   4 +
 tests/bench/meson.build|   4 +
 tests/bench/xbzrle-bench.c | 465 +
 tests/unit/test-xbzrle.c   |  39 +++-
 8 files changed, 680 insertions(+), 8 deletions(-)
 create mode 100644 tests/bench/xbzrle-bench.c

-- 
2.25.1




[PATCH v5 0/2] This patch updates AVX512 support for xbzrle

2022-08-18 Thread ling xu
This patch updates code of avx512 support for xbzrle_encode_buffer
function. We modified runtime check of avx512 and simplified algorithm.
Besides, we provide benchmark in xbzrle-bench.c for performance
comparison.

Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 

ling xu (2):
  Update AVX512 support for xbzrle_encode_buffer
  Test code and benchmark code

 meson.build|  16 ++
 meson_options.txt  |   2 +
 migration/ram.c|  35 ++-
 migration/xbzrle.c | 130 +++
 migration/xbzrle.h |   4 +
 tests/bench/meson.build|   4 +
 tests/bench/xbzrle-bench.c | 468 +
 tests/unit/test-xbzrle.c   |  39 +++-
 8 files changed, 690 insertions(+), 8 deletions(-)
 create mode 100644 tests/bench/xbzrle-bench.c

-- 
2.25.1




[PATCH v5 2/2] Test code and benchmark code

2022-08-18 Thread ling xu
Test code in test-xbzrle.c, benchmark code in xbzrle-bench.c for
performance benchmarking.

Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 
---
 tests/bench/meson.build|   4 +
 tests/bench/xbzrle-bench.c | 468 +
 tests/unit/test-xbzrle.c   |  39 +++-
 3 files changed, 506 insertions(+), 5 deletions(-)
 create mode 100644 tests/bench/xbzrle-bench.c

diff --git a/tests/bench/meson.build b/tests/bench/meson.build
index 279a8fcc33..daefead58d 100644
--- a/tests/bench/meson.build
+++ b/tests/bench/meson.build
@@ -3,6 +3,10 @@ qht_bench = executable('qht-bench',
sources: 'qht-bench.c',
dependencies: [qemuutil])
 
+xbzrle_bench = executable('xbzrle-bench',
+   sources: 'xbzrle-bench.c',
+   dependencies: [qemuutil,migration])
+
 executable('atomic_add-bench',
sources: files('atomic_add-bench.c'),
dependencies: [qemuutil],
diff --git a/tests/bench/xbzrle-bench.c b/tests/bench/xbzrle-bench.c
new file mode 100644
index 00..6ffac62e15
--- /dev/null
+++ b/tests/bench/xbzrle-bench.c
@@ -0,0 +1,468 @@
+/*
+ * Xor Based Zero Run Length Encoding unit tests.
+ *
+ * Copyright 2013 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ *  Orit Wasserman  
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "../migration/xbzrle.h"
+
+#define XBZRLE_PAGE_SIZE 4096
+
+//int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
+// uint8_t *, int) = xbzrle_encode_buffer;
+#if defined(CONFIG_AVX512BW_OPT)
+static bool is_cpu_support_avx512bw;
+#include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void)
+{
+unsigned max = __get_cpuid_max(0, NULL);
+int a, b, c, d;
+is_cpu_support_avx512bw = false;
+if (max >= 1) {
+__cpuid(1, a, b, c, d);
+ /* We must check that AVX is not just available, but usable.  */
+if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+int bv;
+__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+__cpuid_count(7, 0, a, b, c, d);
+   /* 0xe6:
+*  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+*and ZMM16-ZMM31 state are enabled by OS)
+*  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+*/
+if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+//xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
+is_cpu_support_avx512bw = true;
+}
+}
+}
+return ;
+}
+#endif
+
+struct ResTime{
+float t_raw;
+float t_512;
+};
+
+static void encode_decode_zero(struct ResTime *res)
+{
+uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
+int i = 0;
+int dlen = 0, dlen512 = 0;
+int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
+
+for (i = diff_len; i > 0; i--) {
+buffer[1000 + i] = i;
+buffer512[1000 + i] = i;
+}
+
+buffer[1000 + diff_len + 3] = 103;
+buffer[1000 + diff_len + 5] = 105;
+
+buffer512[1000 + diff_len + 3] = 103;
+buffer512[1000 + diff_len + 5] = 105;
+
+/* encode zero page */
+time_t t_start, t_end, t_start512, t_end512;
+t_start = clock();
+dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
+   XBZRLE_PAGE_SIZE);
+t_end = clock();
+float time_val = difftime(t_end, t_start);
+g_assert(dlen == 0);
+
+t_start512 = clock();
+dlen512 = xbzrle_encode_buffer_avx512(buffer512, buffer512, 
XBZRLE_PAGE_SIZE,
+   compressed512, XBZRLE_PAGE_SIZE);
+t_end512 = clock();
+float time_val512 = difftime(t_end512, t_start512);
+g_assert(dlen512 == 0);
+
+res->t_raw = time_val;
+res->t_512 = time_val512;
+
+g_free(buffer);
+g_free(compressed);
+g_free(buffer512);
+g_free(compressed512);
+
+}
+
+static void test_encode_decode_zero_avx512(void)
+{
+int i;
+float time_raw = 0.0, time_512 = 0.0;
+struct ResTime res;
+for (i = 0; i < 1; i++) {
+encode_decode_zero();
+time_raw += res.t_raw;
+time_512 += res.t_512;
+}
+printf("Zero test:\n");
+printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
+
+static void encode_decode_unchanged(struct ResTime *re

[PATCH v5 1/2] Update AVX512 support for xbzrle_encode_buffer

2022-08-18 Thread ling xu
This commit updates code of avx512 support for xbzrle_encode_buffer function to
accelerate xbzrle encoding speed. We add runtime check of avx512 and add
benchmark for this feature. Compared with C version of
xbzrle_encode_buffer function, avx512 version can achieve 50%-70%
performance improvement on benchmarking. In addition, if dirty data is
randomly located in 4K page, the avx512 version can achieve almost 140%
performance gain.

Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 
---
 meson.build|  16 ++
 meson_options.txt  |   2 +
 migration/ram.c|  35 ++--
 migration/xbzrle.c | 130 +
 migration/xbzrle.h |   4 ++
 5 files changed, 184 insertions(+), 3 deletions(-)

diff --git a/meson.build b/meson.build
index 30a380752c..c9d90a5bff 100644
--- a/meson.build
+++ b/meson.build
@@ -2264,6 +2264,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', 
get_option('avx512f') \
 int main(int argc, char *argv[]) { return bar(argv[0]); }
   '''), error_message: 'AVX512F not available').allowed())
 
+config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable 
AVX512BW') \
+  .require(cc.links('''
+#pragma GCC push_options
+#pragma GCC target("avx512bw")
+#include 
+#include 
+static int bar(void *a) {
+
+  __m512i x = *(__m512i *)a;
+  __m512i res= _mm512_abs_epi8(x);
+  return res[1];
+}
+int main(int argc, char *argv[]) { return bar(argv[0]); }
+  '''), error_message: 'AVX512BW not available').allowed())
+
 have_pvrdma = get_option('pvrdma') \
   .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics 
libraries') \
   .require(cc.compiles(gnu_source_prefix + '''
diff --git a/meson_options.txt b/meson_options.txt
index e58e158396..07194bf680 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
description: 'AVX2 optimizations')
 option('avx512f', type: 'feature', value: 'disabled',
description: 'AVX512F optimizations')
+option('avx512bw', type: 'feature', value: 'auto',
+   description: 'AVX512BW optimizations')
 option('keyring', type: 'feature', value: 'auto',
description: 'Linux keyring support')
 
diff --git a/migration/ram.c b/migration/ram.c
index dc1de9ddbc..387bef2675 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -83,6 +83,35 @@
 /* 0x80 is reserved in migration.h start with 0x100 next */
 #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100
 
+int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
+ uint8_t *, int) = xbzrle_encode_buffer;
+#if defined(CONFIG_AVX512BW_OPT)
+#include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void)
+{
+unsigned max = __get_cpuid_max(0, NULL);
+int a, b, c, d;
+if (max >= 1) {
+__cpuid(1, a, b, c, d);
+ /* We must check that AVX is not just available, but usable.  */
+if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+int bv;
+__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+__cpuid_count(7, 0, a, b, c, d);
+   /* 0xe6:
+*  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+*and ZMM16-ZMM31 state are enabled by OS)
+*  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+*/
+if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
+}
+}
+}
+return ;
+}
+#endif
+
 XBZRLECacheStats xbzrle_counters;
 
 /* struct contains XBZRLE cache and a static page
@@ -802,9 +831,9 @@ static int save_xbzrle_page(RAMState *rs, uint8_t 
**current_data,
 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 
 /* XBZRLE encoding (if there is no overflow) */
-encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
-   TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
-   TARGET_PAGE_SIZE);
+encoded_len = xbzrle_encode_buffer_func(prev_cached_page, 
XBZRLE.current_buf,
+TARGET_PAGE_SIZE, 
XBZRLE.encoded_buf,
+TARGET_PAGE_SIZE);
 
 /*
  * Update the cache contents, so that it corresponds to the data
diff --git a/migration/xbzrle.c b/migration/xbzrle.c
index 1ba482ded9..6da7f79625 100644
--- a/migration/xbzrle.c
+++ b/migration/xbzrle.c
@@ -174,3 +174,133 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t 
*dst, int dlen)
 
 return d;
 }
+
+#if defined(CONFIG_AVX512BW_OPT)
+#pragma GCC push_options
+#pragma GCC target("avx512bw")
+#include 
+int xbzrl

[PATCH v4 2/2] Update test code of AVX512 support for xbzrle_encode

2022-08-09 Thread ling xu
Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 
---
 tests/unit/test-xbzrle.c | 458 ++-
 1 file changed, 457 insertions(+), 1 deletion(-)

diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c
index ef951b6e54..2676123ce3 100644
--- a/tests/unit/test-xbzrle.c
+++ b/tests/unit/test-xbzrle.c
@@ -16,6 +16,35 @@
 
 #define XBZRLE_PAGE_SIZE 4096
 
+#if defined(CONFIG_AVX512BW_OPT)
+static bool is_cpu_support_avx512bw;
+#include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void)
+{
+unsigned max = __get_cpuid_max(0, NULL);
+int a, b, c, d;
+is_cpu_support_avx512bw = false;
+if (max >= 1) {
+__cpuid(1, a, b, c, d);
+ /* We must check that AVX is not just available, but usable.  */
+if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+int bv;
+__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+__cpuid_count(7, 0, a, b, c, d);
+   /* 0xe6:
+*  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+*and ZMM16-ZMM31 state are enabled by OS)
+*  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+*/
+if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+is_cpu_support_avx512bw = true;
+}
+}
+}
+return ;
+}
+#endif
+
 static void test_uleb(void)
 {
 uint32_t i, val;
@@ -173,11 +202,438 @@ static void test_encode_decode(void)
 }
 }
 
+static float *encode_decode_zero(void)
+{
+uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
+int i = 0;
+int dlen = 0, dlen512 = 0;
+int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
+
+for (i = diff_len; i > 0; i--) {
+buffer[1000 + i] = i;
+buffer512[1000 + i] = i;
+}
+
+buffer[1000 + diff_len + 3] = 103;
+buffer[1000 + diff_len + 5] = 105;
+
+buffer512[1000 + diff_len + 3] = 103;
+buffer512[1000 + diff_len + 5] = 105;
+
+/* encode zero page */
+time_t t_start, t_end, t_start512, t_end512;
+t_start = clock();
+dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
+   XBZRLE_PAGE_SIZE);
+t_end = clock();
+float time_val = difftime(t_end, t_start);
+g_assert(dlen == 0);
+
+t_start512 = clock();
+dlen512 = xbzrle_encode_buffer_avx512(buffer512, buffer512, 
XBZRLE_PAGE_SIZE,
+   compressed512, XBZRLE_PAGE_SIZE);
+t_end512 = clock();
+float time_val512 = difftime(t_end512, t_start512);
+g_assert(dlen512 == 0);
+
+static float result_zero[2];
+result_zero[0] = time_val;
+result_zero[1] = time_val512;
+
+g_free(buffer);
+g_free(compressed);
+g_free(buffer512);
+g_free(compressed512);
+
+return result_zero;
+}
+
+static void test_encode_decode_zero_avx512(void)
+{
+int i;
+float time_raw = 0.0, time_512 = 0.0;
+float *res;
+for (i = 0; i < 1; i++) {
+res = encode_decode_zero();
+time_raw += res[0];
+time_512 += res[1];
+}
+printf("Zero test:\n");
+printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
+
+static float *encode_decode_unchanged(void)
+{
+uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
+int i = 0;
+int dlen = 0, dlen512 = 0;
+int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
+
+for (i = diff_len; i > 0; i--) {
+test[1000 + i] = i + 4;
+test512[1000 + i] = i + 4;
+}
+
+test[1000 + diff_len + 3] = 107;
+test[1000 + diff_len + 5] = 109;
+
+test512[1000 + diff_len + 3] = 107;
+test512[1000 + diff_len + 5] = 109;
+
+/* test unchanged buffer */
+time_t t_start, t_end, t_start512, t_end512;
+t_start = clock();
+dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed,
+XBZRLE_PAGE_SIZE);
+t_end = clock();
+float time_val = difftime(t_end, t_start);
+g_assert(dlen == 0);
+
+t_start512 = clock();
+dlen512 = xbzrle_encode_buffer_avx512(test512, test512, XBZRLE_PAGE_SIZE,
+   compressed512, XBZRLE_PAGE_SIZE);
+t_end512 = clock();
+float time_val512 = difftime(t_end512, t_start512);
+g_assert(dlen512 == 0);
+
+static float result_unchanged[2];
+result_unchanged

[PATCH v4 0/2] This patch updates AVX512 support for xbzrle

2022-08-09 Thread ling xu


This patch updates code of AVX512 support for xbzrle_encode_buffer
fucntion to accelerate xbzrle encoding speed.

The runtime check code is kept in meson.build and meson_options.txt
without modifying.

The updated AVX512 algorithm is provided in ram.c, xbzrle.c and
xbzrle.h.

The test code is updated in test-xbzrle.c.
 

Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao  
Co-authored-by: Jun Jin 

ling xu (2):
  Update AVX512 support for xbzrle_encode_buffer function
  Update test code of AVX512 support for xbzrle_encode

 meson.build  |  16 ++
 meson_options.txt|   2 +
 migration/ram.c  |  42 +++-
 migration/xbzrle.c   | 181 
 migration/xbzrle.h   |   4 +
 tests/unit/test-xbzrle.c | 458 ++-
 6 files changed, 699 insertions(+), 4 deletions(-)

-- 
2.25.1




[PATCH v4 1/2] Update AVX512 support for xbzrle_encode_buffer function

2022-08-09 Thread ling xu
This commit update code of avx512 support for xbzrle_encode_buffer
function to accelerate xbzrle encoding speed.
Compared with C version of xbzrle_encode_buffer function, avx512 version
can achieve almost 60%-70% performance improvement on unit test provided
by Qemu. In addition, we provide one more unit test called
"test_encode_decode_random_avx512", in which dirty data are randomly
located in 4K page, and this case can achieve almost 140% performance
gain.

Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 
---
 meson.build|  16 
 meson_options.txt  |   2 +
 migration/ram.c|  42 ++-
 migration/xbzrle.c | 181 +
 migration/xbzrle.h |   4 +
 5 files changed, 242 insertions(+), 3 deletions(-)

diff --git a/meson.build b/meson.build
index 294e9a8f32..4222b77e9f 100644
--- a/meson.build
+++ b/meson.build
@@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', 
get_option('avx512f') \
 int main(int argc, char *argv[]) { return bar(argv[0]); }
   '''), error_message: 'AVX512F not available').allowed())
 
+config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable 
AVX512BW') \
+  .require(cc.links('''
+#pragma GCC push_options
+#pragma GCC target("avx512bw")
+#include 
+#include 
+static int bar(void *a) {
+
+  __m512i x = *(__m512i *)a;
+  __m512i res= _mm512_abs_epi8(x);
+  return res[1];
+}
+int main(int argc, char *argv[]) { return bar(argv[0]); }
+  '''), error_message: 'AVX512BW not available').allowed())
+
 have_pvrdma = get_option('pvrdma') \
   .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics 
libraries') \
   .require(cc.compiles(gnu_source_prefix + '''
diff --git a/meson_options.txt b/meson_options.txt
index e58e158396..07194bf680 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
description: 'AVX2 optimizations')
 option('avx512f', type: 'feature', value: 'disabled',
description: 'AVX512F optimizations')
+option('avx512bw', type: 'feature', value: 'auto',
+   description: 'AVX512BW optimizations')
 option('keyring', type: 'feature', value: 'auto',
description: 'Linux keyring support')
 
diff --git a/migration/ram.c b/migration/ram.c
index dc1de9ddbc..bae7bef236 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -83,6 +83,35 @@
 /* 0x80 is reserved in migration.h start with 0x100 next */
 #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100
 
+#if defined(CONFIG_AVX512BW_OPT)
+static bool is_cpu_support_avx512bw;
+#include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void)
+{
+unsigned max = __get_cpuid_max(0, NULL);
+int a, b, c, d;
+is_cpu_support_avx512bw = false;
+if (max >= 1) {
+__cpuid(1, a, b, c, d);
+ /* We must check that AVX is not just available, but usable.  */
+if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+int bv;
+__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+__cpuid_count(7, 0, a, b, c, d);
+   /* 0xe6:
+*  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+*and ZMM16-ZMM31 state are enabled by OS)
+*  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+*/
+if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+is_cpu_support_avx512bw = true;
+}
+}
+}
+return ;
+}
+#endif
+
 XBZRLECacheStats xbzrle_counters;
 
 /* struct contains XBZRLE cache and a static page
@@ -802,9 +831,16 @@ static int save_xbzrle_page(RAMState *rs, uint8_t 
**current_data,
 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 
 /* XBZRLE encoding (if there is no overflow) */
-encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
-   TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
-   TARGET_PAGE_SIZE);
+int (*xbzrle_encode_buffer_func)(uint8_t *, uint8_t *, int,
+ uint8_t *, int) = xbzrle_encode_buffer;
+#if defined(CONFIG_AVX512BW_OPT)
+if (likely(is_cpu_support_avx512bw)) {
+xbzrle_encode_buffer_func = xbzrle_encode_buffer_avx512;
+}
+#endif
+encoded_len = xbzrle_encode_buffer_func(prev_cached_page, 
XBZRLE.current_buf,
+TARGET_PAGE_SIZE, 
XBZRLE.encoded_buf,
+TARGET_PAGE_SIZE);
 
 /*
  * Update the cache contents, so that it corresponds to the data
diff --git a/migration/xbzrle.c b/migration/xbzrle.c
index 1ba482ded9..804adc1acb 100644
--- a/migration/xbzrle.c
+++

[PATCH v3 2/2] Test code for AVX512 support for xbzrle_encode_buffer

2022-08-08 Thread ling xu
Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 
---
 tests/unit/test-xbzrle.c | 307 ---
 1 file changed, 290 insertions(+), 17 deletions(-)

diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c
index ef951b6e54..653016826f 100644
--- a/tests/unit/test-xbzrle.c
+++ b/tests/unit/test-xbzrle.c
@@ -38,111 +38,280 @@ static void test_uleb(void)
 g_assert(val == 0);
 }
 
-static void test_encode_decode_zero(void)
+static float *test_encode_decode_zero(void)
 {
 uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
 uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
 int i = 0;
-int dlen = 0;
+int dlen = 0, dlen512 = 0;
 int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
 
 for (i = diff_len; i > 0; i--) {
 buffer[1000 + i] = i;
+buffer512[1000 + i] = i;
 }
 
 buffer[1000 + diff_len + 3] = 103;
 buffer[1000 + diff_len + 5] = 105;
 
+buffer512[1000 + diff_len + 3] = 103;
+buffer512[1000 + diff_len + 5] = 105;
+
 /* encode zero page */
+time_t t_start, t_end, t_start512, t_end512;
+t_start = clock();
 dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
XBZRLE_PAGE_SIZE);
+t_end = clock();
+float time_val = difftime(t_end, t_start);
 g_assert(dlen == 0);
 
+t_start512 = clock();
+dlen512 = xbzrle_encode_buffer_512(buffer512, buffer512, XBZRLE_PAGE_SIZE,
+   compressed512, XBZRLE_PAGE_SIZE);
+t_end512 = clock();
+float time_val512 = difftime(t_end512, t_start512);
+g_assert(dlen512 == 0);
+
+static float result_zero[2];
+result_zero[0] = time_val;
+result_zero[1] = time_val512;
+
 g_free(buffer);
 g_free(compressed);
+g_free(buffer512);
+g_free(compressed512);
+
+return result_zero;
+}
+
+static void test_encode_decode_zero_range(void)
+{
+int i;
+float time_raw = 0.0, time_512 = 0.0;
+float *res;
+for (i = 0; i < 1; i++) {
+res = test_encode_decode_zero();
+time_raw += res[0];
+time_512 += res[1];
+}
+printf("Zero test:\n");
+printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+printf("512 xbzrle_encode time is %f ms\n", time_512);
 }
 
-static void test_encode_decode_unchanged(void)
+static float *test_encode_decode_unchanged(void)
 {
 uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
 uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
 int i = 0;
-int dlen = 0;
+int dlen = 0, dlen512 = 0;
 int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
 
 for (i = diff_len; i > 0; i--) {
 test[1000 + i] = i + 4;
+test512[1000 + i] = i + 4;
 }
 
 test[1000 + diff_len + 3] = 107;
 test[1000 + diff_len + 5] = 109;
 
+test512[1000 + diff_len + 3] = 107;
+test512[1000 + diff_len + 5] = 109;
+
 /* test unchanged buffer */
+time_t t_start, t_end, t_start512, t_end512;
+t_start = clock();
 dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed,
 XBZRLE_PAGE_SIZE);
+t_end = clock();
+float time_val = difftime(t_end, t_start);
 g_assert(dlen == 0);
 
+t_start512 = clock();
+dlen512 = xbzrle_encode_buffer_512(test512, test512, XBZRLE_PAGE_SIZE,
+   compressed512, XBZRLE_PAGE_SIZE);
+t_end512 = clock();
+float time_val512 = difftime(t_end512, t_start512);
+g_assert(dlen512 == 0);
+
+static float result_unchanged[2];
+result_unchanged[0] = time_val;
+result_unchanged[1] = time_val512;
+
 g_free(test);
 g_free(compressed);
+g_free(test512);
+g_free(compressed512);
+
+return result_unchanged;
 }
 
-static void test_encode_decode_1_byte(void)
+static void test_encode_decode_unchanged_range(void)
+{
+int i;
+float time_raw = 0.0, time_512 = 0.0;
+float *res;
+for (i = 0; i < 1; i++) {
+res = test_encode_decode_unchanged();
+time_raw += res[0];
+time_512 += res[1];
+}
+printf("Unchanged test:\n");
+printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
+
+static float *test_encode_decode_1_byte(void)
 {
 uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
 uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
 uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
-int dlen = 0, rc = 0;
+uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed512 = g_mal

[PATCH v3 0/2] This patch updates runtime check of AVX512

2022-08-08 Thread ling xu
This patch updates runtime check of AVX512 and update avx512 support for
xbzrle_encode_buffer function to accelerate xbzrle encoding speed.

The runtime check is updated in meson.build and meson_options.txt.

The updated AVX512 algorithm is provided in ram.c, xbzrle.c and
xbzrle.h.

The test code is provided in test-xbzrle.c.

Previous discussion is refered below:
https://www.mail-archive.com/qemu-devel@nongnu.org/msg903520.html

ling xu (2):
  Update AVX512 support for xbzrle_encode_buffer function
  Test code for AVX512 support for xbzrle_encode_buffer

 meson.build  |  16 ++
 meson_options.txt|   2 +
 migration/ram.c  |  41 ++
 migration/xbzrle.c   | 181 +++
 migration/xbzrle.h   |   4 +
 tests/unit/test-xbzrle.c | 307 ---
 6 files changed, 534 insertions(+), 17 deletions(-)

-- 
2.25.1




[PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function

2022-08-08 Thread ling xu
This commit update runtime check of AVX512, and implements avx512 of
xbzrle_encode_buffer function to accelerate xbzrle encoding speed.
Compared with C version of xbzrle_encode_buffer function, avx512 version
can achieve almost 60%-70% performance improvement on unit test provided
by Qemu. In addition, we provide one more unit test called
"test_encode_decode_random", in which dirty data are randomly located in
4K page, and this case can achieve almost 140% performance gain.

Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 
---
 meson.build|  16 
 meson_options.txt  |   2 +
 migration/ram.c|  41 ++
 migration/xbzrle.c | 181 +
 migration/xbzrle.h |   4 +
 5 files changed, 244 insertions(+)

diff --git a/meson.build b/meson.build
index 294e9a8f32..4222b77e9f 100644
--- a/meson.build
+++ b/meson.build
@@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', 
get_option('avx512f') \
 int main(int argc, char *argv[]) { return bar(argv[0]); }
   '''), error_message: 'AVX512F not available').allowed())
 
+config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable 
AVX512BW') \
+  .require(cc.links('''
+#pragma GCC push_options
+#pragma GCC target("avx512bw")
+#include 
+#include 
+static int bar(void *a) {
+
+  __m512i x = *(__m512i *)a;
+  __m512i res= _mm512_abs_epi8(x);
+  return res[1];
+}
+int main(int argc, char *argv[]) { return bar(argv[0]); }
+  '''), error_message: 'AVX512BW not available').allowed())
+
 have_pvrdma = get_option('pvrdma') \
   .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics 
libraries') \
   .require(cc.compiles(gnu_source_prefix + '''
diff --git a/meson_options.txt b/meson_options.txt
index e58e158396..07194bf680 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
description: 'AVX2 optimizations')
 option('avx512f', type: 'feature', value: 'disabled',
description: 'AVX512F optimizations')
+option('avx512bw', type: 'feature', value: 'auto',
+   description: 'AVX512BW optimizations')
 option('keyring', type: 'feature', value: 'auto',
description: 'Linux keyring support')
 
diff --git a/migration/ram.c b/migration/ram.c
index dc1de9ddbc..d9c1ac2f7a 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -83,6 +83,35 @@
 /* 0x80 is reserved in migration.h start with 0x100 next */
 #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100
 
+#if defined(CONFIG_AVX512BW_OPT)
+static bool IS_CPU_SUPPORT_AVX512BW;
+#include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void)
+{
+unsigned max = __get_cpuid_max(0, NULL);
+int a, b, c, d;
+IS_CPU_SUPPORT_AVX512BW = false;
+if (max >= 1) {
+__cpuid(1, a, b, c, d);
+ /* We must check that AVX is not just available, but usable.  */
+if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+int bv;
+__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+__cpuid_count(7, 0, a, b, c, d);
+   /* 0xe6:
+*  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+*and ZMM16-ZMM31 state are enabled by OS)
+*  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+*/
+if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+IS_CPU_SUPPORT_AVX512BW = true;
+}
+}
+}
+return ;
+}
+#endif
+
 XBZRLECacheStats xbzrle_counters;
 
 /* struct contains XBZRLE cache and a static page
@@ -802,9 +831,21 @@ static int save_xbzrle_page(RAMState *rs, uint8_t 
**current_data,
 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 
 /* XBZRLE encoding (if there is no overflow) */
+#if defined(CONFIG_AVX512BW_OPT)
+if (likely(IS_CPU_SUPPORT_AVX512BW)) {
+encoded_len = xbzrle_encode_buffer_512(prev_cached_page, 
XBZRLE.current_buf,
+   TARGET_PAGE_SIZE, 
XBZRLE.encoded_buf,
+   TARGET_PAGE_SIZE);
+} else {
+encoded_len = xbzrle_encode_buffer(prev_cached_page, 
XBZRLE.current_buf,
+   TARGET_PAGE_SIZE, 
XBZRLE.encoded_buf,
+   TARGET_PAGE_SIZE);
+}
+#else
 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
TARGET_PAGE_SIZE);
+#endif
 
 /*
  * Update the cache contents, so that it corresponds to the data
diff --git a/migration/xbzrle.c 

[PATCH v3 2/2] Test code for AVX512 support for xbzrle_encode_buffer

2022-08-08 Thread ling xu
Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 
---
 tests/unit/test-xbzrle.c | 307 ---
 1 file changed, 290 insertions(+), 17 deletions(-)

diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c
index ef951b6e54..653016826f 100644
--- a/tests/unit/test-xbzrle.c
+++ b/tests/unit/test-xbzrle.c
@@ -38,111 +38,280 @@ static void test_uleb(void)
 g_assert(val == 0);
 }
 
-static void test_encode_decode_zero(void)
+static float *test_encode_decode_zero(void)
 {
 uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
 uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
 int i = 0;
-int dlen = 0;
+int dlen = 0, dlen512 = 0;
 int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
 
 for (i = diff_len; i > 0; i--) {
 buffer[1000 + i] = i;
+buffer512[1000 + i] = i;
 }
 
 buffer[1000 + diff_len + 3] = 103;
 buffer[1000 + diff_len + 5] = 105;
 
+buffer512[1000 + diff_len + 3] = 103;
+buffer512[1000 + diff_len + 5] = 105;
+
 /* encode zero page */
+time_t t_start, t_end, t_start512, t_end512;
+t_start = clock();
 dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
XBZRLE_PAGE_SIZE);
+t_end = clock();
+float time_val = difftime(t_end, t_start);
 g_assert(dlen == 0);
 
+t_start512 = clock();
+dlen512 = xbzrle_encode_buffer_512(buffer512, buffer512, XBZRLE_PAGE_SIZE,
+   compressed512, XBZRLE_PAGE_SIZE);
+t_end512 = clock();
+float time_val512 = difftime(t_end512, t_start512);
+g_assert(dlen512 == 0);
+
+static float result_zero[2];
+result_zero[0] = time_val;
+result_zero[1] = time_val512;
+
 g_free(buffer);
 g_free(compressed);
+g_free(buffer512);
+g_free(compressed512);
+
+return result_zero;
+}
+
+static void test_encode_decode_zero_range(void)
+{
+int i;
+float time_raw = 0.0, time_512 = 0.0;
+float *res;
+for (i = 0; i < 1; i++) {
+res = test_encode_decode_zero();
+time_raw += res[0];
+time_512 += res[1];
+}
+printf("Zero test:\n");
+printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+printf("512 xbzrle_encode time is %f ms\n", time_512);
 }
 
-static void test_encode_decode_unchanged(void)
+static float *test_encode_decode_unchanged(void)
 {
 uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
 uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
 int i = 0;
-int dlen = 0;
+int dlen = 0, dlen512 = 0;
 int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
 
 for (i = diff_len; i > 0; i--) {
 test[1000 + i] = i + 4;
+test512[1000 + i] = i + 4;
 }
 
 test[1000 + diff_len + 3] = 107;
 test[1000 + diff_len + 5] = 109;
 
+test512[1000 + diff_len + 3] = 107;
+test512[1000 + diff_len + 5] = 109;
+
 /* test unchanged buffer */
+time_t t_start, t_end, t_start512, t_end512;
+t_start = clock();
 dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed,
 XBZRLE_PAGE_SIZE);
+t_end = clock();
+float time_val = difftime(t_end, t_start);
 g_assert(dlen == 0);
 
+t_start512 = clock();
+dlen512 = xbzrle_encode_buffer_512(test512, test512, XBZRLE_PAGE_SIZE,
+   compressed512, XBZRLE_PAGE_SIZE);
+t_end512 = clock();
+float time_val512 = difftime(t_end512, t_start512);
+g_assert(dlen512 == 0);
+
+static float result_unchanged[2];
+result_unchanged[0] = time_val;
+result_unchanged[1] = time_val512;
+
 g_free(test);
 g_free(compressed);
+g_free(test512);
+g_free(compressed512);
+
+return result_unchanged;
 }
 
-static void test_encode_decode_1_byte(void)
+static void test_encode_decode_unchanged_range(void)
+{
+int i;
+float time_raw = 0.0, time_512 = 0.0;
+float *res;
+for (i = 0; i < 1; i++) {
+res = test_encode_decode_unchanged();
+time_raw += res[0];
+time_512 += res[1];
+}
+printf("Unchanged test:\n");
+printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
+
+static float *test_encode_decode_1_byte(void)
 {
 uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
 uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
 uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
-int dlen = 0, rc = 0;
+uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed512 = g_mal

[PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function

2022-08-08 Thread ling xu
This commit update runtime check of AVX512, and implements avx512 of
xbzrle_encode_buffer function to accelerate xbzrle encoding speed.
Compared with C version of xbzrle_encode_buffer function, avx512 version
can achieve almost 60%-70% performance improvement on unit test provided
by Qemu. In addition, we provide one more unit test called
"test_encode_decode_random", in which dirty data are randomly located in
4K page, and this case can achieve almost 140% performance gain.

Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 
---
 meson.build|  16 
 meson_options.txt  |   2 +
 migration/ram.c|  41 ++
 migration/xbzrle.c | 181 +
 migration/xbzrle.h |   4 +
 5 files changed, 244 insertions(+)

diff --git a/meson.build b/meson.build
index 294e9a8f32..4222b77e9f 100644
--- a/meson.build
+++ b/meson.build
@@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', 
get_option('avx512f') \
 int main(int argc, char *argv[]) { return bar(argv[0]); }
   '''), error_message: 'AVX512F not available').allowed())
 
+config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable 
AVX512BW') \
+  .require(cc.links('''
+#pragma GCC push_options
+#pragma GCC target("avx512bw")
+#include 
+#include 
+static int bar(void *a) {
+
+  __m512i x = *(__m512i *)a;
+  __m512i res= _mm512_abs_epi8(x);
+  return res[1];
+}
+int main(int argc, char *argv[]) { return bar(argv[0]); }
+  '''), error_message: 'AVX512BW not available').allowed())
+
 have_pvrdma = get_option('pvrdma') \
   .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics 
libraries') \
   .require(cc.compiles(gnu_source_prefix + '''
diff --git a/meson_options.txt b/meson_options.txt
index e58e158396..07194bf680 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
description: 'AVX2 optimizations')
 option('avx512f', type: 'feature', value: 'disabled',
description: 'AVX512F optimizations')
+option('avx512bw', type: 'feature', value: 'auto',
+   description: 'AVX512BW optimizations')
 option('keyring', type: 'feature', value: 'auto',
description: 'Linux keyring support')
 
diff --git a/migration/ram.c b/migration/ram.c
index dc1de9ddbc..d9c1ac2f7a 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -83,6 +83,35 @@
 /* 0x80 is reserved in migration.h start with 0x100 next */
 #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100
 
+#if defined(CONFIG_AVX512BW_OPT)
+static bool IS_CPU_SUPPORT_AVX512BW;
+#include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void)
+{
+unsigned max = __get_cpuid_max(0, NULL);
+int a, b, c, d;
+IS_CPU_SUPPORT_AVX512BW = false;
+if (max >= 1) {
+__cpuid(1, a, b, c, d);
+ /* We must check that AVX is not just available, but usable.  */
+if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+int bv;
+__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+__cpuid_count(7, 0, a, b, c, d);
+   /* 0xe6:
+*  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+*and ZMM16-ZMM31 state are enabled by OS)
+*  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+*/
+if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+IS_CPU_SUPPORT_AVX512BW = true;
+}
+}
+}
+return ;
+}
+#endif
+
 XBZRLECacheStats xbzrle_counters;
 
 /* struct contains XBZRLE cache and a static page
@@ -802,9 +831,21 @@ static int save_xbzrle_page(RAMState *rs, uint8_t 
**current_data,
 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 
 /* XBZRLE encoding (if there is no overflow) */
+#if defined(CONFIG_AVX512BW_OPT)
+if (likely(IS_CPU_SUPPORT_AVX512BW)) {
+encoded_len = xbzrle_encode_buffer_512(prev_cached_page, 
XBZRLE.current_buf,
+   TARGET_PAGE_SIZE, 
XBZRLE.encoded_buf,
+   TARGET_PAGE_SIZE);
+} else {
+encoded_len = xbzrle_encode_buffer(prev_cached_page, 
XBZRLE.current_buf,
+   TARGET_PAGE_SIZE, 
XBZRLE.encoded_buf,
+   TARGET_PAGE_SIZE);
+}
+#else
 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
TARGET_PAGE_SIZE);
+#endif
 
 /*
  * Update the cache contents, so that it corresponds to the data
diff --git a/migration/xbzrle.c 

[PATCH v3 0/2] This patch updates runtime check of AVX512

2022-08-08 Thread ling xu
This patch updates runtime check of AVX512 and update avx512 support for
xbzrle_encode_buffer function to accelerate xbzrle encoding speed.

The runtime check is updated in meson.build and meson_options.txt.

The updated AVX512 algorithm is provided in ram.c, xbzrle.c and
xbzrle.h.

The test code is provided in test-xbzrle.c.

ling xu (2):
  Update AVX512 support for xbzrle_encode_buffer function
  Test code for AVX512 support for xbzrle_encode_buffer

 meson.build  |  16 ++
 meson_options.txt|   2 +
 migration/ram.c  |  41 ++
 migration/xbzrle.c   | 181 +++
 migration/xbzrle.h   |   4 +
 tests/unit/test-xbzrle.c | 307 ---
 6 files changed, 534 insertions(+), 17 deletions(-)

-- 
2.25.1




[PATCH v2 0/2] This patch adds runtime check of AVX512

2022-08-04 Thread ling xu
This patch adds runtime check of AVX512 on running machine and update
avx512 support for xbzrle_encode_buffer function to accelerate xbzrle
encoding speed.

The runtime check is added in meson.build and meson_options.txt.

The updated AVX512 algorithm is provided in ram.c, xbzrle.h and
xbzrle.c.

The test code is provided in test-xbzrle.c.

Previous discussion is refered below:
https://lore.kernel.org/all/ytlshitevijwe...@redhat.com/

ling xu (2):
  Update AVX512 support for xbzrle_encode_buffer function
  Test code for AVX512 support for xbzrle_encode_buffer function

 meson.build  | 211 +++
 meson_options.txt|  28 
 migration/ram.c  |  41 ++
 migration/xbzrle.c   | 181 +++
 migration/xbzrle.h   |   4 +
 tests/unit/test-xbzrle.c | 307 ---
 6 files changed, 755 insertions(+), 17 deletions(-)

-- 
2.25.1




[PATCH v2 2/2] Test code for AVX512 support for xbzrle_encode_buffer function

2022-08-04 Thread ling xu
Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 
---
 tests/unit/test-xbzrle.c | 307 ---
 1 file changed, 290 insertions(+), 17 deletions(-)

diff --git a/tests/unit/test-xbzrle.c b/tests/unit/test-xbzrle.c
index ef951b6e54..653016826f 100644
--- a/tests/unit/test-xbzrle.c
+++ b/tests/unit/test-xbzrle.c
@@ -38,111 +38,280 @@ static void test_uleb(void)
 g_assert(val == 0);
 }
 
-static void test_encode_decode_zero(void)
+static float *test_encode_decode_zero(void)
 {
 uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
 uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
 int i = 0;
-int dlen = 0;
+int dlen = 0, dlen512 = 0;
 int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
 
 for (i = diff_len; i > 0; i--) {
 buffer[1000 + i] = i;
+buffer512[1000 + i] = i;
 }
 
 buffer[1000 + diff_len + 3] = 103;
 buffer[1000 + diff_len + 5] = 105;
 
+buffer512[1000 + diff_len + 3] = 103;
+buffer512[1000 + diff_len + 5] = 105;
+
 /* encode zero page */
+time_t t_start, t_end, t_start512, t_end512;
+t_start = clock();
 dlen = xbzrle_encode_buffer(buffer, buffer, XBZRLE_PAGE_SIZE, compressed,
XBZRLE_PAGE_SIZE);
+t_end = clock();
+float time_val = difftime(t_end, t_start);
 g_assert(dlen == 0);
 
+t_start512 = clock();
+dlen512 = xbzrle_encode_buffer_512(buffer512, buffer512, XBZRLE_PAGE_SIZE,
+   compressed512, XBZRLE_PAGE_SIZE);
+t_end512 = clock();
+float time_val512 = difftime(t_end512, t_start512);
+g_assert(dlen512 == 0);
+
+static float result_zero[2];
+result_zero[0] = time_val;
+result_zero[1] = time_val512;
+
 g_free(buffer);
 g_free(compressed);
+g_free(buffer512);
+g_free(compressed512);
+
+return result_zero;
+}
+
+static void test_encode_decode_zero_range(void)
+{
+int i;
+float time_raw = 0.0, time_512 = 0.0;
+float *res;
+for (i = 0; i < 1; i++) {
+res = test_encode_decode_zero();
+time_raw += res[0];
+time_512 += res[1];
+}
+printf("Zero test:\n");
+printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+printf("512 xbzrle_encode time is %f ms\n", time_512);
 }
 
-static void test_encode_decode_unchanged(void)
+static float *test_encode_decode_unchanged(void)
 {
 uint8_t *compressed = g_malloc0(XBZRLE_PAGE_SIZE);
 uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed512 = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
 int i = 0;
-int dlen = 0;
+int dlen = 0, dlen512 = 0;
 int diff_len = g_test_rand_int_range(0, XBZRLE_PAGE_SIZE - 1006);
 
 for (i = diff_len; i > 0; i--) {
 test[1000 + i] = i + 4;
+test512[1000 + i] = i + 4;
 }
 
 test[1000 + diff_len + 3] = 107;
 test[1000 + diff_len + 5] = 109;
 
+test512[1000 + diff_len + 3] = 107;
+test512[1000 + diff_len + 5] = 109;
+
 /* test unchanged buffer */
+time_t t_start, t_end, t_start512, t_end512;
+t_start = clock();
 dlen = xbzrle_encode_buffer(test, test, XBZRLE_PAGE_SIZE, compressed,
 XBZRLE_PAGE_SIZE);
+t_end = clock();
+float time_val = difftime(t_end, t_start);
 g_assert(dlen == 0);
 
+t_start512 = clock();
+dlen512 = xbzrle_encode_buffer_512(test512, test512, XBZRLE_PAGE_SIZE,
+   compressed512, XBZRLE_PAGE_SIZE);
+t_end512 = clock();
+float time_val512 = difftime(t_end512, t_start512);
+g_assert(dlen512 == 0);
+
+static float result_unchanged[2];
+result_unchanged[0] = time_val;
+result_unchanged[1] = time_val512;
+
 g_free(test);
 g_free(compressed);
+g_free(test512);
+g_free(compressed512);
+
+return result_unchanged;
 }
 
-static void test_encode_decode_1_byte(void)
+static void test_encode_decode_unchanged_range(void)
+{
+int i;
+float time_raw = 0.0, time_512 = 0.0;
+float *res;
+for (i = 0; i < 1; i++) {
+res = test_encode_decode_unchanged();
+time_raw += res[0];
+time_512 += res[1];
+}
+printf("Unchanged test:\n");
+printf("Raw xbzrle_encode time is %f ms\n", time_raw);
+printf("512 xbzrle_encode time is %f ms\n", time_512);
+}
+
+static float *test_encode_decode_1_byte(void)
 {
 uint8_t *buffer = g_malloc0(XBZRLE_PAGE_SIZE);
 uint8_t *test = g_malloc0(XBZRLE_PAGE_SIZE);
 uint8_t *compressed = g_malloc(XBZRLE_PAGE_SIZE);
-int dlen = 0, rc = 0;
+uint8_t *buffer512 = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *test512 = g_malloc0(XBZRLE_PAGE_SIZE);
+uint8_t *compressed512 = g_mal

[PATCH v2 1/2] Update AVX512 support for xbzrle_encode_buffer function

2022-08-04 Thread ling xu
This commit adds runtime check of AVX512 on running machine, and implements 
AVX512 of
xbzrle_encode_buffer function to accelerate xbzrle encoding speed.
Compared with C version of xbzrle_encode_buffer function, AVX512 version
can achieve almost 60%-70% performance improvement on unit test provided
by qemu. In addition, we provide one more unit test called
"test_encode_decode_random", in which dirty data are randomly located in
4K page, and this case can achieve almost 140% performance gain.

Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 
---
 meson.build| 211 +
 meson_options.txt  |  28 ++
 migration/ram.c|  41 +
 migration/xbzrle.c | 181 ++
 migration/xbzrle.h |   4 +
 5 files changed, 465 insertions(+)

diff --git a/meson.build b/meson.build
index 294e9a8f32..9228df2442 100644
--- a/meson.build
+++ b/meson.build
@@ -2262,6 +2262,217 @@ config_host_data.set('CONFIG_AVX512F_OPT', 
get_option('avx512f') \
 int main(int argc, char *argv[]) { return bar(argv[0]); }
   '''), error_message: 'AVX512F not available').allowed())
 
+config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable 
AVX512BW') \
+  .require(cc.links('''
+#pragma GCC push_options
+#pragma GCC target("avx512bw")
+#include 
+#include 
+static int bar(void *a) {
+
+  __m512i x = *(__m512i *)a;
+  __m512i res= _mm512_abs_epi8(x);
+  return res[1];
+}
+int main(int argc, char *argv[]) { return bar(argv[0]); }
+  '''), error_message: 'AVX512BW not available').allowed())
+
+config_host_data.set('CONFIG_AVX512CD_OPT', get_option('avx512cd') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable 
AVX512CD') \
+  .require(cc.links('''
+#pragma GCC push_options
+#pragma GCC target("avx512cd")
+#include 
+#include 
+static int bar(void *a) {
+
+  __m512i x = *(__m512i *)a;
+  __mmask16 k;
+  __m512i res= _mm512_maskz_lzcnt_epi32 (k, x);
+  return res[1];
+}
+int main(int argc, char *argv[]) { return bar(argv[0]); }
+  '''), error_message: 'AVX512CD not available').allowed())
+
+config_host_data.set('CONFIG_AVX512DQ_OPT', get_option('avx512dq') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable 
AVX512D') \
+  .require(cc.links('''
+#pragma GCC push_options
+#pragma GCC target("avx512dq")
+#include 
+#include 
+static int bar(void *a) {
+
+  __mmask x = *(__mmask *)a;
+  __mmask8 b;
+  return _kxor_mask8(x,b);
+}
+int main(int argc, char *argv[]) { return bar(argv[0]); }
+  '''), error_message: 'AVX512DQ not available').allowed())
+
+config_host_data.set('CONFIG_AVX512ER_OPT', get_option('avx512er') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable 
AVX512ER') \
+  .require(cc.links('''
+#pragma GCC push_options
+#pragma GCC target("avx512er")
+#include 
+#include 
+static int bar(void *a) {
+
+  __m512d x = *(__m512d *)a;
+  __m512d res=_mm512_rsqrt28_pd(x);
+  return res[1];
+}
+int main(int argc, char *argv[]) { return bar(argv[0]); }
+  '''), error_message: 'AVX512ER not available').allowed())
+
+
+config_host_data.set('CONFIG_AVX512IFMA52_OPT', get_option('avx512ifma52') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable 
AVX512ER') \
+  .require(cc.links('''
+#pragma GCC push_options
+#pragma GCC target("avx512ifma")
+#include 
+#include 
+static int bar(void *a) {
+
+  __m512i x = *(__m512i *)a;
+  __m512i b,c;
+  __m512i res= _mm512_madd52lo_epu64 (x, b, c);
+  return res[1];
+}
+int main(int argc, char *argv[]) { return bar(argv[0]); }
+  '''), error_message: 'AVX512IFMA52 not available').allowed())
+
+
+config_host_data.set('CONFIG_AVX512PF_OPT', get_option('avx512pf') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable 
AVX512PF') \
+  .require(cc.links('''
+#pragma GCC push_options
+#pragma GCC target("avx512pf")
+#include 
+#include 
+static void bar(void *a) {
+  char* base_addr;
+  __mmask8 k;
+  __m512i vindex = *(__m512i *)a;
+  _mm512_mask_prefetch_i64scatter_pd (base_addr, k, vindex, 1, 2);
+}
+int main(int argc, char *argv[]) { bar(argv[0]); return 0;}
+  '''), error_message: 'AVX512PF not available').allowed())
+
+
+config_host_data.set('CONFIG_AVX512VPOPCNTDQ_OPT', 
get_option('avx512vpopcntdq') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable 
AVX512VPOPCNTDQ') \
+  .require(cc.links('''
+#pragma GCC push_options
+#pragma GCC target("avx512vpopcntdq")
+#include 
+#include 
+static int ba

[PATCH 0/1] This patch provides AVX512 support for xbzrle_encode_buffer function

2022-07-21 Thread ling xu
This patch adds avx512 support for xbzrle_encode_buffer function to accelerate 
xbzrle encoding speed. 
The specific avx512 implementation is provided in qemu/migration/xbzrle.c. 
We provide AVX512 intrinsic support in qemu/configure file to enable avx512 in 
complier.
And added unit test called "test_encode_decode_random" is provided in 
qemu/tests/u nits/test-xbzrle.c.

ling xu (1):
  Add AVX512 support for xbzrle_encode_buffer function

 configure| 434 ++-
 migration/ram.c  |   6 +
 migration/xbzrle.c   | 177 
 migration/xbzrle.h   |   4 +
 tests/unit/test-xbzrle.c | 307 +--
 5 files changed, 908 insertions(+), 20 deletions(-)

-- 
2.25.1




[PATCH 1/1] Add AVX512 support for xbzrle_encode_buffer function

2022-07-21 Thread ling xu
This commit adds AVX512 implementation of xbzrle_encode_buffer function to
accelerate xbzrle encoding speed. Compared with C version of 
xbzrle_encode_buffer function,
AVX512 version can achieve almost 60%-70% performance improvement on unit test 
provided by qemu.
In addition, we provide one more unit test called "test_encode_decode_random", 
in which
dirty data are randomly located in 4K page, and this case can achieve almost 
140% performance gain.

Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 
---
 configure| 434 ++-
 migration/ram.c  |   6 +
 migration/xbzrle.c   | 177 
 migration/xbzrle.h   |   4 +
 tests/unit/test-xbzrle.c | 307 +--
 5 files changed, 908 insertions(+), 20 deletions(-)

diff --git a/configure b/configure
index 4f12481765..7685479297 100755
--- a/configure
+++ b/configure
@@ -587,6 +587,431 @@ else
   cpu=$(uname -m)
 fi
 
+# cpu flag for x86
+x86_cpu_flags=""
+
+get_x86_cpu_flags(){
+# check sse flag
+cat > $TMPC << EOF
+#include 
+int main(int argc, char *argv[])
+{
+__m128 a,b;
+int res= _mm_ucomieq_ss(a,b);
+return 0;
+}
+EOF
+local_cpu_flag="-msse"
+rm -f  $TMPE
+do_cc $local_cpu_flag  -o $TMPE $TMPC
+if [ -e  $TMPE ]
+then
+x86_cpu_flags="$x86_cpu_flags -msse"
+fi
+
+# check sse2 flag
+cat > $TMPC << EOF
+#include 
+int main(int argc, char *argv[])
+{
+__m128d a,b;
+int res= _mm_ucomineq_sd (a, b);
+return 0;
+}
+EOF
+local_cpu_flag="-msse2"
+rm -f  $TMPE
+do_cc $local_cpu_flag  -o $TMPE $TMPC
+if [ -e  $TMPE ]
+then
+x86_cpu_flags="$x86_cpu_flags -msse2"
+fi
+
+# check sse3 flag
+cat > $TMPC << EOF
+#include 
+int main(int argc, char *argv[])
+{
+__m128 a;
+__m128  res= _mm_moveldup_ps (a);
+return 0;
+}
+EOF
+local_cpu_flag="-msse3"
+rm -f  $TMPE
+do_cc $local_cpu_flag  -o $TMPE $TMPC
+if [ -e  $TMPE ]
+then
+x86_cpu_flags="$x86_cpu_flags -msse3"
+fi
+
+# check ssse3 flag
+cat > $TMPC << EOF
+#include 
+int main(int argc, char *argv[])
+{
+__m128i a;
+__m12i8  res= _mm_abs_epi32 (a);
+return 0;
+}
+EOF
+local_cpu_flag="-mssse3"
+rm -f  $TMPE
+do_cc $local_cpu_flag  -o $TMPE $TMPC
+if [ -e  $TMPE ]
+then
+x86_cpu_flags="$x86_cpu_flags -mssse3"
+fi
+
+# check sse4.1 flag
+cat > $TMPC << EOF
+#include 
+int main(int argc, char *argv[])
+{
+__m128 a;
+__m128 res=_mm_ceil_ps (a);
+return 0;
+}
+EOF
+local_cpu_flag="-msse4.1"
+rm -f  $TMPE
+do_cc $local_cpu_flag  -o $TMPE $TMPC
+if [ -e  $TMPE ]
+then
+x86_cpu_flags="$x86_cpu_flags -msse4.1"
+fi
+
+# check sse4.2 flag
+cat > $TMPC << EOF
+#include 
+int main(int argc, char *argv[])
+{
+__m128i a,b;
+__m128i res=_mm_cmpgt_epi64 (a,b);
+return 0;
+}
+EOF
+local_cpu_flag="-msse4.2"
+rm -f  $TMPE
+do_cc $local_cpu_flag  -o $TMPE $TMPC
+if [ -e  $TMPE ]
+then
+x86_cpu_flags="$x86_cpu_flags -msse4.2"
+fi
+
+# check avx flag
+cat > $TMPC << EOF
+#include 
+int main(int argc, char *argv[])
+{
+__m256 a;
+__m256 res= _mm256_ceil_ps(a);
+return 0;
+}
+EOF
+local_cpu_flag="-mavx"
+rm -f  $TMPE
+do_cc $local_cpu_flag  -o $TMPE $TMPC
+if [ -e  $TMPE ]
+then
+x86_cpu_flags="$x86_cpu_flags -mavx"
+fi
+
+# check avx2 flag
+cat > $TMPC << EOF
+#include 
+int main(int argc, char *argv[])
+{
+__m256i a;
+__m256i res= _mm256_abs_epi32(a);
+return 0;
+}
+EOF
+local_cpu_flag="-mavx2"
+rm -f  $TMPE
+do_cc $local_cpu_flag  -o $TMPE $TMPC
+if [ -e  $TMPE ]
+then
+x86_cpu_flags="$x86_cpu_flags -mavx2"
+fi
+
+# check AVX_VNNI flag
+cat > $TMPC << EOF
+#include 
+int main(int argc, char *argv[])
+{
+__m128i a,b,c;
+__m128i res= _mm_dpbusd_epi32(a,b,c);
+return 0;
+}
+EOF
+local_cpu_flag="-mavxvnni"
+rm -f  $TMPE
+do_cc $local_cpu_flag  -o $TMPE $TMPC
+if [ -e  $TMPE ]
+then
+x86_cpu_flags="$x86_cpu_flags -mavxvnni"
+fi
+
+# check AVX512f flag
+cat > $TMPC << EOF
+#include 
+int main(int argc, char *argv[])
+{
+__m128d a;
+__m512d res= _mm512_broadcastsd_pd(a);
+return 0;
+}
+EOF
+local_cpu_flag="-mavx512f"
+rm -f  $TMPE
+do_cc $local_cpu_flag  -o $TMPE $TMPC
+if [ -e  $TMPE ]
+then
+x86_cpu_flags="$x86_cpu_flags -mavx512f"
+fi
+
+# check AVX512bw flag
+cat > $TMPC << EOF
+#include 
+int mai