RE: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function

2022-08-11 Thread Xu, Ling1
Hi, Richard,
  Thanks for your nice comments! Your suggestions are very helpful. We have 
revised code in ram.c according to your comments. As for "unroll residual from 
main loop" problem in algorithm, we will fix this later. Thanks for your time 
and patience~

Best Regards,
Ling

-Original Message-
From: Richard Henderson  
Sent: Wednesday, August 10, 2022 2:25 AM
To: Xu, Ling1 ; quint...@redhat.com
Cc: qemu-devel@nongnu.org; dgilb...@redhat.com; Zhao, Zhou 
; Jin, Jun I 
Subject: Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer 
function

On 8/9/22 00:51, Xu, Ling1 wrote:
> Hi, Juan,
>Thanks for your advice. We have revised our code including: 1) change 
> "IS_CPU_SUPPORT_AVX512BW" to "is_cpu_support_avx512bw" to indicate that 
> variable isn't global variable;

You can remove this variable entirely...

> 2) use a function pointer to simplify code in ram.c;

... because it's redundant with the function pointer.


r~


Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function

2022-08-09 Thread Richard Henderson

On 8/8/22 00:48, ling xu wrote:

This commit update runtime check of AVX512, and implements avx512 of
xbzrle_encode_buffer function to accelerate xbzrle encoding speed.
Compared with C version of xbzrle_encode_buffer function, avx512 version
can achieve almost 60%-70% performance improvement on unit test provided
by Qemu. In addition, we provide one more unit test called
"test_encode_decode_random", in which dirty data are randomly located in
4K page, and this case can achieve almost 140% performance gain.

Signed-off-by: ling xu 
Co-authored-by: Zhou Zhao 
Co-authored-by: Jun Jin 
---
  meson.build|  16 
  meson_options.txt  |   2 +
  migration/ram.c|  41 ++
  migration/xbzrle.c | 181 +
  migration/xbzrle.h |   4 +
  5 files changed, 244 insertions(+)

diff --git a/meson.build b/meson.build
index 294e9a8f32..4222b77e9f 100644
--- a/meson.build
+++ b/meson.build
@@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', 
get_option('avx512f') \
  int main(int argc, char *argv[]) { return bar(argv[0]); }
'''), error_message: 'AVX512F not available').allowed())
  
+config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \

+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable 
AVX512BW') \
+  .require(cc.links('''
+#pragma GCC push_options
+#pragma GCC target("avx512bw")
+#include 
+#include 
+static int bar(void *a) {
+
+  __m512i x = *(__m512i *)a;
+  __m512i res= _mm512_abs_epi8(x);
+  return res[1];
+}
+int main(int argc, char *argv[]) { return bar(argv[0]); }
+  '''), error_message: 'AVX512BW not available').allowed())
+
  have_pvrdma = get_option('pvrdma') \
.require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics 
libraries') \
.require(cc.compiles(gnu_source_prefix + '''
diff --git a/meson_options.txt b/meson_options.txt
index e58e158396..07194bf680 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
 description: 'AVX2 optimizations')
  option('avx512f', type: 'feature', value: 'disabled',
 description: 'AVX512F optimizations')
+option('avx512bw', type: 'feature', value: 'auto',
+   description: 'AVX512BW optimizations')
  option('keyring', type: 'feature', value: 'auto',
 description: 'Linux keyring support')
  
diff --git a/migration/ram.c b/migration/ram.c

index dc1de9ddbc..d9c1ac2f7a 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -83,6 +83,35 @@
  /* 0x80 is reserved in migration.h start with 0x100 next */
  #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100
  
+#if defined(CONFIG_AVX512BW_OPT)

+static bool IS_CPU_SUPPORT_AVX512BW;
+#include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void)
+{
+unsigned max = __get_cpuid_max(0, NULL);
+int a, b, c, d;
+IS_CPU_SUPPORT_AVX512BW = false;
+if (max >= 1) {
+__cpuid(1, a, b, c, d);
+ /* We must check that AVX is not just available, but usable.  */
+if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+int bv;
+__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+__cpuid_count(7, 0, a, b, c, d);
+   /* 0xe6:
+*  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+*and ZMM16-ZMM31 state are enabled by OS)
+*  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+*/
+if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+IS_CPU_SUPPORT_AVX512BW = true;
+}
+}
+}
+return ;
+}
+#endif
+
  XBZRLECacheStats xbzrle_counters;
  
  /* struct contains XBZRLE cache and a static page

@@ -802,9 +831,21 @@ static int save_xbzrle_page(RAMState *rs, uint8_t 
**current_data,
  memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
  
  /* XBZRLE encoding (if there is no overflow) */

+#if defined(CONFIG_AVX512BW_OPT)
+if (likely(IS_CPU_SUPPORT_AVX512BW)) {
+encoded_len = xbzrle_encode_buffer_512(prev_cached_page, 
XBZRLE.current_buf,
+   TARGET_PAGE_SIZE, 
XBZRLE.encoded_buf,
+   TARGET_PAGE_SIZE);
+} else {
+encoded_len = xbzrle_encode_buffer(prev_cached_page, 
XBZRLE.current_buf,
+   TARGET_PAGE_SIZE, 
XBZRLE.encoded_buf,
+   TARGET_PAGE_SIZE);
+}
+#else
  encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 TARGET_PAGE_SIZE);
+#endif
  
  /*

   * Update the cache contents, so that it corresponds to the data
diff --git a/migration/xbzrle.c b/migration/xbzrle.c
index 1ba482ded9..4db09fdbdb 

Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function

2022-08-09 Thread Richard Henderson

On 8/9/22 00:51, Xu, Ling1 wrote:

Hi, Juan,
   Thanks for your advice. We have revised our code including: 1) change 
"IS_CPU_SUPPORT_AVX512BW" to "is_cpu_support_avx512bw" to indicate that 
variable isn't global variable;


You can remove this variable entirely...


2) use a function pointer to simplify code in ram.c;


... because it's redundant with the function pointer.


r~



RE: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function

2022-08-09 Thread Xu, Ling1
Hi, Juan, 
  Thanks for your advice. We have revised our code including: 1) change 
"IS_CPU_SUPPORT_AVX512BW" to "is_cpu_support_avx512bw" to indicate that 
variable isn't global variable; 2) use a function pointer to simplify code in 
ram.c; 3) change function name "xbzrle_encode_buffer_512" to 
"xbzrle_encode_buffer_avx512", change variable "res" to "countResidual" for 
better understanding, and replace "unsigned long long" with "uint64_t". 
   We will submit patch v4 to fix all issues mentioned in comments. 

Best Regard,
Ling

-Original Message-
From: Juan Quintela  
Sent: Monday, August 8, 2022 9:12 PM
To: Xu, Ling1 
Cc: qemu-devel@nongnu.org; dgilb...@redhat.com; Zhao, Zhou 
; Jin, Jun I 
Subject: Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer 
function

ling xu  wrote:
> This commit update runtime check of AVX512, and implements avx512 of 
> xbzrle_encode_buffer function to accelerate xbzrle encoding speed.
> Compared with C version of xbzrle_encode_buffer function, avx512 
> version can achieve almost 60%-70% performance improvement on unit 
> test provided by Qemu. In addition, we provide one more unit test 
> called "test_encode_decode_random", in which dirty data are randomly 
> located in 4K page, and this case can achieve almost 140% performance gain.
>
> Signed-off-by: ling xu 
> Co-authored-by: Zhou Zhao 
> Co-authored-by: Jun Jin 
> ---
>  meson.build|  16 
>  meson_options.txt  |   2 +
>  migration/ram.c|  41 ++
>  migration/xbzrle.c | 181 +
>  migration/xbzrle.h |   4 +
>  5 files changed, 244 insertions(+)
>
> diff --git a/meson.build b/meson.build index 294e9a8f32..4222b77e9f 
> 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', 
> get_option('avx512f') \
>  int main(int argc, char *argv[]) { return bar(argv[0]); }
>'''), error_message: 'AVX512F not available').allowed())
>  
> +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
> +  .require(have_cpuid_h, error_message: 'cpuid.h not available, 
> +cannot enable AVX512BW') \
> +  .require(cc.links('''
> +#pragma GCC push_options
> +#pragma GCC target("avx512bw")
> +#include 
> +#include 
> +static int bar(void *a) {
> +
> +  __m512i x = *(__m512i *)a;
> +  __m512i res= _mm512_abs_epi8(x);
> +  return res[1];
> +}
> +int main(int argc, char *argv[]) { return bar(argv[0]); }  '''), 
> + error_message: 'AVX512BW not available').allowed())
> +
>  have_pvrdma = get_option('pvrdma') \
>.require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics 
> libraries') \
>.require(cc.compiles(gnu_source_prefix + '''
> diff --git a/meson_options.txt b/meson_options.txt index 
> e58e158396..07194bf680 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
> description: 'AVX2 optimizations')  option('avx512f', type: 
> 'feature', value: 'disabled',
> description: 'AVX512F optimizations')
> +option('avx512bw', type: 'feature', value: 'auto',
> +   description: 'AVX512BW optimizations')
>  option('keyring', type: 'feature', value: 'auto',
> description: 'Linux keyring support')
>  

[no clue about meson, it looks ok]

> diff --git a/migration/ram.c b/migration/ram.c index 
> dc1de9ddbc..d9c1ac2f7a 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -83,6 +83,35 @@
>  /* 0x80 is reserved in migration.h start with 0x100 next */
>  #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100
>  
> +#if defined(CONFIG_AVX512BW_OPT)
> +static bool IS_CPU_SUPPORT_AVX512BW;

An all caps global variable?

> +#include "qemu/cpuid.h"
> +static void __attribute__((constructor)) init_cpu_flag(void) {
> +unsigned max = __get_cpuid_max(0, NULL);
> +int a, b, c, d;
> +IS_CPU_SUPPORT_AVX512BW = false;
> +if (max >= 1) {
> +__cpuid(1, a, b, c, d);
> + /* We must check that AVX is not just available, but usable.  */
> +if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
> +int bv;
> +__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
> +__cpuid_count(7, 0, a, b, c, d);
> +   /* 0xe6:
> +*  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
> +*and ZMM16-ZMM31 state are enabled by OS)
> +*  XCR0[2:1] = 11b (XMM state and YMM state are enabled b

Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function

2022-08-08 Thread Juan Quintela
ling xu  wrote:
> This commit update runtime check of AVX512, and implements avx512 of
> xbzrle_encode_buffer function to accelerate xbzrle encoding speed.
> Compared with C version of xbzrle_encode_buffer function, avx512 version
> can achieve almost 60%-70% performance improvement on unit test provided
> by Qemu. In addition, we provide one more unit test called
> "test_encode_decode_random", in which dirty data are randomly located in
> 4K page, and this case can achieve almost 140% performance gain.
>
> Signed-off-by: ling xu 
> Co-authored-by: Zhou Zhao 
> Co-authored-by: Jun Jin 
> ---
>  meson.build|  16 
>  meson_options.txt  |   2 +
>  migration/ram.c|  41 ++
>  migration/xbzrle.c | 181 +
>  migration/xbzrle.h |   4 +
>  5 files changed, 244 insertions(+)
>
> diff --git a/meson.build b/meson.build
> index 294e9a8f32..4222b77e9f 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', 
> get_option('avx512f') \
>  int main(int argc, char *argv[]) { return bar(argv[0]); }
>'''), error_message: 'AVX512F not available').allowed())
>  
> +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
> +  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot 
> enable AVX512BW') \
> +  .require(cc.links('''
> +#pragma GCC push_options
> +#pragma GCC target("avx512bw")
> +#include 
> +#include 
> +static int bar(void *a) {
> +
> +  __m512i x = *(__m512i *)a;
> +  __m512i res= _mm512_abs_epi8(x);
> +  return res[1];
> +}
> +int main(int argc, char *argv[]) { return bar(argv[0]); }
> +  '''), error_message: 'AVX512BW not available').allowed())
> +
>  have_pvrdma = get_option('pvrdma') \
>.require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics 
> libraries') \
>.require(cc.compiles(gnu_source_prefix + '''
> diff --git a/meson_options.txt b/meson_options.txt
> index e58e158396..07194bf680 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
> description: 'AVX2 optimizations')
>  option('avx512f', type: 'feature', value: 'disabled',
> description: 'AVX512F optimizations')
> +option('avx512bw', type: 'feature', value: 'auto',
> +   description: 'AVX512BW optimizations')
>  option('keyring', type: 'feature', value: 'auto',
> description: 'Linux keyring support')
>  

[no clue about meson, it looks ok]

> diff --git a/migration/ram.c b/migration/ram.c
> index dc1de9ddbc..d9c1ac2f7a 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -83,6 +83,35 @@
>  /* 0x80 is reserved in migration.h start with 0x100 next */
>  #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100
>  
> +#if defined(CONFIG_AVX512BW_OPT)
> +static bool IS_CPU_SUPPORT_AVX512BW;

An all caps global variable?

> +#include "qemu/cpuid.h"
> +static void __attribute__((constructor)) init_cpu_flag(void)
> +{
> +unsigned max = __get_cpuid_max(0, NULL);
> +int a, b, c, d;
> +IS_CPU_SUPPORT_AVX512BW = false;
> +if (max >= 1) {
> +__cpuid(1, a, b, c, d);
> + /* We must check that AVX is not just available, but usable.  */
> +if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
> +int bv;
> +__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
> +__cpuid_count(7, 0, a, b, c, d);
> +   /* 0xe6:
> +*  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
> +*and ZMM16-ZMM31 state are enabled by OS)
> +*  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
> +*/
> +if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
> +IS_CPU_SUPPORT_AVX512BW = true;
> +}
> +}
> +}
> +return ;
> +}
> +#endif
> +
>  XBZRLECacheStats xbzrle_counters;
>  
>  /* struct contains XBZRLE cache and a static page
> @@ -802,9 +831,21 @@ static int save_xbzrle_page(RAMState *rs, uint8_t 
> **current_data,
>  memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
>  
>  /* XBZRLE encoding (if there is no overflow) */
> +#if defined(CONFIG_AVX512BW_OPT)
> +if (likely(IS_CPU_SUPPORT_AVX512BW)) {

All distributions are go to have compile time support for AVX, but I am
not sure the percentage of machines that support avx

> +encoded_len = xbzrle_encode_buffer_512(prev_cached_page, 
> XBZRLE.current_buf,
> +   TARGET_PAGE_SIZE, 
> XBZRLE.encoded_buf,
> +   TARGET_PAGE_SIZE);
> +} else {
> +encoded_len = xbzrle_encode_buffer(prev_cached_page, 
> XBZRLE.current_buf,
> +   TARGET_PAGE_SIZE, 
> XBZRLE.encoded_buf,
> +   TARGET_PAGE_SIZE);
> +}

the else part is the