RE: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
Hi, Richard, Thanks for your nice comments! Your suggestions are very helpful. We have revised code in ram.c according to your comments. As for "unroll residual from main loop" problem in algorithm, we will fix this later. Thanks for your time and patience~ Best Regards, Ling -Original Message- From: Richard Henderson Sent: Wednesday, August 10, 2022 2:25 AM To: Xu, Ling1 ; quint...@redhat.com Cc: qemu-devel@nongnu.org; dgilb...@redhat.com; Zhao, Zhou ; Jin, Jun I Subject: Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function On 8/9/22 00:51, Xu, Ling1 wrote: > Hi, Juan, >Thanks for your advice. We have revised our code including: 1) change > "IS_CPU_SUPPORT_AVX512BW" to "is_cpu_support_avx512bw" to indicate that > variable isn't global variable; You can remove this variable entirely... > 2) use a function pointer to simplify code in ram.c; ... because it's redundant with the function pointer. r~
Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
On 8/8/22 00:48, ling xu wrote: This commit update runtime check of AVX512, and implements avx512 of xbzrle_encode_buffer function to accelerate xbzrle encoding speed. Compared with C version of xbzrle_encode_buffer function, avx512 version can achieve almost 60%-70% performance improvement on unit test provided by Qemu. In addition, we provide one more unit test called "test_encode_decode_random", in which dirty data are randomly located in 4K page, and this case can achieve almost 140% performance gain. Signed-off-by: ling xu Co-authored-by: Zhou Zhao Co-authored-by: Jun Jin --- meson.build| 16 meson_options.txt | 2 + migration/ram.c| 41 ++ migration/xbzrle.c | 181 + migration/xbzrle.h | 4 + 5 files changed, 244 insertions(+) diff --git a/meson.build b/meson.build index 294e9a8f32..4222b77e9f 100644 --- a/meson.build +++ b/meson.build @@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \ int main(int argc, char *argv[]) { return bar(argv[0]); } '''), error_message: 'AVX512F not available').allowed()) +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \ + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \ + .require(cc.links(''' +#pragma GCC push_options +#pragma GCC target("avx512bw") +#include +#include +static int bar(void *a) { + + __m512i x = *(__m512i *)a; + __m512i res= _mm512_abs_epi8(x); + return res[1]; +} +int main(int argc, char *argv[]) { return bar(argv[0]); } + '''), error_message: 'AVX512BW not available').allowed()) + have_pvrdma = get_option('pvrdma') \ .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \ .require(cc.compiles(gnu_source_prefix + ''' diff --git a/meson_options.txt b/meson_options.txt index e58e158396..07194bf680 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto', description: 'AVX2 optimizations') option('avx512f', type: 'feature', value: 'disabled', description: 'AVX512F optimizations') +option('avx512bw', type: 'feature', value: 'auto', + description: 'AVX512BW optimizations') option('keyring', type: 'feature', value: 'auto', description: 'Linux keyring support') diff --git a/migration/ram.c b/migration/ram.c index dc1de9ddbc..d9c1ac2f7a 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -83,6 +83,35 @@ /* 0x80 is reserved in migration.h start with 0x100 next */ #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100 +#if defined(CONFIG_AVX512BW_OPT) +static bool IS_CPU_SUPPORT_AVX512BW; +#include "qemu/cpuid.h" +static void __attribute__((constructor)) init_cpu_flag(void) +{ +unsigned max = __get_cpuid_max(0, NULL); +int a, b, c, d; +IS_CPU_SUPPORT_AVX512BW = false; +if (max >= 1) { +__cpuid(1, a, b, c, d); + /* We must check that AVX is not just available, but usable. */ +if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { +int bv; +__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); +__cpuid_count(7, 0, a, b, c, d); + /* 0xe6: +* XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 +*and ZMM16-ZMM31 state are enabled by OS) +* XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) +*/ +if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { +IS_CPU_SUPPORT_AVX512BW = true; +} +} +} +return ; +} +#endif + XBZRLECacheStats xbzrle_counters; /* struct contains XBZRLE cache and a static page @@ -802,9 +831,21 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data, memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); /* XBZRLE encoding (if there is no overflow) */ +#if defined(CONFIG_AVX512BW_OPT) +if (likely(IS_CPU_SUPPORT_AVX512BW)) { +encoded_len = xbzrle_encode_buffer_512(prev_cached_page, XBZRLE.current_buf, + TARGET_PAGE_SIZE, XBZRLE.encoded_buf, + TARGET_PAGE_SIZE); +} else { +encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, + TARGET_PAGE_SIZE, XBZRLE.encoded_buf, + TARGET_PAGE_SIZE); +} +#else encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE, XBZRLE.encoded_buf, TARGET_PAGE_SIZE); +#endif /* * Update the cache contents, so that it corresponds to the data diff --git a/migration/xbzrle.c b/migration/xbzrle.c index 1ba482ded9..4db09fdbdb
Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
On 8/9/22 00:51, Xu, Ling1 wrote: Hi, Juan, Thanks for your advice. We have revised our code including: 1) change "IS_CPU_SUPPORT_AVX512BW" to "is_cpu_support_avx512bw" to indicate that variable isn't global variable; You can remove this variable entirely... 2) use a function pointer to simplify code in ram.c; ... because it's redundant with the function pointer. r~
RE: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
Hi, Juan, Thanks for your advice. We have revised our code including: 1) change "IS_CPU_SUPPORT_AVX512BW" to "is_cpu_support_avx512bw" to indicate that variable isn't global variable; 2) use a function pointer to simplify code in ram.c; 3) change function name "xbzrle_encode_buffer_512" to "xbzrle_encode_buffer_avx512", change variable "res" to "countResidual" for better understanding, and replace "unsigned long long" with "uint64_t". We will submit patch v4 to fix all issues mentioned in comments. Best Regard, Ling -Original Message- From: Juan Quintela Sent: Monday, August 8, 2022 9:12 PM To: Xu, Ling1 Cc: qemu-devel@nongnu.org; dgilb...@redhat.com; Zhao, Zhou ; Jin, Jun I Subject: Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function ling xu wrote: > This commit update runtime check of AVX512, and implements avx512 of > xbzrle_encode_buffer function to accelerate xbzrle encoding speed. > Compared with C version of xbzrle_encode_buffer function, avx512 > version can achieve almost 60%-70% performance improvement on unit > test provided by Qemu. In addition, we provide one more unit test > called "test_encode_decode_random", in which dirty data are randomly > located in 4K page, and this case can achieve almost 140% performance gain. > > Signed-off-by: ling xu > Co-authored-by: Zhou Zhao > Co-authored-by: Jun Jin > --- > meson.build| 16 > meson_options.txt | 2 + > migration/ram.c| 41 ++ > migration/xbzrle.c | 181 + > migration/xbzrle.h | 4 + > 5 files changed, 244 insertions(+) > > diff --git a/meson.build b/meson.build index 294e9a8f32..4222b77e9f > 100644 > --- a/meson.build > +++ b/meson.build > @@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', > get_option('avx512f') \ > int main(int argc, char *argv[]) { return bar(argv[0]); } >'''), error_message: 'AVX512F not available').allowed()) > > +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \ > + .require(have_cpuid_h, error_message: 'cpuid.h not available, > +cannot enable AVX512BW') \ > + .require(cc.links(''' > +#pragma GCC push_options > +#pragma GCC target("avx512bw") > +#include > +#include > +static int bar(void *a) { > + > + __m512i x = *(__m512i *)a; > + __m512i res= _mm512_abs_epi8(x); > + return res[1]; > +} > +int main(int argc, char *argv[]) { return bar(argv[0]); } '''), > + error_message: 'AVX512BW not available').allowed()) > + > have_pvrdma = get_option('pvrdma') \ >.require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics > libraries') \ >.require(cc.compiles(gnu_source_prefix + ''' > diff --git a/meson_options.txt b/meson_options.txt index > e58e158396..07194bf680 100644 > --- a/meson_options.txt > +++ b/meson_options.txt > @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto', > description: 'AVX2 optimizations') option('avx512f', type: > 'feature', value: 'disabled', > description: 'AVX512F optimizations') > +option('avx512bw', type: 'feature', value: 'auto', > + description: 'AVX512BW optimizations') > option('keyring', type: 'feature', value: 'auto', > description: 'Linux keyring support') > [no clue about meson, it looks ok] > diff --git a/migration/ram.c b/migration/ram.c index > dc1de9ddbc..d9c1ac2f7a 100644 > --- a/migration/ram.c > +++ b/migration/ram.c > @@ -83,6 +83,35 @@ > /* 0x80 is reserved in migration.h start with 0x100 next */ > #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100 > > +#if defined(CONFIG_AVX512BW_OPT) > +static bool IS_CPU_SUPPORT_AVX512BW; An all caps global variable? > +#include "qemu/cpuid.h" > +static void __attribute__((constructor)) init_cpu_flag(void) { > +unsigned max = __get_cpuid_max(0, NULL); > +int a, b, c, d; > +IS_CPU_SUPPORT_AVX512BW = false; > +if (max >= 1) { > +__cpuid(1, a, b, c, d); > + /* We must check that AVX is not just available, but usable. */ > +if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { > +int bv; > +__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); > +__cpuid_count(7, 0, a, b, c, d); > + /* 0xe6: > +* XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 > +*and ZMM16-ZMM31 state are enabled by OS) > +* XCR0[2:1] = 11b (XMM state and YMM state are enabled b
Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
ling xu wrote: > This commit update runtime check of AVX512, and implements avx512 of > xbzrle_encode_buffer function to accelerate xbzrle encoding speed. > Compared with C version of xbzrle_encode_buffer function, avx512 version > can achieve almost 60%-70% performance improvement on unit test provided > by Qemu. In addition, we provide one more unit test called > "test_encode_decode_random", in which dirty data are randomly located in > 4K page, and this case can achieve almost 140% performance gain. > > Signed-off-by: ling xu > Co-authored-by: Zhou Zhao > Co-authored-by: Jun Jin > --- > meson.build| 16 > meson_options.txt | 2 + > migration/ram.c| 41 ++ > migration/xbzrle.c | 181 + > migration/xbzrle.h | 4 + > 5 files changed, 244 insertions(+) > > diff --git a/meson.build b/meson.build > index 294e9a8f32..4222b77e9f 100644 > --- a/meson.build > +++ b/meson.build > @@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', > get_option('avx512f') \ > int main(int argc, char *argv[]) { return bar(argv[0]); } >'''), error_message: 'AVX512F not available').allowed()) > > +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \ > + .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot > enable AVX512BW') \ > + .require(cc.links(''' > +#pragma GCC push_options > +#pragma GCC target("avx512bw") > +#include > +#include > +static int bar(void *a) { > + > + __m512i x = *(__m512i *)a; > + __m512i res= _mm512_abs_epi8(x); > + return res[1]; > +} > +int main(int argc, char *argv[]) { return bar(argv[0]); } > + '''), error_message: 'AVX512BW not available').allowed()) > + > have_pvrdma = get_option('pvrdma') \ >.require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics > libraries') \ >.require(cc.compiles(gnu_source_prefix + ''' > diff --git a/meson_options.txt b/meson_options.txt > index e58e158396..07194bf680 100644 > --- a/meson_options.txt > +++ b/meson_options.txt > @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto', > description: 'AVX2 optimizations') > option('avx512f', type: 'feature', value: 'disabled', > description: 'AVX512F optimizations') > +option('avx512bw', type: 'feature', value: 'auto', > + description: 'AVX512BW optimizations') > option('keyring', type: 'feature', value: 'auto', > description: 'Linux keyring support') > [no clue about meson, it looks ok] > diff --git a/migration/ram.c b/migration/ram.c > index dc1de9ddbc..d9c1ac2f7a 100644 > --- a/migration/ram.c > +++ b/migration/ram.c > @@ -83,6 +83,35 @@ > /* 0x80 is reserved in migration.h start with 0x100 next */ > #define RAM_SAVE_FLAG_COMPRESS_PAGE0x100 > > +#if defined(CONFIG_AVX512BW_OPT) > +static bool IS_CPU_SUPPORT_AVX512BW; An all caps global variable? > +#include "qemu/cpuid.h" > +static void __attribute__((constructor)) init_cpu_flag(void) > +{ > +unsigned max = __get_cpuid_max(0, NULL); > +int a, b, c, d; > +IS_CPU_SUPPORT_AVX512BW = false; > +if (max >= 1) { > +__cpuid(1, a, b, c, d); > + /* We must check that AVX is not just available, but usable. */ > +if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { > +int bv; > +__asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); > +__cpuid_count(7, 0, a, b, c, d); > + /* 0xe6: > +* XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 > +*and ZMM16-ZMM31 state are enabled by OS) > +* XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) > +*/ > +if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) { > +IS_CPU_SUPPORT_AVX512BW = true; > +} > +} > +} > +return ; > +} > +#endif > + > XBZRLECacheStats xbzrle_counters; > > /* struct contains XBZRLE cache and a static page > @@ -802,9 +831,21 @@ static int save_xbzrle_page(RAMState *rs, uint8_t > **current_data, > memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); > > /* XBZRLE encoding (if there is no overflow) */ > +#if defined(CONFIG_AVX512BW_OPT) > +if (likely(IS_CPU_SUPPORT_AVX512BW)) { All distributions are go to have compile time support for AVX, but I am not sure the percentage of machines that support avx > +encoded_len = xbzrle_encode_buffer_512(prev_cached_page, > XBZRLE.current_buf, > + TARGET_PAGE_SIZE, > XBZRLE.encoded_buf, > + TARGET_PAGE_SIZE); > +} else { > +encoded_len = xbzrle_encode_buffer(prev_cached_page, > XBZRLE.current_buf, > + TARGET_PAGE_SIZE, > XBZRLE.encoded_buf, > + TARGET_PAGE_SIZE); > +} the else part is the