On 20/01/2016 10:05, Liang Li wrote: > buffer_find_nonzero_offset() is a hot function during live migration. > Now it use SSE2 instructions for optimization. For platform supports > AVX2 instructions, use AVX2 instructions for optimization can help > to improve the performance about 30% comparing to SSE2. > > Zero page check can be faster with this optimization, the test result > shows that for an 8GiB RAM idle guest just boots, this patch can help > to shorten the total live migration time about 6%. > > This patch use the ifunc mechanism to select the proper function when > running, for platform supports AVX2, execute the AVX2 instructions, > else, execute the original instructions. > > Signed-off-by: Liang Li <liang.z...@intel.com>
Reviewed-by: Paolo Bonzini <pbonz...@redhat.com> > --- > include/qemu-common.h | 8 +--- > util/cutils.c | 118 > ++++++++++++++++++++++++++++++++++++++++++++++++-- > 2 files changed, 115 insertions(+), 11 deletions(-) > > diff --git a/include/qemu-common.h b/include/qemu-common.h > index 22b010c..f4c8c24 100644 > --- a/include/qemu-common.h > +++ b/include/qemu-common.h > @@ -483,13 +483,7 @@ void qemu_hexdump(const char *buf, FILE *fp, const char > *prefix, size_t size); > #endif > > #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8 > -static inline bool > -can_use_buffer_find_nonzero_offset(const void *buf, size_t len) > -{ > - return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR > - * sizeof(VECTYPE)) == 0 > - && ((uintptr_t) buf) % sizeof(VECTYPE) == 0); > -} > +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len); > size_t buffer_find_nonzero_offset(const void *buf, size_t len); > > /* > diff --git a/util/cutils.c b/util/cutils.c > index cfeb848..5c8ee5c 100644 > --- a/util/cutils.c > +++ b/util/cutils.c > @@ -161,6 +161,14 @@ int qemu_fdatasync(int fd) > #endif > } > > +static bool > +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len) > +{ > + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR > + * sizeof(VECTYPE)) == 0 > + && ((uintptr_t) buf) % sizeof(VECTYPE) == 0); > +} > + > /* > * Searches for an area with non-zero content in a buffer > * > @@ -169,8 +177,8 @@ int qemu_fdatasync(int fd) > * and addr must be a multiple of sizeof(VECTYPE) due to > * restriction of optimizations in this function. > * > - * can_use_buffer_find_nonzero_offset() can be used to check > - * these requirements. > + * can_use_buffer_find_nonzero_offset_inner() can be used to > + * check these requirements. > * > * The return value is the offset of the non-zero area rounded > * down to a multiple of sizeof(VECTYPE) for the first > @@ -181,13 +189,13 @@ int qemu_fdatasync(int fd) > * If the buffer is all zero the return value is equal to len. > */ > > -size_t buffer_find_nonzero_offset(const void *buf, size_t len) > +static size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len) > { > const VECTYPE *p = buf; > const VECTYPE zero = (VECTYPE){0}; > size_t i; > > - assert(can_use_buffer_find_nonzero_offset(buf, len)); > + assert(can_use_buffer_find_nonzero_offset_inner(buf, len)); > > if (!len) { > return 0; > @@ -216,6 +224,108 @@ size_t buffer_find_nonzero_offset(const void *buf, > size_t len) > return i * sizeof(VECTYPE); > } > > +#ifdef CONFIG_AVX2_OPT > +#pragma GCC push_options > +#pragma GCC target("avx2") > +#include <cpuid.h> > +#include <immintrin.h> > + > +#define AVX2_VECTYPE __m256i > +#define AVX2_SPLAT(p) _mm256_set1_epi8(*(p)) > +#define AVX2_ALL_EQ(v1, v2) \ > + (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF) > +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2)) > + > +static bool > +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len) > +{ > + return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR > + * sizeof(AVX2_VECTYPE)) == 0 > + && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0); > +} > + > +static size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) > +{ > + const AVX2_VECTYPE *p = buf; > + const AVX2_VECTYPE zero = (AVX2_VECTYPE){0}; > + size_t i; > + > + assert(can_use_buffer_find_nonzero_offset_avx2(buf, len)); > + > + if (!len) { > + return 0; > + } > + > + for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) { > + if (!AVX2_ALL_EQ(p[i], zero)) { > + return i * sizeof(AVX2_VECTYPE); > + } > + } > + > + for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; > + i < len / sizeof(AVX2_VECTYPE); > + i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) { > + AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]); > + AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]); > + AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]); > + AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]); > + AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1); > + AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3); > + if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) { > + break; > + } > + } > + > + return i * sizeof(AVX2_VECTYPE); > +} > + > +static bool avx2_support(void) > +{ > + int a, b, c, d; > + > + if (__get_cpuid_max(0, NULL) < 7) { > + return false; > + } > + > + __cpuid_count(7, 0, a, b, c, d); > + > + return b & bit_AVX2; > +} > + > +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) \ > + __attribute__ ((ifunc("can_use_buffer_find_nonzero_offset_ifunc"))); > +size_t buffer_find_nonzero_offset(const void *buf, size_t len) \ > + __attribute__ ((ifunc("buffer_find_nonzero_offset_ifunc"))); > + > +static void *buffer_find_nonzero_offset_ifunc(void) > +{ > + typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ? > + buffer_find_nonzero_offset_avx2 : buffer_find_nonzero_offset_inner; > + > + return func; > +} > + > +static void *can_use_buffer_find_nonzero_offset_ifunc(void) > +{ > + typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ? > + can_use_buffer_find_nonzero_offset_avx2 : > + can_use_buffer_find_nonzero_offset_inner; > + > + return func; > +} > +#pragma GCC pop_options > +#else > +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len) > +{ > + return can_use_buffer_find_nonzero_offset_inner(buf, len); > +} > + > +size_t buffer_find_nonzero_offset(const void *buf, size_t len) > +{ > + return buffer_find_nonzero_offset_inner(buf, len); > +} > +#endif > + > /* > * Checks if a buffer is all zeroes > * >