On Wed, Jan 16, 2019 at 4:57 PM Raghuveer Devulapalli <raghuveer.devulapa...@intel.com> wrote: > > From: raghuveer devulapalli <raghuveer.devulapa...@intel.com> > > These were found to be upto 1.8 times faster (depending on the array > size) than the corresponding SSE2 version. The AVX2 and SSE2 were > benchmarked on a Intel(R) Core(TM) i5-6260U CPU @ 1.80GHz. The AVX2 and > SSE versions were benchmarked by measuring how many TSC cycles each of > the avx2_combine_over_u and sse2_combine_over_u functions took to run > for various array sizes. For the purpose of benchmarking, turbo was > disabled and intel_pstate governor was set to performance to avoid > variance in CPU frequencies across multiple runs. > > | Array size | #cycles SSE2 | #cycles AVX2 | > -------------------------------------------- > | 400 | 53966 | 32800 | > | 800 | 107595 | 62595 | > | 1600 | 214810 | 122482 | > | 3200 | 429748 | 241971 | > | 6400 | 859070 | 481076 | > > Also ran lowlevel-blt-bench for OVER_8888_8888 operation and that > also shows a 1.55x-1.79x improvement over SSE2. Here are the details: > > AVX2: OVER_8888_8888 = L1:2136.35 L2:2109.46 M:1751.99 ( 60.90%) > SSE2: OVER_8888_8888 = L1:1188.91 L2:1190.63 M:1128.32 ( 40.31%) > > The AVX2 implementation uses the SSE2 version for manipulating pixels > that are not 32 byte aligned. The helper functions from pixman-sse2.h > are re-used for this purpose.
I still cannot measure any performance improvement with cairo-traces. If we're not improving performance in any real world application, then I don't think it's worth adding a significant amount of code. As I told you in person and in private mail, I suspect that you're more likely to see real performance improvements in operations that are more compute-heavy, like bilinear filtering. You could probably use AVX2's gather instructions in the bilinear code as well. Filling out the avx2_iters array would also be a good place to start, since those functions execute when we do not have a specific fast-path for an operation (which will be the case for AVX2). I sense that you want to check this off your todo list and move on. If that's the case, we can include the avx2_composite_over_reverse_n_8888 function I wrote (and will send as a separate patch) to confirm that using AVX2 is capable of giving a performance improvement in some cairo traces. > --- > pixman/pixman-avx2.c | 431 ++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 430 insertions(+), 1 deletion(-) > > diff --git a/pixman/pixman-avx2.c b/pixman/pixman-avx2.c > index d860d67..faef552 100644 > --- a/pixman/pixman-avx2.c > +++ b/pixman/pixman-avx2.c > @@ -6,13 +6,439 @@ > #include "pixman-private.h" > #include "pixman-combine32.h" > #include "pixman-inlines.h" > +#include "pixman-sse2.h" > > +#define MASK_0080_AVX2 _mm256_set1_epi16(0x0080) > +#define MASK_00FF_AVX2 _mm256_set1_epi16(0x00ff) > +#define MASK_0101_AVX2 _mm256_set1_epi16(0x0101) > + > +static force_inline __m256i > +load_256_aligned (__m256i* src) > +{ > + return _mm256_load_si256(src); > +} > + > +static force_inline void > +negate_2x256 (__m256i data_lo, > + __m256i data_hi, > + __m256i* neg_lo, > + __m256i* neg_hi) > +{ > + *neg_lo = _mm256_xor_si256 (data_lo, MASK_00FF_AVX2); > + *neg_hi = _mm256_xor_si256 (data_hi, MASK_00FF_AVX2); > +} > + > +static force_inline __m256i > +pack_2x256_256 (__m256i lo, __m256i hi) > +{ > + return _mm256_packus_epi16 (lo, hi); > +} > + Stray space > +static force_inline void > +pix_multiply_2x256 (__m256i* data_lo, > + __m256i* data_hi, > + __m256i* alpha_lo, > + __m256i* alpha_hi, > + __m256i* ret_lo, > + __m256i* ret_hi) > +{ > + __m256i lo, hi; > + > + lo = _mm256_mullo_epi16 (*data_lo, *alpha_lo); > + hi = _mm256_mullo_epi16 (*data_hi, *alpha_hi); > + lo = _mm256_adds_epu16 (lo, MASK_0080_AVX2); > + hi = _mm256_adds_epu16 (hi, MASK_0080_AVX2); > + *ret_lo = _mm256_mulhi_epu16 (lo, MASK_0101_AVX2); > + *ret_hi = _mm256_mulhi_epu16 (hi, MASK_0101_AVX2); > +} > + Stray space > +static force_inline void > +over_2x256 (__m256i* src_lo, > + __m256i* src_hi, > + __m256i* alpha_lo, > + __m256i* alpha_hi, > + __m256i* dst_lo, > + __m256i* dst_hi) > +{ > + __m256i t1, t2; > + > + negate_2x256 (*alpha_lo, *alpha_hi, &t1, &t2); > + > + pix_multiply_2x256 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi); > + > + *dst_lo = _mm256_adds_epu8 (*src_lo, *dst_lo); > + *dst_hi = _mm256_adds_epu8 (*src_hi, *dst_hi); > +} > + > +static force_inline void > +expand_alpha_2x256 (__m256i data_lo, > + __m256i data_hi, > + __m256i* alpha_lo, > + __m256i* alpha_hi) > +{ > + __m256i lo, hi; > + > + lo = _mm256_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3)); > + hi = _mm256_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3)); > + > + *alpha_lo = _mm256_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3)); > + *alpha_hi = _mm256_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3)); > +} > + > +static force_inline void > +unpack_256_2x256 (__m256i data, __m256i* data_lo, __m256i* data_hi) > +{ > + *data_lo = _mm256_unpacklo_epi8 (data, _mm256_setzero_si256 ()); > + *data_hi = _mm256_unpackhi_epi8 (data, _mm256_setzero_si256 ()); > +} > + > +/* save 4 pixels on a 16-byte boundary aligned address */ > +static force_inline void > +save_256_aligned (__m256i* dst, > + __m256i data) > +{ > + _mm256_store_si256 (dst, data); > +} > + > +static force_inline int > +is_opaque_256 (__m256i x) > +{ > + __m256i ffs = _mm256_cmpeq_epi8 (x, x); > + > + return (_mm256_movemask_epi8 > + (_mm256_cmpeq_epi8 (x, ffs)) & 0x88888888) == 0x88888888; > +} > + > +static force_inline int > +is_zero_256 (__m256i x) > +{ > + return _mm256_movemask_epi8 ( > + _mm256_cmpeq_epi8 (x, _mm256_setzero_si256 ())) == 0xffffffff; > +} > + > +static force_inline int > +is_transparent_256 (__m256i x) > +{ > + return (_mm256_movemask_epi8 ( > + _mm256_cmpeq_epi8 (x, _mm256_setzero_si256 ())) & 0x88888888) > + == 0x88888888; > +} > + > + Extra newline > +/* load 4 pixels from a unaligned address */ > +static force_inline __m256i > +load_256_unaligned (const __m256i* src) > +{ > + return _mm256_loadu_si256 (src); > +} > + > +static force_inline __m256i > +combine8 (const __m256i *ps, const __m256i *pm) > +{ > + __m256i ymm_src_lo, ymm_src_hi; > + __m256i ymm_msk_lo, ymm_msk_hi; > + __m256i s; > + > + if (pm) > + { > + ymm_msk_lo = load_256_unaligned (pm); > + > + if (is_transparent_256 (ymm_msk_lo)) > + return _mm256_setzero_si256 (); > + } > + > + s = load_256_unaligned (ps); > + > + if (pm) > + { > + unpack_256_2x256 (s, &ymm_src_lo, &ymm_src_hi); > + unpack_256_2x256 (ymm_msk_lo, &ymm_msk_lo, &ymm_msk_hi); > + > + expand_alpha_2x256 (ymm_msk_lo, ymm_msk_hi, &ymm_msk_lo, &ymm_msk_hi); > + > + pix_multiply_2x256 (&ymm_src_lo, &ymm_src_hi, > + &ymm_msk_lo, &ymm_msk_hi, > + &ymm_src_lo, &ymm_src_hi); > + > + s = pack_2x256_256 (ymm_src_lo, ymm_src_hi); > + } > + > + return s; > +} > + > +static force_inline void > +core_combine_over_u_avx2_mask (uint32_t * pd, > + const uint32_t* ps, > + const uint32_t* pm, > + int w) > +{ > + uint32_t s, d; > + while (w && ((uintptr_t)pd & 31)) > + { > + d = *pd; > + s = combine1 (ps, pm); > + > + if (s) > + *pd = core_combine_over_u_pixel_sse2 (s, d); > + pd++; > + ps++; > + pm++; > + w--; > + } > + Is the alignment loop here actually important for performance? As far as I know unaligned loads are cheap on all CPUs with AVX2. We might be able to do unaligned loads/stores in the loop below and get into the vectorized code sooner./+pack_2x256_256 > + /* > + * dst is 32 byte aligned, and w >=8 means the next 256 bits > + * contain relevant data > + */ > + Stray whitespace, and */ is not aligned > + while (w >= 8) > + { > + __m256i mask = load_256_unaligned ((__m256i *)pm); > + > + if (!is_zero_256 (mask)) > + { > + __m256i src; > + __m256i src_hi, src_lo; > + __m256i mask_hi, mask_lo; > + __m256i alpha_hi, alpha_lo; > + > + src = load_256_unaligned ((__m256i *)ps); > + > + if (is_opaque_256 (_mm256_and_si256 (src, mask))) > + { > + save_256_aligned ((__m256i *)pd, src); > + } > + else > + { > + __m256i dst = load_256_aligned ((__m256i *)pd); > + __m256i dst_hi, dst_lo; > + > + unpack_256_2x256 (mask, &mask_lo, &mask_hi); > + unpack_256_2x256 (src, &src_lo, &src_hi); > + > + expand_alpha_2x256 (mask_lo, mask_hi, &mask_lo, &mask_hi); > + pix_multiply_2x256 (&src_lo, &src_hi, > + &mask_lo, &mask_hi, > + &src_lo, &src_hi); > + Stray spaces again > + unpack_256_2x256 (dst, &dst_lo, &dst_hi); > + expand_alpha_2x256 (src_lo, src_hi, > + &alpha_lo, &alpha_hi); > + > + over_2x256 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, > + &dst_lo, &dst_hi); > + > + save_256_aligned ( > + (__m256i *)pd, > + pack_2x256_256 (dst_lo, dst_hi)); > + } > + } > + pm += 8; > + ps += 8; > + pd += 8; > + w -= 8; > + } > + > + while (w) > + { > + d = *pd; > + s = combine1 (ps, pm); > + > + if (s) > + *pd = core_combine_over_u_pixel_sse2 (s, d); > + pd++; > + ps++; > + pm++; > + w--; > + } > +} > + > +static force_inline void > +core_combine_over_u_avx2_no_mask (uint32_t * pd, > + const uint32_t* ps, > + int w) > +{ > + uint32_t s, d; > + > + /* Align dst on a 16-byte boundary */ > + while (w && ((uintptr_t)pd & 31)) > + { > + d = *pd; > + s = *ps; > + > + if (s) > + *pd = core_combine_over_u_pixel_sse2 (s, d); > + pd++; > + ps++; > + w--; > + } > + > + while (w >= 8) > + { > + __m256i src; > + __m256i src_hi, src_lo, dst_hi, dst_lo; > + __m256i alpha_hi, alpha_lo; > + > + src = load_256_unaligned ((__m256i *)ps); > + > + if (!is_zero_256 (src)) > + { > + if (is_opaque_256 (src)) > + { > + save_256_aligned ((__m256i *)pd, src); > + } > + else > + { > + __m256i dst = load_256_aligned ((__m256i *)pd); > + > + unpack_256_2x256 (src, &src_lo, &src_hi); > + unpack_256_2x256 (dst, &dst_lo, &dst_hi); > + > + expand_alpha_2x256 (src_lo, src_hi, > + &alpha_lo, &alpha_hi); > + over_2x256 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, > + &dst_lo, &dst_hi); > + > + save_256_aligned ( > + (__m256i *)pd, > + pack_2x256_256 (dst_lo, dst_hi)); > + } > + } > + > + ps += 8; > + pd += 8; > + w -= 8; > + } Should have a blank line here > + while (w) > + { > + d = *pd; > + s = *ps; > + > + if (s) > + *pd = core_combine_over_u_pixel_sse2 (s, d); > + pd++; > + ps++; > + w--; > + } > +} > + > +static force_inline void > +avx2_combine_over_u (pixman_implementation_t *imp, > + pixman_op_t op, > + uint32_t * pd, > + const uint32_t * ps, > + const uint32_t * pm, > + int w) > +{ > + if (pm) > + core_combine_over_u_avx2_mask (pd, ps, pm, w); > + else > + core_combine_over_u_avx2_no_mask (pd, ps, w); > +} > + > +static void > +avx2_combine_over_reverse_u (pixman_implementation_t *imp, > + pixman_op_t op, > + uint32_t * pd, > + const uint32_t * ps, > + const uint32_t * pm, > + int w) > +{ > + uint32_t s, d; > + > + __m256i ymm_dst_lo, ymm_dst_hi; > + __m256i ymm_src_lo, ymm_src_hi; > + __m256i ymm_alpha_lo, ymm_alpha_hi; > + > + /* Align dst on a 16-byte boundary */ > + while (w && > + ((uintptr_t)pd & 31)) > + { > + d = *pd; > + s = combine1 (ps, pm); > + > + *pd++ = core_combine_over_u_pixel_sse2 (d, s); > + w--; > + ps++; > + if (pm) > + pm++; > + } > + > + while (w >= 8) > + { > + ymm_src_hi = combine8 ((__m256i*)ps, (__m256i*)pm); > + ymm_dst_hi = load_256_aligned ((__m256i*) pd); > + > + unpack_256_2x256 (ymm_src_hi, &ymm_src_lo, &ymm_src_hi); > + unpack_256_2x256 (ymm_dst_hi, &ymm_dst_lo, &ymm_dst_hi); > + > + expand_alpha_2x256 (ymm_dst_lo, ymm_dst_hi, > + &ymm_alpha_lo, &ymm_alpha_hi); > + > + over_2x256 (&ymm_dst_lo, &ymm_dst_hi, > + &ymm_alpha_lo, &ymm_alpha_hi, > + &ymm_src_lo, &ymm_src_hi); > + > + /* rebuid the 4 pixel data and save*/ > + save_256_aligned ((__m256i*)pd, > + pack_2x256_256 (ymm_src_lo, ymm_src_hi)); > + > + w -= 8; > + ps += 8; > + pd += 8; > + > + if (pm) > + pm += 8; > + } > + > + while (w) > + { > + d = *pd; > + s = combine1 (ps, pm); > + > + *pd++ = core_combine_over_u_pixel_sse2 (d, s); > + ps++; > + w--; > + if (pm) > + pm++; > + } > +} > + > +static void > +avx2_composite_over_8888_8888 (pixman_implementation_t *imp, > + pixman_composite_info_t *info) > +{ > + PIXMAN_COMPOSITE_ARGS (info); > + int dst_stride, src_stride; > + uint32_t *dst_line, *dst; > + uint32_t *src_line, *src; > + > + PIXMAN_IMAGE_GET_LINE ( > + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); > + PIXMAN_IMAGE_GET_LINE ( > + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); > + > + dst = dst_line; > + src = src_line; > + > + while (height--) > + { > + avx2_combine_over_u (imp, op, dst, src, NULL, width); > + > + dst += dst_stride; > + src += src_stride; > + } > +} Should have a blank line here > static const pixman_fast_path_t avx2_fast_paths[] = > { > + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, > avx2_composite_over_8888_8888), > + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, > avx2_composite_over_8888_8888), > + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, > avx2_composite_over_8888_8888), > + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, > avx2_composite_over_8888_8888), > { PIXMAN_OP_NONE }, > }; > > -static const pixman_iter_info_t avx2_iters[] = > +static const pixman_iter_info_t avx2_iters[] = There was stray whitespace added in the earlier commit here, and now removed. Please just remove it from the original commit. > { > { PIXMAN_null }, > }; > @@ -26,6 +452,9 @@ _pixman_implementation_create_avx2 > (pixman_implementation_t *fallback) > pixman_implementation_t *imp = _pixman_implementation_create (fallback, > avx2_fast_paths); > > /* Set up function pointers */ > + imp->combine_32[PIXMAN_OP_OVER] = avx2_combine_over_u; > + imp->combine_32[PIXMAN_OP_OVER_REVERSE] = avx2_combine_over_reverse_u; > + More stray spaces > imp->iter_info = avx2_iters; > > return imp; > -- > 2.17.1 > > _______________________________________________ > Pixman mailing list > Pixman@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/pixman _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/pixman