POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le. reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills)
Before After Change --------------------------------------------- L1 90.21 133.21 +47.67% L2 94.91 132.95 +40.08% M 95.49 132.53 +38.79% HT 88.07 100.43 +14.03% VT 86.65 112.45 +29.77% R 82.77 96.25 +16.29% RT 65.64 55.14 -16.00% Kops/s 673 580 -13.82% cairo trimmed benchmarks : Speedups ======== t-firefox-asteroids 533.92 -> 495.51 : 1.08x Slowdowns ========= t-poppler 364.99 -> 393.72 : 1.08x t-firefox-canvas-alpha 984.55 -> 1197.85 : 1.22x Signed-off-by: Oded Gabbay <oded.gab...@gmail.com> --- pixman/pixman-vmx.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c index 966219f..5c74a47 100644 --- a/pixman/pixman-vmx.c +++ b/pixman/pixman-vmx.c @@ -2557,6 +2557,128 @@ vmx_combine_add_ca (pixman_implementation_t *imp, } } +static void +vmx_composite_over_n_8_8888 (pixman_implementation_t *imp, + pixman_composite_info_t *info) +{ + PIXMAN_COMPOSITE_ARGS (info); + uint32_t src, srca; + uint32_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + uint32_t m, d; + + vector unsigned int vsrc, valpha, vmask; + + vector unsigned int vmx_dst, vmx_dst_lo, vmx_dst_hi; + vector unsigned int vmx_mask, vmx_mask_lo, vmx_mask_hi; + + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE ( + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + vmask = create_mask_1x32_128 (&src); + vsrc = expand_pixel_32_1x128 (src); + valpha = expand_alpha_1x128 (vsrc); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w && (uintptr_t)dst & 15) + { + uint8_t m = *mask++; + + if (m) + { + d = *dst; + vmx_mask = expand_pixel_8_1x128 (m); + vmx_dst = unpack_32_1x128 (d); + + *dst = pack_1x128_32 (in_over (vsrc, + valpha, + vmx_mask, + vmx_dst)); + } + + w--; + dst++; + } + + while (w >= 4) + { + m = *((uint32_t*)mask); + + if (srca == 0xff && m == 0xffffffff) + { + save_128_aligned(dst, vmask); + } + else if (m) + { + vmx_dst = load_128_aligned (dst); + + vmx_mask = unpack_32_1x128 (m); + vmx_mask = unpacklo_128_16x8 (vmx_mask, + (vector unsigned int) AVV(0)); + + /* Unpacking */ + unpack_128_2x128 (vmx_dst, (vector unsigned int) AVV(0), + &vmx_dst_lo, &vmx_dst_hi); + + unpack_128_2x128 (vmx_mask, (vector unsigned int) AVV(0), + &vmx_mask_lo, &vmx_mask_hi); + + expand_alpha_rev_2x128 (vmx_mask_lo, vmx_mask_hi, + &vmx_mask_lo, &vmx_mask_hi); + + in_over_2x128 (&vsrc, &vsrc, + &valpha, &valpha, + &vmx_mask_lo, &vmx_mask_hi, + &vmx_dst_lo, &vmx_dst_hi); + + save_128_aligned(dst, pack_2x128_128 (vmx_dst_lo, vmx_dst_hi)); + } + + w -= 4; + dst += 4; + mask += 4; + } + + while (w) + { + uint8_t m = *mask++; + + if (m) + { + d = *dst; + vmx_mask = expand_pixel_8_1x128 (m); + vmx_dst = unpack_32_1x128 (d); + + *dst = pack_1x128_32 (in_over (vsrc, + valpha, + vmx_mask, + vmx_dst)); + } + + w--; + dst++; + } + } + +} + static pixman_bool_t vmx_fill (pixman_implementation_t *imp, uint32_t * bits, @@ -3061,6 +3183,10 @@ static const pixman_fast_path_t vmx_fast_paths[] = PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888), PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, vmx_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, vmx_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, vmx_composite_over_n_8_8888), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, vmx_composite_over_n_8_8888), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca), PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca), -- 2.4.3 _______________________________________________ Pixman mailing list Pixman@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/pixman