POWER8, 8 cores, 3.4GHz, RHEL 7.1 ppc64le.

reference memcpy speed = 24764.8MB/s (6191.2MP/s for 32bpp fills)

                Before           After           Change
              ---------------------------------------------
L1              90.21           133.21           +47.67%
L2              94.91           132.95           +40.08%
M               95.49           132.53           +38.79%
HT              88.07           100.43           +14.03%
VT              86.65           112.45           +29.77%
R               82.77           96.25            +16.29%
RT              65.64           55.14            -16.00%
Kops/s          673             580              -13.82%

cairo trimmed benchmarks :

Speedups
========
t-firefox-asteroids     533.92  -> 495.51  :  1.08x

Slowdowns
=========
t-poppler               364.99  -> 393.72  :  1.08x
t-firefox-canvas-alpha  984.55  -> 1197.85 :  1.22x

Signed-off-by: Oded Gabbay <oded.gab...@gmail.com>
---
 pixman/pixman-vmx.c | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 966219f..5c74a47 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2557,6 +2557,128 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
     }
 }
 
+static void
+vmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t m, d;
+
+    vector unsigned int vsrc, valpha, vmask;
+
+    vector unsigned int vmx_dst, vmx_dst_lo, vmx_dst_hi;
+    vector unsigned int vmx_mask, vmx_mask_lo, vmx_mask_hi;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    vmask = create_mask_1x32_128 (&src);
+    vsrc = expand_pixel_32_1x128 (src);
+    valpha = expand_alpha_1x128 (vsrc);
+
+    while (height--)
+    {
+       dst = dst_line;
+       dst_line += dst_stride;
+       mask = mask_line;
+       mask_line += mask_stride;
+       w = width;
+
+       while (w && (uintptr_t)dst & 15)
+       {
+           uint8_t m = *mask++;
+
+           if (m)
+           {
+               d = *dst;
+               vmx_mask = expand_pixel_8_1x128 (m);
+               vmx_dst = unpack_32_1x128 (d);
+
+               *dst = pack_1x128_32 (in_over (vsrc,
+                                              valpha,
+                                              vmx_mask,
+                                              vmx_dst));
+           }
+
+           w--;
+           dst++;
+       }
+
+       while (w >= 4)
+       {
+           m = *((uint32_t*)mask);
+
+           if (srca == 0xff && m == 0xffffffff)
+           {
+               save_128_aligned(dst, vmask);
+           }
+           else if (m)
+           {
+               vmx_dst = load_128_aligned (dst);
+
+               vmx_mask = unpack_32_1x128 (m);
+               vmx_mask = unpacklo_128_16x8 (vmx_mask,
+                                           (vector unsigned int) AVV(0));
+
+               /* Unpacking */
+               unpack_128_2x128 (vmx_dst, (vector unsigned int) AVV(0),
+                                   &vmx_dst_lo, &vmx_dst_hi);
+
+               unpack_128_2x128 (vmx_mask, (vector unsigned int) AVV(0),
+                                   &vmx_mask_lo, &vmx_mask_hi);
+
+               expand_alpha_rev_2x128 (vmx_mask_lo, vmx_mask_hi,
+                                       &vmx_mask_lo, &vmx_mask_hi);
+
+               in_over_2x128 (&vsrc, &vsrc,
+                              &valpha, &valpha,
+                              &vmx_mask_lo, &vmx_mask_hi,
+                              &vmx_dst_lo, &vmx_dst_hi);
+
+               save_128_aligned(dst, pack_2x128_128 (vmx_dst_lo, vmx_dst_hi));
+           }
+
+           w -= 4;
+           dst += 4;
+           mask += 4;
+       }
+
+       while (w)
+       {
+           uint8_t m = *mask++;
+
+           if (m)
+           {
+               d = *dst;
+               vmx_mask = expand_pixel_8_1x128 (m);
+               vmx_dst = unpack_32_1x128 (d);
+
+               *dst = pack_1x128_32 (in_over (vsrc,
+                                              valpha,
+                                              vmx_mask,
+                                              vmx_dst));
+           }
+
+           w--;
+           dst++;
+       }
+    }
+
+}
+
 static pixman_bool_t
 vmx_fill (pixman_implementation_t *imp,
            uint32_t *               bits,
@@ -3061,6 +3183,10 @@ static const pixman_fast_path_t vmx_fast_paths[] =
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, 
vmx_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, 
vmx_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, 
vmx_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, 
vmx_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, 
vmx_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, 
vmx_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, 
vmx_composite_over_n_8_8888),
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, 
vmx_composite_over_n_8888_8888_ca),
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, 
vmx_composite_over_n_8888_8888_ca),
     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, 
vmx_composite_over_n_8888_8888_ca),
-- 
2.4.3

_______________________________________________
Pixman mailing list
Pixman@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/pixman

Reply via email to