Re: [Mlt-devel] [PATCH] compositing optimization

Maksym Veremeyenko Tue, 03 Feb 2015 03:08:27 -0800

03.02.15 07:03, Dan Dennedy написав(ла):

On Thu, Jan 29, 2015 at 8:14 AM, Maksym Veremeyenko <[email protected]
<mailto:[email protected]>> wrote:


    Hi,

    provided patch is a second attempt to introduce subj.

    thees pathset makes two things:

    1. disable creating alpha channel for frame if it does not exist

    2. implement 8 variants of blending/compositing function:

    0| dest_a == NULL | src_a == NULL | weight == 256 | blit
    1| dest_a == NULL | src_a == NULL | weight != 256 | blend: with
    given alpha
    2| dest_a == NULL | src_a != NULL | weight == 256 | blend: only src
    alpha
    3| dest_a == NULL | src_a != NULL | weight != 256 | blend:
    premultiply src alpha
    4| dest_a != NULL | src_a == NULL | weight == 256 | blit: blit and
    set dst alpha to FF
    5| dest_a != NULL | src_a == NULL | weight != 256 | blend: with
    given alpha
    6| dest_a != NULL | src_a != NULL | weight == 256 | blend: full
    blend without src alpha premutiply
    7| dest_a != NULL | src_a != NULL | weight != 256 | blend: full
    (origin version)

    i found a serious boost of performance after applying that patchset.

    please review.


With the last patch applied, a couple of things in demo/ are segfaulting
in composite_line_yuv_sse2_simple.c:blend_case7(). See demo/mlt_news and
mlt_title_over_gfx.

i had a typo with jnz instruction that jump from blend_case6 into theblend_case7 func.

Also, I want to change the name of mlt_frame_get_alpha_mask_nc() to
simply mlt_frame_get_alpha().

done

please review updated patches

PS

there are other places where returned value of mlt_frame_get_alpha_maskbeen checked for NULL, like:



filter_rescale.c:

<------>uint8_t *input = mlt_frame_get_alpha_mask( frame );

<------>if ( input != NULL )
<------>{
<------><------>


filter_avcolour_space.c:

<------><------><------>uint8_t *alpha = mlt_frame_get_alpha_mask( frame );

<------><------><------>mlt_properties_get_data( properties, "alpha",&alpha_size );


<------><------><------>if ( alpha && alpha_size >= len )

should we use mlt_frame_get_alpha there instead ofmlt_frame_get_alpha_mask in next rework steps?


--
________________________________________
Maksym Veremeyenko

>From af70771df0c1bccb98a7701772a4cbf3dde85c6f Mon Sep 17 00:00:00 2001
From: Maksym Veremeyenko <[email protected]>
Date: Tue, 3 Feb 2015 12:26:17 +0200
Subject: [PATCH 1/5] rename arguments indexes to literal names

---
 src/modules/core/composite_line_yuv_sse2_simple.c |   32 ++++++++++----------
 1 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/modules/core/composite_line_yuv_sse2_simple.c b/src/modules/core/composite_line_yuv_sse2_simple.c
index ee9b31b..297e8ec 100644
--- a/src/modules/core/composite_line_yuv_sse2_simple.c
+++ b/src/modules/core/composite_line_yuv_sse2_simple.c
@@ -18,7 +18,7 @@
  */
 
 #include <inttypes.h>
-void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint8_t *alpha_b, uint8_t *alpha_a, int weight)
+void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint8_t *src_a, uint8_t *dest_a, int weight)
 {
     const static unsigned char const1[] =
     {
@@ -32,9 +32,9 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
     __asm__ volatile
     (
         "pxor           %%xmm0, %%xmm0          \n\t"   /* clear zero register */
-        "movdqu         (%4), %%xmm9            \n\t"   /* load const1 */
-        "movdqu         (%7), %%xmm10           \n\t"   /* load const2 */
-        "movd           %0, %%xmm1              \n\t"   /* load weight and decompose */
+        "movdqu         (%[const1]), %%xmm9     \n\t"   /* load const1 */
+        "movdqu         (%[const2]), %%xmm10    \n\t"   /* load const2 */
+        "movd           %[weight], %%xmm1       \n\t"   /* load weight and decompose */
         "movlhps        %%xmm1, %%xmm1          \n\t"
         "pshuflw        $0, %%xmm1, %%xmm1      \n\t"
         "pshufhw        $0, %%xmm1, %%xmm1      \n\t"
@@ -45,7 +45,7 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
                     00  W 00  W 00  W 00  W 00  W 00  W 00  W 00  W
         */
         "loop_start:                            \n\t"
-        "movq           (%1), %%xmm2            \n\t"   /* load source alpha */
+        "movq           (%[src_a]), %%xmm2      \n\t"   /* load source alpha */
         "punpcklbw      %%xmm0, %%xmm2          \n\t"   /* unpack alpha 8 8-bits alphas to 8 16-bits values */
 
         /*
@@ -67,7 +67,7 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
         /*
             DSTa = DSTa + (SRCa * (0xFF - DSTa)) >> 8
         */
-        "movq           (%5), %%xmm3            \n\t"   /* load dst alpha */
+        "movq           (%[dest_a]), %%xmm3     \n\t"   /* load dst alpha */
         "punpcklbw      %%xmm0, %%xmm3          \n\t"   /* unpack dst 8 8-bits alphas to 8 16-bits values */
         "movdqa         %%xmm9, %%xmm4          \n\t"
         "psubw          %%xmm3, %%xmm4          \n\t"
@@ -79,10 +79,10 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
         "psrlw          $8, %%xmm4              \n\t"
         "paddw          %%xmm4, %%xmm3          \n\t"
         "packuswb       %%xmm0, %%xmm3          \n\t"
-        "movq           %%xmm3, (%5)            \n\t"   /* save dst alpha */
+        "movq           %%xmm3, (%[dest_a])     \n\t"   /* save dst alpha */
 
-        "movdqu         (%2), %%xmm3            \n\t"   /* load src */
-        "movdqu         (%3), %%xmm4            \n\t"   /* load dst */
+        "movdqu         (%[src]), %%xmm3        \n\t"   /* load src */
+        "movdqu         (%[dest]), %%xmm4       \n\t"   /* load dst */
         "movdqa         %%xmm3, %%xmm5          \n\t"   /* dub src */
         "movdqa         %%xmm4, %%xmm6          \n\t"   /* dub dst */
 
@@ -184,21 +184,21 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
 
                     U8 V8 U7 V7 U6 V6 U5 V5 U4 V4 U3 V3 U2 V2 U1 V1
         */
-        "movdqu         %%xmm6, (%3)            \n\t"   /* store dst */
+        "movdqu         %%xmm6, (%[dest])       \n\t"   /* store dst */
 
         /*
             increment pointers
         */
-        "add            $0x08, %1               \n\t"
-        "add            $0x08, %5               \n\t"
-        "add            $0x10, %2               \n\t"
-        "add            $0x10, %3               \n\t"
+        "add            $0x08, %[src_a]         \n\t"
+        "add            $0x08, %[dest_a]        \n\t"
+        "add            $0x10, %[src]           \n\t"
+        "add            $0x10, %[dest]          \n\t"
 
-        "dec            %6                      \n\t"
+        "dec            %[width]                \n\t"
         "jnz            loop_start              \n\t"
 
         :
-        : "r" (weight >> 8), "r" (alpha_b), "r" (src), "r" (dest), "r" (const1) , "r" (alpha_a), "r" (width / 8), "r" (const2)
+        : [weight] "r" (weight >> 8), [src_a] "r" (src_a), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1) , [dest_a] "r" (dest_a), [width] "r" (width / 8), [const2] "r" (const2)
         //: "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9", "memory"
     );
 };
-- 
1.7.7.6

>From 9b2fc879ebf6068088ae2809170554b90f3f18f1 Mon Sep 17 00:00:00 2001
From: Maksym Veremeyenko <[email protected]>
Date: Tue, 3 Feb 2015 12:33:23 +0200
Subject: [PATCH 2/5] refactor composite_line_yuv_sse2_simple

---
 src/modules/core/composite_line_yuv_sse2_simple.c |  252 ++++++++-------------
 1 files changed, 89 insertions(+), 163 deletions(-)

diff --git a/src/modules/core/composite_line_yuv_sse2_simple.c b/src/modules/core/composite_line_yuv_sse2_simple.c
index 297e8ec..9a097d6 100644
--- a/src/modules/core/composite_line_yuv_sse2_simple.c
+++ b/src/modules/core/composite_line_yuv_sse2_simple.c
@@ -18,185 +18,111 @@
  */
 
 #include <inttypes.h>
-void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint8_t *src_a, uint8_t *dest_a, int weight)
+#include <stdio.h>
+#include <string.h>
+
+const static unsigned char const1[] =
+{
+    0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00
+};
+const static unsigned char const2[] =
 {
-    const static unsigned char const1[] =
-    {
-        0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00
-    };
-    const static unsigned char const2[] =
-    {
-        0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00
-    };
+    0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00
+};
 
-    __asm__ volatile
-    (
-        "pxor           %%xmm0, %%xmm0          \n\t"   /* clear zero register */
-        "movdqu         (%[const1]), %%xmm9     \n\t"   /* load const1 */
+#define LOAD_CONSTS     \
+        "pxor           %%xmm0, %%xmm0          \n\t"   /* clear zero register */       \
+        "movdqu         (%[const1]), %%xmm9     \n\t"   /* load const1 */               \
         "movdqu         (%[const2]), %%xmm10    \n\t"   /* load const2 */
-        "movd           %[weight], %%xmm1       \n\t"   /* load weight and decompose */
-        "movlhps        %%xmm1, %%xmm1          \n\t"
-        "pshuflw        $0, %%xmm1, %%xmm1      \n\t"
-        "pshufhw        $0, %%xmm1, %%xmm1      \n\t"
 
-        /*
-            xmm1 (weight)
+#define LOAD_WEIGHT     \
+        "movd           %[weight], %%xmm1       \n\t"   /* load weight and decompose */ \
+        "movlhps        %%xmm1, %%xmm1          \n\t"                                   \
+        "pshuflw        $0, %%xmm1, %%xmm1      \n\t"                                   \
+        "pshufhw        $0, %%xmm1, %%xmm1      \n\t"
 
-                    00  W 00  W 00  W 00  W 00  W 00  W 00  W 00  W
-        */
-        "loop_start:                            \n\t"
-        "movq           (%[src_a]), %%xmm2      \n\t"   /* load source alpha */
+#define LOAD_SRC_A      \
+        "movq           (%[src_a]), %%xmm2      \n\t"   /* load source alpha */         \
         "punpcklbw      %%xmm0, %%xmm2          \n\t"   /* unpack alpha 8 8-bits alphas to 8 16-bits values */
 
-        /*
-            xmm2 (src alpha)
-            xmm3 (dst alpha)
-
-                    00 A8 00 A7 00 A6 00 A5 00 A4 00 A3 00 A2 00 A1
-        */
-        "pmullw         %%xmm1, %%xmm2          \n\t"   /* premultiply source alpha */
+#define SRC_A_PREMUL    \
+        "pmullw         %%xmm1, %%xmm2          \n\t"   /* premultiply source alpha */  \
         "psrlw          $8, %%xmm2              \n\t"
 
-        /*
-            xmm2 (premultiplied)
-
-                    00 A8 00 A7 00 A6 00 A5 00 A4 00 A3 00 A2 00 A1
-        */
-
-
-        /*
-            DSTa = DSTa + (SRCa * (0xFF - DSTa)) >> 8
-        */
-        "movq           (%[dest_a]), %%xmm3     \n\t"   /* load dst alpha */
-        "punpcklbw      %%xmm0, %%xmm3          \n\t"   /* unpack dst 8 8-bits alphas to 8 16-bits values */
-        "movdqa         %%xmm9, %%xmm4          \n\t"
-        "psubw          %%xmm3, %%xmm4          \n\t"
-        "pmullw         %%xmm2, %%xmm4          \n\t"
-        "movdqa         %%xmm4, %%xmm5          \n\t"
-        "psrlw          $8, %%xmm4              \n\t"
-        "paddw          %%xmm5, %%xmm4          \n\t"
-        "paddw          %%xmm10, %%xmm4         \n\t"
-        "psrlw          $8, %%xmm4              \n\t"
-        "paddw          %%xmm4, %%xmm3          \n\t"
-        "packuswb       %%xmm0, %%xmm3          \n\t"
+# define DST_A_CALC \
+        /* DSTa = DSTa + (SRCa * (0xFF - DSTa)) >> 8  */ \
+        "movq           (%[dest_a]), %%xmm3     \n\t"   /* load dst alpha */    \
+        "punpcklbw      %%xmm0, %%xmm3          \n\t"   /* unpack dst 8 8-bits alphas to 8 16-bits values */    \
+        "movdqa         %%xmm9, %%xmm4          \n\t"   \
+        "psubw          %%xmm3, %%xmm4          \n\t"   \
+        "pmullw         %%xmm2, %%xmm4          \n\t"   \
+        "movdqa         %%xmm4, %%xmm5          \n\t"   \
+        "psrlw          $8, %%xmm4              \n\t"   \
+        "paddw          %%xmm5, %%xmm4          \n\t"   \
+        "paddw          %%xmm10, %%xmm4         \n\t"   \
+        "psrlw          $8, %%xmm4              \n\t"   \
+        "paddw          %%xmm4, %%xmm3          \n\t"   \
+        "packuswb       %%xmm0, %%xmm3          \n\t"   \
         "movq           %%xmm3, (%[dest_a])     \n\t"   /* save dst alpha */
 
-        "movdqu         (%[src]), %%xmm3        \n\t"   /* load src */
-        "movdqu         (%[dest]), %%xmm4       \n\t"   /* load dst */
-        "movdqa         %%xmm3, %%xmm5          \n\t"   /* dub src */
-        "movdqa         %%xmm4, %%xmm6          \n\t"   /* dub dst */
-
-        /*
-            xmm3 (src)
-            xmm4 (dst)
-            xmm5 (src)
-            xmm6 (dst)
-
-                    U8 V8 U7 V7 U6 V6 U5 V5 U4 V4 U3 V3 U2 V2 U1 V1
-        */
-
-        "punpcklbw      %%xmm0, %%xmm5          \n\t"   /* unpack src low */
-        "punpcklbw      %%xmm0, %%xmm6          \n\t"   /* unpack dst low */
-        "punpckhbw      %%xmm0, %%xmm3          \n\t"   /* unpack src high */
-        "punpckhbw      %%xmm0, %%xmm4          \n\t"   /* unpack dst high */
-
-        /*
-            xmm5 (src_l)
-            xmm6 (dst_l)
-
-                    00 U4 00 V4 00 U3 00 V3 00 U2 00 V2 00 U1 00 V1
-
-            xmm3 (src_u)
-            xmm4 (dst_u)
-
-                    00 U8 00 V8 00 U7 00 V7 00 U6 00 V6 00 U5 00 V5
-        */
-
-        "movdqa         %%xmm2, %%xmm7          \n\t"   /* dub alpha */
-        "movdqa         %%xmm2, %%xmm8          \n\t"   /* dub alpha */
-        "movlhps        %%xmm7, %%xmm7          \n\t"   /* dub low */
-        "movhlps        %%xmm8, %%xmm8          \n\t"   /* dub high */
-
-        /*
-            xmm7 (src alpha)
-
-                    00 A4 00 A3 00 A2 00 A1 00 A4 00 A3 00 A2 00 A1
-            xmm8 (src alpha)
-
-                    00 A8 00 A7 00 A6 00 A5 00 A8 00 A7 00 A6 00 A5
-        */
-
-        "pshuflw        $0x50, %%xmm7, %%xmm7     \n\t"
-        "pshuflw        $0x50, %%xmm8, %%xmm8     \n\t"
-        "pshufhw        $0xFA, %%xmm7, %%xmm7     \n\t"
-        "pshufhw        $0xFA, %%xmm8, %%xmm8     \n\t"
-
-        /*
-            xmm7 (src alpha lower)
-
-                    00 A4 00 A4 00 A3 00 A3 00 A2 00 A2 00 A1 00 A1
-
-            xmm8 (src alpha upper)
-                    00 A8 00 A8 00 A7 00 A7 00 A6 00 A6 00 A5 00 A5
-        */
-
-
-        /*
-            DST = SRC * ALPHA + DST * (0xFF - ALPHA)
-                SRC * ALPHA + DST * 0xFF - DST * ALPHA
-                (SRC - DST) * ALPHA + DST * 0xFF
-
-        */
-        "psubw          %%xmm4, %%xmm3          \n\t"   /* src = src - dst */
-        "psubw          %%xmm6, %%xmm5          \n\t"
-        "pmullw         %%xmm8, %%xmm3          \n\t"   /* src = src * alpha */
-        "pmullw         %%xmm7, %%xmm5          \n\t"
-        "pmullw         %%xmm9, %%xmm4          \n\t"   /* dst = dst * 0xFF */
-        "pmullw         %%xmm9, %%xmm6          \n\t"
-        "paddw          %%xmm3, %%xmm4          \n\t"   /* dst = dst + src */
-        "paddw          %%xmm5, %%xmm6          \n\t"
-        "movdqa         %%xmm4, %%xmm3          \n\t"   /* dst = ((dst >> 8) + dst + 128) >> 8 */
-        "movdqa         %%xmm6, %%xmm5          \n\t"
-        "psrlw          $8, %%xmm4              \n\t"
-        "psrlw          $8, %%xmm6              \n\t"
-        "paddw          %%xmm3, %%xmm4          \n\t"
-        "paddw          %%xmm5, %%xmm6          \n\t"
-        "paddw          %%xmm10, %%xmm4         \n\t"
-        "paddw          %%xmm10, %%xmm6         \n\t"
-        "psrlw          $8, %%xmm4              \n\t"
-        "psrlw          $8, %%xmm6              \n\t"
-//        "pminsw         %%xmm9, %%xmm4          \n\t"   /* clamp values */
-//        "pminsw         %%xmm9, %%xmm6          \n\t"
-
-        /*
-            xmm6 (dst_l)
-
-                    00 U4 00 V4 00 U3 00 V3 00 U2 00 V2 00 U1 00 V1
-
-            xmm4 (dst_u)
-
-                    00 U8 00 V8 00 U7 00 V7 00 U6 00 V6 00 U5 00 V5
-        */
-        "packuswb       %%xmm4, %%xmm6          \n\t"
-
-        /*
-            xmm6 (dst)
-
-                    U8 V8 U7 V7 U6 V6 U5 V5 U4 V4 U3 V3 U2 V2 U1 V1
-        */
+#define DST_PIX_CALC \
+        "movdqu         (%[src]), %%xmm3        \n\t"   /* load src */          \
+        "movdqu         (%[dest]), %%xmm4       \n\t"   /* load dst */          \
+        "movdqa         %%xmm3, %%xmm5          \n\t"   /* dub src */           \
+        "movdqa         %%xmm4, %%xmm6          \n\t"   /* dub dst */           \
+        "punpcklbw      %%xmm0, %%xmm5          \n\t"   /* unpack src low */    \
+        "punpcklbw      %%xmm0, %%xmm6          \n\t"   /* unpack dst low */    \
+        "punpckhbw      %%xmm0, %%xmm3          \n\t"   /* unpack src high */   \
+        "punpckhbw      %%xmm0, %%xmm4          \n\t"   /* unpack dst high */   \
+        "movdqa         %%xmm2, %%xmm7          \n\t"   /* dub alpha */         \
+        "movdqa         %%xmm2, %%xmm8          \n\t"   /* dub alpha */         \
+        "movlhps        %%xmm7, %%xmm7          \n\t"   /* dub low */           \
+        "movhlps        %%xmm8, %%xmm8          \n\t"   /* dub high */          \
+        "pshuflw        $0x50, %%xmm7, %%xmm7   \n\t"                           \
+        "pshuflw        $0x50, %%xmm8, %%xmm8   \n\t"                           \
+        "pshufhw        $0xFA, %%xmm7, %%xmm7   \n\t"                           \
+        "pshufhw        $0xFA, %%xmm8, %%xmm8   \n\t"                           \
+        "psubw          %%xmm4, %%xmm3          \n\t"   /* src = src - dst */   \
+        "psubw          %%xmm6, %%xmm5          \n\t"                           \
+        "pmullw         %%xmm8, %%xmm3          \n\t"   /* src = src * alpha */ \
+        "pmullw         %%xmm7, %%xmm5          \n\t"                           \
+        "pmullw         %%xmm9, %%xmm4          \n\t"   /* dst = dst * 0xFF */  \
+        "pmullw         %%xmm9, %%xmm6          \n\t"                           \
+        "paddw          %%xmm3, %%xmm4          \n\t"   /* dst = dst + src */   \
+        "paddw          %%xmm5, %%xmm6          \n\t"                           \
+        "movdqa         %%xmm4, %%xmm3          \n\t"   /* dst = ((dst >> 8) + dst + 128) >> 8 */ \
+        "movdqa         %%xmm6, %%xmm5          \n\t"                           \
+        "psrlw          $8, %%xmm4              \n\t"                           \
+        "psrlw          $8, %%xmm6              \n\t"                           \
+        "paddw          %%xmm3, %%xmm4          \n\t"                           \
+        "paddw          %%xmm5, %%xmm6          \n\t"                           \
+        "paddw          %%xmm10, %%xmm4         \n\t"                           \
+        "paddw          %%xmm10, %%xmm6         \n\t"                           \
+        "psrlw          $8, %%xmm4              \n\t"                           \
+        "psrlw          $8, %%xmm6              \n\t"                           \
+        "packuswb       %%xmm4, %%xmm6          \n\t"                           \
         "movdqu         %%xmm6, (%[dest])       \n\t"   /* store dst */
 
-        /*
-            increment pointers
-        */
+#define PIX_POINTER_INC \
+        "add            $0x10, %[src]           \n\t"   \
+        "add            $0x10, %[dest]          \n\t"   \
+        "dec            %[width]                \n\t"
+
+void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint8_t *src_a, uint8_t *dest_a, int weight)
+{
+    __asm__ volatile
+    (
+        LOAD_CONSTS
+        LOAD_WEIGHT
+        "loop_start:                            \n\t"
+        LOAD_SRC_A
+        SRC_A_PREMUL
+        DST_A_CALC
+        DST_PIX_CALC
         "add            $0x08, %[src_a]         \n\t"
         "add            $0x08, %[dest_a]        \n\t"
-        "add            $0x10, %[src]           \n\t"
-        "add            $0x10, %[dest]          \n\t"
-
-        "dec            %[width]                \n\t"
+        PIX_POINTER_INC
         "jnz            loop_start              \n\t"
-
         :
         : [weight] "r" (weight >> 8), [src_a] "r" (src_a), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1) , [dest_a] "r" (dest_a), [width] "r" (width / 8), [const2] "r" (const2)
         //: "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9", "memory"
-- 
1.7.7.6

>From 4380069332abb9e2067238844bfa7573b0565bdb Mon Sep 17 00:00:00 2001
From: Maksym Veremeyenko <[email protected]>
Date: Tue, 3 Feb 2015 12:36:54 +0200
Subject: [PATCH 3/5] implement 8 variants of compositing function

---
 src/modules/core/composite_line_yuv_sse2_simple.c |  138 ++++++++++++++++++++-
 1 files changed, 133 insertions(+), 5 deletions(-)

diff --git a/src/modules/core/composite_line_yuv_sse2_simple.c b/src/modules/core/composite_line_yuv_sse2_simple.c
index 9a097d6..85310e6 100644
--- a/src/modules/core/composite_line_yuv_sse2_simple.c
+++ b/src/modules/core/composite_line_yuv_sse2_simple.c
@@ -108,13 +108,13 @@ const static unsigned char const2[] =
         "add            $0x10, %[dest]          \n\t"   \
         "dec            %[width]                \n\t"
 
-void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint8_t *src_a, uint8_t *dest_a, int weight)
+static void blend_case7(uint8_t *dest, uint8_t *src, int width, uint8_t *src_a, uint8_t *dest_a, int weight)
 {
     __asm__ volatile
     (
         LOAD_CONSTS
         LOAD_WEIGHT
-        "loop_start:                            \n\t"
+        "loop_start7:                           \n\t"
         LOAD_SRC_A
         SRC_A_PREMUL
         DST_A_CALC
@@ -122,9 +122,137 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
         "add            $0x08, %[src_a]         \n\t"
         "add            $0x08, %[dest_a]        \n\t"
         PIX_POINTER_INC
-        "jnz            loop_start              \n\t"
+        "jnz            loop_start7             \n\t"
+        :
+        : [weight] "r" (weight), [src_a] "r" (src_a), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1) , [dest_a] "r" (dest_a), [width] "r" (width / 8), [const2] "r" (const2)
+    );
+};
+
+//  | 3   | dest_a == NULL | src_a != NULL | weight != 256 | blend: premultiply src alpha
+static void blend_case3(uint8_t *dest, uint8_t *src, int width, uint8_t *src_a, int weight)
+{
+    __asm__ volatile
+    (
+        LOAD_CONSTS
+        LOAD_WEIGHT
+        "loop_start3:                           \n\t"
+        LOAD_SRC_A
+        SRC_A_PREMUL
+        DST_PIX_CALC
+        "add            $0x08, %[src_a]         \n\t"
+        PIX_POINTER_INC
+        "jnz            loop_start3             \n\t"
+        :
+        : [weight] "r" (weight), [src_a] "r" (src_a), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1), [width] "r" (width / 8), [const2] "r" (const2)
+    );
+};
+
+//  | 2   | dest_a == NULL | src_a != NULL | weight == 255 | blend: only src alpha
+static void blend_case2(uint8_t *dest, uint8_t *src, int width, uint8_t *src_a)
+{
+    __asm__ volatile
+    (
+        LOAD_CONSTS
+        "loop_start2:                           \n\t"
+        LOAD_SRC_A
+        DST_PIX_CALC
+        "add            $0x08, %[src_a]         \n\t"
+        PIX_POINTER_INC
+        "jnz            loop_start2             \n\t"
         :
-        : [weight] "r" (weight >> 8), [src_a] "r" (src_a), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1) , [dest_a] "r" (dest_a), [width] "r" (width / 8), [const2] "r" (const2)
-        //: "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9", "memory"
+        : [src_a] "r" (src_a), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1) , [width] "r" (width / 8), [const2] "r" (const2)
     );
 };
+
+
+//  | 1   | dest_a == NULL | src_a == NULL | weight != 256 | blend: with given alpha
+static void blend_case1(uint8_t *dest, uint8_t *src, int width, int weight)
+{
+    __asm__ volatile
+    (
+        LOAD_CONSTS
+        LOAD_WEIGHT
+        "loop_start1:                           \n\t"
+        "movdqa         %%xmm1, %%xmm2          \n\t"   /* src alpha cames from weight */
+        DST_PIX_CALC
+        PIX_POINTER_INC
+        "jnz            loop_start1             \n\t"
+        :
+        : [weight] "r" (weight), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1), [width] "r" (width / 8), [const2] "r" (const2)
+    );
+};
+
+//  | 5   | dest_a != NULL | src_a == NULL | weight != 256 | blend: with given alpha
+static void blend_case5(uint8_t *dest, uint8_t *src, int width, uint8_t *dest_a, int weight)
+{
+    __asm__ volatile
+    (
+        LOAD_CONSTS
+        LOAD_WEIGHT
+        "loop_start5:                           \n\t"
+        "movdqa         %%xmm1, %%xmm2          \n\t"   /* source alpha comes from weight */
+        DST_A_CALC
+        DST_PIX_CALC
+        "add            $0x08, %[dest_a]        \n\t"
+        PIX_POINTER_INC
+        "jnz            loop_start5             \n\t"
+
+        :
+        : [weight] "r" (weight), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1) , [dest_a] "r" (dest_a), [width] "r" (width / 8), [const2] "r" (const2)
+    );
+};
+
+//  | 6   | dest_a != NULL | src_a != NULL | weight == 256 | blend: full blend without src alpha premutiply
+static void blend_case6(uint8_t *dest, uint8_t *src, int width, uint8_t *src_a, uint8_t *dest_a)
+{
+    __asm__ volatile
+    (
+        LOAD_CONSTS
+        "loop_start6:                           \n\t"
+        LOAD_SRC_A
+        DST_A_CALC
+        DST_PIX_CALC
+        "add            $0x08, %[src_a]         \n\t"
+        "add            $0x08, %[dest_a]        \n\t"
+        PIX_POINTER_INC
+        "jnz            loop_start6             \n\t"
+        :
+        : [src_a] "r" (src_a), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1) , [dest_a] "r" (dest_a), [width] "r" (width / 8), [const2] "r" (const2)
+    );
+};
+
+
+void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint8_t *src_a, uint8_t *dest_a, int weight)
+{
+    weight >>= 8;
+
+    /*
+        | 0   | dest_a == NULL | src_a == NULL | weight == 256 | blit
+        | 1   | dest_a == NULL | src_a == NULL | weight != 256 | blend: with given alpha
+        | 2   | dest_a == NULL | src_a != NULL | weight == 256 | blend: only src alpha
+        | 3   | dest_a == NULL | src_a != NULL | weight != 256 | blend: premultiply src alpha
+        | 4   | dest_a != NULL | src_a == NULL | weight == 256 | blit: blit and set dst alpha to FF
+        | 5   | dest_a != NULL | src_a == NULL | weight != 256 | blend: with given alpha
+        | 6   | dest_a != NULL | src_a != NULL | weight == 256 | blend: full blend without src alpha premutiply
+        | 7   | dest_a != NULL | src_a != NULL | weight != 256 | blend: full (origin version)
+    */
+
+    int cond = ((dest_a != NULL)?4:0) + ((src_a != NULL)?2:0) + ((weight != 256)?1:0);
+
+    switch(cond)
+    {
+        case 0:
+            memcpy(dest, src, 2 * width);
+            break;
+        case 1: blend_case1(dest, src, width, weight); break;
+        case 2: blend_case2(dest, src, width, src_a); break;
+        case 3: blend_case3(dest, src, width, src_a, weight); break;
+        case 4:
+            memcpy(dest, src, 2 * width);
+            memset(dest_a, 0xFF, width);
+            break;
+        case 5: blend_case5(dest, src, width, dest_a, weight); break;
+        case 6: blend_case6(dest, src, width, src_a, dest_a); break;
+        case 7: blend_case7(dest, src, width, src_a, dest_a, weight); break;
+    };
+};
-- 
1.7.7.6

>From bf84fdc387f2ad0e586b9b42602636207c937610 Mon Sep 17 00:00:00 2001
From: Maksym Veremeyenko <[email protected]>
Date: Tue, 3 Feb 2015 12:39:07 +0200
Subject: [PATCH 4/5] implement mlt_frame_get_alpha

---
 src/framework/mlt.vers    |    1 +
 src/framework/mlt_frame.c |   20 ++++++++++++++++++++
 src/framework/mlt_frame.h |    1 +
 3 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/src/framework/mlt.vers b/src/framework/mlt.vers
index 34254f4..82e7f7c 100644
--- a/src/framework/mlt.vers
+++ b/src/framework/mlt.vers
@@ -458,4 +458,5 @@ MLT_0.9.2 {
 
 MLT_0.9.4 {
     mlt_pool_stat;
+    mlt_frame_get_alpha;
 } MLT_0.9.2;
diff --git a/src/framework/mlt_frame.c b/src/framework/mlt_frame.c
index a762969..5fc469e 100644
--- a/src/framework/mlt_frame.c
+++ b/src/framework/mlt_frame.c
@@ -637,6 +637,26 @@ uint8_t *mlt_frame_get_alpha_mask( mlt_frame self )
 	return alpha;
 }
 
+/** Get the alpha channel associated to the frame (without creating if it has not).
+ *
+ * \public \memberof mlt_frame_s
+ * \param self a frame
+ * \return the alpha channel
+ */
+
+uint8_t *mlt_frame_get_alpha( mlt_frame self )
+{
+	uint8_t *alpha = NULL;
+	if ( self != NULL )
+	{
+		if ( self->get_alpha_mask != NULL )
+			alpha = self->get_alpha_mask( self );
+		if ( alpha == NULL )
+			alpha = mlt_properties_get_data( &self->parent, "alpha", NULL );
+	}
+	return alpha;
+}
+
 /** Get the short name for an audio format.
  *
  * You do not need to deallocate the returned string.
diff --git a/src/framework/mlt_frame.h b/src/framework/mlt_frame.h
index ff832d5..2b16c4a 100644
--- a/src/framework/mlt_frame.h
+++ b/src/framework/mlt_frame.h
@@ -122,6 +122,7 @@ extern int mlt_frame_set_alpha( mlt_frame self, uint8_t *alpha, int size, mlt_de
 extern void mlt_frame_replace_image( mlt_frame self, uint8_t *image, mlt_image_format format, int width, int height );
 extern int mlt_frame_get_image( mlt_frame self, uint8_t **buffer, mlt_image_format *format, int *width, int *height, int writable );
 extern uint8_t *mlt_frame_get_alpha_mask( mlt_frame self );
+extern uint8_t *mlt_frame_get_alpha( mlt_frame self );
 extern int mlt_frame_get_audio( mlt_frame self, void **buffer, mlt_audio_format *format, int *frequency, int *channels, int *samples );
 extern int mlt_frame_set_audio( mlt_frame self, void *buffer, mlt_audio_format, int size, mlt_destructor );
 extern unsigned char *mlt_frame_get_waveform( mlt_frame self, int w, int h );
-- 
1.7.7.6

>From 0755c5f6d90abee17f10d42e3d630231bad0771b Mon Sep 17 00:00:00 2001
From: Maksym Veremeyenko <[email protected]>
Date: Tue, 3 Feb 2015 13:33:32 +0200
Subject: [PATCH 5/5] use mlt_frame_get_alpha instead of original and allow
 alpha channels to be NULL

---
 src/framework/mlt_tractor.c             |    9 ++++--
 src/modules/core/filter_resize.c        |    2 +-
 src/modules/core/transition_composite.c |   46 +++++++++++++++++++-----------
 src/modules/gtk2/producer_pango.c       |    2 +-
 src/modules/gtk2/producer_pixbuf.c      |    2 +-
 5 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/src/framework/mlt_tractor.c b/src/framework/mlt_tractor.c
index 1691875..67cb78f 100644
--- a/src/framework/mlt_tractor.c
+++ b/src/framework/mlt_tractor.c
@@ -285,9 +285,12 @@ static int producer_get_image( mlt_frame self, uint8_t **buffer, mlt_image_forma
 	mlt_properties_set_data( properties, "movit.convert.fence",
 		mlt_properties_get_data( frame_properties, "movit.convert.fence", NULL ),
 		0, NULL, NULL );
-	data = mlt_frame_get_alpha_mask( frame );
-	mlt_properties_get_data( frame_properties, "alpha", &size );
-	mlt_frame_set_alpha( self, data, size, NULL );
+	data = mlt_frame_get_alpha( frame );
+	if ( data )
+	{
+		mlt_properties_get_data( frame_properties, "alpha", &size );
+		mlt_frame_set_alpha( self, data, size, NULL );
+	};
 	self->convert_image = frame->convert_image;
 	self->convert_audio = frame->convert_audio;
 	return 0;
diff --git a/src/modules/core/filter_resize.c b/src/modules/core/filter_resize.c
index 1ec7e33..bddd25c 100644
--- a/src/modules/core/filter_resize.c
+++ b/src/modules/core/filter_resize.c
@@ -129,7 +129,7 @@ static uint8_t *frame_resize_image( mlt_frame frame, int owidth, int oheight, in
 
 	// Get the input image, width and height
 	uint8_t *input = mlt_properties_get_data( properties, "image", NULL );
-	uint8_t *alpha = mlt_frame_get_alpha_mask( frame );
+	uint8_t *alpha = mlt_frame_get_alpha( frame );
 	int alpha_size = 0;
 	mlt_properties_get_data( properties, "alpha", &alpha_size );
 
diff --git a/src/modules/core/transition_composite.c b/src/modules/core/transition_composite.c
index 1385401..18eb4a4 100644
--- a/src/modules/core/transition_composite.c
+++ b/src/modules/core/transition_composite.c
@@ -380,20 +380,25 @@ void composite_line_yuv( uint8_t *dest, uint8_t *src, int width, uint8_t *alpha_
 		j = width - width % 8;
 		dest += j * 2;
 		src += j * 2;
-		alpha_a += j;
-		alpha_b += j;
+		if ( alpha_a )
+			alpha_a += j;
+		if ( alpha_b )
+			alpha_b += j;
 	}
 #endif
 
 	for ( ; j < width; j ++ )
 	{
-		mix = calculate_mix( luma, j, soft, weight, *alpha_b ++, step );
+		mix = calculate_mix( luma, j, soft, weight, (!alpha_b)?0x255:(*alpha_b ++), step );
 		*dest = sample_mix( *dest, *src++, mix );
 		dest++;
 		*dest = sample_mix( *dest, *src++, mix );
 		dest++;
-		*alpha_a = ( mix >> 8 ) | *alpha_a;
-		alpha_a ++;
+		if ( alpha_a )
+		{
+			*alpha_a = ( mix >> 8 ) | *alpha_a;
+			alpha_a ++;
+		};
 	}
 }
 
@@ -523,8 +528,10 @@ static int composite_yuv( uint8_t *p_dest, int width_dest, int height_dest, uint
 	p_dest += x * bpp + y * stride_dest;
 
 	// offset pointer into alpha channel based upon cropping
-	alpha_b += x_src + y_src * stride_src / bpp;
-	alpha_a += x + y * stride_dest / bpp;
+	if ( alpha_b )
+		alpha_b += x_src + y_src * stride_src / bpp;
+	if ( alpha_a )
+		alpha_a += x + y * stride_dest / bpp;
 
 	// offset pointer into luma channel based upon cropping
 	if ( p_luma )
@@ -546,8 +553,10 @@ static int composite_yuv( uint8_t *p_dest, int width_dest, int height_dest, uint
 	if ( field == 1 )
 	{
 		p_src += stride_src;
-		alpha_b += stride_src / bpp;
-		alpha_a += stride_dest / bpp;
+		if ( alpha_b )
+			alpha_b += stride_src / bpp;
+		if ( alpha_a )
+			alpha_a += stride_dest / bpp;
 		height_src--;
 	}
 
@@ -560,7 +569,8 @@ static int composite_yuv( uint8_t *p_dest, int width_dest, int height_dest, uint
 	if ( uneven_x != uneven_x_src )
 	{
 		p_src += 2;
-		alpha_b += 1;
+		if ( alpha_b )
+			alpha_b += 1;
 	}
 
 	// now do the compositing only to cropped extents
@@ -570,8 +580,10 @@ static int composite_yuv( uint8_t *p_dest, int width_dest, int height_dest, uint
 
 		p_src += stride_src;
 		p_dest += stride_dest;
-		alpha_b += alpha_b_stride;
-		alpha_a += alpha_a_stride;
+		if ( alpha_b )
+			alpha_b += alpha_b_stride;
+		if ( alpha_a )
+			alpha_a += alpha_a_stride;
 		if ( p_luma )
 			p_luma += alpha_b_stride;
 	}
@@ -1176,13 +1188,13 @@ static int transition_get_image( mlt_frame a_frame, uint8_t **image, mlt_image_f
 		{
 			double aspect_ratio = mlt_frame_get_aspect_ratio( b_frame );
 			get_b_frame_image( self, b_frame, &image_b, &width_b, &height_b, &result );
-			alpha_b = mlt_frame_get_alpha_mask( b_frame );
+			alpha_b = mlt_frame_get_alpha( b_frame );
 			mlt_properties_set_double( a_props, "aspect_ratio", aspect_ratio );
 		}
 
 		// Get the image from the a frame
 		mlt_frame_get_image( a_frame, invert ? &image_b : image, format, width, height, 1 );
-		alpha_a = mlt_frame_get_alpha_mask( a_frame );
+		alpha_a = mlt_frame_get_alpha( a_frame );
 
 		// Optimisation - no compositing required
 		if ( result.item.mix == 0 || ( result.item.w == 0 && result.item.h == 0 ) )
@@ -1225,7 +1237,7 @@ static int transition_get_image( mlt_frame a_frame, uint8_t **image, mlt_image_f
 			mlt_service_unlock( MLT_TRANSITION_SERVICE( self ) );
 			char *operator = mlt_properties_get( properties, "operator" );
 
-			alpha_b = alpha_b == NULL ? mlt_frame_get_alpha_mask( b_frame ) : alpha_b;
+			alpha_b = alpha_b == NULL ? mlt_frame_get_alpha( b_frame ) : alpha_b;
 
 			composite_line_fn line_fn = composite_line_yuv;
 
@@ -1241,10 +1253,10 @@ static int transition_get_image( mlt_frame a_frame, uint8_t **image, mlt_image_f
 			}
 
 			// Allow the user to completely obliterate the alpha channels from both frames
-			if ( mlt_properties_get( properties, "alpha_a" ) )
+			if ( mlt_properties_get( properties, "alpha_a" ) && alpha_a )
 				memset( alpha_a, mlt_properties_get_int( properties, "alpha_a" ), *width * *height );
 
-			if ( mlt_properties_get( properties, "alpha_b" ) )
+			if ( mlt_properties_get( properties, "alpha_b" ) && alpha_b )
 				memset( alpha_b, mlt_properties_get_int( properties, "alpha_b" ), width_b * height_b );
 
 			for ( field = 0; field < ( progressive ? 1 : 2 ); field++ )
diff --git a/src/modules/gtk2/producer_pango.c b/src/modules/gtk2/producer_pango.c
index 9163a49..48d5bc9 100644
--- a/src/modules/gtk2/producer_pango.c
+++ b/src/modules/gtk2/producer_pango.c
@@ -608,7 +608,7 @@ static int producer_get_image( mlt_frame frame, uint8_t **buffer, mlt_image_form
 			cached->image = mlt_pool_alloc( size );
 			memcpy( cached->image, buf, size );
 
-			if ( ( buf = mlt_frame_get_alpha_mask( frame ) ) )
+			if ( ( buf = mlt_frame_get_alpha( frame ) ) )
 			{
 				size = cached->width * cached->height;
 				cached->alpha = mlt_pool_alloc( size );
diff --git a/src/modules/gtk2/producer_pixbuf.c b/src/modules/gtk2/producer_pixbuf.c
index abfa9ab..2e48470 100644
--- a/src/modules/gtk2/producer_pixbuf.c
+++ b/src/modules/gtk2/producer_pixbuf.c
@@ -509,7 +509,7 @@ static void refresh_image( producer_pixbuf self, mlt_frame frame, mlt_image_form
 				self->image = mlt_pool_alloc( image_size );
 				memcpy( self->image, buffer, image_size );
 			}
-			if ( ( buffer = mlt_frame_get_alpha_mask( frame ) ) )
+			if ( ( buffer = mlt_frame_get_alpha( frame ) ) )
 			{
 				self->alpha = mlt_pool_alloc( width * height );
 				memcpy( self->alpha, buffer, width * height );
-- 
1.7.7.6

------------------------------------------------------------------------------
Dive into the World of Parallel Programming. The Go Parallel Website,
sponsored by Intel and developed in partnership with Slashdot Media, is your
hub for all things parallel software development, from weekly thought
leadership blogs to news, videos, case studies, tutorials and more. Take a
look and join the conversation now. http://goparallel.sourceforge.net/

_______________________________________________
Mlt-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/mlt-devel

Re: [Mlt-devel] [PATCH] compositing optimization

Reply via email to