Re: [Mlt-devel] [PATCH] use sse2 instruction for line compositing

Maksym Veremeyenko Tue, 14 Feb 2012 03:36:28 -0800

10.02.12 07:41, Dan Dennedy написав(ла):

2012/2/2 Maksym Veremeyenko<ve...@m1stereo.tv>:

Hi,


attached patch perform line compositing for SSE2+ARCH_X86_64 build. It works
for a case where luma is not defined...


Hi Maksym, did some more testing and ran into a couple of image
quality problems. First, alpha blending seems poor, mostly noticeable
with a text with curvy typeface over video:

melt clip1.dv -filter dynamictext:Hello size=200 outline=2
olcolour=white family=elegante bgcolor=0x00000020

The first time you run that you will see that the alpha of bgcolour
(black with 12.5% opacity) is not honored and the background is black.
Set bgcolour=0 to make it completely transparent and look along curved
edges to see the poor blending.

The second problem is that key-framing opacity causes a repeating
cycle of 100% A frame, A+B blended, and 100% B frame. The below
reproduces it:

melt color:red -track color:blue -transition composite out=99
geometry="0=0/0:100%x100%:0; 99=0/0:100%x100%:100"


i wrongly assumed weight range in 0..255 - updated patch attached...


--
________________________________________
Maksym Veremeyenko

>From e8c8a1dde7883f203f609f364a27ea6c1a77104f Mon Sep 17 00:00:00 2001
From: Maksym Veremeyenko <ve...@m1stereo.tv>
Date: Tue, 14 Feb 2012 13:34:12 +0200
Subject: [PATCH] use sse2 instruction for line compositing

---
 src/modules/core/composite_line_yuv_sse2_simple.c |  164 +++++++++++++++++++++
 src/modules/core/transition_composite.c           |   12 ++-
 2 files changed, 174 insertions(+), 2 deletions(-)
 create mode 100644 src/modules/core/composite_line_yuv_sse2_simple.c

diff --git a/src/modules/core/composite_line_yuv_sse2_simple.c b/src/modules/core/composite_line_yuv_sse2_simple.c
new file mode 100644
index 0000000..f202828
--- /dev/null
+++ b/src/modules/core/composite_line_yuv_sse2_simple.c
@@ -0,0 +1,164 @@
+    const static unsigned char const1[] =
+    {
+        0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00
+    };
+
+    __asm__ volatile
+    (
+        "pxor           %%xmm0, %%xmm0          \n\t"   /* clear zero register */
+        "movdqu         (%4), %%xmm9            \n\t"   /* load const1 */
+        "movd           %0, %%xmm1              \n\t"   /* load weight and decompose */
+        "movlhps        %%xmm1, %%xmm1          \n\t"
+        "pshuflw        $0, %%xmm1, %%xmm1      \n\t"
+        "pshufhw        $0, %%xmm1, %%xmm1      \n\t"
+
+        /*
+            xmm1 (weight)
+
+                    00  W 00  W 00  W 00  W 00  W 00  W 00  W 00  W
+        */
+        "loop_start:                            \n\t"
+        "movq           (%1), %%xmm2            \n\t"   /* load source alpha */
+        "punpcklbw      %%xmm0, %%xmm2          \n\t"   /* unpack alpha 8 8-bits alphas to 8 16-bits values */
+
+        /*
+            xmm2 (src alpha)
+            xmm3 (dst alpha)
+
+                    00 A8 00 A7 00 A6 00 A5 00 A4 00 A3 00 A2 00 A1
+        */
+        "pmullw         %%xmm1, %%xmm2          \n\t"   /* premultiply source alpha */
+        "psrlw          $8, %%xmm2              \n\t"
+
+        /*
+            xmm2 (premultiplied)
+
+                    00 A8 00 A7 00 A6 00 A5 00 A4 00 A3 00 A2 00 A1
+        */
+
+
+        /*
+            DSTa = DSTa + (SRCa * (0xFF - DSTa)) >> 8
+        */
+        "movq           (%5), %%xmm3            \n\t"   /* load dst alpha */
+        "punpcklbw      %%xmm0, %%xmm3          \n\t"   /* unpack dst 8 8-bits alphas to 8 16-bits values */
+        "movdqa         %%xmm9, %%xmm4          \n\t"
+        "psubw          %%xmm3, %%xmm4          \n\t"
+        "pmullw         %%xmm2, %%xmm4          \n\t"
+        "psrlw          $8, %%xmm4              \n\t"
+        "paddw          %%xmm4, %%xmm3          \n\t"
+        "packuswb       %%xmm0, %%xmm3          \n\t"
+        "movq           %%xmm3, (%5)            \n\t"   /* save dst alpha */
+
+        "movdqu         (%2), %%xmm3            \n\t"   /* load src */
+        "movdqu         (%3), %%xmm4            \n\t"   /* load dst */
+        "movdqa         %%xmm3, %%xmm5          \n\t"   /* dub src */
+        "movdqa         %%xmm4, %%xmm6          \n\t"   /* dub dst */
+
+        /*
+            xmm3 (src)
+            xmm4 (dst)
+            xmm5 (src)
+            xmm6 (dst)
+
+                    U8 V8 U7 V7 U6 V6 U5 V5 U4 V4 U3 V3 U2 V2 U1 V1
+        */
+
+        "punpcklbw      %%xmm0, %%xmm5          \n\t"   /* unpack src low */
+        "punpcklbw      %%xmm0, %%xmm6          \n\t"   /* unpack dst low */
+        "punpckhbw      %%xmm0, %%xmm3          \n\t"   /* unpack src high */
+        "punpckhbw      %%xmm0, %%xmm4          \n\t"   /* unpack dst high */
+
+        /*
+            xmm5 (src_l)
+            xmm6 (dst_l)
+
+                    00 U4 00 V4 00 U3 00 V3 00 U2 00 V2 00 U1 00 V1
+
+            xmm3 (src_u)
+            xmm4 (dst_u)
+
+                    00 U8 00 V8 00 U7 00 V7 00 U6 00 V6 00 U5 00 V5
+        */
+
+        "movdqa         %%xmm2, %%xmm7          \n\t"   /* dub alpha */
+        "movdqa         %%xmm2, %%xmm8          \n\t"   /* dub alpha */
+        "movlhps        %%xmm7, %%xmm7          \n\t"   /* dub low */
+        "movhlps        %%xmm8, %%xmm8          \n\t"   /* dub high */
+
+        /*
+            xmm7 (src alpha)
+
+                    00 A4 00 A3 00 A2 00 A1 00 A4 00 A3 00 A2 00 A1
+            xmm8 (src alpha)
+
+                    00 A8 00 A7 00 A6 00 A5 00 A8 00 A7 00 A6 00 A5
+        */
+
+        "pshuflw        $0x50, %%xmm7, %%xmm7     \n\t"
+        "pshuflw        $0x50, %%xmm8, %%xmm8     \n\t"
+        "pshufhw        $0xFA, %%xmm7, %%xmm7     \n\t"
+        "pshufhw        $0xFA, %%xmm8, %%xmm8     \n\t"
+
+        /*
+            xmm7 (src alpha lower)
+
+                    00 A4 00 A4 00 A3 00 A3 00 A2 00 A2 00 A1 00 A1
+
+            xmm8 (src alpha upper)
+                    00 A8 00 A8 00 A7 00 A7 00 A6 00 A6 00 A5 00 A5
+        */
+
+
+        /*
+            DST = SRC * ALPHA + DST * (0xFF - ALPHA)
+                SRC * ALPHA + DST * 0xFF - DST * ALPHA
+                (SRC - DST) * ALPHA + DST * 0xFF
+
+        */
+        "psubw          %%xmm4, %%xmm3          \n\t"   /* src = src - dst */
+        "psubw          %%xmm6, %%xmm5          \n\t"
+        "pmullw         %%xmm8, %%xmm3          \n\t"   /* src = src * alpha */
+        "pmullw         %%xmm7, %%xmm5          \n\t"
+        "pmullw         %%xmm9, %%xmm4          \n\t"   /* dst = dst * 0xFF */
+        "pmullw         %%xmm9, %%xmm6          \n\t"
+        "paddw          %%xmm3, %%xmm4          \n\t"   /* dst = dst + src */
+        "paddw          %%xmm5, %%xmm6          \n\t"
+        "psrlw          $8, %%xmm4              \n\t"   /* dst = dst >> 8 */
+        "psrlw          $8, %%xmm6              \n\t"
+//        "pminsw         %%xmm9, %%xmm4          \n\t"   /* clamp values */
+//        "pminsw         %%xmm9, %%xmm6          \n\t"
+
+        /*
+            xmm6 (dst_l)
+
+                    00 U4 00 V4 00 U3 00 V3 00 U2 00 V2 00 U1 00 V1
+
+            xmm4 (dst_u)
+
+                    00 U8 00 V8 00 U7 00 V7 00 U6 00 V6 00 U5 00 V5
+        */
+        "packuswb       %%xmm4, %%xmm6          \n\t"
+
+        /*
+            xmm6 (dst)
+
+                    U8 V8 U7 V7 U6 V6 U5 V5 U4 V4 U3 V3 U2 V2 U1 V1
+        */
+        "movdqu         %%xmm6, (%3)            \n\t"   /* store dst */
+
+        /*
+            increment pointers
+        */
+        "add            $0x08, %1               \n\t"
+        "add            $0x08, %5               \n\t"
+        "add            $0x10, %2               \n\t"
+        "add            $0x10, %3               \n\t"
+
+        "dec            %6                      \n\t"
+        "jnz            loop_start              \n\t"
+
+        :
+        : "r" (weight >> 8), "r" (alpha_b), "r" (src), "r" (dest), "r" (const1) , "r" (alpha_a), "r" (width / 8)
+        : "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9"
+    );
diff --git a/src/modules/core/transition_composite.c b/src/modules/core/transition_composite.c
index f7054ac..e5d2154 100644
--- a/src/modules/core/transition_composite.c
+++ b/src/modules/core/transition_composite.c
@@ -368,10 +368,18 @@ static inline uint8_t sample_mix( uint8_t dest, uint8_t src, int mix )
 
 static void composite_line_yuv( uint8_t *dest, uint8_t *src, int width, uint8_t *alpha_b, uint8_t *alpha_a, int weight, uint16_t *luma, int soft, uint32_t step )
 {
-	register int j;
+	register int j = 0;
 	register int mix;
 
-	for ( j = 0; j < width; j ++ )
+#if defined(USE_SSE) && defined(ARCH_X86_64)
+	if ( !luma && width > 7)
+	{
+#include "composite_line_yuv_sse2_simple.c"
+		j = width - width % 8;
+	}
+#endif
+
+	for ( ; j < width; j ++ )
 	{
 		mix = calculate_mix( luma, j, soft, weight, *alpha_b ++, step );
 		*dest = sample_mix( *dest, *src++, mix );
-- 
1.7.7.6

------------------------------------------------------------------------------
Keep Your Developer Skills Current with LearnDevNow!
The most comprehensive online learning library for Microsoft developers
is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3,
Metro Style Apps, more. Free future releases when you subscribe now!
http://p.sf.net/sfu/learndevnow-d2d

_______________________________________________
Mlt-devel mailing list
Mlt-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mlt-devel

Re: [Mlt-devel] [PATCH] use sse2 instruction for line compositing

Reply via email to