10.02.12 07:41, Dan Dennedy написав(ла):
2012/2/2 Maksym Veremeyenko<ve...@m1stereo.tv>:
Hi,
attached patch perform line compositing for SSE2+ARCH_X86_64 build. It works
for a case where luma is not defined...
Hi Maksym, did some more testing and ran into a couple of image
quality problems. First, alpha blending seems poor, mostly noticeable
with a text with curvy typeface over video:
melt clip1.dv -filter dynamictext:Hello size=200 outline=2
olcolour=white family=elegante bgcolor=0x00000020
The first time you run that you will see that the alpha of bgcolour
(black with 12.5% opacity) is not honored and the background is black.
Set bgcolour=0 to make it completely transparent and look along curved
edges to see the poor blending.
The second problem is that key-framing opacity causes a repeating
cycle of 100% A frame, A+B blended, and 100% B frame. The below
reproduces it:
melt color:red -track color:blue -transition composite out=99
geometry="0=0/0:100%x100%:0; 99=0/0:100%x100%:100"
i wrongly assumed weight range in 0..255 - updated patch attached...
--
________________________________________
Maksym Veremeyenko
>From e8c8a1dde7883f203f609f364a27ea6c1a77104f Mon Sep 17 00:00:00 2001
From: Maksym Veremeyenko <ve...@m1stereo.tv>
Date: Tue, 14 Feb 2012 13:34:12 +0200
Subject: [PATCH] use sse2 instruction for line compositing
---
src/modules/core/composite_line_yuv_sse2_simple.c | 164 +++++++++++++++++++++
src/modules/core/transition_composite.c | 12 ++-
2 files changed, 174 insertions(+), 2 deletions(-)
create mode 100644 src/modules/core/composite_line_yuv_sse2_simple.c
diff --git a/src/modules/core/composite_line_yuv_sse2_simple.c b/src/modules/core/composite_line_yuv_sse2_simple.c
new file mode 100644
index 0000000..f202828
--- /dev/null
+++ b/src/modules/core/composite_line_yuv_sse2_simple.c
@@ -0,0 +1,164 @@
+ const static unsigned char const1[] =
+ {
+ 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00
+ };
+
+ __asm__ volatile
+ (
+ "pxor %%xmm0, %%xmm0 \n\t" /* clear zero register */
+ "movdqu (%4), %%xmm9 \n\t" /* load const1 */
+ "movd %0, %%xmm1 \n\t" /* load weight and decompose */
+ "movlhps %%xmm1, %%xmm1 \n\t"
+ "pshuflw $0, %%xmm1, %%xmm1 \n\t"
+ "pshufhw $0, %%xmm1, %%xmm1 \n\t"
+
+ /*
+ xmm1 (weight)
+
+ 00 W 00 W 00 W 00 W 00 W 00 W 00 W 00 W
+ */
+ "loop_start: \n\t"
+ "movq (%1), %%xmm2 \n\t" /* load source alpha */
+ "punpcklbw %%xmm0, %%xmm2 \n\t" /* unpack alpha 8 8-bits alphas to 8 16-bits values */
+
+ /*
+ xmm2 (src alpha)
+ xmm3 (dst alpha)
+
+ 00 A8 00 A7 00 A6 00 A5 00 A4 00 A3 00 A2 00 A1
+ */
+ "pmullw %%xmm1, %%xmm2 \n\t" /* premultiply source alpha */
+ "psrlw $8, %%xmm2 \n\t"
+
+ /*
+ xmm2 (premultiplied)
+
+ 00 A8 00 A7 00 A6 00 A5 00 A4 00 A3 00 A2 00 A1
+ */
+
+
+ /*
+ DSTa = DSTa + (SRCa * (0xFF - DSTa)) >> 8
+ */
+ "movq (%5), %%xmm3 \n\t" /* load dst alpha */
+ "punpcklbw %%xmm0, %%xmm3 \n\t" /* unpack dst 8 8-bits alphas to 8 16-bits values */
+ "movdqa %%xmm9, %%xmm4 \n\t"
+ "psubw %%xmm3, %%xmm4 \n\t"
+ "pmullw %%xmm2, %%xmm4 \n\t"
+ "psrlw $8, %%xmm4 \n\t"
+ "paddw %%xmm4, %%xmm3 \n\t"
+ "packuswb %%xmm0, %%xmm3 \n\t"
+ "movq %%xmm3, (%5) \n\t" /* save dst alpha */
+
+ "movdqu (%2), %%xmm3 \n\t" /* load src */
+ "movdqu (%3), %%xmm4 \n\t" /* load dst */
+ "movdqa %%xmm3, %%xmm5 \n\t" /* dub src */
+ "movdqa %%xmm4, %%xmm6 \n\t" /* dub dst */
+
+ /*
+ xmm3 (src)
+ xmm4 (dst)
+ xmm5 (src)
+ xmm6 (dst)
+
+ U8 V8 U7 V7 U6 V6 U5 V5 U4 V4 U3 V3 U2 V2 U1 V1
+ */
+
+ "punpcklbw %%xmm0, %%xmm5 \n\t" /* unpack src low */
+ "punpcklbw %%xmm0, %%xmm6 \n\t" /* unpack dst low */
+ "punpckhbw %%xmm0, %%xmm3 \n\t" /* unpack src high */
+ "punpckhbw %%xmm0, %%xmm4 \n\t" /* unpack dst high */
+
+ /*
+ xmm5 (src_l)
+ xmm6 (dst_l)
+
+ 00 U4 00 V4 00 U3 00 V3 00 U2 00 V2 00 U1 00 V1
+
+ xmm3 (src_u)
+ xmm4 (dst_u)
+
+ 00 U8 00 V8 00 U7 00 V7 00 U6 00 V6 00 U5 00 V5
+ */
+
+ "movdqa %%xmm2, %%xmm7 \n\t" /* dub alpha */
+ "movdqa %%xmm2, %%xmm8 \n\t" /* dub alpha */
+ "movlhps %%xmm7, %%xmm7 \n\t" /* dub low */
+ "movhlps %%xmm8, %%xmm8 \n\t" /* dub high */
+
+ /*
+ xmm7 (src alpha)
+
+ 00 A4 00 A3 00 A2 00 A1 00 A4 00 A3 00 A2 00 A1
+ xmm8 (src alpha)
+
+ 00 A8 00 A7 00 A6 00 A5 00 A8 00 A7 00 A6 00 A5
+ */
+
+ "pshuflw $0x50, %%xmm7, %%xmm7 \n\t"
+ "pshuflw $0x50, %%xmm8, %%xmm8 \n\t"
+ "pshufhw $0xFA, %%xmm7, %%xmm7 \n\t"
+ "pshufhw $0xFA, %%xmm8, %%xmm8 \n\t"
+
+ /*
+ xmm7 (src alpha lower)
+
+ 00 A4 00 A4 00 A3 00 A3 00 A2 00 A2 00 A1 00 A1
+
+ xmm8 (src alpha upper)
+ 00 A8 00 A8 00 A7 00 A7 00 A6 00 A6 00 A5 00 A5
+ */
+
+
+ /*
+ DST = SRC * ALPHA + DST * (0xFF - ALPHA)
+ SRC * ALPHA + DST * 0xFF - DST * ALPHA
+ (SRC - DST) * ALPHA + DST * 0xFF
+
+ */
+ "psubw %%xmm4, %%xmm3 \n\t" /* src = src - dst */
+ "psubw %%xmm6, %%xmm5 \n\t"
+ "pmullw %%xmm8, %%xmm3 \n\t" /* src = src * alpha */
+ "pmullw %%xmm7, %%xmm5 \n\t"
+ "pmullw %%xmm9, %%xmm4 \n\t" /* dst = dst * 0xFF */
+ "pmullw %%xmm9, %%xmm6 \n\t"
+ "paddw %%xmm3, %%xmm4 \n\t" /* dst = dst + src */
+ "paddw %%xmm5, %%xmm6 \n\t"
+ "psrlw $8, %%xmm4 \n\t" /* dst = dst >> 8 */
+ "psrlw $8, %%xmm6 \n\t"
+// "pminsw %%xmm9, %%xmm4 \n\t" /* clamp values */
+// "pminsw %%xmm9, %%xmm6 \n\t"
+
+ /*
+ xmm6 (dst_l)
+
+ 00 U4 00 V4 00 U3 00 V3 00 U2 00 V2 00 U1 00 V1
+
+ xmm4 (dst_u)
+
+ 00 U8 00 V8 00 U7 00 V7 00 U6 00 V6 00 U5 00 V5
+ */
+ "packuswb %%xmm4, %%xmm6 \n\t"
+
+ /*
+ xmm6 (dst)
+
+ U8 V8 U7 V7 U6 V6 U5 V5 U4 V4 U3 V3 U2 V2 U1 V1
+ */
+ "movdqu %%xmm6, (%3) \n\t" /* store dst */
+
+ /*
+ increment pointers
+ */
+ "add $0x08, %1 \n\t"
+ "add $0x08, %5 \n\t"
+ "add $0x10, %2 \n\t"
+ "add $0x10, %3 \n\t"
+
+ "dec %6 \n\t"
+ "jnz loop_start \n\t"
+
+ :
+ : "r" (weight >> 8), "r" (alpha_b), "r" (src), "r" (dest), "r" (const1) , "r" (alpha_a), "r" (width / 8)
+ : "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9"
+ );
diff --git a/src/modules/core/transition_composite.c b/src/modules/core/transition_composite.c
index f7054ac..e5d2154 100644
--- a/src/modules/core/transition_composite.c
+++ b/src/modules/core/transition_composite.c
@@ -368,10 +368,18 @@ static inline uint8_t sample_mix( uint8_t dest, uint8_t src, int mix )
static void composite_line_yuv( uint8_t *dest, uint8_t *src, int width, uint8_t *alpha_b, uint8_t *alpha_a, int weight, uint16_t *luma, int soft, uint32_t step )
{
- register int j;
+ register int j = 0;
register int mix;
- for ( j = 0; j < width; j ++ )
+#if defined(USE_SSE) && defined(ARCH_X86_64)
+ if ( !luma && width > 7)
+ {
+#include "composite_line_yuv_sse2_simple.c"
+ j = width - width % 8;
+ }
+#endif
+
+ for ( ; j < width; j ++ )
{
mix = calculate_mix( luma, j, soft, weight, *alpha_b ++, step );
*dest = sample_mix( *dest, *src++, mix );
--
1.7.7.6
------------------------------------------------------------------------------
Keep Your Developer Skills Current with LearnDevNow!
The most comprehensive online learning library for Microsoft developers
is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3,
Metro Style Apps, more. Free future releases when you subscribe now!
http://p.sf.net/sfu/learndevnow-d2d
_______________________________________________
Mlt-devel mailing list
Mlt-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mlt-devel