Hi,

i was tried to reach realtime performance for some simple cg operation and found that during composite transition operation MLT create an alpha plane for frame that has not it and not even require for further display. i.e. if you put a small CG over HD frame, MLT create a full frame alpha channel and further blending operation use it memory for no reason IMHO.

so i implement function mlt_frame_get_alpha_mask_nc that do the same as mlt_frame_get_alpha_mask but do not create alpha channel if not exist - it just return NULL. next i replaced some code parts for using mlt_frame_get_alpha_mask_nc and handling returned NULL value.

finally composite_line_yuv_sse2_simple function was split into 8 variants:

|0| dest_a == NULL | src_a == NULL | weight == 256 | blit
|1| dest_a == NULL | src_a == NULL | weight != 256 | blend: with given alpha
|2| dest_a == NULL | src_a != NULL | weight == 256 | blend: only src alpha
|3| dest_a == NULL | src_a != NULL | weight != 256 | blend: premultiply src alpha |4| dest_a != NULL | src_a == NULL | weight == 256 | blit: blit and set dst alpha to FF
|5| dest_a != NULL | src_a == NULL | weight != 256 | blend: with given alpha
|6| dest_a != NULL | src_a != NULL | weight == 256 | blend: full blend without src alpha premutiply |7| dest_a != NULL | src_a != NULL | weight != 256 | blend: full (origin version)

from my tests i did not found visible regression. may be somebody else could also review/test proposed code.

--
________________________________________
Maksym Veremeyenko


>From 2e973085a151bd43762b17bf37e802cdcb130167 Mon Sep 17 00:00:00 2001
From: Maksym Veremeyenko <ve...@m1.tv>
Date: Fri, 27 Jun 2014 18:02:16 +0300
Subject: [PATCH 1/6] rename arguments indexes to literal names

---
 src/modules/core/composite_line_yuv_sse2_simple.c |   30 ++++++++++----------
 1 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/modules/core/composite_line_yuv_sse2_simple.c b/src/modules/core/composite_line_yuv_sse2_simple.c
index 04eb1ca..049ed9e 100644
--- a/src/modules/core/composite_line_yuv_sse2_simple.c
+++ b/src/modules/core/composite_line_yuv_sse2_simple.c
@@ -33,9 +33,9 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
     __asm__ volatile
     (
         "pxor           %%xmm0, %%xmm0          \n\t"   /* clear zero register */
-        "movdqu         (%4), %%xmm9            \n\t"   /* load const1 */
-        "movdqu         (%7), %%xmm10           \n\t"   /* load const2 */
-        "movd           %0, %%xmm1              \n\t"   /* load weight and decompose */
+        "movdqu         (%[const1]), %%xmm9     \n\t"   /* load const1 */
+        "movdqu         (%[const2]), %%xmm10    \n\t"   /* load const2 */
+        "movd           %[weight], %%xmm1       \n\t"   /* load weight and decompose */
         "movlhps        %%xmm1, %%xmm1          \n\t"
         "pshuflw        $0, %%xmm1, %%xmm1      \n\t"
         "pshufhw        $0, %%xmm1, %%xmm1      \n\t"
@@ -46,7 +46,7 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
                     00  W 00  W 00  W 00  W 00  W 00  W 00  W 00  W
         */
         "loop_start:                            \n\t"
-        "movq           (%1), %%xmm2            \n\t"   /* load source alpha */
+        "movq           (%[alpha_b]), %%xmm2    \n\t"   /* load source alpha */
         "punpcklbw      %%xmm0, %%xmm2          \n\t"   /* unpack alpha 8 8-bits alphas to 8 16-bits values */
 
         /*
@@ -68,7 +68,7 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
         /*
             DSTa = DSTa + (SRCa * (0xFF - DSTa)) >> 8
         */
-        "movq           (%5), %%xmm3            \n\t"   /* load dst alpha */
+        "movq           (%[alpha_a]), %%xmm3    \n\t"   /* load dst alpha */
         "punpcklbw      %%xmm0, %%xmm3          \n\t"   /* unpack dst 8 8-bits alphas to 8 16-bits values */
         "movdqa         %%xmm9, %%xmm4          \n\t"
         "psubw          %%xmm3, %%xmm4          \n\t"
@@ -80,10 +80,10 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
         "psrlw          $8, %%xmm4              \n\t"
         "paddw          %%xmm4, %%xmm3          \n\t"
         "packuswb       %%xmm0, %%xmm3          \n\t"
-        "movq           %%xmm3, (%5)            \n\t"   /* save dst alpha */
+        "movq           %%xmm3, (%[alpha_a])    \n\t"   /* save dst alpha */
 
-        "movdqu         (%2), %%xmm3            \n\t"   /* load src */
-        "movdqu         (%3), %%xmm4            \n\t"   /* load dst */
+        "movdqu         (%[src]), %%xmm3        \n\t"   /* load src */
+        "movdqu         (%[dest]), %%xmm4       \n\t"   /* load dst */
         "movdqa         %%xmm3, %%xmm5          \n\t"   /* dub src */
         "movdqa         %%xmm4, %%xmm6          \n\t"   /* dub dst */
 
@@ -185,21 +185,21 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
 
                     U8 V8 U7 V7 U6 V6 U5 V5 U4 V4 U3 V3 U2 V2 U1 V1
         */
-        "movdqu         %%xmm6, (%3)            \n\t"   /* store dst */
+        "movdqu         %%xmm6, (%[dest])       \n\t"   /* store dst */
 
         /*
             increment pointers
         */
-        "add            $0x08, %1               \n\t"
-        "add            $0x08, %5               \n\t"
-        "add            $0x10, %2               \n\t"
-        "add            $0x10, %3               \n\t"
+        "add            $0x08, %[alpha_b]       \n\t"
+        "add            $0x08, %[alpha_a]       \n\t"
+        "add            $0x10, %[src]           \n\t"
+        "add            $0x10, %[dest]          \n\t"
 
-        "dec            %6                      \n\t"
+        "dec            %[width]                \n\t"
         "jnz            loop_start              \n\t"
 
         :
-        : "r" (weight >> 8), "r" (alpha_b), "r" (src), "r" (dest), "r" (const1) , "r" (alpha_a), "r" (width / 8), "r" (const2)
+        : [weight] "r" (weight >> 8), [alpha_b] "r" (alpha_b), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1) , [alpha_a] "r" (alpha_a), [width] "r" (width / 8), [const2] "r" (const2)
         //: "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9", "memory"
     );
 };
-- 
1.7.7.6

>From 91072a4854e50f92fe5dcaa266916751571e1873 Mon Sep 17 00:00:00 2001
From: Maksym Veremeyenko <ve...@m1.tv>
Date: Fri, 27 Jun 2014 18:03:25 +0300
Subject: [PATCH 2/6] implement mlt_frame_get_alpha_mask_nc

---
 src/framework/mlt.vers    |    1 +
 src/framework/mlt_frame.c |   20 ++++++++++++++++++++
 src/framework/mlt_frame.h |    1 +
 3 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/src/framework/mlt.vers b/src/framework/mlt.vers
index 7c4a22c..7a6dbf7 100644
--- a/src/framework/mlt.vers
+++ b/src/framework/mlt.vers
@@ -105,6 +105,7 @@ MLT_0.8.8 {
     mlt_frame_clone;
     mlt_frame_close;
     mlt_frame_get_alpha_mask;
+    mlt_frame_get_alpha_mask_nc;
     mlt_frame_get_aspect_ratio;
     mlt_frame_get_audio;
     mlt_frame_get_image;
diff --git a/src/framework/mlt_frame.c b/src/framework/mlt_frame.c
index c75327f..d78bd28 100644
--- a/src/framework/mlt_frame.c
+++ b/src/framework/mlt_frame.c
@@ -639,6 +639,26 @@ uint8_t *mlt_frame_get_alpha_mask( mlt_frame self )
 	return alpha;
 }
 
+/** Get the alpha channel associated to the frame (without creating if it has not).
+ *
+ * \public \memberof mlt_frame_s
+ * \param self a frame
+ * \return the alpha channel
+ */
+
+uint8_t *mlt_frame_get_alpha_mask_nc( mlt_frame self )
+{
+	uint8_t *alpha = NULL;
+	if ( self != NULL )
+	{
+		if ( self->get_alpha_mask != NULL )
+			alpha = self->get_alpha_mask( self );
+		if ( alpha == NULL )
+			alpha = mlt_properties_get_data( &self->parent, "alpha", NULL );
+	}
+	return alpha;
+}
+
 /** Get the short name for an audio format.
  *
  * You do not need to deallocate the returned string.
diff --git a/src/framework/mlt_frame.h b/src/framework/mlt_frame.h
index 9d00602..85cb5a6 100644
--- a/src/framework/mlt_frame.h
+++ b/src/framework/mlt_frame.h
@@ -123,6 +123,7 @@ extern int mlt_frame_set_alpha( mlt_frame self, uint8_t *alpha, int size, mlt_de
 extern void mlt_frame_replace_image( mlt_frame self, uint8_t *image, mlt_image_format format, int width, int height );
 extern int mlt_frame_get_image( mlt_frame self, uint8_t **buffer, mlt_image_format *format, int *width, int *height, int writable );
 extern uint8_t *mlt_frame_get_alpha_mask( mlt_frame self );
+extern uint8_t *mlt_frame_get_alpha_mask_nc( mlt_frame self );
 extern int mlt_frame_get_audio( mlt_frame self, void **buffer, mlt_audio_format *format, int *frequency, int *channels, int *samples );
 extern int mlt_frame_set_audio( mlt_frame self, void *buffer, mlt_audio_format, int size, mlt_destructor );
 extern unsigned char *mlt_frame_get_waveform( mlt_frame self, int w, int h );
-- 
1.7.7.6

>From 37ee2a657acf4dc28fcf9013f70f365bc48a40f7 Mon Sep 17 00:00:00 2001
From: Maksym Veremeyenko <ve...@m1.tv>
Date: Fri, 27 Jun 2014 18:05:31 +0300
Subject: [PATCH 3/6] use mlt_frame_get_alpha_mask_nc instead of original and
 allow alpha channels to be NULL

---
 src/modules/core/transition_composite.c |   31 +++++++++++++++++++------------
 1 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/src/modules/core/transition_composite.c b/src/modules/core/transition_composite.c
index aa5b196..ce25333 100644
--- a/src/modules/core/transition_composite.c
+++ b/src/modules/core/transition_composite.c
@@ -524,8 +524,10 @@ static int composite_yuv( uint8_t *p_dest, int width_dest, int height_dest, uint
 	p_dest += x * bpp + y * stride_dest;
 
 	// offset pointer into alpha channel based upon cropping
-	alpha_b += x_src + y_src * stride_src / bpp;
-	alpha_a += x + y * stride_dest / bpp;
+	if ( alpha_b )
+		alpha_b += x_src + y_src * stride_src / bpp;
+	if ( alpha_a )
+		alpha_a += x + y * stride_dest / bpp;
 
 	// offset pointer into luma channel based upon cropping
 	if ( p_luma )
@@ -547,8 +549,10 @@ static int composite_yuv( uint8_t *p_dest, int width_dest, int height_dest, uint
 	if ( field == 1 )
 	{
 		p_src += stride_src;
-		alpha_b += stride_src / bpp;
-		alpha_a += stride_dest / bpp;
+		if ( alpha_b )
+			alpha_b += stride_src / bpp;
+		if ( alpha_a )
+			alpha_a += stride_dest / bpp;
 		height_src--;
 	}
 
@@ -561,7 +565,8 @@ static int composite_yuv( uint8_t *p_dest, int width_dest, int height_dest, uint
 	if ( uneven_x != uneven_x_src )
 	{
 		p_src += 2;
-		alpha_b += 1;
+		if ( alpha_b )
+			alpha_b += 1;
 	}
 
 	// now do the compositing only to cropped extents
@@ -571,8 +576,10 @@ static int composite_yuv( uint8_t *p_dest, int width_dest, int height_dest, uint
 
 		p_src += stride_src;
 		p_dest += stride_dest;
-		alpha_b += alpha_b_stride;
-		alpha_a += alpha_a_stride;
+		if ( alpha_b )
+			alpha_b += alpha_b_stride;
+		if ( alpha_a )
+			alpha_a += alpha_a_stride;
 		if ( p_luma )
 			p_luma += alpha_b_stride;
 	}
@@ -1176,13 +1183,13 @@ static int transition_get_image( mlt_frame a_frame, uint8_t **image, mlt_image_f
 		{
 			double aspect_ratio = mlt_frame_get_aspect_ratio( b_frame );
 			get_b_frame_image( self, b_frame, &image_b, &width_b, &height_b, &result );
-			alpha_b = mlt_frame_get_alpha_mask( b_frame );
+			alpha_b = mlt_frame_get_alpha_mask_nc( b_frame );
 			mlt_properties_set_double( a_props, "aspect_ratio", aspect_ratio );
 		}
 
 		// Get the image from the a frame
 		mlt_frame_get_image( a_frame, invert ? &image_b : image, format, width, height, 1 );
-		alpha_a = mlt_frame_get_alpha_mask( a_frame );
+		alpha_a = mlt_frame_get_alpha_mask_nc( a_frame );
 
 		// Optimisation - no compositing required
 		if ( result.item.mix == 0 || ( result.item.w == 0 && result.item.h == 0 ) )
@@ -1226,7 +1233,7 @@ static int transition_get_image( mlt_frame a_frame, uint8_t **image, mlt_image_f
 			mlt_service_unlock( MLT_TRANSITION_SERVICE( self ) );
 			char *operator = mlt_properties_get( properties, "operator" );
 
-			alpha_b = alpha_b == NULL ? mlt_frame_get_alpha_mask( b_frame ) : alpha_b;
+			alpha_b = alpha_b == NULL ? mlt_frame_get_alpha_mask_nc( b_frame ) : alpha_b;
 
 			composite_line_fn line_fn = composite_line_yuv;
 
@@ -1242,10 +1249,10 @@ static int transition_get_image( mlt_frame a_frame, uint8_t **image, mlt_image_f
 			}
 
 			// Allow the user to completely obliterate the alpha channels from both frames
-			if ( mlt_properties_get( properties, "alpha_a" ) )
+			if ( mlt_properties_get( properties, "alpha_a" ) && alpha_a )
 				memset( alpha_a, mlt_properties_get_int( properties, "alpha_a" ), *width * *height );
 
-			if ( mlt_properties_get( properties, "alpha_b" ) )
+			if ( mlt_properties_get( properties, "alpha_b" ) && alpha_b )
 				memset( alpha_b, mlt_properties_get_int( properties, "alpha_b" ), width_b * height_b );
 
 			for ( field = 0; field < ( progressive ? 1 : 2 ); field++ )
-- 
1.7.7.6

>From 61be601d21a4d371f868d57cfeb09a36de6e494e Mon Sep 17 00:00:00 2001
From: Maksym Veremeyenko <ve...@m1.tv>
Date: Fri, 27 Jun 2014 18:39:05 +0300
Subject: [PATCH 4/6] change alpha_X to more meaninfull names

---
 src/modules/core/composite_line_yuv_sse2_simple.c |   14 +++++++-------
 1 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/modules/core/composite_line_yuv_sse2_simple.c b/src/modules/core/composite_line_yuv_sse2_simple.c
index 049ed9e..db3a36e 100644
--- a/src/modules/core/composite_line_yuv_sse2_simple.c
+++ b/src/modules/core/composite_line_yuv_sse2_simple.c
@@ -19,7 +19,7 @@
  */
 
 #include <inttypes.h>
-void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint8_t *alpha_b, uint8_t *alpha_a, int weight)
+void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint8_t *src_a, uint8_t *dest_a, int weight)
 {
     const static unsigned char const1[] =
     {
@@ -46,7 +46,7 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
                     00  W 00  W 00  W 00  W 00  W 00  W 00  W 00  W
         */
         "loop_start:                            \n\t"
-        "movq           (%[alpha_b]), %%xmm2    \n\t"   /* load source alpha */
+        "movq           (%[src_a]), %%xmm2      \n\t"   /* load source alpha */
         "punpcklbw      %%xmm0, %%xmm2          \n\t"   /* unpack alpha 8 8-bits alphas to 8 16-bits values */
 
         /*
@@ -68,7 +68,7 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
         /*
             DSTa = DSTa + (SRCa * (0xFF - DSTa)) >> 8
         */
-        "movq           (%[alpha_a]), %%xmm3    \n\t"   /* load dst alpha */
+        "movq           (%[dest_a]), %%xmm3     \n\t"   /* load dst alpha */
         "punpcklbw      %%xmm0, %%xmm3          \n\t"   /* unpack dst 8 8-bits alphas to 8 16-bits values */
         "movdqa         %%xmm9, %%xmm4          \n\t"
         "psubw          %%xmm3, %%xmm4          \n\t"
@@ -80,7 +80,7 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
         "psrlw          $8, %%xmm4              \n\t"
         "paddw          %%xmm4, %%xmm3          \n\t"
         "packuswb       %%xmm0, %%xmm3          \n\t"
-        "movq           %%xmm3, (%[alpha_a])    \n\t"   /* save dst alpha */
+        "movq           %%xmm3, (%[dest_a])     \n\t"   /* save dst alpha */
 
         "movdqu         (%[src]), %%xmm3        \n\t"   /* load src */
         "movdqu         (%[dest]), %%xmm4       \n\t"   /* load dst */
@@ -190,8 +190,8 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
         /*
             increment pointers
         */
-        "add            $0x08, %[alpha_b]       \n\t"
-        "add            $0x08, %[alpha_a]       \n\t"
+        "add            $0x08, %[src_a]         \n\t"
+        "add            $0x08, %[dest_a]        \n\t"
         "add            $0x10, %[src]           \n\t"
         "add            $0x10, %[dest]          \n\t"
 
@@ -199,7 +199,7 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
         "jnz            loop_start              \n\t"
 
         :
-        : [weight] "r" (weight >> 8), [alpha_b] "r" (alpha_b), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1) , [alpha_a] "r" (alpha_a), [width] "r" (width / 8), [const2] "r" (const2)
+        : [weight] "r" (weight >> 8), [src_a] "r" (src_a), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1) , [dest_a] "r" (dest_a), [width] "r" (width / 8), [const2] "r" (const2)
         //: "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9", "memory"
     );
 };
-- 
1.7.7.6

>From 9dbf37d103a578ecf5ffc1096798119b2b264a44 Mon Sep 17 00:00:00 2001
From: Maksym Veremeyenko <ve...@m1.tv>
Date: Sun, 29 Jun 2014 16:26:53 +0300
Subject: [PATCH 5/6] use mlt_frame_get_alpha_mask_nc instead of original and
 allow alpha channels to be NULL

---
 src/framework/mlt_tractor.c             |    2 +-
 src/modules/core/filter_resize.c        |    2 +-
 src/modules/core/transition_composite.c |   15 ++++++++++-----
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/framework/mlt_tractor.c b/src/framework/mlt_tractor.c
index 458a533..47aa72a 100644
--- a/src/framework/mlt_tractor.c
+++ b/src/framework/mlt_tractor.c
@@ -284,7 +284,7 @@ static int producer_get_image( mlt_frame self, uint8_t **buffer, mlt_image_forma
 	mlt_properties_set_data( properties, "movit.convert.fence",
 		mlt_properties_get_data( frame_properties, "movit.convert.fence", NULL ),
 		0, NULL, NULL );
-	data = mlt_frame_get_alpha_mask( frame );
+	data = mlt_frame_get_alpha_mask_nc( frame );
 	mlt_properties_get_data( frame_properties, "alpha", &size );
 	mlt_frame_set_alpha( self, data, size, NULL );
 	self->convert_image = frame->convert_image;
diff --git a/src/modules/core/filter_resize.c b/src/modules/core/filter_resize.c
index 06e5b9a..9beb603 100644
--- a/src/modules/core/filter_resize.c
+++ b/src/modules/core/filter_resize.c
@@ -130,7 +130,7 @@ static uint8_t *frame_resize_image( mlt_frame frame, int owidth, int oheight, in
 
 	// Get the input image, width and height
 	uint8_t *input = mlt_properties_get_data( properties, "image", NULL );
-	uint8_t *alpha = mlt_frame_get_alpha_mask( frame );
+	uint8_t *alpha = mlt_frame_get_alpha_mask_nc( frame );
 	int alpha_size = 0;
 	mlt_properties_get_data( properties, "alpha", &alpha_size );
 
diff --git a/src/modules/core/transition_composite.c b/src/modules/core/transition_composite.c
index ce25333..cedc891 100644
--- a/src/modules/core/transition_composite.c
+++ b/src/modules/core/transition_composite.c
@@ -381,20 +381,25 @@ void composite_line_yuv( uint8_t *dest, uint8_t *src, int width, uint8_t *alpha_
 		j = width - width % 8;
 		dest += j * 2;
 		src += j * 2;
-		alpha_a += j;
-		alpha_b += j;
+		if ( alpha_a )
+			alpha_a += j;
+		if ( alpha_b )
+			alpha_b += j;
 	}
 #endif
 
 	for ( ; j < width; j ++ )
 	{
-		mix = calculate_mix( luma, j, soft, weight, *alpha_b ++, step );
+		mix = calculate_mix( luma, j, soft, weight, (!alpha_b)?0x255:(*alpha_b ++), step );
 		*dest = sample_mix( *dest, *src++, mix );
 		dest++;
 		*dest = sample_mix( *dest, *src++, mix );
 		dest++;
-		*alpha_a = ( mix >> 8 ) | *alpha_a;
-		alpha_a ++;
+		if ( alpha_a )
+		{
+			*alpha_a = ( mix >> 8 ) | *alpha_a;
+			alpha_a ++;
+		};
 	}
 }
 
-- 
1.7.7.6

>From e39f069779b918fed0a14e5ec2a4aa0b11725b56 Mon Sep 17 00:00:00 2001
From: Maksym Veremeyenko <ve...@m1.tv>
Date: Sun, 29 Jun 2014 16:27:46 +0300
Subject: [PATCH 6/6] implement 8 variants of blending function

---
 src/modules/core/composite_line_yuv_sse2_simple.c |  385 ++++++++++++---------
 1 files changed, 221 insertions(+), 164 deletions(-)

diff --git a/src/modules/core/composite_line_yuv_sse2_simple.c b/src/modules/core/composite_line_yuv_sse2_simple.c
index db3a36e..6434b32 100644
--- a/src/modules/core/composite_line_yuv_sse2_simple.c
+++ b/src/modules/core/composite_line_yuv_sse2_simple.c
@@ -18,188 +18,245 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <stdio.h>
+#include <string.h>
 #include <inttypes.h>
-void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint8_t *src_a, uint8_t *dest_a, int weight)
+
+const static unsigned char const1[] =
 {
-    const static unsigned char const1[] =
-    {
-        0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00
-    };
-    const static unsigned char const2[] =
-    {
-        0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00
-    };
+    0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00
+};
+const static unsigned char const2[] =
+{
+    0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00
+};
 
-    __asm__ volatile
-    (
-        "pxor           %%xmm0, %%xmm0          \n\t"   /* clear zero register */
-        "movdqu         (%[const1]), %%xmm9     \n\t"   /* load const1 */
+#define LOAD_CONSTS     \
+        "pxor           %%xmm0, %%xmm0          \n\t"   /* clear zero register */       \
+        "movdqu         (%[const1]), %%xmm9     \n\t"   /* load const1 */               \
         "movdqu         (%[const2]), %%xmm10    \n\t"   /* load const2 */
-        "movd           %[weight], %%xmm1       \n\t"   /* load weight and decompose */
-        "movlhps        %%xmm1, %%xmm1          \n\t"
-        "pshuflw        $0, %%xmm1, %%xmm1      \n\t"
-        "pshufhw        $0, %%xmm1, %%xmm1      \n\t"
 
-        /*
-            xmm1 (weight)
+#define LOAD_WEIGHT     \
+        "movd           %[weight], %%xmm1       \n\t"   /* load weight and decompose */ \
+        "movlhps        %%xmm1, %%xmm1          \n\t"                                   \
+        "pshuflw        $0, %%xmm1, %%xmm1      \n\t"                                   \
+        "pshufhw        $0, %%xmm1, %%xmm1      \n\t"
 
-                    00  W 00  W 00  W 00  W 00  W 00  W 00  W 00  W
-        */
-        "loop_start:                            \n\t"
-        "movq           (%[src_a]), %%xmm2      \n\t"   /* load source alpha */
+#define LOAD_SRC_A      \
+        "movq           (%[src_a]), %%xmm2      \n\t"   /* load source alpha */         \
         "punpcklbw      %%xmm0, %%xmm2          \n\t"   /* unpack alpha 8 8-bits alphas to 8 16-bits values */
 
-        /*
-            xmm2 (src alpha)
-            xmm3 (dst alpha)
-
-                    00 A8 00 A7 00 A6 00 A5 00 A4 00 A3 00 A2 00 A1
-        */
-        "pmullw         %%xmm1, %%xmm2          \n\t"   /* premultiply source alpha */
+#define SRC_A_PREMUL    \
+        "pmullw         %%xmm1, %%xmm2          \n\t"   /* premultiply source alpha */  \
         "psrlw          $8, %%xmm2              \n\t"
 
-        /*
-            xmm2 (premultiplied)
-
-                    00 A8 00 A7 00 A6 00 A5 00 A4 00 A3 00 A2 00 A1
-        */
-
-
-        /*
-            DSTa = DSTa + (SRCa * (0xFF - DSTa)) >> 8
-        */
-        "movq           (%[dest_a]), %%xmm3     \n\t"   /* load dst alpha */
-        "punpcklbw      %%xmm0, %%xmm3          \n\t"   /* unpack dst 8 8-bits alphas to 8 16-bits values */
-        "movdqa         %%xmm9, %%xmm4          \n\t"
-        "psubw          %%xmm3, %%xmm4          \n\t"
-        "pmullw         %%xmm2, %%xmm4          \n\t"
-        "movdqa         %%xmm4, %%xmm5          \n\t"
-        "psrlw          $8, %%xmm4              \n\t"
-        "paddw          %%xmm5, %%xmm4          \n\t"
-        "paddw          %%xmm10, %%xmm4         \n\t"
-        "psrlw          $8, %%xmm4              \n\t"
-        "paddw          %%xmm4, %%xmm3          \n\t"
-        "packuswb       %%xmm0, %%xmm3          \n\t"
+# define DST_A_CALC \
+        /* DSTa = DSTa + (SRCa * (0xFF - DSTa)) >> 8  */ \
+        "movq           (%[dest_a]), %%xmm3     \n\t"   /* load dst alpha */    \
+        "punpcklbw      %%xmm0, %%xmm3          \n\t"   /* unpack dst 8 8-bits alphas to 8 16-bits values */    \
+        "movdqa         %%xmm9, %%xmm4          \n\t"   \
+        "psubw          %%xmm3, %%xmm4          \n\t"   \
+        "pmullw         %%xmm2, %%xmm4          \n\t"   \
+        "movdqa         %%xmm4, %%xmm5          \n\t"   \
+        "psrlw          $8, %%xmm4              \n\t"   \
+        "paddw          %%xmm5, %%xmm4          \n\t"   \
+        "paddw          %%xmm10, %%xmm4         \n\t"   \
+        "psrlw          $8, %%xmm4              \n\t"   \
+        "paddw          %%xmm4, %%xmm3          \n\t"   \
+        "packuswb       %%xmm0, %%xmm3          \n\t"   \
         "movq           %%xmm3, (%[dest_a])     \n\t"   /* save dst alpha */
 
-        "movdqu         (%[src]), %%xmm3        \n\t"   /* load src */
-        "movdqu         (%[dest]), %%xmm4       \n\t"   /* load dst */
-        "movdqa         %%xmm3, %%xmm5          \n\t"   /* dub src */
-        "movdqa         %%xmm4, %%xmm6          \n\t"   /* dub dst */
-
-        /*
-            xmm3 (src)
-            xmm4 (dst)
-            xmm5 (src)
-            xmm6 (dst)
-
-                    U8 V8 U7 V7 U6 V6 U5 V5 U4 V4 U3 V3 U2 V2 U1 V1
-        */
-
-        "punpcklbw      %%xmm0, %%xmm5          \n\t"   /* unpack src low */
-        "punpcklbw      %%xmm0, %%xmm6          \n\t"   /* unpack dst low */
-        "punpckhbw      %%xmm0, %%xmm3          \n\t"   /* unpack src high */
-        "punpckhbw      %%xmm0, %%xmm4          \n\t"   /* unpack dst high */
-
-        /*
-            xmm5 (src_l)
-            xmm6 (dst_l)
-
-                    00 U4 00 V4 00 U3 00 V3 00 U2 00 V2 00 U1 00 V1
-
-            xmm3 (src_u)
-            xmm4 (dst_u)
-
-                    00 U8 00 V8 00 U7 00 V7 00 U6 00 V6 00 U5 00 V5
-        */
-
-        "movdqa         %%xmm2, %%xmm7          \n\t"   /* dub alpha */
-        "movdqa         %%xmm2, %%xmm8          \n\t"   /* dub alpha */
-        "movlhps        %%xmm7, %%xmm7          \n\t"   /* dub low */
-        "movhlps        %%xmm8, %%xmm8          \n\t"   /* dub high */
-
-        /*
-            xmm7 (src alpha)
-
-                    00 A4 00 A3 00 A2 00 A1 00 A4 00 A3 00 A2 00 A1
-            xmm8 (src alpha)
-
-                    00 A8 00 A7 00 A6 00 A5 00 A8 00 A7 00 A6 00 A5
-        */
-
-        "pshuflw        $0x50, %%xmm7, %%xmm7     \n\t"
-        "pshuflw        $0x50, %%xmm8, %%xmm8     \n\t"
-        "pshufhw        $0xFA, %%xmm7, %%xmm7     \n\t"
-        "pshufhw        $0xFA, %%xmm8, %%xmm8     \n\t"
-
-        /*
-            xmm7 (src alpha lower)
-
-                    00 A4 00 A4 00 A3 00 A3 00 A2 00 A2 00 A1 00 A1
-
-            xmm8 (src alpha upper)
-                    00 A8 00 A8 00 A7 00 A7 00 A6 00 A6 00 A5 00 A5
-        */
-
-
-        /*
-            DST = SRC * ALPHA + DST * (0xFF - ALPHA)
-                SRC * ALPHA + DST * 0xFF - DST * ALPHA
-                (SRC - DST) * ALPHA + DST * 0xFF
-
-        */
-        "psubw          %%xmm4, %%xmm3          \n\t"   /* src = src - dst */
-        "psubw          %%xmm6, %%xmm5          \n\t"
-        "pmullw         %%xmm8, %%xmm3          \n\t"   /* src = src * alpha */
-        "pmullw         %%xmm7, %%xmm5          \n\t"
-        "pmullw         %%xmm9, %%xmm4          \n\t"   /* dst = dst * 0xFF */
-        "pmullw         %%xmm9, %%xmm6          \n\t"
-        "paddw          %%xmm3, %%xmm4          \n\t"   /* dst = dst + src */
-        "paddw          %%xmm5, %%xmm6          \n\t"
-        "movdqa         %%xmm4, %%xmm3          \n\t"   /* dst = ((dst >> 8) + dst + 128) >> 8 */
-        "movdqa         %%xmm6, %%xmm5          \n\t"
-        "psrlw          $8, %%xmm4              \n\t"
-        "psrlw          $8, %%xmm6              \n\t"
-        "paddw          %%xmm3, %%xmm4          \n\t"
-        "paddw          %%xmm5, %%xmm6          \n\t"
-        "paddw          %%xmm10, %%xmm4         \n\t"
-        "paddw          %%xmm10, %%xmm6         \n\t"
-        "psrlw          $8, %%xmm4              \n\t"
-        "psrlw          $8, %%xmm6              \n\t"
-//        "pminsw         %%xmm9, %%xmm4          \n\t"   /* clamp values */
-//        "pminsw         %%xmm9, %%xmm6          \n\t"
-
-        /*
-            xmm6 (dst_l)
-
-                    00 U4 00 V4 00 U3 00 V3 00 U2 00 V2 00 U1 00 V1
-
-            xmm4 (dst_u)
-
-                    00 U8 00 V8 00 U7 00 V7 00 U6 00 V6 00 U5 00 V5
-        */
-        "packuswb       %%xmm4, %%xmm6          \n\t"
-
-        /*
-            xmm6 (dst)
-
-                    U8 V8 U7 V7 U6 V6 U5 V5 U4 V4 U3 V3 U2 V2 U1 V1
-        */
+#define DST_PIX_CALC \
+        "movdqu         (%[src]), %%xmm3        \n\t"   /* load src */          \
+        "movdqu         (%[dest]), %%xmm4       \n\t"   /* load dst */          \
+        "movdqa         %%xmm3, %%xmm5          \n\t"   /* dub src */           \
+        "movdqa         %%xmm4, %%xmm6          \n\t"   /* dub dst */           \
+        "punpcklbw      %%xmm0, %%xmm5          \n\t"   /* unpack src low */    \
+        "punpcklbw      %%xmm0, %%xmm6          \n\t"   /* unpack dst low */    \
+        "punpckhbw      %%xmm0, %%xmm3          \n\t"   /* unpack src high */   \
+        "punpckhbw      %%xmm0, %%xmm4          \n\t"   /* unpack dst high */   \
+        "movdqa         %%xmm2, %%xmm7          \n\t"   /* dub alpha */         \
+        "movdqa         %%xmm2, %%xmm8          \n\t"   /* dub alpha */         \
+        "movlhps        %%xmm7, %%xmm7          \n\t"   /* dub low */           \
+        "movhlps        %%xmm8, %%xmm8          \n\t"   /* dub high */          \
+        "pshuflw        $0x50, %%xmm7, %%xmm7   \n\t"                           \
+        "pshuflw        $0x50, %%xmm8, %%xmm8   \n\t"                           \
+        "pshufhw        $0xFA, %%xmm7, %%xmm7   \n\t"                           \
+        "pshufhw        $0xFA, %%xmm8, %%xmm8   \n\t"                           \
+        "psubw          %%xmm4, %%xmm3          \n\t"   /* src = src - dst */   \
+        "psubw          %%xmm6, %%xmm5          \n\t"                           \
+        "pmullw         %%xmm8, %%xmm3          \n\t"   /* src = src * alpha */ \
+        "pmullw         %%xmm7, %%xmm5          \n\t"                           \
+        "pmullw         %%xmm9, %%xmm4          \n\t"   /* dst = dst * 0xFF */  \
+        "pmullw         %%xmm9, %%xmm6          \n\t"                           \
+        "paddw          %%xmm3, %%xmm4          \n\t"   /* dst = dst + src */   \
+        "paddw          %%xmm5, %%xmm6          \n\t"                           \
+        "movdqa         %%xmm4, %%xmm3          \n\t"   /* dst = ((dst >> 8) + dst + 128) >> 8 */ \
+        "movdqa         %%xmm6, %%xmm5          \n\t"                           \
+        "psrlw          $8, %%xmm4              \n\t"                           \
+        "psrlw          $8, %%xmm6              \n\t"                           \
+        "paddw          %%xmm3, %%xmm4          \n\t"                           \
+        "paddw          %%xmm5, %%xmm6          \n\t"                           \
+        "paddw          %%xmm10, %%xmm4         \n\t"                           \
+        "paddw          %%xmm10, %%xmm6         \n\t"                           \
+        "psrlw          $8, %%xmm4              \n\t"                           \
+        "psrlw          $8, %%xmm6              \n\t"                           \
+        "packuswb       %%xmm4, %%xmm6          \n\t"                           \
         "movdqu         %%xmm6, (%[dest])       \n\t"   /* store dst */
 
-        /*
-            increment pointers
-        */
+#define PIX_POINTER_INC \
+        "add            $0x10, %[src]           \n\t"   \
+        "add            $0x10, %[dest]          \n\t"   \
+        "dec            %[width]                \n\t"
+
+static void blend_case7(uint8_t *dest, uint8_t *src, int width, uint8_t *src_a, uint8_t *dest_a, int weight)
+{
+    __asm__ volatile
+    (
+        LOAD_CONSTS
+        LOAD_WEIGHT
+        "loop_start7:                           \n\t"
+        LOAD_SRC_A
+        SRC_A_PREMUL
+        DST_A_CALC
+        DST_PIX_CALC
         "add            $0x08, %[src_a]         \n\t"
         "add            $0x08, %[dest_a]        \n\t"
-        "add            $0x10, %[src]           \n\t"
-        "add            $0x10, %[dest]          \n\t"
+        PIX_POINTER_INC
+        "jnz            loop_start7             \n\t"
+        :
+        : [weight] "r" (weight), [src_a] "r" (src_a), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1) , [dest_a] "r" (dest_a), [width] "r" (width / 8), [const2] "r" (const2)
+    );
+};
 
-        "dec            %[width]                \n\t"
-        "jnz            loop_start              \n\t"
+//  | 3   | dest_a == NULL | src_a != NULL | weight != 256 | blend: premultiply src alpha
+static void blend_case3(uint8_t *dest, uint8_t *src, int width, uint8_t *src_a, int weight)
+{
+    __asm__ volatile
+    (
+        LOAD_CONSTS
+        LOAD_WEIGHT
+        "loop_start3:                           \n\t"
+        LOAD_SRC_A
+        SRC_A_PREMUL
+        DST_PIX_CALC
+        "add            $0x08, %[src_a]         \n\t"
+        PIX_POINTER_INC
+        "jnz            loop_start3             \n\t"
+        :
+        : [weight] "r" (weight), [src_a] "r" (src_a), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1), [width] "r" (width / 8), [const2] "r" (const2)
+    );
+};
 
+//  | 2   | dest_a == NULL | src_a != NULL | weight == 255 | blend: only src alpha
+static void blend_case2(uint8_t *dest, uint8_t *src, int width, uint8_t *src_a)
+{
+    __asm__ volatile
+    (
+        LOAD_CONSTS
+        "loop_start2:                           \n\t"
+        LOAD_SRC_A
+        DST_PIX_CALC
+        "add            $0x08, %[src_a]         \n\t"
+        PIX_POINTER_INC
+        "jnz            loop_start2             \n\t"
         :
-        : [weight] "r" (weight >> 8), [src_a] "r" (src_a), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1) , [dest_a] "r" (dest_a), [width] "r" (width / 8), [const2] "r" (const2)
-        //: "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9", "memory"
+        : [src_a] "r" (src_a), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1) , [width] "r" (width / 8), [const2] "r" (const2)
     );
 };
+
+
+//  | 1   | dest_a == NULL | src_a == NULL | weight != 256 | blend: with given alpha
+static void blend_case1(uint8_t *dest, uint8_t *src, int width, int weight)
+{
+    __asm__ volatile
+    (
+        LOAD_CONSTS
+        LOAD_WEIGHT
+        "loop_start1:                           \n\t"
+        "movdqa         %%xmm1, %%xmm2          \n\t"   /* src alpha cames from weight */
+        DST_PIX_CALC
+        PIX_POINTER_INC
+        "jnz            loop_start1             \n\t"
+        :
+        : [weight] "r" (weight), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1), [width] "r" (width / 8), [const2] "r" (const2)
+    );
+};
+
+//  | 5   | dest_a != NULL | src_a == NULL | weight != 256 | blend: with given alpha
+static void blend_case5(uint8_t *dest, uint8_t *src, int width, uint8_t *dest_a, int weight)
+{
+    __asm__ volatile
+    (
+        LOAD_CONSTS
+        LOAD_WEIGHT
+        "loop_start5:                           \n\t"
+        "movdqa         %%xmm1, %%xmm2          \n\t"   /* source alpha comes from weight */
+        DST_A_CALC
+        DST_PIX_CALC
+        "add            $0x08, %[dest_a]        \n\t"
+        PIX_POINTER_INC
+        "jnz            loop_start5             \n\t"
+
+        :
+        : [weight] "r" (weight), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1) , [dest_a] "r" (dest_a), [width] "r" (width / 8), [const2] "r" (const2)
+    );
+};
+
+//  | 6   | dest_a != NULL | src_a != NULL | weight == 256 | blend: full blend without src alpha premutiply
+static void blend_case6(uint8_t *dest, uint8_t *src, int width, uint8_t *src_a, uint8_t *dest_a)
+{
+    __asm__ volatile
+    (
+        LOAD_CONSTS
+        "loop_start6:                           \n\t"
+        LOAD_SRC_A
+        DST_A_CALC
+        DST_PIX_CALC
+        "add            $0x08, %[src_a]         \n\t"
+        "add            $0x08, %[dest_a]        \n\t"
+        PIX_POINTER_INC
+        "jnz            loop_start7             \n\t"
+        :
+        : [src_a] "r" (src_a), [src] "r" (src), [dest] "r" (dest), [const1] "r" (const1) , [dest_a] "r" (dest_a), [width] "r" (width / 8), [const2] "r" (const2)
+    );
+};
+
+
+void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint8_t *src_a, uint8_t *dest_a, int weight)
+{
+    weight >>= 8;
+
+    /*
+        | 0   | dest_a == NULL | src_a == NULL | weight == 256 | blit
+        | 1   | dest_a == NULL | src_a == NULL | weight != 256 | blend: with given alpha
+        | 2   | dest_a == NULL | src_a != NULL | weight == 256 | blend: only src alpha
+        | 3   | dest_a == NULL | src_a != NULL | weight != 256 | blend: premultiply src alpha
+        | 4   | dest_a != NULL | src_a == NULL | weight == 256 | blit: blit and set dst alpha to FF
+        | 5   | dest_a != NULL | src_a == NULL | weight != 256 | blend: with given alpha
+        | 6   | dest_a != NULL | src_a != NULL | weight == 256 | blend: full blend without src alpha premutiply
+        | 7   | dest_a != NULL | src_a != NULL | weight != 256 | blend: full (origin version)
+    */
+
+    int cond = ((dest_a != NULL)?4:0) + ((src_a != NULL)?2:0) + ((weight != 256)?1:0);
+
+    switch(cond)
+    {
+        case 0:
+            memcpy(dest, src, 2 * width);
+            break;
+        case 1: blend_case1(dest, src, width, weight); break;
+        case 2: blend_case2(dest, src, width, src_a); break;
+        case 3: blend_case3(dest, src, width, src_a, weight); break;
+        case 4:
+            memcpy(dest, src, 2 * width);
+            memset(dest_a, 0xFF, width);
+            break;
+        case 5: blend_case5(dest, src, width, dest_a, weight); break;
+        case 6: blend_case6(dest, src, width, src_a, dest_a); break;
+        case 7: blend_case7(dest, src, width, src_a, dest_a, weight); break;
+        default:
+            fprintf(stderr, "%s: UNHANDLE CASE %d\n", __FUNCTION__, cond);
+            break;
+    };
+};
-- 
1.7.7.6

------------------------------------------------------------------------------
Open source business process management suite built on Java and Eclipse
Turn processes into business applications with Bonita BPM Community Edition
Quickly connect people, data, and systems into organized workflows
Winner of BOSSIE, CODIE, OW2 and Gartner awards
http://p.sf.net/sfu/Bonitasoft
_______________________________________________
Mlt-devel mailing list
Mlt-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mlt-devel

Reply via email to