Hi,
i was tried to reach realtime performance for some simple cg operation
and found that during composite transition operation MLT create an alpha
plane for frame that has not it and not even require for further
display. i.e. if you put a small CG over HD frame, MLT create a full
frame alpha channel and further blending operation use it memory for no
reason IMHO.
so i implement function mlt_frame_get_alpha_mask_nc that do the same as
mlt_frame_get_alpha_mask but do not create alpha channel if not exist -
it just return NULL. next i replaced some code parts for using
mlt_frame_get_alpha_mask_nc and handling returned NULL value.
finally composite_line_yuv_sse2_simple function was split into 8 variants:
|0| dest_a == NULL | src_a == NULL | weight == 256 | blit
|1| dest_a == NULL | src_a == NULL | weight != 256 | blend: with given alpha
|2| dest_a == NULL | src_a != NULL | weight == 256 | blend: only src alpha
|3| dest_a == NULL | src_a != NULL | weight != 256 | blend: premultiply
src alpha
|4| dest_a != NULL | src_a == NULL | weight == 256 | blit: blit and set
dst alpha to FF
|5| dest_a != NULL | src_a == NULL | weight != 256 | blend: with given alpha
|6| dest_a != NULL | src_a != NULL | weight == 256 | blend: full blend
without src alpha premutiply
|7| dest_a != NULL | src_a != NULL | weight != 256 | blend: full (origin
version)
from my tests i did not found visible regression. may be somebody else
could also review/test proposed code.
--
Maksym Veremeyenko
From 2e973085a151bd43762b17bf37e802cdcb130167 Mon Sep 17 00:00:00 2001
From: Maksym Veremeyenko ve...@m1.tv
Date: Fri, 27 Jun 2014 18:02:16 +0300
Subject: [PATCH 1/6] rename arguments indexes to literal names
---
src/modules/core/composite_line_yuv_sse2_simple.c | 30 ++--
1 files changed, 15 insertions(+), 15 deletions(-)
diff --git a/src/modules/core/composite_line_yuv_sse2_simple.c b/src/modules/core/composite_line_yuv_sse2_simple.c
index 04eb1ca..049ed9e 100644
--- a/src/modules/core/composite_line_yuv_sse2_simple.c
+++ b/src/modules/core/composite_line_yuv_sse2_simple.c
@@ -33,9 +33,9 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
__asm__ volatile
(
pxor %%xmm0, %%xmm0 \n\t /* clear zero register */
-movdqu (%4), %%xmm9\n\t /* load const1 */
-movdqu (%7), %%xmm10 \n\t /* load const2 */
-movd %0, %%xmm1 \n\t /* load weight and decompose */
+movdqu (%[const1]), %%xmm9 \n\t /* load const1 */
+movdqu (%[const2]), %%xmm10\n\t /* load const2 */
+movd %[weight], %%xmm1 \n\t /* load weight and decompose */
movlhps%%xmm1, %%xmm1 \n\t
pshuflw$0, %%xmm1, %%xmm1 \n\t
pshufhw$0, %%xmm1, %%xmm1 \n\t
@@ -46,7 +46,7 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
00 W 00 W 00 W 00 W 00 W 00 W 00 W 00 W
*/
loop_start:\n\t
-movq (%1), %%xmm2\n\t /* load source alpha */
+movq (%[alpha_b]), %%xmm2\n\t /* load source alpha */
punpcklbw %%xmm0, %%xmm2 \n\t /* unpack alpha 8 8-bits alphas to 8 16-bits values */
/*
@@ -68,7 +68,7 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
/*
DSTa = DSTa + (SRCa * (0xFF - DSTa)) 8
*/
-movq (%5), %%xmm3\n\t /* load dst alpha */
+movq (%[alpha_a]), %%xmm3\n\t /* load dst alpha */
punpcklbw %%xmm0, %%xmm3 \n\t /* unpack dst 8 8-bits alphas to 8 16-bits values */
movdqa %%xmm9, %%xmm4 \n\t
psubw %%xmm3, %%xmm4 \n\t
@@ -80,10 +80,10 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint
psrlw $8, %%xmm4 \n\t
paddw %%xmm4, %%xmm3 \n\t
packuswb %%xmm0, %%xmm3 \n\t
-movq %%xmm3, (%5)\n\t /* save dst alpha */
+movq %%xmm3, (%[alpha_a])\n\t /* save dst alpha */
-movdqu (%2), %%xmm3\n\t /* load src */
-movdqu (%3), %%xmm4\n\t /* load dst */
+movdqu (%[src]), %%xmm3\n\t /* load src */
+movdqu (%[dest]), %%xmm4 \n\t /* load dst */
movdqa %%xmm3, %%xmm5 \n\t /* dub src */
movdqa %%xmm4, %%xmm6 \n\t /* dub dst */
@@ -185,21 +185,21 @@ void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint