Hi,

attached patch implement SSE2 optimized swab function that is activelly used with decklink producer/consumer. it gives not performance boots but in my case it decrease thread CPU usage from 12% to 9% for 1080i25.

--
________________________________________
Maksym Veremeyenko

>From c1b1417026d5cc2ebf9b743e7d5f1778499514a6 Mon Sep 17 00:00:00 2001
From: Maksym Veremeyenko <ve...@m1.tv>
Date: Thu, 13 Feb 2014 16:27:23 +0200
Subject: [PATCH 2/4] implement SSE optimized swab function

---
 src/modules/decklink/common.cpp            |   44 ++++++++++++++++++++++++++++
 src/modules/decklink/common.h              |    1 +
 src/modules/decklink/consumer_decklink.cpp |    4 +-
 src/modules/decklink/producer_decklink.cpp |    4 +-
 4 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/src/modules/decklink/common.cpp b/src/modules/decklink/common.cpp
index 83f5e53..cafdda9 100644
--- a/src/modules/decklink/common.cpp
+++ b/src/modules/decklink/common.cpp
@@ -19,6 +19,7 @@
 
 #include "common.h"
 #include <stdlib.h>
+#include <unistd.h>
 
 #ifdef __DARWIN__
 
@@ -89,3 +90,46 @@ void freeDLString( DLString aDLString )
 
 #endif
 
+
+void swab2( const void *from, void *to, int n )
+{
+#if defined(USE_SSE)
+#define SWAB_STEP 16
+	__asm__ volatile
+	(
+		"loop_start:                            \n\t"
+
+		/* load */
+		"movdqa         0(%[from]), %%xmm0      \n\t"
+		"add            $0x10, %[from]          \n\t"
+
+		/* duplicate to temp registers */
+		"movdqa         %%xmm0, %%xmm1          \n\t"
+
+		/* shift right temp register */
+		"psrlw          $8, %%xmm1              \n\t"
+
+		/* shift left main register */
+		"psllw          $8, %%xmm0              \n\t"
+
+		/* compose them back */
+		"por           %%xmm0, %%xmm1           \n\t"
+
+		/* save */
+		"movdqa         %%xmm1, 0(%[to])        \n\t"
+		"add            $0x10, %[to]            \n\t"
+
+		"dec            %[cnt]                  \n\t"
+		"jnz            loop_start              \n\t"
+
+		:
+		: [from]"r"(from), [to]"r"(to), [cnt]"r"(n / SWAB_STEP)
+		: "xmm0", "xmm1"
+	);
+
+	from = (unsigned char*) from + n - (n % SWAB_STEP);
+	to = (unsigned char*) to + n - (n % SWAB_STEP);
+	n = (n % SWAB_STEP);
+#endif
+	swab(from, to, n);
+};
diff --git a/src/modules/decklink/common.h b/src/modules/decklink/common.h
index 3b48b9c..98a8536 100644
--- a/src/modules/decklink/common.h
+++ b/src/modules/decklink/common.h
@@ -38,5 +38,6 @@
 char* getCString( DLString aDLString );
 void freeCString( char* aCString );
 void freeDLString( DLString aDLString );
+void swab2( const void *from, void *to, int n );
 
 #endif // DECKLINK_COMMON_H
diff --git a/src/modules/decklink/consumer_decklink.cpp b/src/modules/decklink/consumer_decklink.cpp
index 8f01dea..ac6f5a9 100644
--- a/src/modules/decklink/consumer_decklink.cpp
+++ b/src/modules/decklink/consumer_decklink.cpp
@@ -416,9 +416,9 @@ public:
 					// Normal non-keyer playout - needs byte swapping
 					if ( !progressive && m_displayMode->GetFieldDominance() == bmdUpperFieldFirst )
 						// convert lower field first to top field first
-						swab( (char*) image, (char*) buffer + stride, stride * ( height - 1 ) );
+						swab2( (char*) image, (char*) buffer + stride, stride * ( height - 1 ) );
 					else
-						swab( (char*) image, (char*) buffer, stride * height );
+						swab2( (char*) image, (char*) buffer, stride * height );
 				}
 				else if ( !mlt_properties_get_int( MLT_FRAME_PROPERTIES( frame ), "test_image" ) )
 				{
diff --git a/src/modules/decklink/producer_decklink.cpp b/src/modules/decklink/producer_decklink.cpp
index 9434893..6801fad 100644
--- a/src/modules/decklink/producer_decklink.cpp
+++ b/src/modules/decklink/producer_decklink.cpp
@@ -432,7 +432,7 @@ public:
 						for ( int i = 1; i < m_vancLines + 1; i++ )
 						{
 							if ( vanc->GetBufferForVerticalBlankingLine( i, &buffer ) == S_OK )
-								swab( (char*) buffer, (char*) image + ( i - 1 ) * video->GetRowBytes(), video->GetRowBytes() );
+								swab2( (char*) buffer, (char*) image + ( i - 1 ) * video->GetRowBytes(), video->GetRowBytes() );
 							else
 								mlt_log_debug( getProducer(), "failed capture vanc line %d\n", i );
 						}
@@ -445,7 +445,7 @@ public:
 				if ( image && buffer )
 				{
 					size =  video->GetRowBytes() * video->GetHeight();
-					swab( (char*) buffer, (char*) image + m_vancLines * video->GetRowBytes(), size );
+					swab2( (char*) buffer, (char*) image + m_vancLines * video->GetRowBytes(), size );
 					mlt_frame_set_image( frame, (uint8_t*) image, size, mlt_pool_release );
 				}
 				else if ( image )
-- 
1.7.7.6

------------------------------------------------------------------------------
Android apps run on BlackBerry 10
Introducing the new BlackBerry 10.2.1 Runtime for Android apps.
Now with support for Jelly Bean, Bluetooth, Mapview and more.
Get your Android app in front of a whole new audience.  Start now.
http://pubads.g.doubleclick.net/gampad/clk?id=124407151&iu=/4140/ostg.clktrk
_______________________________________________
Mlt-devel mailing list
Mlt-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mlt-devel

Reply via email to