Hi,attached patch implement SSE2 optimized swab function that is activelly used with decklink producer/consumer. it gives not performance boots but in my case it decrease thread CPU usage from 12% to 9% for 1080i25.
-- ________________________________________ Maksym Veremeyenko
>From c1b1417026d5cc2ebf9b743e7d5f1778499514a6 Mon Sep 17 00:00:00 2001 From: Maksym Veremeyenko <ve...@m1.tv> Date: Thu, 13 Feb 2014 16:27:23 +0200 Subject: [PATCH 2/4] implement SSE optimized swab function --- src/modules/decklink/common.cpp | 44 ++++++++++++++++++++++++++++ src/modules/decklink/common.h | 1 + src/modules/decklink/consumer_decklink.cpp | 4 +- src/modules/decklink/producer_decklink.cpp | 4 +- 4 files changed, 49 insertions(+), 4 deletions(-) diff --git a/src/modules/decklink/common.cpp b/src/modules/decklink/common.cpp index 83f5e53..cafdda9 100644 --- a/src/modules/decklink/common.cpp +++ b/src/modules/decklink/common.cpp @@ -19,6 +19,7 @@ #include "common.h" #include <stdlib.h> +#include <unistd.h> #ifdef __DARWIN__ @@ -89,3 +90,46 @@ void freeDLString( DLString aDLString ) #endif + +void swab2( const void *from, void *to, int n ) +{ +#if defined(USE_SSE) +#define SWAB_STEP 16 + __asm__ volatile + ( + "loop_start: \n\t" + + /* load */ + "movdqa 0(%[from]), %%xmm0 \n\t" + "add $0x10, %[from] \n\t" + + /* duplicate to temp registers */ + "movdqa %%xmm0, %%xmm1 \n\t" + + /* shift right temp register */ + "psrlw $8, %%xmm1 \n\t" + + /* shift left main register */ + "psllw $8, %%xmm0 \n\t" + + /* compose them back */ + "por %%xmm0, %%xmm1 \n\t" + + /* save */ + "movdqa %%xmm1, 0(%[to]) \n\t" + "add $0x10, %[to] \n\t" + + "dec %[cnt] \n\t" + "jnz loop_start \n\t" + + : + : [from]"r"(from), [to]"r"(to), [cnt]"r"(n / SWAB_STEP) + : "xmm0", "xmm1" + ); + + from = (unsigned char*) from + n - (n % SWAB_STEP); + to = (unsigned char*) to + n - (n % SWAB_STEP); + n = (n % SWAB_STEP); +#endif + swab(from, to, n); +}; diff --git a/src/modules/decklink/common.h b/src/modules/decklink/common.h index 3b48b9c..98a8536 100644 --- a/src/modules/decklink/common.h +++ b/src/modules/decklink/common.h @@ -38,5 +38,6 @@ char* getCString( DLString aDLString ); void freeCString( char* aCString ); void freeDLString( DLString aDLString ); +void swab2( const void *from, void *to, int n ); #endif // DECKLINK_COMMON_H diff --git a/src/modules/decklink/consumer_decklink.cpp b/src/modules/decklink/consumer_decklink.cpp index 8f01dea..ac6f5a9 100644 --- a/src/modules/decklink/consumer_decklink.cpp +++ b/src/modules/decklink/consumer_decklink.cpp @@ -416,9 +416,9 @@ public: // Normal non-keyer playout - needs byte swapping if ( !progressive && m_displayMode->GetFieldDominance() == bmdUpperFieldFirst ) // convert lower field first to top field first - swab( (char*) image, (char*) buffer + stride, stride * ( height - 1 ) ); + swab2( (char*) image, (char*) buffer + stride, stride * ( height - 1 ) ); else - swab( (char*) image, (char*) buffer, stride * height ); + swab2( (char*) image, (char*) buffer, stride * height ); } else if ( !mlt_properties_get_int( MLT_FRAME_PROPERTIES( frame ), "test_image" ) ) { diff --git a/src/modules/decklink/producer_decklink.cpp b/src/modules/decklink/producer_decklink.cpp index 9434893..6801fad 100644 --- a/src/modules/decklink/producer_decklink.cpp +++ b/src/modules/decklink/producer_decklink.cpp @@ -432,7 +432,7 @@ public: for ( int i = 1; i < m_vancLines + 1; i++ ) { if ( vanc->GetBufferForVerticalBlankingLine( i, &buffer ) == S_OK ) - swab( (char*) buffer, (char*) image + ( i - 1 ) * video->GetRowBytes(), video->GetRowBytes() ); + swab2( (char*) buffer, (char*) image + ( i - 1 ) * video->GetRowBytes(), video->GetRowBytes() ); else mlt_log_debug( getProducer(), "failed capture vanc line %d\n", i ); } @@ -445,7 +445,7 @@ public: if ( image && buffer ) { size = video->GetRowBytes() * video->GetHeight(); - swab( (char*) buffer, (char*) image + m_vancLines * video->GetRowBytes(), size ); + swab2( (char*) buffer, (char*) image + m_vancLines * video->GetRowBytes(), size ); mlt_frame_set_image( frame, (uint8_t*) image, size, mlt_pool_release ); } else if ( image ) -- 1.7.7.6
------------------------------------------------------------------------------ Android apps run on BlackBerry 10 Introducing the new BlackBerry 10.2.1 Runtime for Android apps. Now with support for Jelly Bean, Bluetooth, Mapview and more. Get your Android app in front of a whole new audience. Start now. http://pubads.g.doubleclick.net/gampad/clk?id=124407151&iu=/4140/ostg.clktrk
_______________________________________________ Mlt-devel mailing list Mlt-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/mlt-devel