vlc | branch: master | Laurent Aimar <[email protected]> | Fri May 25 21:13:00 2012 +0200| [83f2312b574f5cbe289ec63867e584f05c52fff6] | committer: Laurent Aimar
Added support for SSE2 to 16 bit merge (deinterlace). > http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=83f2312b574f5cbe289ec63867e584f05c52fff6 --- modules/video_filter/deinterlace/deinterlace.c | 4 +-- modules/video_filter/deinterlace/merge.c | 32 ++++++++++++++++++++++-- modules/video_filter/deinterlace/merge.h | 11 +++++++- 3 files changed, 42 insertions(+), 5 deletions(-) diff --git a/modules/video_filter/deinterlace/deinterlace.c b/modules/video_filter/deinterlace/deinterlace.c index 414f87b..2762f04 100644 --- a/modules/video_filter/deinterlace/deinterlace.c +++ b/modules/video_filter/deinterlace/deinterlace.c @@ -636,9 +636,9 @@ int Open( vlc_object_t *p_this ) else #endif #if defined(CAN_COMPILE_SSE) - if( chroma->pixel_size == 1 && (vlc_CPU() & CPU_CAPABILITY_SSE2) ) + if( (vlc_CPU() & CPU_CAPABILITY_SSE2) ) { - p_sys->pf_merge = MergeSSE2; + p_sys->pf_merge = chroma->pixel_size == 1 ? Merge8BitSSE2 : Merge16BitSSE2; p_sys->pf_end_merge = EndMMX; } else diff --git a/modules/video_filter/deinterlace/merge.c b/modules/video_filter/deinterlace/merge.c index b6fb619..b462b21 100644 --- a/modules/video_filter/deinterlace/merge.c +++ b/modules/video_filter/deinterlace/merge.c @@ -118,8 +118,8 @@ void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2, #endif #if defined(CAN_COMPILE_SSE) -void MergeSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2, - size_t i_bytes ) +void Merge8BitSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2, + size_t i_bytes ) { uint8_t *p_dest = _p_dest; const uint8_t *p_s1 = _p_s1; @@ -143,6 +143,34 @@ void MergeSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2, for( ; i_bytes > 0; i_bytes-- ) *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1; } + +void Merge16BitSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2, + size_t i_bytes ) +{ + uint16_t *p_dest = _p_dest; + const uint16_t *p_s1 = _p_s1; + const uint16_t *p_s2 = _p_s2; + + size_t i_words = i_bytes / 2; + for( ; i_words > 0 && ((uintptr_t)p_s1 & 15); i_words-- ) + *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1; + + for( ; i_words >= 8; i_words -= 8 ) + { + __asm__ __volatile__( "movdqu %2,%%xmm1;" + "pavgw %1, %%xmm1;" + "movdqu %%xmm1, %0" :"=m" (*p_dest): + "m" (*p_s1), + "m" (*p_s2) ); + p_dest += 8; + p_s1 += 8; + p_s2 += 8; + } + + for( ; i_words > 0; i_words-- ) + *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1; +} + #endif #ifdef CAN_COMPILE_C_ALTIVEC diff --git a/modules/video_filter/deinterlace/merge.h b/modules/video_filter/deinterlace/merge.h index 7f4af07..1117b37 100644 --- a/modules/video_filter/deinterlace/merge.h +++ b/modules/video_filter/deinterlace/merge.h @@ -141,7 +141,16 @@ void Merge3DNow ( void *, const void *, const void *, size_t ); * @param _p_s2 Source line B * @param i_bytes Number of bytes to merge */ -void MergeSSE2 ( void *, const void *, const void *, size_t ); +void Merge8BitSSE2( void *, const void *, const void *, size_t ); +/** + * SSE2 routine to blend pixels from two picture lines. + * + * @param _p_dest Target + * @param _p_s1 Source line A + * @param _p_s2 Source line B + * @param i_bytes Number of bytes to merge + */ +void Merge16BitSSE2( void *, const void *, const void *, size_t ); #endif #if defined __ARM_NEON__ _______________________________________________ vlc-commits mailing list [email protected] http://mailman.videolan.org/listinfo/vlc-commits
