---
Athlon64:
C: 161
SSE2: 73
Atom 330:
C: 432
SSE2: 265
Sandy Bridge:
C: 88
SSE2: 50
SSE4.1: 29
libavcodec/x86/ac3dsp.asm | 100 +++++++++++++++++++++++++++++++++++++++++++
libavcodec/x86/ac3dsp_mmx.c | 7 +++
2 files changed, 107 insertions(+), 0 deletions(-)
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index 18f9dc3..9430c51 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -27,6 +27,11 @@ SECTION_RODATA
; 16777216.0f - used in ff_float_to_fixed24()
pf_1_24: times 4 dd 0x4B800000
+; used in ff_ac3_compute_mantissa_size()
+cextern ac3_bap_bits
+pw_bap_mul1: dd 43691, 43691, 0, 65536
+pw_bap_mul2: dd 5, 7, 0, 7
+
SECTION .text
;-----------------------------------------------------------------------------
@@ -293,3 +298,98 @@ cglobal float_to_fixed24_sse2, 3,3,9, dst, src, len
%endif
ja .loop
REP_RET
+
+;------------------------------------------------------------------------------
+; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
+;------------------------------------------------------------------------------
+
+%macro PHADDD4 2 ; xmm src, xmm tmp
+ movhlps %2, %1
+ paddd %1, %2
+ pshufd %2, %1, 0x1
+ paddd %1, %2
+%endmacro
+
+INIT_XMM
+cglobal ac3_compute_mantissa_size_sse2, 1,5,2, mant_cnt, blk, sum, tmp1, tmp2
+ xor sumq, sumq
+ mov blkd, 6
+ pxor m0, m0
+ pxor m1, m1
+ ALIGN 16
+.loop
+ paddw m0, [mant_cntq ]
+ paddw m1, [mant_cntq+16]
+ movzx tmp1d, word [mant_cntq+8]
+ shr tmp1d, 1
+ movzx tmp2d, word [mant_cntq+4]
+ imul tmp2d, 43691
+ shr tmp2d, 17
+ add tmp2d, tmp1d
+ lea tmp1d, [tmp2q*8]
+ sub tmp1d, tmp2d
+ movzx tmp2d, word [mant_cntq+2]
+ imul tmp2d, 43691
+ shr tmp2d, 17
+ lea tmp2d, [tmp2q+tmp2q*4]
+ add tmp2d, tmp1d
+ add sumd, tmp2d
+ add mant_cntq, 32
+ dec blkd
+ ja .loop
+ pmaddwd m0, [ff_ac3_bap_bits ]
+ pmaddwd m1, [ff_ac3_bap_bits+16]
+ paddd m0, m1
+ PHADDD4 m0, m1
+ movd eax, m0
+ add eax, sumd
+ RET
+
+INIT_XMM
+cglobal ac3_compute_mantissa_size_sse41, 1,2,7, mant_cnt, sum
+ movdqa m0, [mant_cntq ]
+ movdqa m1, [mant_cntq+ 1*16]
+ paddw m0, [mant_cntq+ 2*16]
+ paddw m1, [mant_cntq+ 3*16]
+ paddw m0, [mant_cntq+ 4*16]
+ paddw m1, [mant_cntq+ 5*16]
+ paddw m0, [mant_cntq+ 6*16]
+ paddw m1, [mant_cntq+ 7*16]
+ paddw m0, [mant_cntq+ 8*16]
+ paddw m1, [mant_cntq+ 9*16]
+ paddw m0, [mant_cntq+10*16]
+ paddw m1, [mant_cntq+11*16]
+ pmaddwd m0, [ff_ac3_bap_bits ]
+ pmaddwd m1, [ff_ac3_bap_bits+16]
+ paddd m0, m1
+ PHADDD4 m0, m1
+ movd sumd, m0
+ movdqu m6, [pw_bap_mul1]
+ pmovzxwd m0, [mant_cntq+2]
+ pmovzxwd m1, [mant_cntq+1*32+2]
+ pmovzxwd m2, [mant_cntq+2*32+2]
+ pmovzxwd m3, [mant_cntq+3*32+2]
+ pmovzxwd m4, [mant_cntq+4*32+2]
+ pmovzxwd m5, [mant_cntq+5*32+2]
+ pmulld m0, m6
+ pmulld m1, m6
+ pmulld m2, m6
+ pmulld m3, m6
+ pmulld m4, m6
+ pmulld m5, m6
+ psrld m0, 17
+ psrld m1, 17
+ psrld m2, 17
+ psrld m3, 17
+ psrld m4, 17
+ psrld m5, 17
+ paddd m0, m1
+ paddd m0, m2
+ paddd m0, m3
+ paddd m0, m4
+ paddd m0, m5
+ pmulld m0, [pw_bap_mul2]
+ PHADDD4 m0, m1
+ movd eax, m0
+ add eax, sumd
+ RET
diff --git a/libavcodec/x86/ac3dsp_mmx.c b/libavcodec/x86/ac3dsp_mmx.c
index 4750423..961c6f2 100644
--- a/libavcodec/x86/ac3dsp_mmx.c
+++ b/libavcodec/x86/ac3dsp_mmx.c
@@ -42,6 +42,9 @@ extern void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned i
extern void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len);
extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
+extern int ff_ac3_compute_mantissa_size_sse2 (uint16_t mant_cnt[6][16]);
+extern int ff_ac3_compute_mantissa_size_sse41(uint16_t mant_cnt[6][16]);
+
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
{
int mm_flags = av_get_cpu_flags();
@@ -69,6 +72,7 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
c->float_to_fixed24 = ff_float_to_fixed24_sse2;
+ c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
@@ -77,5 +81,8 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) {
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
}
+ if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
+ c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse41;
+ }
#endif
}
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel