---
This patch is completely untested. It would be nice to know if it
works, and if so, if it's any faster. I don't have a system with
SSE4.1.

 libavcodec/x86/ac3dsp.asm   |   42 ++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/ac3dsp_mmx.c |    4 ++++
 2 files changed, 46 insertions(+), 0 deletions(-)

diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index fc05f25..e978944 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -29,6 +29,9 @@ pf_1_24: times 4 dd 0x4B800000
 
 ; used in ff_ac3_compute_mantissa_size()
 cextern ac3_bap_bits
+ALIGN 16
+pw_bap_mul1: dd 43691, 43691, 0, 65536
+pw_bap_mul2: dd 5, 7, 0, 7
 
 SECTION .text
 
@@ -341,3 +344,42 @@ cglobal ac3_compute_mantissa_size_sse2, 1,5,2, mant_cnt, blk, sum, tmp1, tmp2
     movd       eax, m0
     add        eax, sumd
     RET
+
+INIT_XMM
+cglobal ac3_compute_mantissa_size_sse41, 1,5,2, mant_cnt, blk, sum, tmp1, tmp2
+    xor       sumq, sumq
+    mov       blkd, 6
+    pxor        m0, m0
+    pxor        m1, m1
+    pxor        m2, m2
+    pxor        m4, m4
+    ALIGN 16
+.loop
+    paddw       m0, [mant_cntq   ]
+    paddw       m1, [mant_cntq+16]
+
+    movq        m3, [mant_cntq+2]
+    punpcklwd   m3, m2
+    pmulld      m3, [pw_bap_mul1]
+    psrld       m3, 17
+    paddd       m4, m3
+
+    add  mant_cntq, 32
+    dec       blkd
+    ja .loop
+
+    pmulld      m4, [pw_bap_mul2]
+
+    pmaddwd     m0, [ff_ac3_bap_bits   ]
+    pmaddwd     m1, [ff_ac3_bap_bits+16]
+    paddd       m0, m1
+
+    phaddd      m0, m0
+    phaddd      m0, m0
+    phaddd      m4, m4
+    phaddd      m4, m4
+    paddd       m0, m4
+
+    movd       eax, m0
+    add        eax, sumd
+    RET
diff --git a/libavcodec/x86/ac3dsp_mmx.c b/libavcodec/x86/ac3dsp_mmx.c
index 2664736..1835291 100644
--- a/libavcodec/x86/ac3dsp_mmx.c
+++ b/libavcodec/x86/ac3dsp_mmx.c
@@ -43,6 +43,7 @@ extern void ff_float_to_fixed24_sse  (int32_t *dst, const float *src, unsigned i
 extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
 
 extern int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
+extern int ff_ac3_compute_mantissa_size_sse41(uint16_t mant_cnt[6][16]);
 
 av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
 {
@@ -80,5 +81,8 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
     if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) {
         c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
     }
+    if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSSE3 && HAVE_SSE) {
+        c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse41;
+    }
 #endif
 }
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to