---
This patch is completely untested. It would be nice to know if it
works, and if so, if it's any faster. I don't have a system with
SSE4.1.
libavcodec/x86/ac3dsp.asm | 42 ++++++++++++++++++++++++++++++++++++++++++
libavcodec/x86/ac3dsp_mmx.c | 4 ++++
2 files changed, 46 insertions(+), 0 deletions(-)
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index fc05f25..e978944 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -29,6 +29,9 @@ pf_1_24: times 4 dd 0x4B800000
; used in ff_ac3_compute_mantissa_size()
cextern ac3_bap_bits
+ALIGN 16
+pw_bap_mul1: dd 43691, 43691, 0, 65536
+pw_bap_mul2: dd 5, 7, 0, 7
SECTION .text
@@ -341,3 +344,42 @@ cglobal ac3_compute_mantissa_size_sse2, 1,5,2, mant_cnt, blk, sum, tmp1, tmp2
movd eax, m0
add eax, sumd
RET
+
+INIT_XMM
+cglobal ac3_compute_mantissa_size_sse41, 1,5,2, mant_cnt, blk, sum, tmp1, tmp2
+ xor sumq, sumq
+ mov blkd, 6
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m4, m4
+ ALIGN 16
+.loop
+ paddw m0, [mant_cntq ]
+ paddw m1, [mant_cntq+16]
+
+ movq m3, [mant_cntq+2]
+ punpcklwd m3, m2
+ pmulld m3, [pw_bap_mul1]
+ psrld m3, 17
+ paddd m4, m3
+
+ add mant_cntq, 32
+ dec blkd
+ ja .loop
+
+ pmulld m4, [pw_bap_mul2]
+
+ pmaddwd m0, [ff_ac3_bap_bits ]
+ pmaddwd m1, [ff_ac3_bap_bits+16]
+ paddd m0, m1
+
+ phaddd m0, m0
+ phaddd m0, m0
+ phaddd m4, m4
+ phaddd m4, m4
+ paddd m0, m4
+
+ movd eax, m0
+ add eax, sumd
+ RET
diff --git a/libavcodec/x86/ac3dsp_mmx.c b/libavcodec/x86/ac3dsp_mmx.c
index 2664736..1835291 100644
--- a/libavcodec/x86/ac3dsp_mmx.c
+++ b/libavcodec/x86/ac3dsp_mmx.c
@@ -43,6 +43,7 @@ extern void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned i
extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
extern int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
+extern int ff_ac3_compute_mantissa_size_sse41(uint16_t mant_cnt[6][16]);
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
{
@@ -80,5 +81,8 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
if (mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSSE3) {
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
}
+ if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSSE3 && HAVE_SSE) {
+ c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse41;
+ }
#endif
}
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel