This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit cd886bf0a5dd8984dab002e40e396b7c96d38781 Author: Andreas Rheinhardt <[email protected]> AuthorDate: Tue Mar 24 15:55:45 2026 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Sat Mar 28 11:25:38 2026 +0100 avcodec/x86/sbcdsp: Port ff_sbc_analyze_[48]_mmx to SSE2 Halfs the amount of pmaddwd and improves performance a lot: sbc_analyze_4_c: 55.7 ( 1.00x) sbc_analyze_4_mmx: 7.0 ( 7.94x) sbc_analyze_4_sse2: 4.3 (12.93x) sbc_analyze_8_c: 131.1 ( 1.00x) sbc_analyze_8_mmx: 22.4 ( 5.84x) sbc_analyze_8_sse2: 10.7 (12.25x) It also saves 224B of .text and allows to remove the emms_c() from sbcenc.c (notice that ff_sbc_calc_scalefactors_mmx() issues emms on its own, so it already abides by the ABI). Hint: A pshufd could be avoided per function if the constants were reordered. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/sbcenc.c | 2 - libavcodec/x86/sbcdsp.asm | 126 +++++++++++++++++++++++-------------------- libavcodec/x86/sbcdsp_init.c | 12 +++-- tests/checkasm/sbcdsp.c | 2 +- 4 files changed, 75 insertions(+), 67 deletions(-) diff --git a/libavcodec/sbcenc.c b/libavcodec/sbcenc.c index bc2f844789..7e047cd5ab 100644 --- a/libavcodec/sbcenc.c +++ b/libavcodec/sbcenc.c @@ -31,7 +31,6 @@ */ #include "libavutil/channel_layout.h" -#include "libavutil/emms.h" #include "libavutil/opt.h" #include "avcodec.h" #include "codec_internal.h" @@ -322,7 +321,6 @@ static int sbc_encode_frame(AVCodecContext *avctx, AVPacket *avpkt, frame->blocks, frame->channels, frame->subbands); - emms_c(); sbc_pack_frame(avpkt, frame, j, sbc->msbc); *got_packet_ptr = 1; diff --git a/libavcodec/x86/sbcdsp.asm b/libavcodec/x86/sbcdsp.asm index ddc1237d8f..3351e2aadf 100644 --- a/libavcodec/x86/sbcdsp.asm +++ b/libavcodec/x86/sbcdsp.asm @@ -38,43 +38,44 @@ SECTION .text %endif %endmacro -%macro ANALYZE_MAC 9 ; out1, out2, in1, in2, tmp1, tmp2, add1, add2, offset - NIDN movq, %5, %3 - NIDN movq, %6, %4 - pmaddwd %5, [constsq+%9] - pmaddwd %6, [constsq+%9+8] - NIDN paddd, %1, %7 - NIDN paddd, %2, %8 -%endmacro - -%macro ANALYZE_MAC_IN 7 ; out1, out2, tmp1, tmp2, add1, add2, offset - ANALYZE_MAC %1, %2, [inq+%7], [inq+%7+8], %3, %4, %5, %6, %7 -%endmacro - -%macro ANALYZE_MAC_REG 7 ; out1, out2, in, tmp1, tmp2, offset, pack -%ifidn %7, pack - psrad %3, 16 ; SBC_PROTO_FIXED_SCALE - packssdw %3, %3 +%macro ANALYZE_MAC 6 ; out1, out2, tmp1, tmp2, offset, aligned + mov%6 %3, [inq+%5] + mov%6 %4, [inq+%5+mmsize] +%if %5 == 0 + pcmpeqd m0, m0 + psrld m0, 31 +%endif + pmaddwd %3, [constsq+%5] + pmaddwd %4, [constsq+%5+mmsize] +%if %5 == 0 + pslld m0, 15 ; 1 << (SBC_PROTO_FIXED_SCALE - 1) as dword %endif - ANALYZE_MAC %1, %2, %3, %3, %4, %5, %4, %5, %6 + NIDN paddd, %1, %3 + NIDN paddd, %2, %4 %endmacro ;******************************************************************* ;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts); ;******************************************************************* -INIT_MMX mmx -cglobal sbc_analyze_4, 3, 3, 4, in, out, consts - ANALYZE_MAC_IN m0, m1, m0, m1, [scale_mask], [scale_mask], 0 - ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 16 - ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 32 - ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 48 - ANALYZE_MAC_IN m0, m1, m2, m3, m2, m3, 64 - - ANALYZE_MAC_REG m0, m2, m0, m0, m2, 80, pack - ANALYZE_MAC_REG m0, m2, m1, m1, m3, 96, pack - - movq [outq ], m0 - movq [outq+8], m2 +INIT_XMM sse2 +cglobal sbc_analyze_4, 3, 3, 5, in, out, consts + ANALYZE_MAC m1, m2, m1, m2, 0, u + ANALYZE_MAC m1, m2, m3, m4, 32, u + movu m3, [inq+64] + paddd m1, m0 + pmaddwd m3, [constsq+64] + paddd m1, m2 + paddd m1, m3 + + psrad m1, 16 + packssdw m1, m1 + pshufd m2, m1, q0000 + pmaddwd m2, [constsq+80] + pshufd m1, m1, q1111 + pmaddwd m1, [constsq+96] + paddd m1, m2 + + mova [outq], m1 RET @@ -82,34 +83,41 @@ cglobal sbc_analyze_4, 3, 3, 4, in, out, consts ;******************************************************************* ;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts); ;******************************************************************* -INIT_MMX mmx -cglobal sbc_analyze_8, 3, 3, 4, in, out, consts - ANALYZE_MAC_IN m0, m1, m0, m1, [scale_mask], [scale_mask], 0 - ANALYZE_MAC_IN m2, m3, m2, m3, [scale_mask], [scale_mask], 16 - ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 32 - ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 48 - ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 64 - ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 80 - ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 96 - ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 112 - ANALYZE_MAC_IN m0, m1, m4, m5, m4, m5, 128 - ANALYZE_MAC_IN m2, m3, m6, m7, m6, m7, 144 - - ANALYZE_MAC_REG m4, m5, m0, m4, m5, 160, pack - ANALYZE_MAC_REG m4, m5, m1, m6, m7, 192, pack - ANALYZE_MAC_REG m4, m5, m2, m6, m7, 224, pack - ANALYZE_MAC_REG m4, m5, m3, m6, m7, 256, pack - - movq [outq ], m4 - movq [outq+8], m5 - - ANALYZE_MAC_REG m0, m5, m0, m0, m5, 176, no - ANALYZE_MAC_REG m0, m5, m1, m1, m7, 208, no - ANALYZE_MAC_REG m0, m5, m2, m2, m7, 240, no - ANALYZE_MAC_REG m0, m5, m3, m3, m7, 272, no - - movq [outq+16], m0 - movq [outq+24], m5 +INIT_XMM sse2 +cglobal sbc_analyze_8, 3, 3, 6, in, out, consts + ANALYZE_MAC m1, m2, m1, m2, 0, a + ANALYZE_MAC m1, m2, m3, m4, 32, a + paddd m1, m0 + ANALYZE_MAC m1, m2, m3, m4, 64, a + ANALYZE_MAC m1, m2, m3, m4, 96, a + paddd m2, m0 + ANALYZE_MAC m1, m2, m3, m4, 128, a + + psrad m1, 16 + psrad m2, 16 + packssdw m1, m2 + + pshufd m2, m1, q0000 + pmaddwd m0, m2, [constsq+160] + pshufd m3, m1, q1111 + pmaddwd m2, [constsq+176] + pmaddwd m4, m3, [constsq+192] + pshufd m5, m1, q2222 + pmaddwd m3, [constsq+208] + paddd m0, m4 + pmaddwd m4, m5, [constsq+224] + pshufd m1, m1, q3333 + pmaddwd m5, [constsq+240] + paddd m2, m3 + pmaddwd m3, m1, [constsq+256] + paddd m0, m4 + pmaddwd m1, [constsq+272] + paddd m0, m3 + paddd m2, m5 + + mova [outq], m0 + paddd m2, m1 + mova [outq+16], m2 RET diff --git a/libavcodec/x86/sbcdsp_init.c b/libavcodec/x86/sbcdsp_init.c index d959f76f8c..acca0bbdc9 100644 --- a/libavcodec/x86/sbcdsp_init.c +++ b/libavcodec/x86/sbcdsp_init.c @@ -26,7 +26,7 @@ /** * @file - * SBC MMX optimization for some basic "building bricks" + * SBC DSP optimization for some basic "building bricks" */ #include "libavutil/attributes.h" @@ -34,8 +34,8 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/sbcdsp.h" -void ff_sbc_analyze_4_mmx(const int16_t *in, int32_t *out, const int16_t *consts); -void ff_sbc_analyze_8_mmx(const int16_t *in, int32_t *out, const int16_t *consts); +void ff_sbc_analyze_4_sse2(const int16_t *in, int32_t *out, const int16_t *consts); +void ff_sbc_analyze_8_sse2(const int16_t *in, int32_t *out, const int16_t *consts); void ff_sbc_calc_scalefactors_mmx(const int32_t sb_sample_f[16][2][8], uint32_t scale_factor[2][8], int blocks, int channels, int subbands); @@ -45,8 +45,10 @@ av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s) int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_MMX(cpu_flags)) { - s->sbc_analyze_4 = ff_sbc_analyze_4_mmx; - s->sbc_analyze_8 = ff_sbc_analyze_8_mmx; s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx; } + if (EXTERNAL_SSE2(cpu_flags)) { + s->sbc_analyze_4 = ff_sbc_analyze_4_sse2; + s->sbc_analyze_8 = ff_sbc_analyze_8_sse2; + } } diff --git a/tests/checkasm/sbcdsp.c b/tests/checkasm/sbcdsp.c index aefe066fe2..3bef11a5e7 100644 --- a/tests/checkasm/sbcdsp.c +++ b/tests/checkasm/sbcdsp.c @@ -41,7 +41,7 @@ static void check_sbc_analyze(SBCDSPContext *sbcdsp) DECLARE_ALIGNED(SBC_ALIGN, int32_t, out_ref)[SBC_MAX_SUBBANDS]; DECLARE_ALIGNED(SBC_ALIGN, int32_t, out_new)[SBC_MAX_SUBBANDS]; - declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *in, int32_t *out, const int16_t *consts); + declare_func(void, const int16_t *in, int32_t *out, const int16_t *consts); for (int i = 0; i < 2; ++i) { if (check_func(i ? sbcdsp->sbc_analyze_8 : sbcdsp->sbc_analyze_4, "sbc_analyze_%u", i ? 8 : 4)) { _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
